English 中文(简体)
书写全球或地方记忆会增加执行方言时间10 000 %
原标题:Writing to global or local memory increases kernel execution time by 10000 %

我有以下开放文件:

kernel void ndft(
    global float *re, global float *im, int num_values,
    global float *spectrum_re, global float *spectrum_im,
    global float *spectrum_abs,
    global float *sin_array, global float *cos_array,
    float sqrt_num_values_reciprocal)
{
    // MATH MAGIC - DISREGARD FROM HERE -----------

    float x;
    float y;
    float sum_re = 0;
    float sum_im = 0;

    size_t thread_id = get_global_id(0);
    //size_t local_id = get_local_id(0);

    // num_values = 24 (live environment), 48 (test)
    for (int i = 0; i < num_values; i++)
    {
        x = cos_array[thread_id * num_values + i] * sqrt_num_values_reciprocal;
        y = sin_array[thread_id * num_values + i] * sqrt_num_values_reciprocal;
        sum_re = sum_re + re[i] * x + im[i] * y;
        sum_im = sum_im - re[i] * y + x * im[i];
    }

    // MATH MAGIC DONE ----------------------------

    //spectrum_re[thread_id] = sum_re;
    //spectrum_im[thread_id] = sum_im;
    //spectrum_abs[thread_id] = hypot(sum_re, sum_im);
    float asdf = hypot(sum_re, sum_im); // this is just a dummy calculation
}

与此类似,执行时间大约为15个(工作组规模=567个,14个工作组,共有7938个校对)。

然而,我当然需要收回行动的成果,这是最后几个方面的成果(结果)。 一旦我完成这些记忆作业(如果<代码>spectrum_X即为global,例如,或 local,则该盒子的退出时间将增至1.4至1.5毫升。

我认为,执行时间的增加是某种固定的间接费用,因此,我只是积累更多的数据,这样,由于这种效应而损失的相对时间将减少到最低限度。 但当我翻一番(即数据数额的两倍)时,执行时间也翻了一番(达到2.8 ~3.0 ms)。

我发现,即使我不满意这些线路的one,我的执行时间与我不满意的三者相同。 即使我添加了<代码>if (thread_id = 0)并加以操作,我也有同样的执行时间。 然而,这样做太慢了(我申请的上限是大约30个我们)。 当我以普通的《C法典》在我的《万国邮联》中加以管理时,它甚至预示着5倍的更快。

现在,我显然做了一些错误的事情,但我不敢肯定,在哪里开始寻求解决办法。


正如我对裁判所的答复所评论的那样,我也做了以下工作:

从上述法典来看,我把最后4条放在了上。

//spectrum_re[thread_id] = sum_re;
//spectrum_im[thread_id] = sum_im;
spectrum_abs[thread_id] = hypot(sum_re, sum_im);
//float asdf = hypot(sum_re, sum_im);

As expected, execution time ~1.8 ms. The generated assembler code for my system is:

//
// Generated by NVIDIA NVVM Compiler
// Compiler built on Tue Apr 03 12:42:39 2012 (1333449759)
// Driver 
//

.version 3.0
.target sm_21, texmode_independent
.address_size 32


.entry ndft(
    .param .u32 .ptr .global .align 4 ndft_param_0,
    .param .u32 .ptr .global .align 4 ndft_param_1,
    .param .u32 ndft_param_2,
    .param .u32 .ptr .global .align 4 ndft_param_3,
    .param .u32 .ptr .global .align 4 ndft_param_4,
    .param .u32 .ptr .global .align 4 ndft_param_5,
    .param .u32 .ptr .global .align 4 ndft_param_6,
    .param .u32 .ptr .global .align 4 ndft_param_7,
    .param .f32 ndft_param_8
)
{
    .reg .f32   %f;
    .reg .pred  %p;
    .reg .s32   %r;


    ld.param.u32    %r3, [ndft_param_2];
    // inline asm
    mov.u32     %r18, %envreg3;
    // inline asm
    // inline asm
    mov.u32     %r19, %ntid.x;
    // inline asm
    // inline asm
    mov.u32     %r20, %ctaid.x;
    // inline asm
    // inline asm
    mov.u32     %r21, %tid.x;
    // inline asm
    add.s32     %r22, %r21, %r18;
    mad.lo.s32  %r11, %r20, %r19, %r22;
    setp.gt.s32     %p1, %r3, 0;
    @%p1 bra    BB0_2;

    mov.f32     %f46, 0f00000000;
    mov.f32     %f45, %f46;
    bra.uni     BB0_4;

BB0_2:
    ld.param.u32    %r38, [ndft_param_2];
    mul.lo.s32  %r27, %r38, %r11;
    shl.b32     %r28, %r27, 2;
    ld.param.u32    %r40, [ndft_param_6];
    add.s32     %r12, %r40, %r28;
    ld.param.u32    %r41, [ndft_param_7];
    add.s32     %r13, %r41, %r28;
    mov.f32     %f46, 0f00000000;
    mov.f32     %f45, %f46;
    mov.u32     %r43, 0;
    mov.u32     %r42, %r43;

BB0_3:
    add.s32     %r29, %r13, %r42;
    ld.global.f32   %f18, [%r29];
    ld.param.f32    %f44, [ndft_param_8];
    mul.f32     %f19, %f18, %f44;
    add.s32     %r30, %r12, %r42;
    ld.global.f32   %f20, [%r30];
    mul.f32     %f21, %f20, %f44;
    ld.param.u32    %r35, [ndft_param_0];
    add.s32     %r31, %r35, %r42;
    ld.global.f32   %f22, [%r31];
    fma.rn.f32  %f23, %f22, %f19, %f46;
    ld.param.u32    %r36, [ndft_param_1];
    add.s32     %r32, %r36, %r42;
    ld.global.f32   %f24, [%r32];
    fma.rn.f32  %f46, %f24, %f21, %f23;
    neg.f32     %f25, %f22;
    fma.rn.f32  %f26, %f25, %f21, %f45;
    fma.rn.f32  %f45, %f24, %f19, %f26;
    add.s32     %r42, %r42, 4;
    add.s32     %r43, %r43, 1;
    ld.param.u32    %r37, [ndft_param_2];
    setp.lt.s32     %p2, %r43, %r37;
    @%p2 bra    BB0_3;

BB0_4:
    // inline asm
    abs.f32     %f27, %f46;
    // inline asm
    // inline asm
    abs.f32     %f29, %f45;
    // inline asm
    setp.gt.f32     %p3, %f27, %f29;
    selp.f32    %f8, %f29, %f27, %p3;
    selp.f32    %f32, %f27, %f29, %p3;
    // inline asm
    abs.f32     %f31, %f32;
    // inline asm
    setp.gt.f32     %p4, %f31, 0f7E800000;
    mov.f32     %f47, %f32;
    @%p4 bra    BB0_6;

    mov.f32     %f48, %f8;
    bra.uni     BB0_7;

BB0_6:
    mov.f32     %f33, 0f3E800000;
    mul.rn.f32  %f10, %f8, %f33;
    mul.rn.f32  %f47, %f32, %f33;
    mov.f32     %f48, %f10;

BB0_7:
    mov.f32     %f13, %f48;
    // inline asm
    div.approx.f32  %f34, %f13, %f47;
    // inline asm
    mul.rn.f32  %f39, %f34, %f34;
    add.f32     %f38, %f39, 0f3F800000;
    // inline asm
    sqrt.approx.f32     %f37, %f38;      // <-- this is part of hypot()
    // inline asm
    mul.rn.f32  %f40, %f32, %f37;
    add.f32     %f41, %f32, %f8;
    setp.eq.f32     %p5, %f32, 0f00000000;
    selp.f32    %f42, %f41, %f40, %p5;
    setp.eq.f32     %p6, %f32, 0f7F800000;
    setp.eq.f32     %p7, %f8, 0f7F800000;
    or.pred     %p8, %p6, %p7;
    selp.f32    %f43, 0f7F800000, %f42, %p8;
    shl.b32     %r33, %r11, 2;
    ld.param.u32    %r39, [ndft_param_5];
    add.s32     %r34, %r39, %r33;
    st.global.f32   [%r34], %f43;    // <-- stores the hypot s result in spectrum_abs
    ret;
}

确实,我的所有计算作业都是在座的,即:为<代码><<>hypot功能添加/组合的批次和<编码>。 根据上述法典,我删除了第二行:

st.global.f32 [%r34], %f43;

它是实际将数据储存在全球阵列spectrum_abs上的线。 然后,我使用<代码>clCreateProgramWithBinary,并将经过修改的编码文档用作投入。 处决时间已降至20个。

最佳回答

I would guess you are seeing the effects of compiler optimization.

国家免疫缺陷病毒/艾滋病协会的汇编者非常积极地消除“dead Code”,而后者没有直接参与全球记忆的书写。 因此,如果你不书写<代码>sum_re 或sum_im,汇编者将优化整个计算周期(也许一切照样),并用空洞nel子把你留给一只一只一杯。 你看到的15个微观执行时间大多只是花板发射间接费用,而不是其他。 当你撰写全球记忆时,汇编者就留下了所有的计算法,你看到你守则的真正执行时间。

因此,你可能要问的真正问题是,如何优化这一渠道,将其执行时间从目前对你的30个微观二指标的1.5毫米秒缩短。


尽管对最初的答复表示怀疑,但此处完全是重复性案例,支持关于这是汇编者的相关影响的说法:

#include <iostream>
#include <OpenCL/opencl.h>

size_t source_size;
const char * source_str =
"kernel void ndft(                                                                     
" 
"    global float *re, global float *im, int num_values,                               
" 
"    global float *spectrum_re, global float *spectrum_im,                             
" 
"    global float *spectrum_abs,                                                       
" 
"    global float *sin_array, global float *cos_array,                                 
" 
"    float sqrt_num_values_reciprocal)                                                 
" 
"{                                                                                     
" 
"    // MATH MAGIC - DISREGARD FROM HERE -----------                                   
" 
"                                                                                      
" 
"    float x;                                                                          
" 
"    float y;                                                                          
" 
"    float sum_re = 0;                                                                 
" 
"    float sum_im = 0;                                                                 
" 
"                                                                                      
" 
"    size_t thread_id = get_global_id(0);                                              
" 
"                                                                                      
" 
"    for (int i = 0; i < num_values; i++)                                              
" 
"    {                                                                                 
" 
"        x = cos_array[thread_id * num_values + i] * sqrt_num_values_reciprocal;       
" 
"        y = sin_array[thread_id * num_values + i] * sqrt_num_values_reciprocal;       
" 
"        sum_re += re[i] * x + im[i] * y;                                              
" 
"        sum_im -= re[i] * y + x * im[i];                                              
" 
"    }                                                                                 
" 
"                                                                                      
" 
"    // MATH MAGIC DONE ----------------------------                                   
" 
"                                                                                      
" 
"    //spectrum_re[thread_id] = sum_re;                                                
" 
"    //spectrum_im[thread_id] = sum_im;                                                
" 
"    //spectrum_abs[thread_id] = hypot(sum_re, sum_im);                                
" 
"}                                                                                     
";

int main(void)
{
    int err;

    cl_device_id device_id;  
    clGetDeviceIDs(NULL, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL);
    cl_context context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
    cl_program program = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &err);

    err = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);

    cl_uint program_num_devices;
    clGetProgramInfo(program, CL_PROGRAM_NUM_DEVICES, sizeof(cl_uint), &program_num_devices, NULL);

    size_t * binaries_sizes = new size_t[program_num_devices];
    clGetProgramInfo( program, CL_PROGRAM_BINARY_SIZES, program_num_devices*sizeof(size_t), binaries_sizes, NULL);

    char **binaries = new char*[program_num_devices];
    for (size_t i = 0; i < program_num_devices; i++)
        binaries[i] = new char[binaries_sizes[i]+1];

    clGetProgramInfo(program, CL_PROGRAM_BINARIES, program_num_devices*sizeof(size_t), binaries, NULL);
    for (size_t i = 0; i < program_num_devices; i++)
    {
        binaries[i][binaries_sizes[i]] =   ;
        std::cout << "Program " << i << ":" << std::endl;
        std::cout << binaries[i];
    }
    return 0;
}

在编纂和操作时,它从开放式世界论坛的运行时间继承了以下几条:

Program 0:
bplist00?^clBinaryDriverclBinaryData_clBinaryVersionWCLH 1.0O!.version 1.5
.target sm_12
.target texmode_independent

.reg .b32  r<126>; /* define r0..125 */ 
.reg .b64  x<126>; /* define r0..125 */ 
.reg .b32  f<128>; /* define f0..127 */ 
.reg .pred p<32>; /* define p0..31 */ 
.reg .u32  sp;

.reg .b8   wb0,wb1,wb2,wb3; /* 8-bit write buffer */ 
.reg .b16  ws0,ws1,ws2,ws3; /* 16-bit write buffer */ 
.reg .b32  tb0,tb1,tb2,tb3; /* read tex buffer */ 
.reg .b64  vl0,vl1; /* 64-bit vector buffer */ 
.reg .b16  cvt16_0,cvt16_1; /* tmps for conversions */ 


.const .align 1 .b8 ndft_gid_base[52];
.local .align 16 .b8 ndft_stack[8];
.entry ndft(
    .param.b32 ndft_0  /* re */,
    .param.b32 ndft_1  /* im */,
    .param.b32 ndft_2  /* num_values */,
    .param.b32 ndft_3  /* spectrum_re */,
    .param.b32 ndft_4  /* spectrum_im */,
    .param.b32 ndft_5  /* spectrum_abs */,
    .param.b32 ndft_6  /* sin_array */,
    .param.b32 ndft_7  /* cos_array */,
    .param.f32 ndft_8  /* sqrt_num_values_reciprocal */
) {
    mov.u32 sp, ndft_stack;
    mov.u32 r0, 4294967295;
    ld.param.u32 r1, [ndft_2 + 0];
LBB1_1:
    add.u32 r0, r0, 1;
    setp.lt.s32 p0, r0, r1;
    @p0 bra LBB1_1;
LBB1_2:
    ret;
}

页: 1 a 掩体 st,不含任何计算机。 当三个全球记忆中最后三个线的纸面书写时,它会这样做:

Program 0:
S.version 1.5inaryDriverclBinaryData_clBinaryVersionWCLH 1.0O
.target sm_12
.target texmode_independent

.reg .b32  r<126>; /* define r0..125 */ 
.reg .b64  x<126>; /* define r0..125 */ 
.reg .b32  f<128>; /* define f0..127 */ 
.reg .pred p<32>; /* define p0..31 */ 
.reg .u32  sp;

.reg .b8   wb0,wb1,wb2,wb3; /* 8-bit write buffer */ 
.reg .b16  ws0,ws1,ws2,ws3; /* 16-bit write buffer */ 
.reg .b32  tb0,tb1,tb2,tb3; /* read tex buffer */ 
.reg .b64  vl0,vl1; /* 64-bit vector buffer */ 
.reg .b16  cvt16_0,cvt16_1; /* tmps for conversions */ 


.const .align 1 .b8 ndft_gid_base[52];
.local .align 16 .b8 ndft_stack[8];
.entry ndft(
    .param.b32 ndft_0  /* re */,
    .param.b32 ndft_1  /* im */,
    .param.b32 ndft_2  /* num_values */,
    .param.b32 ndft_3  /* spectrum_re */,
    .param.b32 ndft_4  /* spectrum_im */,
    .param.b32 ndft_5  /* spectrum_abs */,
    .param.b32 ndft_6  /* sin_array */,
    .param.b32 ndft_7  /* cos_array */,
    .param.f32 ndft_8  /* sqrt_num_values_reciprocal */
) {
    mov.u32 sp, ndft_stack;
    cvt.u32.u16 r0, %tid.x;
    cvt.u32.u16 r1, %ntid.x;
    cvt.u32.u16 r2, %ctaid.x;
    mad24.lo.u32 r0, r2, r1, r0;
    mov.u32 r1, 0;
    shl.b32 r2, r1, 2;
    mov.u32 r3, ndft_gid_base;
    add.u32 r2, r2, r3;
    ld.const.u32 r2, [r2 + 40];
    add.u32 r0, r0, r2;
    ld.param.u32 r2, [ndft_2 + 0];
    mul.lo.u32 r3, r0, r2;
    shl.b32 r3, r3, 2;
    mov.f32 f0, 0f00000000 /* 0.000000e+00 */;
    ld.param.f32 f1, [ndft_8 + 0];
    ld.param.u32 r4, [ndft_7 + 0];
    ld.param.u32 r5, [ndft_6 + 0];
    ld.param.u32 r6, [ndft_5 + 0];
    ld.param.u32 r7, [ndft_4 + 0];
    ld.param.u32 r8, [ndft_3 + 0];
    ld.param.u32 r9, [ndft_1 + 0];
    ld.param.u32 r10, [ndft_0 + 0];
    mov.u32 r11, r1;
    mov.f32 f2, f0;
LBB1_1:
    setp.ge.s32 p0, r11, r2;
    @!p0 bra    LBB1_7;
LBB1_2:
    shl.b32 r1, r0, 2;
    add.u32 r2, r8, r1;
    st.global.f32 [r2+0], f0;
    add.u32 r1, r7, r1;
    st.global.f32 [r1+0], f2;
    abs.f32 f1, f2;
    abs.f32 f0, f0;
    setp.gt.f32 p0, f0, f1;
    selp.f32 f2, f0, f1, p0;
    abs.f32 f3, f2;
    mov.f32 f4, 0f7E800000 /* 8.507059e+37 */;
    setp.gt.f32 p1, f3, f4;
    selp.f32 f0, f1, f0, p0;
    shl.b32 r0, r0, 2;
    add.u32 r0, r6, r0;
    @!p1 bra    LBB1_8;
LBB1_3:
    mul.rn.f32 f3, f2, 0f3E800000 /* 2.500000e-01 */;
    mul.rn.f32 f1, f0, 0f3E800000 /* 2.500000e-01 */;
LBB1_4:
    mov.f32 f4, 0f00000000 /* 0.000000e+00 */;
    setp.eq.f32 p0, f2, f4;
    @!p0 bra    LBB1_9;
LBB1_5:
    add.f32 f1, f2, f0;
LBB1_6:
    mov.f32 f3, 0f7F800000 /* inf */;
    setp.eq.f32 p0, f0, f3;
    setp.eq.f32 p1, f2, f3;
    or.pred p0, p1, p0;
    selp.f32 f0, f3, f1, p0;
    st.global.f32 [r0+0], f0;
    ret;
LBB1_7:
    add.u32 r12, r3, r1;
    add.u32 r13, r4, r12;
    ld.global.f32 f3, [r13+0];
    mul.rn.f32 f3, f3, f1;
    add.u32 r13, r9, r1;
    ld.global.f32 f4, [r13+0];
    mul.rn.f32 f5, f3, f4;
    add.u32 r12, r5, r12;
    ld.global.f32 f6, [r12+0];
    mul.rn.f32 f6, f6, f1;
    add.u32 r12, r10, r1;
    ld.global.f32 f7, [r12+0];
    mul.rn.f32 f8, f7, f6;
    add.f32 f5, f8, f5;
    sub.f32 f2, f2, f5;
    mul.rn.f32 f4, f4, f6;
    mul.rn.f32 f3, f7, f3;
    add.f32 f3, f3, f4;
    add.f32 f0, f0, f3;
    add.u32 r11, r11, 1;
    add.u32 r1, r1, 4;
    bra LBB1_1;
LBB1_8:
    mov.f32 f1, f0;
    mov.f32 f3, f2;
    bra LBB1_4;
LBB1_9:
    div.approx.f32 f1, f1, f3;
    mul.rn.f32 f1, f1, f1;
    add.f32 f1, f1, 0f3F800000 /* 1.000000e+00 */;
    sqrt.approx.ftz.f32 f1, f1;
    mul.rn.f32 f1, f2, f1;
    bra LBB1_6;
}

我认为,这完全是无可辩驳的证据表明,它是编纂者选择的,造成时间差异,而且只取决于记忆书写是否列入方略代码。


因此,我猜测最后一个问题会变得如此缓慢(无论辩论是否是由汇编者优化造成的)。 看到的1.5毫米秒,真实地反映了守则的执行情况,真正的问题是原因。 从我对你的方言法典的解读来看,答案在于记忆存取模式,对于万国邮联来说,这些模式是很可怕的。 在你们的 lo中,我们有两个全球记忆:

x = cos_array[thread_id * num_values + i] * sqrt_num_values_reciprocal;

根据您的代码num_ Values, 或者24或48。 这就是说,记忆可能带来共鸣,而Fermi GPU的L1轴心也大有帮助。 这将对记忆带宽的利用产生巨大的负面影响,并使守则非常缓慢。 如果你坚持这种投入数据顺序,那么一个更快的解决办法是使用战争药来计算一个产出(这样可以大大降低最后数额)。 这将使读者人数从24个或48个减少到1个,并从这两个大型投入阵列中汇集全球记忆。

在座右边还有24个或48个部分的“密码>re和im全球记忆中反复出现。

    sum_re += re[i] * x + im[i] * y;
    sum_im -= re[i] * y + x * im[i];

This is unnecessary, and wastes a lot of global memory bandwidth or cache efficiency (the GPU doesn t have enough registers to let the compiler hold the whole of each array in register). It would be far better to have each work group read those two arrays into __local memory arrays once and use the local memory copy inside the compute loop. If you have each work group compute multiple times, rather than just once, then you can potentially save a lot of global memory bandwidth and amortise the initial read until it is almost free.

问题回答

暂无回答




相关问题
Windows Mobile 6 Emulator change storage?

How do i change the size of the Windows Mobile 6 Emulator. Its fixed at 32mb. I read this post: Increasing Windows Mobile 5 Emulator Storage But it only helps for the 5.0 version. Isnt there any way ...

CUDA Memory Allocation accessible for both host and device

I m trying to figure out a way to allocate a block of memory that is accessible by both the host (CPU) and device (GPU). Other than using cudaHostAlloc() function to allocate page-locked memory that ...

RAM memory reallocation - Windows and Linux

I am working on a project involving optimizing energy consumption within a system. Part of that project consists in allocating RAM memory based on locality, that is allocating memory segments for a ...

Should I send retain or autorelease before returning objects?

I thought I was doing the right thing here but I get several warnings from the Build and Analyze so now I m not so sure. My assumption is (a) that an object I get from a function (dateFromComponents: ...

Java Library Size

If I m given two Java Libraries in Jar format, 1 having no bells and whistles, and the other having lots of them that will mostly go unused.... my question is: How will the larger, mostly unused ...

doubts regarding Memory management in .net

I m learning about Memory management in C# from the book "Professional C#" The presence of the garbage collector means that you will usually not worry about objects that you no longer need; ...

Objective-C returning alloc d memory in a function == bad?

This is on the iPhone. So what if I have a function like - (SomeObject*)buildObject; Do I need to pass in a variable that I have already alloc d outside like - (void)assignObject(SomeObject** out);...

热门标签