Question

I m new to CUDA and I m probably doing something wrong. All I need is logical operation on two binary vectors. Length of vectors is 2048000. I compared speed between logical and in Matlab s C mex file and in CUDA kernel. C on CPU is ~5% faster than CUDA. Please note that I measured only kernel execution (without memory transfer). I have i7 930 and 9800GT.

##MEX file testCPU.c:##

#include "mex.h"
void mexFunction( int nlhs, mxArray *plhs[],
        int nrhs, const mxArray *prhs[] ) {
    
    int i, varLen;
    unsigned char *vars, *output;
            
    vars = mxGetPr(prhs[0]);
    plhs[0] = mxCreateLogicalMatrix(2048000, 1);
    output = mxGetPr(plhs[0]);
    for (i=0;i<2048000;i++){
        output[i] = vars[i] & vars[2048000+i];
    }
}

汇编汇编

mex testCPU.c

创建矢量

vars = ~~(randi(2,2048000,2)-1);

测量速度 :

tic;testCPU(vars);toc;

<强 > CUDA :

#CUDA file testGPU.cu#
#include "mex.h"
#include "cuda.h"

__global__ void logical_and(unsigned char* in, unsigned char* out, int N) {
    int idx = blockIdx.x*blockDim.x+threadIdx.x;
    out[idx] = in[idx] && in[idx+N];
}


void mexFunction( int nlhs, mxArray *plhs[],
        int nrhs, const mxArray *prhs[] ) {
    
    int i;
    unsigned char *vars, *output, *gpu, *gpures;
    
    vars = (unsigned char*)mxGetData(prhs[0]);
    
    plhs[0] = mxCreateLogicalMatrix(2048000, 1);
    output = (unsigned char*)mxGetData(plhs[0]);       
       
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    float dt_ms;
    
    // input GPU malloc
    cudaEventRecord(start, 0);
    cudaMalloc( (void **) &gpu, sizeof(unsigned char)*4096000);
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&dt_ms, start, stop);
    printf("GPU input malloc: %f ms, %i
", dt_ms, cudaGetLastError());
    
    // output GPU malloc
    cudaEventRecord(start, 0);
    cudaMalloc( (void **) &gpures, sizeof(unsigned char)*2048000);
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&dt_ms, start, stop);
    printf("GPU output malloc: %f ms, %i
", dt_ms, cudaGetLastError());
    
    // copy from CPU to GPU
    cudaEventRecord(start, 0);
    cudaMemcpy( gpu, vars, sizeof(unsigned char)*4096000, cudaMemcpyHostToDevice);
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&dt_ms, start, stop);
    printf("copy input from CPU to GPU: %f ms, %i
", dt_ms, cudaGetLastError());
    
    dim3 dimBlock(32);
    printf("thread count: %i
", dimBlock.x);
    dim3 dimGrid(2048000/dimBlock.x);
    printf("block count: %i
", dimGrid.x);
    
    // --- KERNEL ---
    cudaEventRecord(start, 0);
    logical_and<<<dimGrid, dimBlock>>>(gpu, gpures, 2048000);
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&dt_ms, start, stop);
    printf("GPU kernel: %f ms, %i
", dt_ms, cudaGetLastError());
    
    // result from GPU to CPU
    cudaEventRecord(start, 0);
    cudaMemcpy( output, gpures, sizeof(unsigned char)*2048000, cudaMemcpyDeviceToHost );
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&dt_ms, start, stop);
    printf("copy output from GPU to CPU: %f ms, %i
", dt_ms, cudaGetLastError());
    
    
    cudaFree(gpu);
    cudaFree(gpures);
    
}

汇编汇编:

 nvmex -f nvmexopts_9.bat testGPU.cu 
-I"C:Program FilesNVIDIA GPU Computing ToolkitCUDAv4.2include" 
-L"C:Program FilesNVIDIA GPU Computing ToolkitCUDAv4.2libx64" -lcudart -lcufft

产出:

GPU input malloc: 0.772160 ms, 0
GPU output malloc: 0.041728 ms, 0
copy input from CPU to GPU: 1.494784 ms, 0
thread count: 32
block count: 64000
*** GPU kernel: 3.761216 ms, 0 ***
copy output from GPU to CPU: 1.203488 ms, 0

代码可以吗? CPU 的速率是~ 0. 1米, 比 CUDA 内核快。我尝试过不同的线条计数( 乘数32), 最高为 512, 32 是最快的。运算符 & amp; 而不是 amp; & amp; 慢了近1米。

9800GT真的这么弱吗?用今天的主流卡(即GTX460,560)我能指望什么加速呢?

谢谢

EDIT: based on talonmies comment, I made these modifications:

核心函数 :

__global__ void logical_and(uchar4* in, uchar4* out, int N) {
    int idx = blockIdx.x*blockDim.x+threadIdx.x;
    out[idx].x = in[idx].x & in[idx+N].x;
    out[idx].y = in[idx].y & in[idx+N].y;
    out[idx].z = in[idx].z & in[idx+N].z;
    out[idx].w = in[idx].w & in[idx+N].w;
}

主函数 :

uchar4 *gpu, *gpures;

// 32 was worst, 64,128,256,512 were similar
dim3 dimBlock(128);
// block count is now 4xtimes smaller
dim3 dimGrid(512000/dimBlock.x);

产出:

GPU input malloc: 0.043360 ms, 0
GPU output malloc: 0.038592 ms, 0
copy input from CPU to GPU: 1.499584 ms, 0
thread count: 128
block count: 4000
*** GPU kernel: 0.131296 ms, 0 ***
copy output from GPU to CPU: 1.281120 ms, 0

Is that correct? Almost 30x speed-up! It seems too good to be true, but result is correct :) How faster will be GTX560 on this particular task? Thx

Edit 2:

这是这个代码吗?

__global__ void logical_and(uchar4* in, uchar4* out, int N) {
    int idx = blockIdx.x*blockDim.x+threadIdx.x;
    
    out[idx].x = in[idx].x & in[idx+N].x;
    out[idx].y = in[idx].y & in[idx+N].y;
    out[idx].z = in[idx].z & in[idx+N].z;
    out[idx].w = in[idx].w & in[idx+N].w;
}

自動轉換到 :

__global__ void logical_and(uchar4* in, uchar4* out, int N) {
    int idx = blockIdx.x*blockDim.x+threadIdx.x;  
    uchar4 buff;

    buff.x = in[idx].x;
    buff.y = in[idx].y;
    buff.z = in[idx].z;
    buff.w = in[idx].w;

    buff.x &= in[idx+N].x;
    buff.y &= in[idx+N].y;
    buff.z &= in[idx+N].z;
    buff.w &= in[idx+N].w;

    out[idx].x = buff.x;
    out[idx].y = buff.y;
    out[idx].z = buff.z;
    out[idx].w = buff.w;
}

编译者?

如果它是正确的, 它可以解释我对于联结访问的困惑。我认为在 [idx+N] 中 in[idx] & amp; in[idx+N] 中, in[idx] 和 in[idx+N] 中, < idx+N] 被装入两个联结步骤。 可以是16个的倍数, 因为 uchar4 是4 bytes long, 而对于联结访问地址必须是64 bytes( 在1.1 设备上) 。 Am I 对不对?

Answer 1

如>>talonmies 指出,您重新访问和处理您的数据,这远非最佳的字节,远非最佳。您可能想要考虑的一系列技术,如指示级平行主义和缓冲读/writes,由Vividia Webinar 由Vasily Volkov所汇总。

简言之,你想做的是,在每条线索中,用联结的方式读读数个 unit4 < /code>,处理它们,然后才保存它们。

<强 > 更新

如果你重写以下代码,这有什么区别吗?

__global__ void logical_and(unsigned int* in, unsigned int* out, int N) {
    int idx = blockIdx.x*blockDim.x*chunksize+threadIdx.x;
    unsigned int buff[chunksize];
    #pragma unroll
    for ( int k = 0 ; k < chunksize ; k++ )
        buff[k] = in[ blockDim.x*k + idx ];
    #pragma unroll
    for ( int k = 0 ; k < chunksize ; k++ )
        buff[k] &= in[ blockDim.x*k + idx + N ];
    #pragma unroll
    for ( int k = 0 ; k < chunksize ; k++ )
        out[ blockDim.x*k + idx ] = buff[k];
}

请注意,我假设 chunk size 是您在某处找到的变量 define , 例如。

#define chunksize 4

您需要将您发射的区块数和 < code> N < / code > 的区块数除以该区块数。我还使用了 < code> untigned int , 它只有四个已打包的 < code> uchar 。在您的调用功能中, 您可能需要相应投下指针。

Answer 2

我认为它发生的情况是 "http://en.wikipedia.org/wiki/False_share" rel=“nofollow”>false share 。我认为问题在于,你试图用线条写字的字节大小区域正在产生巨大的种族状况,因为不同的线条正试图写到同一个单词一致的地址。我不确定在GPU中的细节,但在CPU中,当不同的线条试图写到相同的256字节对齐区域(所谓的缓存线)的记忆时,它们会不断堵塞对方,使你们全球的性能急剧下降。

EDIT: based on talonmies comment, I made these modifications:

Edit 2:

友情链接