GPGPUによるパーソナルスーパーコンピュータの可能性
レポートはこちら
http://www.slideshare.net/waysaku/gpgpu-3774966
※はてなダイアリーでslideshareの貼付け方がわからん、、、
参考サイトを見よう見まねで書いただけだけど、ソースも載っけとく
#include <time.h> #include <sys/time.h> #include <stdio.h> #include <cutil.h> #define BLOCK 16 #define WIDTH 1024 void Host(float *a, float *b, float *c); __global__ void DeviceCalculation(float *a, float *b, float *c); void SetTimer(unsigned int *t); float EndTimer(unsigned int *t); float h_a[WIDTH*WIDTH]; float h_b[WIDTH*WIDTH]; float h_c[WIDTH*WIDTH]; double gettimeofday_sec() { struct timeval tv; gettimeofday(&tv, NULL); return tv.tv_sec + (double)tv.tv_usec*1e-6; } int main(int argc, char**argv) { int i; double t1,t2; CUT_DEVICE_INIT(argc,argv); float *d_a, *d_b, *d_c; cudaMalloc((void**) &d_a, sizeof(float)*WIDTH*WIDTH); cudaMalloc((void**) &d_b, sizeof(float)*WIDTH*WIDTH); cudaMalloc((void**) &d_c, sizeof(float)*WIDTH*WIDTH); cudaMemset(d_c, 0, sizeof(float)*WIDTH*WIDTH); for(i=0; i<WIDTH*WIDTH; i++) { h_a[i]=i; h_b[i]=i; } t1 = gettimeofday_sec(); cudaMemcpy(d_a, h_a, sizeof(float)*WIDTH*WIDTH, cudaMemcpyHostToDevice); cudaMemcpy(d_b, h_b, sizeof(float)*WIDTH*WIDTH, cudaMemcpyHostToDevice); dim3 grid(WIDTH/BLOCK, WIDTH/BLOCK, 1); dim3 threads(BLOCK, BLOCK, 1); DeviceCalculation<<< grid, threads >>>(d_a, d_b, d_c); cudaMemcpy(h_c, d_c, sizeof(float)*WIDTH*WIDTH,cudaMemcpyDeviceToHost); t2 = gettimeofday_sec(); printf("計算時間 = %10.30f\n", t2 - t1); printf(" 計算結果=%f\n",h_c[WIDTH*WIDTH-1]); cudaFree(d_a); cudaFree(d_b); cudaFree(d_c); t1 = gettimeofday_sec(); Host(h_a, h_b, h_c); t2 = gettimeofday_sec(); printf("ホスト計算時間 = %10.30f\n", t2 - t1); printf("計算結果=%f\n",h_c[WIDTH*WIDTH-1]); //getch(); } void Host(float *A, float *B, float *C) { int i,j,k; int row,col; float tmp; for(i=0; i<WIDTH; i++){ for(j=0; j<WIDTH; j++){ tmp=0.0; for(k=0; k<WIDTH; k++){ row=k+i*WIDTH; col=j+k*WIDTH; tmp+=A[row]*B[col]; } C[j+i*WIDTH]=tmp; } } } __global__ void DeviceCalculation(float *A, float *B, float *C) { int bx = blockIdx.x; int by = blockIdx.y; int tx = threadIdx.x; int ty = threadIdx.y; float tmp = 0; __shared__ float As[BLOCK][BLOCK]; __shared__ float Bs[BLOCK][BLOCK]; for (int a = 0, b = 0 ; a < WIDTH; a += BLOCK, b += BLOCK) { int a_adr = WIDTH * BLOCK * by + a; int b_adr = BLOCK * bx + WIDTH * b; As[ty][tx] = A[a_adr + WIDTH*ty + tx]; Bs[ty][tx] = B[b_adr + WIDTH*ty + tx]; __syncthreads(); for (int k = 0; k < BLOCK; k++) { tmp += As[ty][k] * Bs[k][tx]; } __syncthreads(); } int adr = WIDTH * BLOCK * by + BLOCK * bx; C[adr + WIDTH * ty + tx] = tmp; }