Pythonによる並列プログラミング -GPGPUも-
pythonでの並列プログラミングの勉強した結果を載せてみます。
pycudaというCUDAのpythonバインディングも試してみました。
Pythonによる並列プログラミング -GPGPUも-
View more documents from Yusaku Watanabe
single_thread_execute.py
import numpy import time size = 256 arr_0 = numpy.arange(size*size) arr_1 = numpy.arange(size*size) arr_result = numpy.zeros(size*size) print "execute start" start = time.time() count = 0 for i in range(size): for j in range(size): tmp = 0 for k in range(size): row = k + (i*size) col = j + (k*size) tmp += arr_0[row] * arr_1[col] arr_result[j + (i*size)] = tmp #print "i = " + str(i) print "time = " + str((time.time() - start)) #print arr_result.sum() result_sum = 0 for i in arr_result: result_sum += i print result_sum
multi_thread_execute.py
import numpy import threading import time size = 256 arr_0 = numpy.arange(size*size) arr_1 = numpy.arange(size*size) arr_result = numpy.zeros(size*size) task_complete = 0 class task(threading.Thread): def __init__(self, i): threading.Thread.__init__(self) self.i = i def run(self): global task_complete global size global arr_0 global arr_1 global arr_result i = self.i for j in range(size): tmp = 0 for k in range(size): row = k + (i*size) col = j + (k*size) tmp += arr_0[row] * arr_1[col] arr_result[j + (i*size)] = tmp task_complete += 1 #print "i = " + str(i) print "execute start" start = time.time() count = 0 thread_arr = [] print "task init" for i in range(size): thread_arr.append(task(i)) print "start tasks" for i in range(len(thread_arr)): thread_arr[i].start() while True: if(task_complete >= size): break print "done" print "time = " + str((time.time() - start)) #print arr_result.sum() result_sum = 0 for i in arr_result: result_sum += i print result_sum
multi_process_execute.py
import numpy import threading import time from multiprocessing import Process, Value, Array, Lock, Queue size = 256 arr_0 = numpy.arange(size*size) arr_1 = numpy.arange(size*size) def task(q, shared_arr_result): print "start process " while True: if q.empty(): break i = q.get() for j in range(size): tmp = 0 for k in range(size): row = k + (i*size) col = j + (k*size) tmp += arr_0[row] * arr_1[col] shared_arr_result[j + (i*size)] = tmp #print str(i) if __name__ == '__main__': size = 256 q = Queue() for i in range(256): q.put(i) print "execute start" arr_result = numpy.zeros(size*size) shared_arr_result = Array("f", arr_result) process_arr = [0] * 4 print "task init & start" start = time.time() for i in range(len(process_arr)): process_arr[i] = Process(target=task, args=(q, shared_arr_result)) process_arr[i].start() while True: if q.empty(): break print "time = " + str((time.time() - start)) result_sum = 0 for i in shared_arr_result: result_sum += i print result_sum
pycuda_execute.py
import pycuda.autoinit import pycuda.driver as drv import numpy import time from pycuda.compiler import SourceModule mod = SourceModule(""" __global__ void square(int* arr_0, int* arr_1, int* arr_result) { const int size = 256; const int x = blockIdx.x * blockDim.x + threadIdx.x; const int y = blockIdx.y * blockDim.y + threadIdx.y; int tmp = 0; for(int k = 0; k < size; k++) { int row = k + y * size; int col = x + k * size; tmp += arr_0[row] * arr_1[col]; } arr_result[x + y * size] = tmp; } """) size = 128 block = 4 arr_0 = numpy.arange(size*size) arr_1 = numpy.arange(size*size) arr_result = numpy.zeros(size*size) print "start" print arr_result start = time.time() square = mod.get_function("square") square(drv.InOut(arr_0), drv.InOut(arr_1), drv.InOut(arr_result), block=(block, block, 1), grid=(size/block, size/block)) print "end... time = " + str((time.time() - start)) print arr_result