Pythonによる並列プログラミング -GPGPUも-

pythonでの並列プログラミングの勉強した結果を載せてみます。
pycudaというCUDAのpythonバインディングも試してみました。


single_thread_execute.py

import numpy
import time

size = 256
arr_0 = numpy.arange(size*size)
arr_1 = numpy.arange(size*size)
arr_result = numpy.zeros(size*size)

print "execute start"
start = time.time()

count = 0
for i in range(size):
	for j in range(size):
		tmp = 0
		for k in range(size):
			row = k + (i*size)
			col = j + (k*size)
			tmp += arr_0[row] * arr_1[col]
		arr_result[j + (i*size)] = tmp
	#print "i = " + str(i)

print "time = " + str((time.time() - start))
#print arr_result.sum()

result_sum = 0
for i in arr_result:
	result_sum += i	
print result_sum 

multi_thread_execute.py

import numpy
import threading
import time

size = 256
arr_0 = numpy.arange(size*size)
arr_1 = numpy.arange(size*size)
arr_result = numpy.zeros(size*size)

task_complete = 0
class task(threading.Thread):
    def __init__(self, i):
	threading.Thread.__init__(self)
	self.i = i

    def run(self):
	global task_complete
	global size
	global arr_0
	global arr_1
	global arr_result

	i = self.i
	for j in range(size):
		tmp = 0
		for k in range(size):
			row = k + (i*size)
			col = j + (k*size)
			tmp += arr_0[row] * arr_1[col]
		arr_result[j + (i*size)] = tmp
	task_complete += 1
	#print "i = " + str(i)


print "execute start"
start = time.time()
count = 0

thread_arr = []
print "task init"
for i in range(size):
	thread_arr.append(task(i))

print "start tasks"
for i in range(len(thread_arr)):
	thread_arr[i].start()

while True:
	if(task_complete  >= size):
		break

print "done"
print "time = " + str((time.time() - start))
#print arr_result.sum()

result_sum = 0
for i in arr_result:
	result_sum += i	
print result_sum 

multi_process_execute.py

import numpy
import threading
import time
from multiprocessing import Process, Value, Array, Lock, Queue



size = 256
arr_0 = numpy.arange(size*size)
arr_1 = numpy.arange(size*size)


def task(q, shared_arr_result):
	print "start process "
	while True:	
		if q.empty():
			break

		i = q.get()	
		for j in range(size):
			tmp = 0
			for k in range(size):
				row = k + (i*size)
				col = j + (k*size)
				tmp += arr_0[row] * arr_1[col]
			shared_arr_result[j + (i*size)] = tmp
		#print  str(i)


if __name__ == '__main__':
	size = 256
	q = Queue()
	for i in range(256):
		q.put(i)
	print "execute start"

	arr_result = numpy.zeros(size*size)
	shared_arr_result = Array("f", arr_result)

	process_arr = [0] * 4
	print "task init & start"
	start = time.time()
	for i in range(len(process_arr)):
		process_arr[i] = Process(target=task, args=(q, shared_arr_result))
		process_arr[i].start()


	while True:
		if q.empty():
			break

	print "time = " + str((time.time() - start))
	result_sum = 0
	for i in shared_arr_result:
		result_sum += i	
	print result_sum 


pycuda_execute.py

import pycuda.autoinit
import pycuda.driver as drv
import numpy
import time
from pycuda.compiler import SourceModule

mod = SourceModule("""
	__global__ void square(int* arr_0, int* arr_1, int* arr_result)
	{
          const int size = 256;
          const int x = blockIdx.x * blockDim.x + threadIdx.x;
          const int y = blockIdx.y * blockDim.y + threadIdx.y;

          int tmp = 0;
          for(int k = 0; k < size; k++) {
              int row = k + y * size;
              int col = x + k * size;
              tmp += arr_0[row] * arr_1[col];
          }
          arr_result[x + y * size] = tmp;
	}
""")

size = 128
block = 4

arr_0 = numpy.arange(size*size)
arr_1 = numpy.arange(size*size)
arr_result = numpy.zeros(size*size)


print "start"
print arr_result

start = time.time()
square = mod.get_function("square")
square(drv.InOut(arr_0), drv.InOut(arr_1), drv.InOut(arr_result), block=(block, block, 1), grid=(size/block, size/block))

print "end... time = " + str((time.time() - start))
print arr_result