#!/usr/bin/python3 import pycuda.autoinit import pycuda.driver as drv import time import os os.environ['OPENBLAS_NUM_THREADS'] = '1' os.environ['MKL_NUM_THREADS'] = '1' os.environ['NUMEXPR_NUM_THREADS'] = '1' os.environ['OMP_NUM_THREADS'] = '1' import numpy # show whether numpy use OpenBlas or Intel MKL numpy.__config__.show() from pycuda.compiler import SourceModule t0 = float(time.time()) mod = SourceModule(""" __global__ void multiply_them(float *dest, float *a, float *b) { // check https://en.wikipedia.org/wiki/Thread_block_(CUDA_programming) // and https://cs.calvin.edu/courses/cs/374/CUDA/CUDA-Thread-Indexing-Cheatsheet.pdf const int i = blockIdx.x * blockDim.x + threadIdx.x; dest[i] = a[i] * log(abs(a[i])) * b[i] * log(abs(b[i])); } """) multiply_them = mod.get_function("multiply_them") t1 = float(time.time()) print("compiling kernel %f\n" % (t1-t0)) t0 = float(time.time()) a = numpy.random.randn(33554432).astype(numpy.float32) b = numpy.random.randn(33554432).astype(numpy.float32) dest = numpy.zeros_like(a) t1 = float(time.time()) print("generate random a, b & dest %f\n" % (t1-t0)) t0 = float(time.time()) # for block() and grid() limits see the above URL multiply_them( drv.Out(dest), drv.In(a), drv.In(b), block=(1024,1,1), grid=(32768,1)) t1 = float(time.time()) print("GPU execution %f\n" % (t1-t0)) print("sum of dest = %f\n" % numpy.sum(dest)) print("re-run on GPU\n") t0 = float(time.time()) a_gpu = drv.mem_alloc(a.nbytes) b_gpu = drv.mem_alloc(b.nbytes) dest_gpu = drv.mem_alloc(dest.nbytes) t1 = float(time.time()) print("alloc GPU memory %f\n" % (t1-t0)) t0 = float(time.time()) drv.memcpy_htod(a_gpu, a) drv.memcpy_htod(b_gpu, b) t1 = float(time.time()) print("load data to GPU %f\n" % (t1-t0)) t0 = float(time.time()) multiply_them( dest_gpu, a_gpu, b_gpu, block=(512,1,1), grid=(65536,1)) t1 = float(time.time()) print("run again on GPU %f\n" % (t1-t0)) t0 = float(time.time()) drv.memcpy_dtoh(dest, dest_gpu) t1 = float(time.time()) print("off load from GPU %f\n" % (t1-t0)) t0 = float(time.time()) c = a * numpy.log(abs(a)) * b * numpy.log(abs(b)) t1 = float(time.time()) print("CPU execution %f\n" % (t1-t0)) print("sum of dest = %f\n" % numpy.sum(dest)) print("sum of c = %f\n" % numpy.sum(c))