#!/usr/bin/python3

import pycuda.autoinit
import pycuda.driver as drv

import time
import os

os.environ['OPENBLAS_NUM_THREADS'] = '1'
os.environ['MKL_NUM_THREADS'] = '1'
os.environ['NUMEXPR_NUM_THREADS'] = '1'
os.environ['OMP_NUM_THREADS'] = '1'

import numpy
# show whether numpy use OpenBlas or Intel MKL
numpy.__config__.show()

from pycuda.compiler import SourceModule

t0 = float(time.time())
mod = SourceModule("""
__global__ void multiply_them(float *dest, float *a, float *b)
{
  // check https://en.wikipedia.org/wiki/Thread_block_(CUDA_programming)
  // and https://cs.calvin.edu/courses/cs/374/CUDA/CUDA-Thread-Indexing-Cheatsheet.pdf
  const int i = blockIdx.x * blockDim.x + threadIdx.x;
  dest[i] = a[i] * log(abs(a[i])) * b[i] * log(abs(b[i]));
}
""")

multiply_them = mod.get_function("multiply_them")
t1 = float(time.time())
print("compiling kernel %f\n" % (t1-t0))

t0 = float(time.time())
a = numpy.random.randn(33554432).astype(numpy.float32)
b = numpy.random.randn(33554432).astype(numpy.float32)
dest = numpy.zeros_like(a)
t1 = float(time.time())
print("generate random a, b & dest %f\n" % (t1-t0))

t0 = float(time.time())
# for block() and grid() limits see the above URL
multiply_them(
        drv.Out(dest), drv.In(a), drv.In(b),
        block=(1024,1,1), grid=(32768,1))
t1 = float(time.time())
print("GPU execution %f\n" % (t1-t0))

print("sum of dest = %f\n" % numpy.sum(dest))
print("re-run on GPU\n")

t0 = float(time.time())
a_gpu = drv.mem_alloc(a.nbytes)
b_gpu = drv.mem_alloc(b.nbytes)
dest_gpu = drv.mem_alloc(dest.nbytes)
t1 = float(time.time())
print("alloc GPU memory %f\n" % (t1-t0))

t0 = float(time.time())
drv.memcpy_htod(a_gpu, a)
drv.memcpy_htod(b_gpu, b)
t1 = float(time.time())
print("load data to GPU %f\n" % (t1-t0))

t0 = float(time.time())
multiply_them(
        dest_gpu, a_gpu, b_gpu,
        block=(512,1,1), grid=(65536,1))
t1 = float(time.time())
print("run again on GPU %f\n" % (t1-t0))

t0 = float(time.time())
drv.memcpy_dtoh(dest, dest_gpu)
t1 = float(time.time())
print("off load from GPU %f\n" % (t1-t0))

t0 = float(time.time())
c = a * numpy.log(abs(a)) * b * numpy.log(abs(b))
t1 = float(time.time())
print("CPU execution %f\n" % (t1-t0))

print("sum of dest = %f\n" % numpy.sum(dest))
print("sum of c = %f\n" % numpy.sum(c))