examples/vmult/opencl.ss
#lang scheme
(require (planet jaymccarthy/opencl/c)
         (planet jaymccarthy/opencl/scheme)
         scheme/foreign)
(unsafe!)

(define HOW-MANY (read)) 
(define input-v (malloc _float HOW-MANY 'raw))
(define output-v (malloc _float HOW-MANY 'raw))
(define how-much-mem (* HOW-MANY (ctype-sizeof _float)))

(for ([i (in-range HOW-MANY)])
  (ptr-set! input-v _float i (read)))

(define ds (platform-devices #f 'CL_DEVICE_TYPE_GPU))
(define d (cvector-ref ds 0))

(define ctxt (clCreateContext (vector d)))

(define kernel-source
  (string->bytes/utf-8
   #<<END
__kernel square(                                                       
   __global float* input,                                              
   __global float* output,                                             
   const unsigned int count)                                           
{                                                                      
   int i = get_global_id(0);                                          
   if(i < count)                                                       
       output[i] = input[i] * input[i];                                
}                                                                      
END
   ))

(define program (clCreateProgramWithSource ctxt (vector kernel-source)))            
(define cq (clCreateCommandQueue ctxt d empty))

(clBuildProgram program (vector d) #"")

(define kernel (clCreateKernel program #"square"))
(define input (clCreateBuffer ctxt 'CL_MEM_READ_ONLY how-much-mem #f))
(define output (clCreateBuffer ctxt 'CL_MEM_WRITE_ONLY how-much-mem #f))
(define input-evt
  (clEnqueueWriteBuffer cq input 'CL_FALSE 0 how-much-mem input-v (vector)))

(clFinish cq)

(clSetKernelArg:_cl_mem kernel 0 input)
(clSetKernelArg:_cl_mem kernel 1 output)
(clSetKernelArg:_cl_uint kernel 2 HOW-MANY)

(define work-group-size (kernel-work-group-info kernel d 'CL_KERNEL_WORK_GROUP_SIZE))

(define kernel-evt
  (clEnqueueNDRangeKernel cq kernel 1 
                          (vector HOW-MANY)
                          (vector work-group-size)
                          (vector input-evt)))

(time (clFinish cq))

(define output-evt
  (clEnqueueReadBuffer cq output 'CL_FALSE 0 how-much-mem output-v (vector kernel-evt)))        

(clFinish cq)

(clReleaseMemObject input)
(clReleaseMemObject output)
(clReleaseKernel kernel)

(clReleaseProgram program)
(clReleaseCommandQueue cq)
(clReleaseContext ctxt)

(free input-v)
(free output-v)