summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rwxr-xr-xcompile.sh1
-rw-r--r--kernel.cl19
-rw-r--r--test.c422
3 files changed, 442 insertions, 0 deletions
diff --git a/compile.sh b/compile.sh
new file mode 100755
index 0000000..07cc7df
--- /dev/null
+++ b/compile.sh
@@ -0,0 +1 @@
+gcc -I/opt/AMDAPPSDK-3.0/include -lOpenCL -lpcilib test.c
diff --git a/kernel.cl b/kernel.cl
new file mode 100644
index 0000000..26a0009
--- /dev/null
+++ b/kernel.cl
@@ -0,0 +1,19 @@
+#define REG_PERF_COUNTER 0x20
+#define REG_DESCRIPTOR_ADDRESS 0x50
+
+#define RD32(addr) ((__global volatile uint*)bar)[addr / 4];
+#define WR32(addr, value) ((__global volatile uint*)bar)[addr / 4] = value;
+#define WR64(addr, value) ((__global volatile ulong*)bar)[addr / 8] = value;
+
+__kernel void process(uint iter, __global uint *input, __global uint *output, __global volatile uint *bar, ulong bus_addr)
+{
+ WR32 (REG_PERF_COUNTER, 1);
+}
+
+
+__kernel void measure(uint iter, __global uint *input, __global uint *output, __global volatile uint *bar, ulong bus_addr)
+{
+ output[0] = iter;
+ output[iter + 1] = RD32(REG_PERF_COUNTER);
+ WR64 (REG_DESCRIPTOR_ADDRESS, bus_addr);
+}
diff --git a/test.c b/test.c
new file mode 100644
index 0000000..ae92c57
--- /dev/null
+++ b/test.c
@@ -0,0 +1,422 @@
+#include <string.h>
+#include <stdio.h>
+#include <time.h>
+
+#include <pcilib.h>
+#include <pcilib/mem.h>
+#include <pcilib/bar.h>
+#include <pcilib/kmem.h>
+
+#include "CL/cl.h"
+#include "CL/cl_ext.h"
+
+#define KERNEL_CONTROL
+//#define OPENCL_TIMINGS
+
+#define BAR PCILIB_BAR0
+
+#define TLP_SIZE 64
+#define PAGE_SIZE 4096
+#define NUM_PAGES 16
+
+#define KMEM_DEFAULT_FLAGS (pcilib_kmem_flags_t)(PCILIB_KMEM_FLAG_HARDWARE | PCILIB_KMEM_FLAG_PERSISTENT | PCILIB_KMEM_FLAG_EXCLUSIVE)
+
+#define KMEM_USE_RING PCILIB_KMEM_USE(PCILIB_KMEM_USE_USER, 1)
+#define KMEM_USE_DEFAULT PCILIB_KMEM_USE(PCILIB_KMEM_USE_USER, 2)
+
+#define REG_RESET_DMA 0x00
+#define REG_DMA 0x04
+#define REG_NUM_PACKETS_PER_DESCRIPTOR 0x10
+#define REG_PERF_COUNTER 0x20
+//#define REG_PERF_COUNTER 0x28
+#define REG_PACKET_LENGTH 0x0C
+#define REG_DESCRIPTOR_ADDRESS 0x50
+#define REG_UPDATE_ADDRESS 0x58
+#define REG_UPDATE_THRESHOLD 0x60
+#define REG_UPDATE_COUNTER 0x70
+
+
+#define REG_COUNTER 0x9000
+
+#define WR32(addr, value) *(uint32_t *) (bar + (addr)) = (value);
+#define RD32(addr) (*(uint32_t *) (bar + (addr)))
+#define WR32_sleep(addr, value) *(uint32_t *) (bar + (addr)) = (value); usleep (100);
+
+#define WR64(addr, value) *(uint64_t *) (bar + (addr)) = (value);
+#define RD64(addr) (*(uint64_t *) (bar + (addr)))
+#define WR64_sleep(addr, value) *(uint64_t *) (bar + (addr)) = (value); usleep (100);
+
+
+#define DATA_SIZE NUM_PAGES * PAGE_SIZE
+
+
+#define CL_CHECK_STATUS(error) { \
+ if ((error) != CL_SUCCESS) fprintf (stderr, "OpenCL error <%s:%i>: %i\n", __FILE__, __LINE__, (error)); }
+
+
+static void check_data(cl_command_queue queue, cl_mem mem, size_t size) {
+ uint32_t *data;
+
+ data = malloc (size);
+ memset (data, 0, size);
+
+ CL_CHECK_STATUS (clEnqueueReadBuffer (queue, mem, CL_TRUE, 0, size, data, 0, NULL, NULL));
+
+ printf("%lx\n", data[0]);
+
+ free (data);
+}
+
+
+int main(void)
+{
+ int i;
+ cl_uint j = 0;
+ cl_context context;
+ cl_command_queue command_queue;
+ cl_int err;
+ cl_uint num_of_platforms=0;
+ cl_platform_id platform_id;
+ cl_device_id device_id;
+ cl_uint num_of_devices=0;
+ cl_mem input, output;//, host;
+ cl_bus_address_amd bus_address;
+ cl_event event, event1, event2;
+
+ cl_int status;
+ cl_command_type type;
+ size_t res_size;
+
+ clEnqueueMakeBuffersResidentAMD_fn clEnqueueMakeBuffersResidentAMD;
+ clEnqueueWaitSignalAMD_fn clEnqueueWaitSignalAMD;
+
+ CL_CHECK_STATUS(clGetPlatformIDs(1, &platform_id, &num_of_platforms));
+ clEnqueueMakeBuffersResidentAMD = (clEnqueueMakeBuffersResidentAMD_fn)clGetExtensionFunctionAddressForPlatform(platform_id, "clEnqueueMakeBuffersResidentAMD");
+ clEnqueueWaitSignalAMD = clGetExtensionFunctionAddressForPlatform (platform_id, "clEnqueueWaitSignalAMD");
+
+ CL_CHECK_STATUS(clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id,&num_of_devices));
+
+ cl_context_properties properties[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties) platform_id, 0 };
+ context = clCreateContext(properties, 1, &device_id, NULL,NULL, &err);
+ CL_CHECK_STATUS(err);
+
+ cl_queue_properties props[] = {
+ CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE/*|CL_QUEUE_ON_DEVICE|CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE|CL_QUEUE_ON_DEVICE_DEFAULT,
+ CL_QUEUE_SIZE, CL_DEVICE_QUEUE_ON_DEVICE_PREFERRED_SIZE*/,
+ 0};
+ command_queue = clCreateCommandQueueWithProperties(context, device_id, props, &err);
+ CL_CHECK_STATUS(err);
+
+ input = clCreateBuffer(context, CL_MEM_READ_WRITE|CL_MEM_BUS_ADDRESSABLE_AMD, DATA_SIZE, NULL, &err);
+ CL_CHECK_STATUS(err);
+
+ output = clCreateBuffer(context, CL_MEM_READ_WRITE, 4096, NULL, &err);
+ CL_CHECK_STATUS(err);
+
+ memset(&bus_address, 0, sizeof(cl_bus_address_amd));
+
+// CL_CHECK_STATUS(clEnqueueMakeBuffersResidentAMD(command_queue, 1, &input, CL_TRUE, &bus_address, 0, 0, NULL));
+ CL_CHECK_STATUS(clEnqueueMakeBuffersResidentAMD(command_queue, 1, &input, CL_FALSE, &bus_address, 0, 0, &event));
+ CL_CHECK_STATUS (clWaitForEvents (1, &event));
+ CL_CHECK_STATUS (clReleaseEvent (event));
+
+
+ printf("bus adress : surface : 0x%lx, marker : 0x%lx\n", bus_address.surface_bus_address, bus_address.marker_bus_address);
+
+ pcilib_t *pci = pcilib_open("/dev/fpga0", PCILIB_MODEL_DETECT);
+ if (!pci) {
+ printf("pcilib_open failed\n");
+ exit(1);
+ }
+
+ volatile void *bar = pcilib_resolve_bar_address(pci, BAR, 0);
+ if (!bar) {
+ pcilib_close(pci);
+ printf("map bar\n");
+ exit(1);
+ }
+
+ const pcilib_bar_info_t *bar_info = pcilib_get_bar_info(pci, BAR);
+ if (!bar_info) {
+ pcilib_close(pci);
+ printf("get bar info\n");
+ exit(1);
+ }
+
+ cl_bus_address_amd amd_addr = {
+ .surface_bus_address = bar_info->phys_addr,
+ .marker_bus_address = bar_info->phys_addr
+ };
+
+ cl_mem bar_cl = clCreateBuffer (context, CL_MEM_EXTERNAL_PHYSICAL_AMD, bar_info->size, &amd_addr, &err);
+ if (err) {
+ pcilib_close(pci);
+ printf("Error (%i) mapping BAR to GPU\n", err);
+ exit(1);
+ }
+
+ FILE *f = fopen("kernel.cl", "rb");
+ fseek(f, 0, SEEK_END);
+ long fsize = ftell(f);
+ fseek(f, 0, SEEK_SET); //same as rewind(f);
+ char *cl_string = malloc(fsize + 1);
+ fread(cl_string, fsize, 1, f);
+ cl_string[fsize] = 0;
+ fclose(f);
+
+ cl_program program = clCreateProgramWithSource (context, 1, (const char **) &cl_string, NULL, &err);
+ CL_CHECK_STATUS (err);
+ CL_CHECK_STATUS(clBuildProgram (program, 1, &device_id, "-cl-std=CL2.0 -D CL_VERSION_2_0", NULL, NULL));
+
+ size_t work_size = 1;
+ cl_kernel process_kernel = clCreateKernel (program, "process", &err);
+ CL_CHECK_STATUS (err);
+ CL_CHECK_STATUS (clSetKernelArg (process_kernel, 0, sizeof (uint), &j));
+ CL_CHECK_STATUS (clSetKernelArg (process_kernel, 1, sizeof (cl_mem), &input));
+ CL_CHECK_STATUS (clSetKernelArg (process_kernel, 2, sizeof (cl_mem), &output));
+ CL_CHECK_STATUS (clSetKernelArg (process_kernel, 3, sizeof (cl_mem), &bar_cl));
+ CL_CHECK_STATUS (clSetKernelArg (process_kernel, 4, sizeof (ulong), &bus_address.surface_bus_address));
+
+ cl_kernel measure_kernel = clCreateKernel (program, "measure", &err);
+ CL_CHECK_STATUS (clSetKernelArg (measure_kernel, 0, sizeof (uint), &j));
+ CL_CHECK_STATUS (clSetKernelArg (measure_kernel, 1, sizeof (cl_mem), &input));
+ CL_CHECK_STATUS (clSetKernelArg (measure_kernel, 2, sizeof (cl_mem), &output));
+ CL_CHECK_STATUS (clSetKernelArg (measure_kernel, 3, sizeof (cl_mem), &bar_cl));
+ CL_CHECK_STATUS (clSetKernelArg (measure_kernel, 4, sizeof (ulong), &bus_address.surface_bus_address));
+
+ pcilib_kmem_handle_t *desc_kmem = pcilib_alloc_kernel_memory (pci, PCILIB_KMEM_TYPE_CONSISTENT, 1, 128, 4096, KMEM_USE_RING, KMEM_DEFAULT_FLAGS);
+ uintptr_t desc_bus = pcilib_kmem_get_block_ba (pci, desc_kmem, 0);
+ volatile void *desc = (uint32_t *) pcilib_kmem_get_block_ua (pci, desc_kmem, 0);
+ memset ((uint32_t *)desc, 0, 5 * sizeof (uint32_t));
+ volatile uint64_t *hwaddr = (uint64_t*)(desc + 2 * sizeof(uint32_t));
+
+ pcilib_kmem_handle_t *kbuf_kmem = pcilib_alloc_kernel_memory(pci, PCILIB_KMEM_TYPE_DMA_C2S_PAGE, 1, 4096, 4096, KMEM_USE_DEFAULT, KMEM_DEFAULT_FLAGS);
+ uintptr_t kbuf_bus = pcilib_kmem_get_block_ba (pci, kbuf_kmem, 0);
+ volatile uint32_t *kbuf = (uint32_t *) pcilib_kmem_get_block_ua (pci, kbuf_kmem, 0);
+ memset ((uint32_t *)kbuf, 0, 4096);
+
+ void *marker = pcilib_map_area(pci, bus_address.marker_bus_address, 4096);
+ if (!marker) {
+ printf("pcilib_map_area failed\n");
+ exit(1);
+ }
+
+ *(uint32_t*)marker = 0;
+
+ void *gpubuf = pcilib_map_area(pci, bus_address.surface_bus_address, 4096);
+ *(uint32_t*)gpubuf = 0x1;
+
+ check_data(command_queue, input, 4);
+
+
+ WR32 (REG_RESET_DMA, 1); usleep (100000);
+ WR32 (REG_RESET_DMA, 0); usleep (100000);
+
+ WR32_sleep (REG_NUM_PACKETS_PER_DESCRIPTOR, PAGE_SIZE / (4 * TLP_SIZE));
+ WR32_sleep (REG_PACKET_LENGTH, 0x80000 | TLP_SIZE);
+ WR32_sleep (REG_UPDATE_THRESHOLD, 1);
+ WR64_sleep (REG_UPDATE_ADDRESS, desc_bus);
+ WR64_sleep (REG_UPDATE_COUNTER, bus_address.marker_bus_address);
+// WR32_sleep (REG_PERF_COUNTER, 0);
+// WR32 (REG_DMA, 1);
+
+ WR32_sleep (0x9048, 0x232);
+ WR32_sleep (REG_COUNTER, 1);
+ usleep(1000000);
+
+
+ struct timespec tss, tse, tsk;
+
+
+ for (i = 0; i < 100; i++) {
+ WR32_sleep (REG_DMA, 0);
+ WR32_sleep (REG_PERF_COUNTER, 0);
+ WR64_sleep (REG_DESCRIPTOR_ADDRESS, bus_address.surface_bus_address);
+
+ *(volatile uint32_t*)marker = 0;
+ *hwaddr = 0;
+
+ clock_gettime(CLOCK_REALTIME, &tss);
+ WR32 (REG_DMA, 1);
+
+ if (i < 50) {
+ while ((*hwaddr) == 0) {
+ }
+ } else {
+ while ((*(volatile uint32_t*)marker) < 1) {
+ }
+ }
+ clock_gettime(CLOCK_REALTIME, &tse);
+
+ CL_CHECK_STATUS (clEnqueueNDRangeKernel (command_queue, process_kernel, 1, NULL, &work_size, NULL, 0, NULL, &event));
+ CL_CHECK_STATUS (clWaitForEvents (1, &event));
+ clock_gettime(CLOCK_REALTIME, &tsk);
+
+ double lath = 4. * RD32 (0x20) / 1000;
+
+ double lat = (tse.tv_sec - tss.tv_sec)*1000000 + 1. * (tse.tv_nsec - tss.tv_nsec) / 1000.;
+ double latk = (tsk.tv_sec - tss.tv_sec)*1000000 + 1. * (tsk.tv_nsec - tss.tv_nsec) / 1000.;
+
+ printf(" iteration %u, hw: %6.3lf us, sw: %6.3lf us, +krn: %6.3lf us, maker: %u\n", i, lath, lat, latk, *(volatile uint32_t*)marker);
+
+ CL_CHECK_STATUS (clReleaseEvent (event));
+ }
+
+ usleep(100000);
+
+
+#ifdef KERNEL_CONTROL
+ cl_kernel kernel = measure_kernel;
+#else
+ cl_kernel kernel = process_kernel;
+#endif /* KERNEL_CONTROL */
+
+
+ for (i = 0; i < 10; i++) {
+ cl_event wevent[NUM_PAGES], kevent[NUM_PAGES];
+
+ printf("Iteration %i\n", i);
+ WR32_sleep (REG_DMA, 0);
+ WR32_sleep (REG_PERF_COUNTER, 0);
+ *(volatile uint32_t*)marker = 0;
+
+ clock_gettime(CLOCK_REALTIME, &tss);
+ // we rather need to trigger it every few milliseconds and see what happens.
+// CL_CHECK_STATUS (clEnqueueWaitSignalAMD (command_queue, input, 0, 0, NULL, &event));
+ for (j = 0; j < NUM_PAGES; j++) {
+#ifndef KERNEL_CONTROL
+ WR64_sleep (REG_DESCRIPTOR_ADDRESS, bus_address.surface_bus_address + j * PAGE_SIZE);
+#endif
+// if (j) {
+// CL_CHECK_STATUS (clEnqueueWaitSignalAMD (command_queue, input, j + 1, 1, &wevent[j-1], &wevent[j]));
+// } else {
+ CL_CHECK_STATUS (clEnqueueWaitSignalAMD (command_queue, input, j + 1, 0, NULL, &wevent[j]));
+// }
+ CL_CHECK_STATUS (clSetKernelArg (kernel, 0, sizeof (uint), &j));
+ CL_CHECK_STATUS (clEnqueueNDRangeKernel (command_queue, kernel, 1, NULL, &work_size, NULL, 1, &wevent[j], &kevent[j]));
+ CL_CHECK_STATUS (clFlush(command_queue));
+ }
+
+#ifdef KERNEL_CONTROL
+ // we write one extra in the end.
+ CL_CHECK_STATUS (clEnqueueWaitSignalAMD (command_queue, input, NUM_PAGES + 1, 0, NULL, NULL));
+ CL_CHECK_STATUS (clFlush(command_queue));
+#endif
+ clock_gettime(CLOCK_REALTIME, &tse);
+ double lat_sched = (tse.tv_sec - tss.tv_sec)*1000000 + 1. * (tse.tv_nsec - tss.tv_nsec) / 1000.;
+
+// usleep(10000);
+
+ clock_gettime(CLOCK_REALTIME, &tss);
+ double lat_flush = (tss.tv_sec - tse.tv_sec)*1000000 + 1. * (tss.tv_nsec - tse.tv_nsec) / 1000.;
+#ifdef KERNEL_CONTROL
+ WR64 (REG_DESCRIPTOR_ADDRESS, bus_address.surface_bus_address);
+#endif
+ WR32 (REG_DMA, 1);
+
+ int cur;
+ for (cur = 0; (*(volatile uint32_t*)marker) < NUM_PAGES;) {
+ if (cur != (*(volatile uint32_t*)marker)) {
+ clock_gettime(CLOCK_REALTIME, &tse);
+ cur = (*(volatile uint32_t*)marker);
+ double latm = (tse.tv_sec - tss.tv_sec)*1000000 + 1. * (tse.tv_nsec - tss.tv_nsec) / 1000.;
+ printf(" Marker %u after %6.3lf us\n", cur, latm);
+ }
+
+ }
+
+ CL_CHECK_STATUS (clWaitForEvents (1, &kevent[NUM_PAGES - 1]));
+ CL_CHECK_STATUS (clFinish(command_queue));
+
+ clock_gettime(CLOCK_REALTIME, &tse);
+ double lat = (tse.tv_sec - tss.tv_sec)*1000000 + 1. * (tse.tv_nsec - tss.tv_nsec) / 1000.;
+
+ printf(" Markers: 0x%lx %u\n", *hwaddr, *(volatile uint32_t*)marker);
+
+ printf(" GPU latencies: ");
+ for (j = 1; j < NUM_PAGES; j++) {
+/*
+ cl_ulong start, submit, end;
+
+ CL_CHECK_STATUS (clGetEventProfilingInfo (wevent[j], CL_PROFILING_COMMAND_SUBMIT, sizeof (cl_ulong), &submit, NULL));
+ CL_CHECK_STATUS (clGetEventProfilingInfo (wevent[j], CL_PROFILING_COMMAND_START, sizeof (cl_ulong), &start, NULL));
+ CL_CHECK_STATUS (clGetEventProfilingInfo (wevent[j], CL_PROFILING_COMMAND_END, sizeof (cl_ulong), &end, NULL));
+ printf(" Page %i start-end: %6.3lf us, submit-end: %6.3lf us\n", j, 1. * (end - start) / 1000, 1. * (start - submit) / 1000);
+
+ CL_CHECK_STATUS (clGetEventProfilingInfo (kevent[j], CL_PROFILING_COMMAND_SUBMIT, sizeof (cl_ulong), &submit, NULL));
+ CL_CHECK_STATUS (clGetEventProfilingInfo (kevent[j], CL_PROFILING_COMMAND_START, sizeof (cl_ulong), &start, NULL));
+ CL_CHECK_STATUS (clGetEventProfilingInfo (kevent[j], CL_PROFILING_COMMAND_END, sizeof (cl_ulong), &end, NULL));
+ printf(" Kernel %i start-end: %6.3lf us, submit-end: %6.3lf us\n", j, 1. * (end - start) / 1000, 1. * (start - submit) / 1000);*/
+
+ cl_ulong end, endk, endw, startw, startk;
+ CL_CHECK_STATUS (clGetEventProfilingInfo (kevent[j - 1], CL_PROFILING_COMMAND_END, sizeof (cl_ulong), &endk, NULL));
+ CL_CHECK_STATUS (clGetEventProfilingInfo (wevent[j], CL_PROFILING_COMMAND_START, sizeof (cl_ulong), &startw, NULL));
+ CL_CHECK_STATUS (clGetEventProfilingInfo (wevent[j], CL_PROFILING_COMMAND_END, sizeof (cl_ulong), &endw, NULL));
+ CL_CHECK_STATUS (clGetEventProfilingInfo (kevent[j], CL_PROFILING_COMMAND_START, sizeof (cl_ulong), &startk, NULL));
+ CL_CHECK_STATUS (clGetEventProfilingInfo (kevent[j], CL_PROFILING_COMMAND_END, sizeof (cl_ulong), &end, NULL));
+ printf("k-%.3lf-w-%.3lf-w-%.3lf-k-%.3lf ", 1. * (startw - endk) / 1000, 1. * (endw - startw) / 1000, 1. * (startk - endw) / 1000, 1. * (end - startk) / 1000);
+
+
+ }
+ printf("\n");
+
+ double lath = 4. * RD32 (0x20) / 1000;
+ printf(" fpga: %6.3lf us, software: %6.3lf us, sched: %6.3lf us, flush: %6.3lf us\n", lath, lat, lat_sched, lat_flush);
+
+/*
+ CL_CHECK_STATUS(clGetEventInfo(event, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(status), &status, &res_size));
+ printf(" Event return: %i (CL_COMPLETE: %i)\n", status, CL_COMPLETE);
+
+ CL_CHECK_STATUS(clGetEventInfo(event, CL_EVENT_COMMAND_TYPE, sizeof(type), &type, &res_size));
+ printf(" Event type: 0x%x (CL_COMMAND_WAIT_SIGNAL_AMD: 0x%x)\n", type, CL_COMMAND_WAIT_SIGNAL_AMD);
+*/
+
+
+ for (j = 0; j < NUM_PAGES; j++) {
+// CL_CHECK_STATUS (clReleaseEvent (wevent[j]));
+ CL_CHECK_STATUS (clReleaseEvent (kevent[j]));
+ }
+
+// CL_CHECK_STATUS (clReleaseEvent (event));
+
+
+#ifdef KERNEL_CONTROL
+ uint data[1024];
+ CL_CHECK_STATUS (clEnqueueReadBuffer (command_queue, output, CL_TRUE, 0, 4096, data, 0, NULL, NULL));
+ printf("\nLatencies: ");
+ for (j = 0; j < NUM_PAGES + 1; j++) {
+/* if (j)
+ printf("%6.3lf ", 4. * (data[j] - data[j - 1]) / 1000);
+ else
+ printf("%6.3lf ", 4. * data[j] / 1000);*/
+ printf("%u ", data[j]);
+ }
+ printf("\n");
+#endif
+ }
+
+ WR32 (REG_COUNTER, 0);
+ WR32 (REG_DMA, 0);
+ usleep(10000);
+ WR32 (REG_RESET_DMA, 1); usleep (100000);
+ WR32 (REG_RESET_DMA, 0); usleep (100000);
+
+ pcilib_unmap_area(pci, marker, 4096);
+ pcilib_unmap_area(pci, gpubuf, 4096);
+
+ pcilib_free_kernel_memory(pci, kbuf_kmem, KMEM_DEFAULT_FLAGS);
+ pcilib_free_kernel_memory(pci, desc_kmem, KMEM_DEFAULT_FLAGS);
+
+ clReleaseMemObject(bar_cl);
+
+ pcilib_close(pci);
+
+ clReleaseKernel (process_kernel);
+ clReleaseKernel (measure_kernel);
+ clReleaseProgram (program);
+
+ clReleaseMemObject(output);
+ clReleaseMemObject(input);
+ clReleaseCommandQueue(command_queue);
+ clReleaseContext(context);
+}