diff options
-rwxr-xr-x | compile.sh | 1 | ||||
-rw-r--r-- | kernel.cl | 19 | ||||
-rw-r--r-- | test.c | 422 |
3 files changed, 442 insertions, 0 deletions
diff --git a/compile.sh b/compile.sh new file mode 100755 index 0000000..07cc7df --- /dev/null +++ b/compile.sh @@ -0,0 +1 @@ +gcc -I/opt/AMDAPPSDK-3.0/include -lOpenCL -lpcilib test.c diff --git a/kernel.cl b/kernel.cl new file mode 100644 index 0000000..26a0009 --- /dev/null +++ b/kernel.cl @@ -0,0 +1,19 @@ +#define REG_PERF_COUNTER 0x20 +#define REG_DESCRIPTOR_ADDRESS 0x50 + +#define RD32(addr) ((__global volatile uint*)bar)[addr / 4]; +#define WR32(addr, value) ((__global volatile uint*)bar)[addr / 4] = value; +#define WR64(addr, value) ((__global volatile ulong*)bar)[addr / 8] = value; + +__kernel void process(uint iter, __global uint *input, __global uint *output, __global volatile uint *bar, ulong bus_addr) +{ + WR32 (REG_PERF_COUNTER, 1); +} + + +__kernel void measure(uint iter, __global uint *input, __global uint *output, __global volatile uint *bar, ulong bus_addr) +{ + output[0] = iter; + output[iter + 1] = RD32(REG_PERF_COUNTER); + WR64 (REG_DESCRIPTOR_ADDRESS, bus_addr); +} @@ -0,0 +1,422 @@ +#include <string.h> +#include <stdio.h> +#include <time.h> + +#include <pcilib.h> +#include <pcilib/mem.h> +#include <pcilib/bar.h> +#include <pcilib/kmem.h> + +#include "CL/cl.h" +#include "CL/cl_ext.h" + +#define KERNEL_CONTROL +//#define OPENCL_TIMINGS + +#define BAR PCILIB_BAR0 + +#define TLP_SIZE 64 +#define PAGE_SIZE 4096 +#define NUM_PAGES 16 + +#define KMEM_DEFAULT_FLAGS (pcilib_kmem_flags_t)(PCILIB_KMEM_FLAG_HARDWARE | PCILIB_KMEM_FLAG_PERSISTENT | PCILIB_KMEM_FLAG_EXCLUSIVE) + +#define KMEM_USE_RING PCILIB_KMEM_USE(PCILIB_KMEM_USE_USER, 1) +#define KMEM_USE_DEFAULT PCILIB_KMEM_USE(PCILIB_KMEM_USE_USER, 2) + +#define REG_RESET_DMA 0x00 +#define REG_DMA 0x04 +#define REG_NUM_PACKETS_PER_DESCRIPTOR 0x10 +#define REG_PERF_COUNTER 0x20 +//#define REG_PERF_COUNTER 0x28 +#define REG_PACKET_LENGTH 0x0C +#define REG_DESCRIPTOR_ADDRESS 0x50 +#define REG_UPDATE_ADDRESS 0x58 +#define REG_UPDATE_THRESHOLD 0x60 +#define REG_UPDATE_COUNTER 0x70 + + +#define REG_COUNTER 0x9000 + +#define WR32(addr, value) *(uint32_t *) (bar + (addr)) = (value); +#define RD32(addr) (*(uint32_t *) (bar + (addr))) +#define WR32_sleep(addr, value) *(uint32_t *) (bar + (addr)) = (value); usleep (100); + +#define WR64(addr, value) *(uint64_t *) (bar + (addr)) = (value); +#define RD64(addr) (*(uint64_t *) (bar + (addr))) +#define WR64_sleep(addr, value) *(uint64_t *) (bar + (addr)) = (value); usleep (100); + + +#define DATA_SIZE NUM_PAGES * PAGE_SIZE + + +#define CL_CHECK_STATUS(error) { \ + if ((error) != CL_SUCCESS) fprintf (stderr, "OpenCL error <%s:%i>: %i\n", __FILE__, __LINE__, (error)); } + + +static void check_data(cl_command_queue queue, cl_mem mem, size_t size) { + uint32_t *data; + + data = malloc (size); + memset (data, 0, size); + + CL_CHECK_STATUS (clEnqueueReadBuffer (queue, mem, CL_TRUE, 0, size, data, 0, NULL, NULL)); + + printf("%lx\n", data[0]); + + free (data); +} + + +int main(void) +{ + int i; + cl_uint j = 0; + cl_context context; + cl_command_queue command_queue; + cl_int err; + cl_uint num_of_platforms=0; + cl_platform_id platform_id; + cl_device_id device_id; + cl_uint num_of_devices=0; + cl_mem input, output;//, host; + cl_bus_address_amd bus_address; + cl_event event, event1, event2; + + cl_int status; + cl_command_type type; + size_t res_size; + + clEnqueueMakeBuffersResidentAMD_fn clEnqueueMakeBuffersResidentAMD; + clEnqueueWaitSignalAMD_fn clEnqueueWaitSignalAMD; + + CL_CHECK_STATUS(clGetPlatformIDs(1, &platform_id, &num_of_platforms)); + clEnqueueMakeBuffersResidentAMD = (clEnqueueMakeBuffersResidentAMD_fn)clGetExtensionFunctionAddressForPlatform(platform_id, "clEnqueueMakeBuffersResidentAMD"); + clEnqueueWaitSignalAMD = clGetExtensionFunctionAddressForPlatform (platform_id, "clEnqueueWaitSignalAMD"); + + CL_CHECK_STATUS(clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id,&num_of_devices)); + + cl_context_properties properties[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties) platform_id, 0 }; + context = clCreateContext(properties, 1, &device_id, NULL,NULL, &err); + CL_CHECK_STATUS(err); + + cl_queue_properties props[] = { + CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE/*|CL_QUEUE_ON_DEVICE|CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE|CL_QUEUE_ON_DEVICE_DEFAULT, + CL_QUEUE_SIZE, CL_DEVICE_QUEUE_ON_DEVICE_PREFERRED_SIZE*/, + 0}; + command_queue = clCreateCommandQueueWithProperties(context, device_id, props, &err); + CL_CHECK_STATUS(err); + + input = clCreateBuffer(context, CL_MEM_READ_WRITE|CL_MEM_BUS_ADDRESSABLE_AMD, DATA_SIZE, NULL, &err); + CL_CHECK_STATUS(err); + + output = clCreateBuffer(context, CL_MEM_READ_WRITE, 4096, NULL, &err); + CL_CHECK_STATUS(err); + + memset(&bus_address, 0, sizeof(cl_bus_address_amd)); + +// CL_CHECK_STATUS(clEnqueueMakeBuffersResidentAMD(command_queue, 1, &input, CL_TRUE, &bus_address, 0, 0, NULL)); + CL_CHECK_STATUS(clEnqueueMakeBuffersResidentAMD(command_queue, 1, &input, CL_FALSE, &bus_address, 0, 0, &event)); + CL_CHECK_STATUS (clWaitForEvents (1, &event)); + CL_CHECK_STATUS (clReleaseEvent (event)); + + + printf("bus adress : surface : 0x%lx, marker : 0x%lx\n", bus_address.surface_bus_address, bus_address.marker_bus_address); + + pcilib_t *pci = pcilib_open("/dev/fpga0", PCILIB_MODEL_DETECT); + if (!pci) { + printf("pcilib_open failed\n"); + exit(1); + } + + volatile void *bar = pcilib_resolve_bar_address(pci, BAR, 0); + if (!bar) { + pcilib_close(pci); + printf("map bar\n"); + exit(1); + } + + const pcilib_bar_info_t *bar_info = pcilib_get_bar_info(pci, BAR); + if (!bar_info) { + pcilib_close(pci); + printf("get bar info\n"); + exit(1); + } + + cl_bus_address_amd amd_addr = { + .surface_bus_address = bar_info->phys_addr, + .marker_bus_address = bar_info->phys_addr + }; + + cl_mem bar_cl = clCreateBuffer (context, CL_MEM_EXTERNAL_PHYSICAL_AMD, bar_info->size, &amd_addr, &err); + if (err) { + pcilib_close(pci); + printf("Error (%i) mapping BAR to GPU\n", err); + exit(1); + } + + FILE *f = fopen("kernel.cl", "rb"); + fseek(f, 0, SEEK_END); + long fsize = ftell(f); + fseek(f, 0, SEEK_SET); //same as rewind(f); + char *cl_string = malloc(fsize + 1); + fread(cl_string, fsize, 1, f); + cl_string[fsize] = 0; + fclose(f); + + cl_program program = clCreateProgramWithSource (context, 1, (const char **) &cl_string, NULL, &err); + CL_CHECK_STATUS (err); + CL_CHECK_STATUS(clBuildProgram (program, 1, &device_id, "-cl-std=CL2.0 -D CL_VERSION_2_0", NULL, NULL)); + + size_t work_size = 1; + cl_kernel process_kernel = clCreateKernel (program, "process", &err); + CL_CHECK_STATUS (err); + CL_CHECK_STATUS (clSetKernelArg (process_kernel, 0, sizeof (uint), &j)); + CL_CHECK_STATUS (clSetKernelArg (process_kernel, 1, sizeof (cl_mem), &input)); + CL_CHECK_STATUS (clSetKernelArg (process_kernel, 2, sizeof (cl_mem), &output)); + CL_CHECK_STATUS (clSetKernelArg (process_kernel, 3, sizeof (cl_mem), &bar_cl)); + CL_CHECK_STATUS (clSetKernelArg (process_kernel, 4, sizeof (ulong), &bus_address.surface_bus_address)); + + cl_kernel measure_kernel = clCreateKernel (program, "measure", &err); + CL_CHECK_STATUS (clSetKernelArg (measure_kernel, 0, sizeof (uint), &j)); + CL_CHECK_STATUS (clSetKernelArg (measure_kernel, 1, sizeof (cl_mem), &input)); + CL_CHECK_STATUS (clSetKernelArg (measure_kernel, 2, sizeof (cl_mem), &output)); + CL_CHECK_STATUS (clSetKernelArg (measure_kernel, 3, sizeof (cl_mem), &bar_cl)); + CL_CHECK_STATUS (clSetKernelArg (measure_kernel, 4, sizeof (ulong), &bus_address.surface_bus_address)); + + pcilib_kmem_handle_t *desc_kmem = pcilib_alloc_kernel_memory (pci, PCILIB_KMEM_TYPE_CONSISTENT, 1, 128, 4096, KMEM_USE_RING, KMEM_DEFAULT_FLAGS); + uintptr_t desc_bus = pcilib_kmem_get_block_ba (pci, desc_kmem, 0); + volatile void *desc = (uint32_t *) pcilib_kmem_get_block_ua (pci, desc_kmem, 0); + memset ((uint32_t *)desc, 0, 5 * sizeof (uint32_t)); + volatile uint64_t *hwaddr = (uint64_t*)(desc + 2 * sizeof(uint32_t)); + + pcilib_kmem_handle_t *kbuf_kmem = pcilib_alloc_kernel_memory(pci, PCILIB_KMEM_TYPE_DMA_C2S_PAGE, 1, 4096, 4096, KMEM_USE_DEFAULT, KMEM_DEFAULT_FLAGS); + uintptr_t kbuf_bus = pcilib_kmem_get_block_ba (pci, kbuf_kmem, 0); + volatile uint32_t *kbuf = (uint32_t *) pcilib_kmem_get_block_ua (pci, kbuf_kmem, 0); + memset ((uint32_t *)kbuf, 0, 4096); + + void *marker = pcilib_map_area(pci, bus_address.marker_bus_address, 4096); + if (!marker) { + printf("pcilib_map_area failed\n"); + exit(1); + } + + *(uint32_t*)marker = 0; + + void *gpubuf = pcilib_map_area(pci, bus_address.surface_bus_address, 4096); + *(uint32_t*)gpubuf = 0x1; + + check_data(command_queue, input, 4); + + + WR32 (REG_RESET_DMA, 1); usleep (100000); + WR32 (REG_RESET_DMA, 0); usleep (100000); + + WR32_sleep (REG_NUM_PACKETS_PER_DESCRIPTOR, PAGE_SIZE / (4 * TLP_SIZE)); + WR32_sleep (REG_PACKET_LENGTH, 0x80000 | TLP_SIZE); + WR32_sleep (REG_UPDATE_THRESHOLD, 1); + WR64_sleep (REG_UPDATE_ADDRESS, desc_bus); + WR64_sleep (REG_UPDATE_COUNTER, bus_address.marker_bus_address); +// WR32_sleep (REG_PERF_COUNTER, 0); +// WR32 (REG_DMA, 1); + + WR32_sleep (0x9048, 0x232); + WR32_sleep (REG_COUNTER, 1); + usleep(1000000); + + + struct timespec tss, tse, tsk; + + + for (i = 0; i < 100; i++) { + WR32_sleep (REG_DMA, 0); + WR32_sleep (REG_PERF_COUNTER, 0); + WR64_sleep (REG_DESCRIPTOR_ADDRESS, bus_address.surface_bus_address); + + *(volatile uint32_t*)marker = 0; + *hwaddr = 0; + + clock_gettime(CLOCK_REALTIME, &tss); + WR32 (REG_DMA, 1); + + if (i < 50) { + while ((*hwaddr) == 0) { + } + } else { + while ((*(volatile uint32_t*)marker) < 1) { + } + } + clock_gettime(CLOCK_REALTIME, &tse); + + CL_CHECK_STATUS (clEnqueueNDRangeKernel (command_queue, process_kernel, 1, NULL, &work_size, NULL, 0, NULL, &event)); + CL_CHECK_STATUS (clWaitForEvents (1, &event)); + clock_gettime(CLOCK_REALTIME, &tsk); + + double lath = 4. * RD32 (0x20) / 1000; + + double lat = (tse.tv_sec - tss.tv_sec)*1000000 + 1. * (tse.tv_nsec - tss.tv_nsec) / 1000.; + double latk = (tsk.tv_sec - tss.tv_sec)*1000000 + 1. * (tsk.tv_nsec - tss.tv_nsec) / 1000.; + + printf(" iteration %u, hw: %6.3lf us, sw: %6.3lf us, +krn: %6.3lf us, maker: %u\n", i, lath, lat, latk, *(volatile uint32_t*)marker); + + CL_CHECK_STATUS (clReleaseEvent (event)); + } + + usleep(100000); + + +#ifdef KERNEL_CONTROL + cl_kernel kernel = measure_kernel; +#else + cl_kernel kernel = process_kernel; +#endif /* KERNEL_CONTROL */ + + + for (i = 0; i < 10; i++) { + cl_event wevent[NUM_PAGES], kevent[NUM_PAGES]; + + printf("Iteration %i\n", i); + WR32_sleep (REG_DMA, 0); + WR32_sleep (REG_PERF_COUNTER, 0); + *(volatile uint32_t*)marker = 0; + + clock_gettime(CLOCK_REALTIME, &tss); + // we rather need to trigger it every few milliseconds and see what happens. +// CL_CHECK_STATUS (clEnqueueWaitSignalAMD (command_queue, input, 0, 0, NULL, &event)); + for (j = 0; j < NUM_PAGES; j++) { +#ifndef KERNEL_CONTROL + WR64_sleep (REG_DESCRIPTOR_ADDRESS, bus_address.surface_bus_address + j * PAGE_SIZE); +#endif +// if (j) { +// CL_CHECK_STATUS (clEnqueueWaitSignalAMD (command_queue, input, j + 1, 1, &wevent[j-1], &wevent[j])); +// } else { + CL_CHECK_STATUS (clEnqueueWaitSignalAMD (command_queue, input, j + 1, 0, NULL, &wevent[j])); +// } + CL_CHECK_STATUS (clSetKernelArg (kernel, 0, sizeof (uint), &j)); + CL_CHECK_STATUS (clEnqueueNDRangeKernel (command_queue, kernel, 1, NULL, &work_size, NULL, 1, &wevent[j], &kevent[j])); + CL_CHECK_STATUS (clFlush(command_queue)); + } + +#ifdef KERNEL_CONTROL + // we write one extra in the end. + CL_CHECK_STATUS (clEnqueueWaitSignalAMD (command_queue, input, NUM_PAGES + 1, 0, NULL, NULL)); + CL_CHECK_STATUS (clFlush(command_queue)); +#endif + clock_gettime(CLOCK_REALTIME, &tse); + double lat_sched = (tse.tv_sec - tss.tv_sec)*1000000 + 1. * (tse.tv_nsec - tss.tv_nsec) / 1000.; + +// usleep(10000); + + clock_gettime(CLOCK_REALTIME, &tss); + double lat_flush = (tss.tv_sec - tse.tv_sec)*1000000 + 1. * (tss.tv_nsec - tse.tv_nsec) / 1000.; +#ifdef KERNEL_CONTROL + WR64 (REG_DESCRIPTOR_ADDRESS, bus_address.surface_bus_address); +#endif + WR32 (REG_DMA, 1); + + int cur; + for (cur = 0; (*(volatile uint32_t*)marker) < NUM_PAGES;) { + if (cur != (*(volatile uint32_t*)marker)) { + clock_gettime(CLOCK_REALTIME, &tse); + cur = (*(volatile uint32_t*)marker); + double latm = (tse.tv_sec - tss.tv_sec)*1000000 + 1. * (tse.tv_nsec - tss.tv_nsec) / 1000.; + printf(" Marker %u after %6.3lf us\n", cur, latm); + } + + } + + CL_CHECK_STATUS (clWaitForEvents (1, &kevent[NUM_PAGES - 1])); + CL_CHECK_STATUS (clFinish(command_queue)); + + clock_gettime(CLOCK_REALTIME, &tse); + double lat = (tse.tv_sec - tss.tv_sec)*1000000 + 1. * (tse.tv_nsec - tss.tv_nsec) / 1000.; + + printf(" Markers: 0x%lx %u\n", *hwaddr, *(volatile uint32_t*)marker); + + printf(" GPU latencies: "); + for (j = 1; j < NUM_PAGES; j++) { +/* + cl_ulong start, submit, end; + + CL_CHECK_STATUS (clGetEventProfilingInfo (wevent[j], CL_PROFILING_COMMAND_SUBMIT, sizeof (cl_ulong), &submit, NULL)); + CL_CHECK_STATUS (clGetEventProfilingInfo (wevent[j], CL_PROFILING_COMMAND_START, sizeof (cl_ulong), &start, NULL)); + CL_CHECK_STATUS (clGetEventProfilingInfo (wevent[j], CL_PROFILING_COMMAND_END, sizeof (cl_ulong), &end, NULL)); + printf(" Page %i start-end: %6.3lf us, submit-end: %6.3lf us\n", j, 1. * (end - start) / 1000, 1. * (start - submit) / 1000); + + CL_CHECK_STATUS (clGetEventProfilingInfo (kevent[j], CL_PROFILING_COMMAND_SUBMIT, sizeof (cl_ulong), &submit, NULL)); + CL_CHECK_STATUS (clGetEventProfilingInfo (kevent[j], CL_PROFILING_COMMAND_START, sizeof (cl_ulong), &start, NULL)); + CL_CHECK_STATUS (clGetEventProfilingInfo (kevent[j], CL_PROFILING_COMMAND_END, sizeof (cl_ulong), &end, NULL)); + printf(" Kernel %i start-end: %6.3lf us, submit-end: %6.3lf us\n", j, 1. * (end - start) / 1000, 1. * (start - submit) / 1000);*/ + + cl_ulong end, endk, endw, startw, startk; + CL_CHECK_STATUS (clGetEventProfilingInfo (kevent[j - 1], CL_PROFILING_COMMAND_END, sizeof (cl_ulong), &endk, NULL)); + CL_CHECK_STATUS (clGetEventProfilingInfo (wevent[j], CL_PROFILING_COMMAND_START, sizeof (cl_ulong), &startw, NULL)); + CL_CHECK_STATUS (clGetEventProfilingInfo (wevent[j], CL_PROFILING_COMMAND_END, sizeof (cl_ulong), &endw, NULL)); + CL_CHECK_STATUS (clGetEventProfilingInfo (kevent[j], CL_PROFILING_COMMAND_START, sizeof (cl_ulong), &startk, NULL)); + CL_CHECK_STATUS (clGetEventProfilingInfo (kevent[j], CL_PROFILING_COMMAND_END, sizeof (cl_ulong), &end, NULL)); + printf("k-%.3lf-w-%.3lf-w-%.3lf-k-%.3lf ", 1. * (startw - endk) / 1000, 1. * (endw - startw) / 1000, 1. * (startk - endw) / 1000, 1. * (end - startk) / 1000); + + + } + printf("\n"); + + double lath = 4. * RD32 (0x20) / 1000; + printf(" fpga: %6.3lf us, software: %6.3lf us, sched: %6.3lf us, flush: %6.3lf us\n", lath, lat, lat_sched, lat_flush); + +/* + CL_CHECK_STATUS(clGetEventInfo(event, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(status), &status, &res_size)); + printf(" Event return: %i (CL_COMPLETE: %i)\n", status, CL_COMPLETE); + + CL_CHECK_STATUS(clGetEventInfo(event, CL_EVENT_COMMAND_TYPE, sizeof(type), &type, &res_size)); + printf(" Event type: 0x%x (CL_COMMAND_WAIT_SIGNAL_AMD: 0x%x)\n", type, CL_COMMAND_WAIT_SIGNAL_AMD); +*/ + + + for (j = 0; j < NUM_PAGES; j++) { +// CL_CHECK_STATUS (clReleaseEvent (wevent[j])); + CL_CHECK_STATUS (clReleaseEvent (kevent[j])); + } + +// CL_CHECK_STATUS (clReleaseEvent (event)); + + +#ifdef KERNEL_CONTROL + uint data[1024]; + CL_CHECK_STATUS (clEnqueueReadBuffer (command_queue, output, CL_TRUE, 0, 4096, data, 0, NULL, NULL)); + printf("\nLatencies: "); + for (j = 0; j < NUM_PAGES + 1; j++) { +/* if (j) + printf("%6.3lf ", 4. * (data[j] - data[j - 1]) / 1000); + else + printf("%6.3lf ", 4. * data[j] / 1000);*/ + printf("%u ", data[j]); + } + printf("\n"); +#endif + } + + WR32 (REG_COUNTER, 0); + WR32 (REG_DMA, 0); + usleep(10000); + WR32 (REG_RESET_DMA, 1); usleep (100000); + WR32 (REG_RESET_DMA, 0); usleep (100000); + + pcilib_unmap_area(pci, marker, 4096); + pcilib_unmap_area(pci, gpubuf, 4096); + + pcilib_free_kernel_memory(pci, kbuf_kmem, KMEM_DEFAULT_FLAGS); + pcilib_free_kernel_memory(pci, desc_kmem, KMEM_DEFAULT_FLAGS); + + clReleaseMemObject(bar_cl); + + pcilib_close(pci); + + clReleaseKernel (process_kernel); + clReleaseKernel (measure_kernel); + clReleaseProgram (program); + + clReleaseMemObject(output); + clReleaseMemObject(input); + clReleaseCommandQueue(command_queue); + clReleaseContext(context); +} |