From 16e0aeeed527f8452e336685f664d7aa848702d3 Mon Sep 17 00:00:00 2001 From: "Suren A. Chilingaryan" Date: Thu, 19 May 2016 19:48:24 +0200 Subject: First test --- gdr_test.cu | 295 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 295 insertions(+) create mode 100644 gdr_test.cu (limited to 'gdr_test.cu') diff --git a/gdr_test.cu b/gdr_test.cu new file mode 100644 index 0000000..13af482 --- /dev/null +++ b/gdr_test.cu @@ -0,0 +1,295 @@ +#include +#include +#include +#include +#include +#include +#include + +#include +#include + + +#include +#include +#include + +#include "config.h" +#include "ipedma.h" +#include "kernels.h" + +#define DEVICE "/dev/fpga0" + +#define BAR PCILIB_BAR0 + + +#define KMEM_DEFAULT_FLAGS (pcilib_kmem_flags_t)(PCILIB_KMEM_FLAG_HARDWARE | PCILIB_KMEM_FLAG_PERSISTENT | PCILIB_KMEM_FLAG_EXCLUSIVE) + +#define KMEM_USE_RING PCILIB_KMEM_USE(PCILIB_KMEM_USE_USER, 1) +#define KMEM_USE_DEFAULT PCILIB_KMEM_USE(PCILIB_KMEM_USE_USER, 2) + +#define gdrAssert(ans) { gdrError((ans), __FILE__, __LINE__); } +inline int gdrError(int code, const char *file, int line) +{ + if (code != 0) + { + fprintf(stderr,"GDRassert: %i %s %d\n", + code, file, line); + return code; + } else { + return 0; + } +} + + +#define initAssert(ans) { initError((ans), __FILE__, __LINE__); } +inline int initError(CUresult code, const char *file, int line) +{ + if (code != CUDA_SUCCESS) + { + const char *error = NULL; + cuGetErrorString (code, &error); + fprintf(stderr,"GPUassert: %s (Code: %i) %s %d\n", + error, code, file, line); + return code; + } else { + return 0; + } +} + + +#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); } +inline int gpuAssert(cudaError_t code, const char *file, int line) +{ + if (code != cudaSuccess) + { + fprintf(stderr,"GPUassert: %s (Code: %i) %s %d\n", + cudaGetErrorString(code), code, file, line); + return code; + } else { + return 0; + } +} + + + +int main(int argc, char *argv[]) { + int err; + + //CUDA initialization + initAssert (cuInit(0)); + + int num_gpus; + initAssert (cuDeviceGetCount (&num_gpus)); + printf ("Found %i GPUs on the system\n", num_gpus); + + + CUdevice gpu; //will be used to find the correct GPU + for (num_gpus--; num_gpus >= 0; num_gpus--) { + + CUdevice current_gpu; + initAssert (cuDeviceGet (¤t_gpu, num_gpus)); + + char gpu_name[30] = {0}; + initAssert (cuDeviceGetName (gpu_name, 30, current_gpu)); + + printf("GPU %i: %s\n", num_gpus, gpu_name); + + + if (strncmp (gpu_name, "Tesla K40", 9) == 0) { + printf ("Found a Tesla GPU! I'll use that one.\n"); + gpu = current_gpu; + break; + } + } + + //The CU_CTX_MAP_HOST is what we are interested in! + CUcontext context; + initAssert (cuCtxCreate (&context, CU_CTX_MAP_HOST | CU_CTX_SCHED_AUTO, gpu)); + initAssert (cuCtxSetCurrent (context)); + + //NOTE: API Version 3010 is problematic + //(see https://www.cs.cmu.edu/afs/cs/academic/class/15668-s11/www/cuda-doc/html/group__CUDART__DRIVER.html) + unsigned int api_version; + initAssert (cuCtxGetApiVersion (context, &api_version)); + printf ("CUDA API Version: %u\n", api_version); + printf ("CUDA init done\n\n"); + + CUdevprop gpu_props; + initAssert(cuDeviceGetProperties(&gpu_props, gpu)); + printf ("Clock %lu KHz\n", gpu_props.clockRate); + + CUdeviceptr d_A, d_D; + initAssert(cuMemAlloc(&d_D, GPU_PAGE)); // Should be multiple of GPU page, or mapping of next allocation will segfault the gdrcopy module + initAssert(cuMemAlloc(&d_A, PAGE_SIZE)); + unsigned int flag = 1; + initAssert(cuPointerSetAttribute(&flag, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, d_D)); + initAssert(cuPointerSetAttribute(&flag, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, d_A)); + + gdr_mh_t A_mh, D_mh; + gdr_info_t A_info, D_info; + void *A_bar_ptr = NULL; + void *D_bar_ptr = NULL; + + + gdr_t g = gdr_open(); + gdrAssert(g == NULL); + + gdrAssert(gdr_pin_buffer(g, d_D, GPU_PAGE, 0, 0, &D_mh)); + gdrAssert(gdr_map(g, D_mh, &D_bar_ptr, GPU_PAGE)); + gdrAssert(gdr_get_info(g, D_mh, &D_info)); + + gdrAssert(gdr_pin_buffer(g, d_A, PAGE_SIZE, 0, 0, &A_mh)); + gdrAssert(gdr_map(g, A_mh, &A_bar_ptr, PAGE_SIZE)); + gdrAssert(gdr_get_info(g, A_mh, &A_info)); + + int D_bar_off = D_info.va - d_D; + volatile uint32_t *D = (uint32_t *)((char *)D_bar_ptr + D_bar_off); + + int A_bar_off = A_info.va - d_A; + volatile uint32_t *A = (uint32_t *)((char *)A_bar_ptr + A_bar_off); + + printf("DevicePtr: %lx, GDR ptr: %p, Bus ptr: %lx, (Bar: %p, Offset: %i), VA: 0x%lx, Size: %lu, Page: %lu\n", d_A, A, A_info.bus_addr, A_bar_ptr, A_bar_off, A_info.va, A_info.mapped_size, A_info.page_size); + + pcilib_t *pci; + volatile void *bar; + const pcilib_bar_info_t *bar_info; + + pci = pcilib_open(DEVICE, PCILIB_MODEL_DETECT); + if (!pci) { + printf("pcilib_open\n"); + exit(1); + } + bar = pcilib_resolve_bar_address(pci, BAR, 0); + if (!bar) { + pcilib_close(pci); + printf("map bar\n"); + exit(1); + } + printf("BAR mapped to: %p\n", bar); + + CUdeviceptr dBAR; +// initAssert (cuMemHostRegister ((void*)((((uintptr_t)bar)/65536)*65536), 65536, CU_MEMHOSTREGISTER_DEVICEMAP)); + initAssert (cuMemHostRegister ((void*)bar, 4096, CU_MEMHOSTREGISTER_IOMEMORY)); + initAssert (cuMemHostGetDevicePointer(&dBAR, (void*)bar, 0)); + + bar_info = pcilib_get_bar_info(pci, BAR); + printf("%p (Phys: 0x%lx, Size: 0x%x)\n", bar_info[BAR].virt_addr, bar_info[BAR].phys_addr, bar_info[BAR].size); + + pcilib_kmem_handle_t *kdesc_kmem = pcilib_alloc_kernel_memory (pci, PCILIB_KMEM_TYPE_CONSISTENT, 1, 128, 4096, KMEM_USE_RING, KMEM_DEFAULT_FLAGS); + uintptr_t kdesc_bus = pcilib_kmem_get_block_ba (pci, kdesc_kmem, 0); + volatile void *kdesc = (uint32_t *) pcilib_kmem_get_block_ua (pci, kdesc_kmem, 0); + + + pcilib_kmem_handle_t *kbuf_kmem = pcilib_alloc_kernel_memory(pci, PCILIB_KMEM_TYPE_DMA_C2S_PAGE, 1, ((PAGE_SIZE%4096)?(4096 * (1 + PAGE_SIZE/4096)):PAGE_SIZE), 4096, KMEM_USE_DEFAULT, KMEM_DEFAULT_FLAGS); + uintptr_t kbuf_bus = pcilib_kmem_get_block_ba (pci, kbuf_kmem, 0); + volatile uint32_t *kbuf = (uint32_t *) pcilib_kmem_get_block_ua (pci, kbuf_kmem, 0); + memset ((uint32_t *)kbuf, 0, PAGE_SIZE); + +#ifdef GPU_DESC + volatile void *desc = D; + uintptr_t desc_bus = D_info.bus_addr; +#else + volatile void *desc = kdesc; + uintptr_t desc_bus = kdesc_bus; +#endif + + + memset ((uint32_t *)desc, 0, 5 * sizeof (uint32_t)); + volatile uint64_t *hwaddr = (uint64_t*)((char*)desc + 2 * sizeof(uint32_t)); + + WR32 (REG_RESET_DMA, 1); + usleep (100000); + WR32 (REG_RESET_DMA, 0); + usleep (100000); + + WR32 (REG_NUM_PACKETS_PER_DESCRIPTOR, PAGE_SIZE / (4 * TLP_SIZE)); + WR32 (REG_PACKET_LENGTH, 0x80000 | TLP_SIZE); + WR32 (REG_UPDATE_THRESHOLD, 0); + WR64 (REG_UPDATE_ADDRESS, desc_bus); + WR32 (REG_DMA, 1); + WR32 (REG_COUNTER, 1); + +#ifdef VERBOSE + struct timespec tss, tse, tsk; +#else + struct timeval tvs, tve; +#endif /* VERBOSE */ + + for (int i = 0; i < ITERS; i++) { + clock_gettime(CLOCK_REALTIME, &tss); + +#ifdef GPU_DESC + ipedma<<<1, 1>>>((void*)dBAR, A_info.bus_addr, (uint64_t*)d_D, (uint32_t*)d_A); +#else + WR64 (REG_DESCRIPTOR_ADDRESS, A_info.bus_addr); +// WR64 (REG_DESCRIPTOR_ADDRESS, kbuf_bus); + + do { + } while (*hwaddr == 0); + clock_gettime(CLOCK_REALTIME, &tse); + + null<<<1, 1>>>((uint32_t*)d_A); +#endif + cudaDeviceSynchronize(); + + clock_gettime(CLOCK_REALTIME, &tsk); + + *hwaddr = 0; + +#ifdef VERBOSE + initAssert(cuMemcpyDtoH((void*)kbuf, d_A, PAGE_SIZE)); + +# ifdef GPU_DESC + double lat = 1000. * kbuf[0] / gpu_props.clockRate; + double latk = 1000. * kbuf[1] / gpu_props.clockRate; + double latc = ((tsk.tv_sec - tss.tv_sec)*1000000 + 1. * (tsk.tv_nsec - tss.tv_nsec) / 1000.) / GPU_ITERS; +#else + double lat = (tse.tv_sec - tss.tv_sec)*1000000 + 1. * (tse.tv_nsec - tss.tv_nsec) / 1000.; + double latk = (tsk.tv_sec - tss.tv_sec)*1000000 + 1. * (tsk.tv_nsec - tss.tv_nsec) / 1000.; + double latc = 0; +#endif + + printf("Latency: %.3lf us / %.3lf us (%.3lf us) %x %x %x %x\n", lat, latk, latc, kbuf[0], kbuf[1], kbuf[2], kbuf[3]); +#else + if (!i) gettimeofday(&tvs, NULL); +#endif /* VERBOSE */ + + } + +#ifndef VERBOSE + gettimeofday(&tve, NULL); + size_t avglat = (tve.tv_sec - tvs.tv_sec)*1000000 + (tve.tv_usec - tvs.tv_usec); + printf("Latency: %.3lf us (average for %i iterations)\n", 1. * avglat / ITERS, ITERS); +#endif /* VERBOSE */ + + usleep(1000000); + + + + WR32 (REG_COUNTER, 0); + WR32 (REG_DMA, 0); + + WR32 (REG_RESET_DMA, 1); + usleep (100000); + WR32 (REG_RESET_DMA, 0); + usleep (100000); + + pcilib_free_kernel_memory(pci, kbuf_kmem, KMEM_DEFAULT_FLAGS); + pcilib_free_kernel_memory(pci, kdesc_kmem, KMEM_DEFAULT_FLAGS); + + pcilib_close(pci); + printf("PCI closed\n"); + + + gdr_unmap(g, A_mh, A_bar_ptr, PAGE_SIZE); + gdr_unpin_buffer(g, A_mh); + + gdr_unmap(g, D_mh, D_bar_ptr, GPU_PAGE); + gdr_unpin_buffer(g, D_mh); + + gdr_close(g); + cuMemFree(d_A); + cuMemFree(d_D); + printf("GDR closed\n"); +} -- cgit v1.2.3