diff options
-rw-r--r-- | .gitignore | 8 | ||||
-rw-r--r-- | CMakeLists.txt | 41 | ||||
-rw-r--r-- | config.h | 9 | ||||
-rw-r--r-- | gdr_test.cu | 295 | ||||
-rw-r--r-- | gdrcopy.diff | 57 | ||||
-rwxr-xr-x | insmod.sh | 42 | ||||
-rw-r--r-- | ipedma.h | 20 | ||||
-rw-r--r-- | kernels.cu | 48 | ||||
-rw-r--r-- | kernels.h | 5 |
9 files changed, 525 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..b527926 --- /dev/null +++ b/.gitignore @@ -0,0 +1,8 @@ +CMakeCache.txt +CMakeFiles +CMakeScripts +Makefile +cmake_install.cmake +install_manifest.txt +CTestTestfile.cmake +gdr_test diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..f3369d9 --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,41 @@ +project(ipecamera) + +set(IPECAMERA_VERSION "0.0.1") +set(IPECAMERA_ABI_VERSION "0") + +cmake_minimum_required(VERSION 2.6) +list(APPEND CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake") + +add_definitions("-fPIC --std=gnu99 -Wall -O2 -gdwarf-2 -g3 -fno-omit-frame-pointer") + +find_package(CUDA REQUIRED) +set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-gencode arch=compute_35,code=sm_35;-rdc=true) +set(CUDA_SEPARABLE_COMPILATION ON) + + +find_package(PkgConfig REQUIRED) + +pkg_check_modules(PCILIB pcitool>=0.2 REQUIRED) + + +include_directories( + ${CMAKE_SOURCE_DIR} + ${PCILIB_INCLUDE_DIRS} + ${CUDA_INCLUDE_DIRS} +) + +link_directories( + ${PCILIB_LIBRARY_DIRS} + ${CUDA_LIBRARY_DIRS} +) + + +set(CUDA_KERNELS kernels.cu) + +#cuda_compile_ptx(cuda_ptx_files kernels.cu) +#add_custom_target(ptx ALL DEPENDS ${cuda_ptx_files} ${CUDA_KERNELS} SOURCES ${CUDA_KERNELS}) + + +cuda_add_executable(gdr_test gdr_test.cu kernels.cu) +target_link_libraries(gdr_test pcilib rt cuda gdrapi /usr/local/cuda/lib64/libcudadevrt.a) + diff --git a/config.h b/config.h new file mode 100644 index 0000000..acefc3b --- /dev/null +++ b/config.h @@ -0,0 +1,9 @@ +#define ITERS 100 +#define GPU_ITERS 1000 + +#define TLP_SIZE 64 +#define GPU_PAGE 65536 +#define PAGE_SIZE 4096 + +#define VERBOSE +#define GPU_DESC diff --git a/gdr_test.cu b/gdr_test.cu new file mode 100644 index 0000000..13af482 --- /dev/null +++ b/gdr_test.cu @@ -0,0 +1,295 @@ +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <stdarg.h> +#include <time.h> +#include <sched.h> +#include <sys/time.h> + +#include <cuda.h> +#include <gdrapi.h> + + +#include <pcilib.h> +#include <pcilib/bar.h> +#include <pcilib/kmem.h> + +#include "config.h" +#include "ipedma.h" +#include "kernels.h" + +#define DEVICE "/dev/fpga0" + +#define BAR PCILIB_BAR0 + + +#define KMEM_DEFAULT_FLAGS (pcilib_kmem_flags_t)(PCILIB_KMEM_FLAG_HARDWARE | PCILIB_KMEM_FLAG_PERSISTENT | PCILIB_KMEM_FLAG_EXCLUSIVE) + +#define KMEM_USE_RING PCILIB_KMEM_USE(PCILIB_KMEM_USE_USER, 1) +#define KMEM_USE_DEFAULT PCILIB_KMEM_USE(PCILIB_KMEM_USE_USER, 2) + +#define gdrAssert(ans) { gdrError((ans), __FILE__, __LINE__); } +inline int gdrError(int code, const char *file, int line) +{ + if (code != 0) + { + fprintf(stderr,"GDRassert: %i %s %d\n", + code, file, line); + return code; + } else { + return 0; + } +} + + +#define initAssert(ans) { initError((ans), __FILE__, __LINE__); } +inline int initError(CUresult code, const char *file, int line) +{ + if (code != CUDA_SUCCESS) + { + const char *error = NULL; + cuGetErrorString (code, &error); + fprintf(stderr,"GPUassert: %s (Code: %i) %s %d\n", + error, code, file, line); + return code; + } else { + return 0; + } +} + + +#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); } +inline int gpuAssert(cudaError_t code, const char *file, int line) +{ + if (code != cudaSuccess) + { + fprintf(stderr,"GPUassert: %s (Code: %i) %s %d\n", + cudaGetErrorString(code), code, file, line); + return code; + } else { + return 0; + } +} + + + +int main(int argc, char *argv[]) { + int err; + + //CUDA initialization + initAssert (cuInit(0)); + + int num_gpus; + initAssert (cuDeviceGetCount (&num_gpus)); + printf ("Found %i GPUs on the system\n", num_gpus); + + + CUdevice gpu; //will be used to find the correct GPU + for (num_gpus--; num_gpus >= 0; num_gpus--) { + + CUdevice current_gpu; + initAssert (cuDeviceGet (¤t_gpu, num_gpus)); + + char gpu_name[30] = {0}; + initAssert (cuDeviceGetName (gpu_name, 30, current_gpu)); + + printf("GPU %i: %s\n", num_gpus, gpu_name); + + + if (strncmp (gpu_name, "Tesla K40", 9) == 0) { + printf ("Found a Tesla GPU! I'll use that one.\n"); + gpu = current_gpu; + break; + } + } + + //The CU_CTX_MAP_HOST is what we are interested in! + CUcontext context; + initAssert (cuCtxCreate (&context, CU_CTX_MAP_HOST | CU_CTX_SCHED_AUTO, gpu)); + initAssert (cuCtxSetCurrent (context)); + + //NOTE: API Version 3010 is problematic + //(see https://www.cs.cmu.edu/afs/cs/academic/class/15668-s11/www/cuda-doc/html/group__CUDART__DRIVER.html) + unsigned int api_version; + initAssert (cuCtxGetApiVersion (context, &api_version)); + printf ("CUDA API Version: %u\n", api_version); + printf ("CUDA init done\n\n"); + + CUdevprop gpu_props; + initAssert(cuDeviceGetProperties(&gpu_props, gpu)); + printf ("Clock %lu KHz\n", gpu_props.clockRate); + + CUdeviceptr d_A, d_D; + initAssert(cuMemAlloc(&d_D, GPU_PAGE)); // Should be multiple of GPU page, or mapping of next allocation will segfault the gdrcopy module + initAssert(cuMemAlloc(&d_A, PAGE_SIZE)); + unsigned int flag = 1; + initAssert(cuPointerSetAttribute(&flag, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, d_D)); + initAssert(cuPointerSetAttribute(&flag, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, d_A)); + + gdr_mh_t A_mh, D_mh; + gdr_info_t A_info, D_info; + void *A_bar_ptr = NULL; + void *D_bar_ptr = NULL; + + + gdr_t g = gdr_open(); + gdrAssert(g == NULL); + + gdrAssert(gdr_pin_buffer(g, d_D, GPU_PAGE, 0, 0, &D_mh)); + gdrAssert(gdr_map(g, D_mh, &D_bar_ptr, GPU_PAGE)); + gdrAssert(gdr_get_info(g, D_mh, &D_info)); + + gdrAssert(gdr_pin_buffer(g, d_A, PAGE_SIZE, 0, 0, &A_mh)); + gdrAssert(gdr_map(g, A_mh, &A_bar_ptr, PAGE_SIZE)); + gdrAssert(gdr_get_info(g, A_mh, &A_info)); + + int D_bar_off = D_info.va - d_D; + volatile uint32_t *D = (uint32_t *)((char *)D_bar_ptr + D_bar_off); + + int A_bar_off = A_info.va - d_A; + volatile uint32_t *A = (uint32_t *)((char *)A_bar_ptr + A_bar_off); + + printf("DevicePtr: %lx, GDR ptr: %p, Bus ptr: %lx, (Bar: %p, Offset: %i), VA: 0x%lx, Size: %lu, Page: %lu\n", d_A, A, A_info.bus_addr, A_bar_ptr, A_bar_off, A_info.va, A_info.mapped_size, A_info.page_size); + + pcilib_t *pci; + volatile void *bar; + const pcilib_bar_info_t *bar_info; + + pci = pcilib_open(DEVICE, PCILIB_MODEL_DETECT); + if (!pci) { + printf("pcilib_open\n"); + exit(1); + } + bar = pcilib_resolve_bar_address(pci, BAR, 0); + if (!bar) { + pcilib_close(pci); + printf("map bar\n"); + exit(1); + } + printf("BAR mapped to: %p\n", bar); + + CUdeviceptr dBAR; +// initAssert (cuMemHostRegister ((void*)((((uintptr_t)bar)/65536)*65536), 65536, CU_MEMHOSTREGISTER_DEVICEMAP)); + initAssert (cuMemHostRegister ((void*)bar, 4096, CU_MEMHOSTREGISTER_IOMEMORY)); + initAssert (cuMemHostGetDevicePointer(&dBAR, (void*)bar, 0)); + + bar_info = pcilib_get_bar_info(pci, BAR); + printf("%p (Phys: 0x%lx, Size: 0x%x)\n", bar_info[BAR].virt_addr, bar_info[BAR].phys_addr, bar_info[BAR].size); + + pcilib_kmem_handle_t *kdesc_kmem = pcilib_alloc_kernel_memory (pci, PCILIB_KMEM_TYPE_CONSISTENT, 1, 128, 4096, KMEM_USE_RING, KMEM_DEFAULT_FLAGS); + uintptr_t kdesc_bus = pcilib_kmem_get_block_ba (pci, kdesc_kmem, 0); + volatile void *kdesc = (uint32_t *) pcilib_kmem_get_block_ua (pci, kdesc_kmem, 0); + + + pcilib_kmem_handle_t *kbuf_kmem = pcilib_alloc_kernel_memory(pci, PCILIB_KMEM_TYPE_DMA_C2S_PAGE, 1, ((PAGE_SIZE%4096)?(4096 * (1 + PAGE_SIZE/4096)):PAGE_SIZE), 4096, KMEM_USE_DEFAULT, KMEM_DEFAULT_FLAGS); + uintptr_t kbuf_bus = pcilib_kmem_get_block_ba (pci, kbuf_kmem, 0); + volatile uint32_t *kbuf = (uint32_t *) pcilib_kmem_get_block_ua (pci, kbuf_kmem, 0); + memset ((uint32_t *)kbuf, 0, PAGE_SIZE); + +#ifdef GPU_DESC + volatile void *desc = D; + uintptr_t desc_bus = D_info.bus_addr; +#else + volatile void *desc = kdesc; + uintptr_t desc_bus = kdesc_bus; +#endif + + + memset ((uint32_t *)desc, 0, 5 * sizeof (uint32_t)); + volatile uint64_t *hwaddr = (uint64_t*)((char*)desc + 2 * sizeof(uint32_t)); + + WR32 (REG_RESET_DMA, 1); + usleep (100000); + WR32 (REG_RESET_DMA, 0); + usleep (100000); + + WR32 (REG_NUM_PACKETS_PER_DESCRIPTOR, PAGE_SIZE / (4 * TLP_SIZE)); + WR32 (REG_PACKET_LENGTH, 0x80000 | TLP_SIZE); + WR32 (REG_UPDATE_THRESHOLD, 0); + WR64 (REG_UPDATE_ADDRESS, desc_bus); + WR32 (REG_DMA, 1); + WR32 (REG_COUNTER, 1); + +#ifdef VERBOSE + struct timespec tss, tse, tsk; +#else + struct timeval tvs, tve; +#endif /* VERBOSE */ + + for (int i = 0; i < ITERS; i++) { + clock_gettime(CLOCK_REALTIME, &tss); + +#ifdef GPU_DESC + ipedma<<<1, 1>>>((void*)dBAR, A_info.bus_addr, (uint64_t*)d_D, (uint32_t*)d_A); +#else + WR64 (REG_DESCRIPTOR_ADDRESS, A_info.bus_addr); +// WR64 (REG_DESCRIPTOR_ADDRESS, kbuf_bus); + + do { + } while (*hwaddr == 0); + clock_gettime(CLOCK_REALTIME, &tse); + + null<<<1, 1>>>((uint32_t*)d_A); +#endif + cudaDeviceSynchronize(); + + clock_gettime(CLOCK_REALTIME, &tsk); + + *hwaddr = 0; + +#ifdef VERBOSE + initAssert(cuMemcpyDtoH((void*)kbuf, d_A, PAGE_SIZE)); + +# ifdef GPU_DESC + double lat = 1000. * kbuf[0] / gpu_props.clockRate; + double latk = 1000. * kbuf[1] / gpu_props.clockRate; + double latc = ((tsk.tv_sec - tss.tv_sec)*1000000 + 1. * (tsk.tv_nsec - tss.tv_nsec) / 1000.) / GPU_ITERS; +#else + double lat = (tse.tv_sec - tss.tv_sec)*1000000 + 1. * (tse.tv_nsec - tss.tv_nsec) / 1000.; + double latk = (tsk.tv_sec - tss.tv_sec)*1000000 + 1. * (tsk.tv_nsec - tss.tv_nsec) / 1000.; + double latc = 0; +#endif + + printf("Latency: %.3lf us / %.3lf us (%.3lf us) %x %x %x %x\n", lat, latk, latc, kbuf[0], kbuf[1], kbuf[2], kbuf[3]); +#else + if (!i) gettimeofday(&tvs, NULL); +#endif /* VERBOSE */ + + } + +#ifndef VERBOSE + gettimeofday(&tve, NULL); + size_t avglat = (tve.tv_sec - tvs.tv_sec)*1000000 + (tve.tv_usec - tvs.tv_usec); + printf("Latency: %.3lf us (average for %i iterations)\n", 1. * avglat / ITERS, ITERS); +#endif /* VERBOSE */ + + usleep(1000000); + + + + WR32 (REG_COUNTER, 0); + WR32 (REG_DMA, 0); + + WR32 (REG_RESET_DMA, 1); + usleep (100000); + WR32 (REG_RESET_DMA, 0); + usleep (100000); + + pcilib_free_kernel_memory(pci, kbuf_kmem, KMEM_DEFAULT_FLAGS); + pcilib_free_kernel_memory(pci, kdesc_kmem, KMEM_DEFAULT_FLAGS); + + pcilib_close(pci); + printf("PCI closed\n"); + + + gdr_unmap(g, A_mh, A_bar_ptr, PAGE_SIZE); + gdr_unpin_buffer(g, A_mh); + + gdr_unmap(g, D_mh, D_bar_ptr, GPU_PAGE); + gdr_unpin_buffer(g, D_mh); + + gdr_close(g); + cuMemFree(d_A); + cuMemFree(d_D); + printf("GDR closed\n"); +} diff --git a/gdrcopy.diff b/gdrcopy.diff new file mode 100644 index 0000000..c7043c7 --- /dev/null +++ b/gdrcopy.diff @@ -0,0 +1,57 @@ +diff --git a/gdrapi.c b/gdrapi.c +index e38fb8a..c9faeb5 100644 +--- a/gdrapi.c ++++ b/gdrapi.c +@@ -218,6 +218,7 @@ int gdr_get_info(gdr_t g, gdr_mh_t handle, gdr_info_t *info) + info->page_size = params.page_size; + info->tm_cycles = params.tm_cycles; + info->cycles_per_ms = params.tsc_khz; ++ info->bus_addr = params.bus_addr; + } + return ret; + } +diff --git a/gdrapi.h b/gdrapi.h +index da02719..006f7f0 100644 +--- a/gdrapi.h ++++ b/gdrapi.h +@@ -89,6 +89,7 @@ struct gdr_info { + uint32_t page_size; + uint64_t tm_cycles; + uint32_t cycles_per_ms; ++ uint64_t bus_addr; + }; + typedef struct gdr_info gdr_info_t; + int gdr_get_info(gdr_t g, gdr_mh_t handle, gdr_info_t *info); +diff --git a/gdrdrv/gdrdrv.c b/gdrdrv/gdrdrv.c +index 8363051..8e78441 100644 +--- a/gdrdrv/gdrdrv.c ++++ b/gdrdrv/gdrdrv.c +@@ -443,11 +443,16 @@ static int gdrdrv_get_info(gdr_info_t *info, void __user *_params) + return -EINVAL; + } + ++ struct nvidia_p2p_page *page = mr->page_table->pages[0]; ++ unsigned long page_paddr = page->physical_address; ++ unsigned long paddr = page_paddr + mr->offset; ++ + params.va = mr->va; + params.mapped_size = mr->mapped_size; + params.page_size = mr->page_size; + params.tm_cycles = mr->tm_cycles; + params.tsc_khz = mr->tsc_khz; ++ params.bus_addr = paddr; + + if (copy_to_user(_params, ¶ms, sizeof(params))) { + gdr_err("copy_to_user failed on user pointer %p\n", _params); +diff --git a/gdrdrv/gdrdrv.h b/gdrdrv/gdrdrv.h +index 672a203..e1fd2a5 100644 +--- a/gdrdrv/gdrdrv.h ++++ b/gdrdrv/gdrdrv.h +@@ -77,6 +77,7 @@ struct GDRDRV_IOC_GET_INFO_PARAMS + __u32 page_size; + __u32 tsc_khz; + __u64 tm_cycles; ++ __u64 bus_addr; + }; + + #define GDRDRV_IOC_GET_INFO _IOWR(GDRDRV_IOCTL, 4, struct GDRDRV_IOC_GET_INFO_PARAMS *) diff --git a/insmod.sh b/insmod.sh new file mode 100755 index 0000000..de5eb3c --- /dev/null +++ b/insmod.sh @@ -0,0 +1,42 @@ +#!/bin/bash +# Copyright (c) 2014, NVIDIA CORPORATION. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + +THIS_DIR=$(dirname $0) + +# remove driver +grep gdrdrv /proc/devices >/dev/null && sudo /sbin/rmmod gdrdrv + +# insert driver +#sudo /sbin/insmod gdrdrv/gdrdrv.ko +modprobe gdrdrv + +# create device inodes +major=`fgrep gdrdrv /proc/devices | cut -b 1-4` +echo "INFO: driver major is $major" + +# remove old inodes just in case +if [ -e /dev/gdrdrv ]; then + sudo rm /dev/gdrdrv +fi + +echo "INFO: creating /dev/gdrdrv inode" +sudo mknod /dev/gdrdrv c $major 0 +sudo chmod a+w+r /dev/gdrdrv diff --git a/ipedma.h b/ipedma.h new file mode 100644 index 0000000..284b058 --- /dev/null +++ b/ipedma.h @@ -0,0 +1,20 @@ +#define REG_RESET_DMA 0x00 +#define REG_DMA 0x04 +#define REG_NUM_PACKETS_PER_DESCRIPTOR 0x10 +#define REG_PERF_COUNTER 0x28 +#define REG_PACKET_LENGTH 0x0C +#define REG_DESCRIPTOR_ADDRESS 0x50 +#define REG_UPDATE_ADDRESS 0x58 +#define REG_UPDATE_THRESHOLD 0x60 + + +#define REG_COUNTER 0x9000 + +#define WR32(addr, value) *(uint32_t *) (((char*)(bar)) + (addr)) = (value); +#define RD32(addr) (*(uint32_t *) (((char*)(bar)) + (addr))) +#define WR32_sleep(addr, value) *(uint32_t *) (((char*)(bar)) + (addr)) = (value); usleep (100); + +#define WR64(addr, value) *(uint64_t *) (((char*)(bar)) + (addr)) = (value); +#define RD64(addr) (*(uint64_t *) (((char*)(bar)) + (addr))) +#define WR64_sleep(addr, value) *(uint64_t *) (((char*)(bar)) + (addr)) = (value); usleep (100); + diff --git a/kernels.cu b/kernels.cu new file mode 100644 index 0000000..341bb59 --- /dev/null +++ b/kernels.cu @@ -0,0 +1,48 @@ +#include <cuda.h> +#include <stdint.h> + +#include "config.h" +#include "ipedma.h" + +__global__ void null(uint32_t *data) { +} + + +__global__ void ipedma(volatile void *bar, uintptr_t bus_addr, volatile uint64_t *desc, uint32_t *data) { + int i; + clock_t sum = 0, sumk = 0, t1, t2, t3; + + for (i = 0; i < GPU_ITERS; i++) { + long wait = 0; + + desc[1] = 0; + + t1 = clock64(); + WR64 (REG_DESCRIPTOR_ADDRESS, bus_addr); + + do { + if (++wait > 0x10000) break; + } while (desc[1] == 0); + + t2 = clock64(); + + null<<<1,1>>>(data); + cudaDeviceSynchronize(); + t3 = clock64(); + + sum += t2 - t1; + sumk += t3 - t1; + } + + data[0] = sum / GPU_ITERS; + data[1] = sumk / GPU_ITERS; +} + + + + +/* +__global__ void do_leet (int *rin) { + *rin = 0x1337; +} +*/ diff --git a/kernels.h b/kernels.h new file mode 100644 index 0000000..74a0b44 --- /dev/null +++ b/kernels.h @@ -0,0 +1,5 @@ +__global__ void null(uint32_t *data); + +__global__ void ipedma(volatile void *bar, uintptr_t bus_addr, volatile uint64_t *desc, uint32_t *data); + + |