【推荐】2019 Java 开发者跳槽指南.pdf(吐血整理) >>>
随机产生2000万个点的点云数据,将该数据进行进行加操作,比对CPU和GPU时间上的差异。
代码如下,文件名为main.cu。
#include <iostream>
#include <cuda.h>
#include <boost/version.hpp>
#include <boost/numeric/conversion/cast.hpp>
#include <boost/thread/mutex.hpp>
#include <boost/thread/condition.hpp>
#include <boost/thread.hpp>
#include <boost/thread/thread.hpp>
#include <boost/filesystem.hpp>
#include <boost/bind.hpp>
#include <boost/cstdint.hpp>
#include <boost/function.hpp>
#include <boost/tuple/tuple.hpp>
#include <boost/shared_ptr.hpp>
#include <boost/weak_ptr.hpp>
#include <boost/mpl/fold.hpp>
#include <boost/mpl/inherit.hpp>
#include <boost/mpl/inherit_linearly.hpp>
#include <boost/mpl/joint_view.hpp>
#include <boost/mpl/transform.hpp>
#include <boost/mpl/vector.hpp>
#include <boost/algorithm/string.hpp>
#include <pcl/point_types.h>
#include <pcl/io/pcd_io.h>
#include <pcl/gpu/containers/device_array.h>
#include <pcl/gpu/containers/device_array.hpp>
/* get thread id: 1D block and 2D grid */
#define get_tid() (blockDim.x * (blockIdx.x + blockIdx.y * gridDim.x) + threadIdx.x)
/* get block id: 2D grid */
#define get_bid() (blockIdx.x + blockIdx.y * gridDim.x)
__global__ void cloud2GPU(pcl::gpu::PtrSz<pcl::PointXYZ> cloud_device, int N)
{
int idx = get_tid();
if (idx < N)
{
cloud_device[idx].x += 2.0;
cloud_device[idx].y += 3.0;
cloud_device[idx].z += 4.0;
idx += blockDim.x * gridDim.x;
}
}
bool cloud2CPU(pcl::PointCloud<pcl::PointXYZ>::Ptr& cloud)
{
for (std::size_t i = 0; i < cloud->points.size (); ++i)
{
cloud->points[i].x += 2.0;
cloud->points[i].y += 3.0;
cloud->points[i].z += 4.0;
}
return true;
}
/* warm up GPU */
__global__ void warmup_knl()
{
int i, j;
i = 1;
j = 2;
i = i + j;
}
void warmup()
{
int i;
for (i = 0; i < 8; i++)
{
warmup_knl<<<1, 256>>>();
}
}
int main()
{
int N = 20000000;
/* 1D block */
int bs = 256;
/* 2D grid */
int s = ceil(sqrt((N + bs - 1.) / bs));
dim3 grid = dim3(s, s);
clock_t tStart;
pcl::PointCloud<pcl::PointXYZ>::Ptr cloud(new pcl::PointCloud<pcl::PointXYZ>);
cloud->width = N;
cloud->height = 1;
cloud->points.resize (cloud->width * cloud->height);
// Generate the data
for (std::size_t i = 0; i < cloud->points.size (); ++i)
{
cloud->points[i].x = 1024 * rand () / (RAND_MAX + 1.0f);
cloud->points[i].y = 1024 * rand () / (RAND_MAX + 1.0f);
cloud->points[i].z = 1.0;
}
tStart = clock();
cloud2CPU(cloud);
printf("CPU Time taken: %.2fs\n", (double)(clock() - tStart)/CLOCKS_PER_SEC);
//GPU热身,防止冷启动造成GPU不能真正达到效果
warmup();
pcl::gpu::DeviceArray<pcl::PointXYZ> cloud_device;
//cpu阻塞此处直到GPU执行完成
cudaDeviceSynchronize();
tStart = clock();
cloud_device.upload(cloud->points);
cloud2GPU<<<grid, bs>>>(cloud_device, N);
cloud_device.download(cloud->points);
cudaDeviceSynchronize();
printf("GPU Time taken: %.2fs\n", (double)(clock() - tStart)/CLOCKS_PER_SEC);
return 0;
}
CMakeLists.txt内容如下:
cmake_minimum_required(VERSION 2.8.12)
project(cloudProcess)
set(CMAKE_INCLUDE_CURRENT_DIR ON)
set(CMAKE_AUTOMOC ON)
file(GLOB USER_CU *.cu)
find_package(Eigen3 3.3.7)
find_package(PCL 1.9 REQUIRED)
find_package(CUDA)
message(${EIGEN3_INCLUDE_DIR})
INCLUDE_DIRECTORIES(${EIGEN3_INCLUDE_DIR})
INCLUDE_DIRECTORIES(${PCL_INCLUDE_DIRS})
LINK_DIRECTORIES(${PCL_LIBRARY_DIRS})
add_definitions(${PCL_DEFINITIONS})
cuda_add_executable(${PROJECT_NAME} ${USER_CU})
target_link_libraries (${PROJECT_NAME} ${PCL_LIBRARIES})
来源:oschina
链接:https://my.oschina.net/u/4228078/blog/3155035