cuda入门——结合opncv和cuda编程（2）

来源：互联网发布：领航一号软件下载编辑：程序博客网时间：2024/06/10 04:31

opencv读入图片，将图片数据传递到cuda处理

#include<iostream>#include<opencv2/core/core.hpp>#include<opencv2/highgui/highgui.hpp>#include<opencv2/imgproc/imgproc.hpp>#include<stdio.h>using namespace std;using namespace cv;#define NUM_BLOCK  300  // Number of thread blocks  #define NUM_THREAD  64    __global__  void hello(uchar *a, uchar *b,int bins,int nthreads, int nblocks)   {  int i;     int idx = blockIdx.x*blockDim.x+threadIdx.x;  // Sequential thread index across the blocks      for (i=idx; i<bins; i+=nthreads*nblocks) {          a[idx]+=b[idx];  if(a[idx]>255)a[idx]=255;if(a[idx]<0)a[idx]=0;    }   }  int main(){IplImage* img1=cvLoadImage("test1.jpg",0);IplImage* img2=cvLoadImage("test2.jpg",0);uchar* a=(uchar*)img1->imageData; uchar* b=(uchar*)img2->imageData; int N=img1->height*img1->widthStep;uchar *ad;      uchar *bd;      const int csize = N*sizeof(uchar);      const int isize = N*sizeof(uchar);               cudaMalloc( (void**)&ad, csize );       cudaMalloc( (void**)&bd, isize );       cudaMemcpy( ad, a, csize, cudaMemcpyHostToDevice );       cudaMemcpy( bd, b, isize, cudaMemcpyHostToDevice );        dim3 dimGrid(NUM_BLOCK,1,1);  // Grid dimensions      dim3 dimBlock(NUM_THREAD,1,1);  // Block dimensions      hello<<<dimGrid, dimBlock>>>(ad, bd,N,NUM_THREAD, NUM_BLOCK);      cudaMemcpy( b, ad, csize, cudaMemcpyDeviceToHost );     cudaFree( ad );      cudaFree( bd );      cvNamedWindow("图像显示",CV_WINDOW_AUTOSIZE);    cvShowImage("图像显示",img2);    cvWaitKey(0);return 0; }

参考代码：计算圆周率

#include <stdio.h>#include<windows.h>#include <cuda.h>#define NBIN 1000000000 // Number of bins#define NUM_BLOCK  300  // Number of thread blocks#define NUM_THREAD  64  // Number of threads per blockint tid;float pi = 0;// Kernel that executes on the CUDA device__global__ void cal_pi(float *sum, int nbin, float step, int nthreads, int nblocks) {int i;float x;int idx = blockIdx.x*blockDim.x+threadIdx.x;  // Sequential thread index across the blocksfor (i=idx; i< nbin; i+=nthreads*nblocks) {x = (i+0.5)*step;sum[idx] += 4.0/(1.0+x*x);}} //Main routine that executes on the hostint main(void) { LARGE_INTEGER frec;  LARGE_INTEGER strt;  LARGE_INTEGER ed;  QueryPerformanceFrequency(&frec);  QueryPerformanceCounter(&strt);  dim3 dimGrid(NUM_BLOCK,1,1);  // Grid dimensionsdim3 dimBlock(NUM_THREAD,1,1);  // Block dimensionsfloat *sumHost, *sumDev;  // Pointer to host & device arraysfloat step = 1.0/NBIN;  // Step sizesize_t size = NUM_BLOCK*NUM_THREAD*sizeof(float);  //Array memory sizesumHost = (float *)malloc(size);  //  Allocate array on hostcudaMalloc((void **) &sumDev, size);  // Allocate array on device// Initialize array in device to 0cudaMemset(sumDev, 0, size);// Do calculation on devicecal_pi <<<dimGrid, dimBlock>>> (sumDev, NBIN, step, NUM_THREAD, NUM_BLOCK); // call CUDA kernel// Retrieve result from device and store it in host arraycudaMemcpy(sumHost, sumDev, size, cudaMemcpyDeviceToHost);for(tid=0; tid<NUM_THREAD*NUM_BLOCK; tid++)pi += sumHost[tid];pi *= step;// Print resultsprintf("PI = %f\n",pi);// Cleanupfree(sumHost); cudaFree(sumDev);          QueryPerformanceCounter(&ed); printf("%e\n",(ed.QuadPart-strt.QuadPart)*1000/frec.QuadPart);  return 0;}