Difference between revisions of "CoE Cluster november 2011/CUDA"
From Teknologisk videncenter
(One intermediate revision by the same user not shown) | |||
Line 1: | Line 1: | ||
+ | We shall use the following two files (one in C++ and one in CUDA C) as the basis for further development of our parallel programming skills - using Nvidia's CUDA technology for parallel processing onboard Graphics Porcessing Units (GPU). The programs currently remove all blue tones from a selected still image. We will develop the idea to explore performance and algorithmic issues when using CUDA and OpenCV (see literature list) in detection of skin tones in still and moving images: | ||
+ | |||
<pre>// Read image from file, remove blue channel and write onto another file | <pre>// Read image from file, remove blue channel and write onto another file | ||
Line 44: | Line 46: | ||
return 0; | return 0; | ||
− | } </pre> | + | } </pre> |
<br> | <br> | ||
− | <pre | + | <pre> // removeblue.cu |
− | + | // Read image from file, remove blue channel and write onto another file | |
− | |||
− | unsigned char *CUDAimg; | + | //#include <stdio.h> |
+ | //#include <stdlib.h> | ||
+ | //#include <unistd.h> | ||
+ | |||
+ | __device__ void RGB(int x, int y, unsigned char* m, int step, int r, int g, int b) | ||
+ | { | ||
+ | unsigned char *p; | ||
+ | p = ((unsigned char *) (m + step*x))+3*y; | ||
+ | *p = (unsigned char) b; | ||
+ | *(p+1) = (unsigned char) g; | ||
+ | *(p+2) = (unsigned char) r; | ||
+ | } | ||
+ | __device__ void RGB(int x, int y, unsigned char* m, int step, int rgb) | ||
+ | { | ||
+ | unsigned char *p; | ||
+ | p = ((unsigned char *) (m + step*x)+3*y); | ||
+ | *p = (unsigned char) (rgb&0xff); | ||
+ | *(p+1) = (unsigned char) ((rgb>>8)&0xff); | ||
+ | *(p+2) = (unsigned char) ((rgb>>16)&0xff); | ||
+ | } | ||
+ | __device__ int RGB(int x, int y, unsigned char* m, int step) | ||
+ | { | ||
+ | unsigned char *p; | ||
+ | unsigned int b; | ||
+ | p = ((unsigned char *) (m + step*x)+3*y); | ||
+ | b = *p+((*(p+1))<<8)+((*(p+2))<<16); | ||
+ | return b; | ||
+ | } | ||
+ | |||
+ | __global__ void Count(unsigned char *img, int rows, int cols, int step) | ||
+ | { | ||
+ | float rowfac = ((float) rows)/gridDim.x; | ||
+ | int rowstart = blockIdx.x*rowfac; | ||
+ | int rowend = (blockIdx.x+1)*rowfac; | ||
+ | float colfac = ((float) cols)/blockDim.x; | ||
+ | int colstart = threadIdx.x*colfac; | ||
+ | int colend = (threadIdx.x+1)*colfac; | ||
+ | for (int i=rowstart; i<rowend; i++) | ||
+ | { | ||
+ | for (int j=colstart; j<colend; j++) | ||
+ | RGB(i, j, img, step, RGB(i, j, img, step)&0x0000ff); | ||
+ | } | ||
+ | } | ||
+ | |||
+ | void startCUDA(int blocks, int threads, unsigned char* img, int rows, int cols, int step) | ||
+ | { | ||
+ | if (img!=NULL) | ||
+ | { | ||
+ | dim3 dimBlock(threads, threads, 1); | ||
+ | dim3 dimGrid(blocks, 1, 1); | ||
+ | |||
+ | unsigned char *CUDAimg; | ||
+ | cudaMalloc((void**) &CUDAimg, rows*cols*3); | ||
+ | cudaMemcpy(CUDAimg, img, rows*cols*3, cudaMemcpyHostToDevice); | ||
+ | Count<<<dimGrid, dimBlock>>>(CUDAimg, rows, cols, step); | ||
+ | cudaMemcpy(img, CUDAimg, rows*cols*3, cudaMemcpyDeviceToHost); | ||
+ | cudaFree(CUDAimg); | ||
+ | } | ||
+ | } | ||
+ | |||
</pre> | </pre> |
Latest revision as of 10:32, 9 December 2011
We shall use the following two files (one in C++ and one in CUDA C) as the basis for further development of our parallel programming skills - using Nvidia's CUDA technology for parallel processing onboard Graphics Porcessing Units (GPU). The programs currently remove all blue tones from a selected still image. We will develop the idea to explore performance and algorithmic issues when using CUDA and OpenCV (see literature list) in detection of skin tones in still and moving images:
// Read image from file, remove blue channel and write onto another file include <iostream> include <string> include <sstream> include <cv.h> include <highgui.h> using namespace cv; using namespace std; include "PixelMat.hpp" void startCUDA(int blocks, int threads, unsigned char* img, int rows, int cols, int step); int main(int argc, char* argv[]) { int blocks = 0; int threads = 0; if (argc > 4) { istringstream buf1(argv[1]); buf1 >> blocks; istringstream buf2(argv[2]); buf2 >> threads; PixelMat img = (PixelMat) imread(argv[3],-1); int rows = img.rows; int cols = img.cols; int step = img.step; fprintf(stderr, "%d\n", step); startCUDA(blocks, threads, img.data, rows, cols, step); imwrite(argv[4], img); } else cout << "Usage: " << argv[0] << " <blocks> <threads> <inputfile> <outputfile>" << endl; return 0; }
// removeblue.cu // Read image from file, remove blue channel and write onto another file //#include <stdio.h> //#include <stdlib.h> //#include <unistd.h> __device__ void RGB(int x, int y, unsigned char* m, int step, int r, int g, int b) { unsigned char *p; p = ((unsigned char *) (m + step*x))+3*y; *p = (unsigned char) b; *(p+1) = (unsigned char) g; *(p+2) = (unsigned char) r; } __device__ void RGB(int x, int y, unsigned char* m, int step, int rgb) { unsigned char *p; p = ((unsigned char *) (m + step*x)+3*y); *p = (unsigned char) (rgb&0xff); *(p+1) = (unsigned char) ((rgb>>8)&0xff); *(p+2) = (unsigned char) ((rgb>>16)&0xff); } __device__ int RGB(int x, int y, unsigned char* m, int step) { unsigned char *p; unsigned int b; p = ((unsigned char *) (m + step*x)+3*y); b = *p+((*(p+1))<<8)+((*(p+2))<<16); return b; } __global__ void Count(unsigned char *img, int rows, int cols, int step) { float rowfac = ((float) rows)/gridDim.x; int rowstart = blockIdx.x*rowfac; int rowend = (blockIdx.x+1)*rowfac; float colfac = ((float) cols)/blockDim.x; int colstart = threadIdx.x*colfac; int colend = (threadIdx.x+1)*colfac; for (int i=rowstart; i<rowend; i++) { for (int j=colstart; j<colend; j++) RGB(i, j, img, step, RGB(i, j, img, step)&0x0000ff); } } void startCUDA(int blocks, int threads, unsigned char* img, int rows, int cols, int step) { if (img!=NULL) { dim3 dimBlock(threads, threads, 1); dim3 dimGrid(blocks, 1, 1); unsigned char *CUDAimg; cudaMalloc((void**) &CUDAimg, rows*cols*3); cudaMemcpy(CUDAimg, img, rows*cols*3, cudaMemcpyHostToDevice); Count<<<dimGrid, dimBlock>>>(CUDAimg, rows, cols, step); cudaMemcpy(img, CUDAimg, rows*cols*3, cudaMemcpyDeviceToHost); cudaFree(CUDAimg); } }