CoE Cluster november 2011/CUDA

We shall use the following two files (one in C++ and one in CUDA C) as the basis for further development of our parallel programming skills - using Nvidia's CUDA technology for parallel processing onboard Graphics Porcessing Units (GPU). The programs currently remove all blue tones from a selected still image. We will develop the idea to explore performance and algorithmic issues when using CUDA and OpenCV (see literature list) in detection of skin tones in still and moving images:

// Read image from file, remove blue channel and write onto another file

include <iostream>

include <string>

include <sstream>

include <cv.h>

include <highgui.h>

using namespace cv; using namespace std;

include "PixelMat.hpp"

void startCUDA(int blocks, int threads, unsigned char* img, int rows, int cols, int step);

int main(int argc, char* argv[]) { 

 int blocks = 0;
int threads = 0;

 if (argc > 4)
   {
     istringstream buf1(argv[1]);
     buf1 >> blocks;
     istringstream buf2(argv[2]);
     buf2 >> threads;
     PixelMat img = (PixelMat) imread(argv[3],-1);
     int rows = img.rows;
     int cols = img.cols;
     int step = img.step;

     fprintf(stderr, "%d\n", step);

     startCUDA(blocks, threads, img.data, rows, cols, step);

     imwrite(argv[4], img);
   }
else
   cout << "Usage: " << argv[0] << " <blocks> <threads> <inputfile> <outputfile>" << endl;

 return 0;

}

 // removeblue.cu
// Read image from file, remove blue channel and write onto another file


//#include <stdio.h>
//#include <stdlib.h>
//#include <unistd.h>

__device__ void RGB(int x, int y, unsigned char* m, int step, int r, int g, int b)
{
  unsigned char *p;
  p = ((unsigned char *) (m + step*x))+3*y;
  *p = (unsigned char) b;
  *(p+1) = (unsigned char) g;
  *(p+2) = (unsigned char) r;
}
__device__ void RGB(int x, int y, unsigned char* m, int step, int rgb)
{
  unsigned char *p;
  p = ((unsigned char *) (m + step*x)+3*y);
  *p = (unsigned char) (rgb&0xff);
  *(p+1) = (unsigned char) ((rgb>>8)&0xff);
  *(p+2) = (unsigned char) ((rgb>>16)&0xff);
}
__device__ int RGB(int x, int y, unsigned char* m, int step)
{
  unsigned char *p;
  unsigned int b;
  p = ((unsigned char *) (m + step*x)+3*y);
  b = *p+((*(p+1))<<8)+((*(p+2))<<16);
  return b;
}

__global__ void Count(unsigned char *img, int rows, int cols, int step)
{
  float rowfac = ((float) rows)/gridDim.x;
  int rowstart = blockIdx.x*rowfac;
  int rowend = (blockIdx.x+1)*rowfac;
  float colfac = ((float) cols)/blockDim.x;
  int colstart = threadIdx.x*colfac;
  int colend = (threadIdx.x+1)*colfac;
  for (int i=rowstart; i<rowend; i++)
    {
      for (int j=colstart; j<colend; j++)
        RGB(i, j, img, step, RGB(i, j, img, step)&0x0000ff);
    }
}

void startCUDA(int blocks, int threads, unsigned char* img, int rows, int cols, int step)
{
   if (img!=NULL)
   {
      dim3 dimBlock(threads, threads, 1);
          dim3 dimGrid(blocks, 1, 1);

      unsigned char *CUDAimg;
      cudaMalloc((void**) &CUDAimg, rows*cols*3);
      cudaMemcpy(CUDAimg, img, rows*cols*3, cudaMemcpyHostToDevice);
      Count<<<dimGrid, dimBlock>>>(CUDAimg, rows, cols, step);
      cudaMemcpy(img, CUDAimg, rows*cols*3, cudaMemcpyDeviceToHost);
      cudaFree(CUDAimg);
   }
}