Latest revision as of 09:55, 5 December 2010

Normal single threaded programming

#define N   10

void add( int *a, int *b, int *c ) {
    int tid = 0;    // this is CPU zero, so we start at zero
    while (tid < N) {
        c[tid] = a[tid] + b[tid];
        tid += 1;   // we have one CPU, so we increment by one
    }
}

int main( void ) {
    int a[N], b[N], c[N];

    // fill the arrays 'a' and 'b' on the CPU
    for (int i=0; i<N; i++) {
        a[i] = -i;
        b[i] = i * i;
    }

    add( a, b, c );

    // display the results
    for (int i=0; i<N; i++) {
        printf( "%d + %d = %d\n", a[i], b[i], c[i] );
    }

    return 0;
}

Basic multithreaded application

CPU 1

CPU 2

void add( int *a, int *b, int *c ) 
{
    int tid = 0;
    while (tid < N) {
        c[tid] = a[tid] + b[tid] ;
        tid += 2;
    }
}

void add( int *a, int *b, int *c ) 
{
    int tid = 1;
    while (tid < N) {
        c[tid] = a[tid] + b[tid] ;
        tid += 2;
    }
}

CUDA threaded example

The __global__ tells the compiler nvcc compiler that the add subroutine should run on the device.

#define N   10

__global__ void add( int *a, int *b, int *c ) {
    int tid = blockIdx.x;    // this thread handles the data at its thread id
    if (tid < N)
        c[tid] = a[tid] + b[tid];
}

int main( void ) {
    int a[N], b[N], c[N];
    int *dev_a, *dev_b, *dev_c;

    // allocate the memory on the GPU
    cudaMalloc( (void**)&dev_a, N * sizeof(int) );
    cudaMalloc( (void**)&dev_b, N * sizeof(int) );
    cudaMalloc( (void**)&dev_c, N * sizeof(int) );

    // fill the arrays 'a' and 'b' on the CPU
    for (int i=0; i<N; i++) {
        a[i] = -i;
        b[i] = i * i;
    }

    // copy the arrays 'a' and 'b' to the GPU
    cudaMemcpy( dev_a, a, N * sizeof(int),
                              cudaMemcpyHostToDevice );
    cudaMemcpy( dev_b, b, N * sizeof(int),
                              cudaMemcpyHostToDevice );

    add<<<N,1>>>( dev_a, dev_b, dev_c );

    // copy the array 'c' back from the GPU to the CPU
    cudaMemcpy( c, dev_c, N * sizeof(int),
                              cudaMemcpyDeviceToHost );

    // display the results
    for (int i=0; i<N; i++) {
        printf( "%d + %d = %d\n", a[i], b[i], c[i] );
    }

    // free the memory allocated on the GPU
    cudaFree( dev_a );
    cudaFree( dev_b );
    cudaFree( dev_c );

    return 0;
}

Memory on the host and memory on the device

As a generel rule: Host pointers can access memory from host code, and device pointers can access memory from device code.

You can pass pointers allocated with cudaMalloc( ) to functions that execute on the device.
You can use pointers allocated with cudaMalloc() to read or write memory from code that executes on the device.
You can pass pointers allocated with cudaMalloc( ) to functions that execute on the host.
You cannot use pointers allocated with cudaMalloc() to read or write memory from code that executes on the host.

@@ Line 30: / Line 30: @@
 }
 </source>
+== Basic multithreaded application ==
+{|class="wikitable"
+|-bgcolor="#DDDDDD"
+! CPU 1 !! CPU 2
+|-
+|<source lang=c>
+void add( int *a, int *b, int *c )
+{
+    int tid = 0;
+    while (tid < N) {
+        c[tid] = a[tid] + b[tid] ;
+        tid += 2;
+    }
+}
+</source>
+|<source lang=c>
+void add( int *a, int *b, int *c )
+{
+    int tid = 1;
+    while (tid < N) {
+        c[tid] = a[tid] + b[tid] ;
+        tid += 2;
+    }
+}
+</source>
+|-
+|}
 =CUDA threaded example=
-<source lang=cli>
+The '''__global__''' tells the compiler [[CUDA nvcc|nvcc]] compiler that the '''add''' subroutine should run on the '''device'''.
+<source lang=c>
 #define N   10
@@ Line 45: / Line 74: @@
      // allocate the memory on the GPU
-     HANDLE_ERROR( cudaMalloc( (void**)&dev_a, N * sizeof(int) ) );
+     cudaMalloc( (void**)&dev_a, N * sizeof(int) );
-     HANDLE_ERROR( cudaMalloc( (void**)&dev_b, N * sizeof(int) ) );
+     cudaMalloc( (void**)&dev_b, N * sizeof(int) );
-     HANDLE_ERROR( cudaMalloc( (void**)&dev_c, N * sizeof(int) ) );
+     cudaMalloc( (void**)&dev_c, N * sizeof(int) );
      // fill the arrays 'a' and 'b' on the CPU
@@ Line 56: / Line 85: @@
      // copy the arrays 'a' and 'b' to the GPU
-     HANDLE_ERROR( cudaMemcpy( dev_a, a, N * sizeof(int),
+     cudaMemcpy( dev_a, a, N * sizeof(int),
-                               cudaMemcpyHostToDevice ) );
+                               cudaMemcpyHostToDevice );
-     HANDLE_ERROR( cudaMemcpy( dev_b, b, N * sizeof(int),
+     cudaMemcpy( dev_b, b, N * sizeof(int),
-                               cudaMemcpyHostToDevice ) );
+                               cudaMemcpyHostToDevice );
      add<<<N,1>>>( dev_a, dev_b, dev_c );
      // copy the array 'c' back from the GPU to the CPU
-     HANDLE_ERROR( cudaMemcpy( c, dev_c, N * sizeof(int),
+     cudaMemcpy( c, dev_c, N * sizeof(int),
-                               cudaMemcpyDeviceToHost ) );
+                               cudaMemcpyDeviceToHost );
      // display the results
@@ Line 73: / Line 102: @@
      // free the memory allocated on the GPU
-     HANDLE_ERROR( cudaFree( dev_a ) );
+     cudaFree( dev_a );
-     HANDLE_ERROR( cudaFree( dev_b ) );
+     cudaFree( dev_b );
-     HANDLE_ERROR( cudaFree( dev_c ) );
+     cudaFree( dev_c );
      return 0;
 }
 </source>
+== Memory on the host and memory on the device==
-{{Source cli}}
+As a generel rule:  Host pointers can access memory from host code, and device pointers can access memory from device code.
+*You '''can''' pass pointers allocated with cudaMalloc( ) to functions that execute on the device.
+*You '''can''' use pointers allocated with cudaMalloc() to read or write memory from code that executes on the device.
+*You '''can''' pass pointers allocated with cudaMalloc( ) to functions that execute on the host.
+*You '''cannot''' use pointers allocated with cudaMalloc() to read or write memory from code that executes on the host.
+[[Category:CUDA]][[Category:CoE]]

Navigation menu

Difference between revisions of "CUDA by example/chapter 4"

Latest revision as of 09:55, 5 December 2010

Contents

Normal single threaded programming

Basic multithreaded application

CUDA threaded example

Memory on the host and memory on the device