Difference between revisions of "CUDA by example/chapter 4"

From Teknologisk videncenter
Jump to: navigation, search
m
m (Memory on the host and memory on the device)
 
(3 intermediate revisions by the same user not shown)
Line 30: Line 30:
 
}
 
}
 
</source>
 
</source>
 +
== Basic multithreaded application ==
 +
{|class="wikitable"
 +
|-bgcolor="#DDDDDD"
 +
! CPU 1 !! CPU 2
 +
|-
 +
|<source lang=c>
 +
void add( int *a, int *b, int *c )
 +
{
 +
    int tid = 0;
 +
    while (tid < N) {
 +
        c[tid] = a[tid] + b[tid] ;
 +
        tid += 2;
 +
    }
 +
}
 +
</source>
 +
|<source lang=c>
 +
void add( int *a, int *b, int *c )
 +
{
 +
    int tid = 1;
 +
    while (tid < N) {
 +
        c[tid] = a[tid] + b[tid] ;
 +
        tid += 2;
 +
    }
 +
}
 +
</source>
 +
|-
 +
|}
 +
 
=CUDA threaded example=
 
=CUDA threaded example=
 +
The '''__global__''' tells the compiler [[CUDA nvcc|nvcc]] compiler that the '''add''' subroutine should run on the '''device'''.
 
<source lang=c>
 
<source lang=c>
 
#define N  10
 
#define N  10
Line 45: Line 74:
  
 
     // allocate the memory on the GPU
 
     // allocate the memory on the GPU
     HANDLE_ERROR( cudaMalloc( (void**)&dev_a, N * sizeof(int) ) );
+
     cudaMalloc( (void**)&dev_a, N * sizeof(int) );
     HANDLE_ERROR( cudaMalloc( (void**)&dev_b, N * sizeof(int) ) );
+
     cudaMalloc( (void**)&dev_b, N * sizeof(int) );
     HANDLE_ERROR( cudaMalloc( (void**)&dev_c, N * sizeof(int) ) );
+
     cudaMalloc( (void**)&dev_c, N * sizeof(int) );
  
 
     // fill the arrays 'a' and 'b' on the CPU
 
     // fill the arrays 'a' and 'b' on the CPU
Line 56: Line 85:
  
 
     // copy the arrays 'a' and 'b' to the GPU
 
     // copy the arrays 'a' and 'b' to the GPU
     HANDLE_ERROR( cudaMemcpy( dev_a, a, N * sizeof(int),
+
     cudaMemcpy( dev_a, a, N * sizeof(int),
                               cudaMemcpyHostToDevice ) );
+
                               cudaMemcpyHostToDevice );
     HANDLE_ERROR( cudaMemcpy( dev_b, b, N * sizeof(int),
+
     cudaMemcpy( dev_b, b, N * sizeof(int),
                               cudaMemcpyHostToDevice ) );
+
                               cudaMemcpyHostToDevice );
  
 
     add<<<N,1>>>( dev_a, dev_b, dev_c );
 
     add<<<N,1>>>( dev_a, dev_b, dev_c );
  
 
     // copy the array 'c' back from the GPU to the CPU
 
     // copy the array 'c' back from the GPU to the CPU
     HANDLE_ERROR( cudaMemcpy( c, dev_c, N * sizeof(int),
+
     cudaMemcpy( c, dev_c, N * sizeof(int),
                               cudaMemcpyDeviceToHost ) );
+
                               cudaMemcpyDeviceToHost );
  
 
     // display the results
 
     // display the results
Line 73: Line 102:
  
 
     // free the memory allocated on the GPU
 
     // free the memory allocated on the GPU
     HANDLE_ERROR( cudaFree( dev_a ) );
+
     cudaFree( dev_a );
     HANDLE_ERROR( cudaFree( dev_b ) );
+
     cudaFree( dev_b );
     HANDLE_ERROR( cudaFree( dev_c ) );
+
     cudaFree( dev_c );
  
 
     return 0;
 
     return 0;
 
}
 
}
 
</source>
 
</source>
 +
== Memory on the host and memory on the device==
 +
As a generel rule:  Host pointers can access memory from host code, and device pointers can access memory from device code.
 +
*You '''can''' pass pointers allocated with cudaMalloc( ) to functions that execute on the device.
 +
*You '''can''' use pointers allocated with cudaMalloc() to read or write memory from code that executes on the device.
 +
*You '''can''' pass pointers allocated with cudaMalloc( ) to functions that execute on the host.
 +
*You '''cannot''' use pointers allocated with cudaMalloc() to read or write memory from code that executes on the host.
 +
[[Category:CUDA]][[Category:CoE]]

Latest revision as of 09:55, 5 December 2010

Normal single threaded programming

#define N   10

void add( int *a, int *b, int *c ) {
    int tid = 0;    // this is CPU zero, so we start at zero
    while (tid < N) {
        c[tid] = a[tid] + b[tid];
        tid += 1;   // we have one CPU, so we increment by one
    }
}

int main( void ) {
    int a[N], b[N], c[N];

    // fill the arrays 'a' and 'b' on the CPU
    for (int i=0; i<N; i++) {
        a[i] = -i;
        b[i] = i * i;
    }

    add( a, b, c );

    // display the results
    for (int i=0; i<N; i++) {
        printf( "%d + %d = %d\n", a[i], b[i], c[i] );
    }

    return 0;
}

Basic multithreaded application

CPU 1 CPU 2
void add( int *a, int *b, int *c ) 
{
    int tid = 0;
    while (tid < N) {
        c[tid] = a[tid] + b[tid] ;
        tid += 2;
    }
}
void add( int *a, int *b, int *c ) 
{
    int tid = 1;
    while (tid < N) {
        c[tid] = a[tid] + b[tid] ;
        tid += 2;
    }
}

CUDA threaded example

The __global__ tells the compiler nvcc compiler that the add subroutine should run on the device.

#define N   10

__global__ void add( int *a, int *b, int *c ) {
    int tid = blockIdx.x;    // this thread handles the data at its thread id
    if (tid < N)
        c[tid] = a[tid] + b[tid];
}

int main( void ) {
    int a[N], b[N], c[N];
    int *dev_a, *dev_b, *dev_c;

    // allocate the memory on the GPU
    cudaMalloc( (void**)&dev_a, N * sizeof(int) );
    cudaMalloc( (void**)&dev_b, N * sizeof(int) );
    cudaMalloc( (void**)&dev_c, N * sizeof(int) );

    // fill the arrays 'a' and 'b' on the CPU
    for (int i=0; i<N; i++) {
        a[i] = -i;
        b[i] = i * i;
    }

    // copy the arrays 'a' and 'b' to the GPU
    cudaMemcpy( dev_a, a, N * sizeof(int),
                              cudaMemcpyHostToDevice );
    cudaMemcpy( dev_b, b, N * sizeof(int),
                              cudaMemcpyHostToDevice );

    add<<<N,1>>>( dev_a, dev_b, dev_c );

    // copy the array 'c' back from the GPU to the CPU
    cudaMemcpy( c, dev_c, N * sizeof(int),
                              cudaMemcpyDeviceToHost );

    // display the results
    for (int i=0; i<N; i++) {
        printf( "%d + %d = %d\n", a[i], b[i], c[i] );
    }

    // free the memory allocated on the GPU
    cudaFree( dev_a );
    cudaFree( dev_b );
    cudaFree( dev_c );

    return 0;
}

Memory on the host and memory on the device

As a generel rule: Host pointers can access memory from host code, and device pointers can access memory from device code.

  • You can pass pointers allocated with cudaMalloc( ) to functions that execute on the device.
  • You can use pointers allocated with cudaMalloc() to read or write memory from code that executes on the device.
  • You can pass pointers allocated with cudaMalloc( ) to functions that execute on the host.
  • You cannot use pointers allocated with cudaMalloc() to read or write memory from code that executes on the host.