Revision as of 09:52, 5 December 2010

Normal single threaded programming

#define N   10

void add( int *a, int *b, int *c ) {
    int tid = 0;    // this is CPU zero, so we start at zero
    while (tid < N) {
        c[tid] = a[tid] + b[tid];
        tid += 1;   // we have one CPU, so we increment by one
    }
}

int main( void ) {
    int a[N], b[N], c[N];

    // fill the arrays 'a' and 'b' on the CPU
    for (int i=0; i<N; i++) {
        a[i] = -i;
        b[i] = i * i;
    }

    add( a, b, c );

    // display the results
    for (int i=0; i<N; i++) {
        printf( "%d + %d = %d\n", a[i], b[i], c[i] );
    }

    return 0;
}

Basic multithreaded application

CPU 1

CPU 2

void add( int *a, int *b, int *c ) 
{
    int tid = 0;
    while (tid < N) {
        c[tid] = a[tid] + b[tid] ;
        tid += 2;
    }
}

void add( int *a, int *b, int *c ) 
{
    int tid = 1;
    while (tid < N) {
        c[tid] = a[tid] + b[tid] ;
        tid += 2;
    }
}

CUDA threaded example

The __global__ tells the compiler nvcc compiler that the add subroutine should run on the device.

#define N   10

__global__ void add( int *a, int *b, int *c ) {
    int tid = blockIdx.x;    // this thread handles the data at its thread id
    if (tid < N)
        c[tid] = a[tid] + b[tid];
}

int main( void ) {
    int a[N], b[N], c[N];
    int *dev_a, *dev_b, *dev_c;

    // allocate the memory on the GPU
    cudaMalloc( (void**)&dev_a, N * sizeof(int) );
    cudaMalloc( (void**)&dev_b, N * sizeof(int) );
    cudaMalloc( (void**)&dev_c, N * sizeof(int) );

    // fill the arrays 'a' and 'b' on the CPU
    for (int i=0; i<N; i++) {
        a[i] = -i;
        b[i] = i * i;
    }

    // copy the arrays 'a' and 'b' to the GPU
    cudaMemcpy( dev_a, a, N * sizeof(int),
                              cudaMemcpyHostToDevice );
    cudaMemcpy( dev_b, b, N * sizeof(int),
                              cudaMemcpyHostToDevice );

    add<<<N,1>>>( dev_a, dev_b, dev_c );

    // copy the array 'c' back from the GPU to the CPU
    cudaMemcpy( c, dev_c, N * sizeof(int),
                              cudaMemcpyDeviceToHost );

    // display the results
    for (int i=0; i<N; i++) {
        printf( "%d + %d = %d\n", a[i], b[i], c[i] );
    }

    // free the memory allocated on the GPU
    cudaFree( dev_a );
    cudaFree( dev_b );
    cudaFree( dev_c );

    return 0;
}

Memory on the host and memory on the device

You can pass pointers allocated with cudaMalloc( ) to functions that execute on the device.
You can use pointers allocated with cudaMalloc() to read or write memory from code that executes on the device.
You can pass pointers allocated with cudaMalloc( ) to functions that execute on the host.
You cannot use pointers allocated with cudaMalloc() to read or write memory from code that executes on the host.

@@ Line 59: / Line 59: @@
 =CUDA threaded example=
+The '''__global__''' tells the compiler [[CUDA nvcc|nvcc]] compiler that the '''add''' subroutine should run on the '''device'''.
 <source lang=c>
 #define N   10
@@ Line 108: / Line 109: @@
 }
 </source>
+== Memory on the host and memory on the device==
+*You '''can''' pass pointers allocated with cudaMalloc( ) to functions that execute on the device.
+*You '''can''' use pointers allocated with cudaMalloc() to read or write memory from code that executes on the device.
+*You '''can''' pass pointers allocated with cudaMalloc( ) to functions that execute on the host.
+*You '''cannot''' use pointers allocated with cudaMalloc() to read or write memory from code that executes on the host.

Navigation menu

Difference between revisions of "CUDA by example/chapter 4"

Revision as of 09:52, 5 December 2010

Contents

Normal single threaded programming

Basic multithreaded application

CUDA threaded example

Memory on the host and memory on the device