Difference between revisions of "CUDA by example/chapter 4"
From Teknologisk videncenter
m (New page: =Normal single threaded programming= <source lang=c> #define N 10 void add( int *a, int *b, int *c ) { int tid = 0; // this is CPU zero, so we start at zero while (tid < N) {...) |
m (→Memory on the host and memory on the device) |
||
(4 intermediate revisions by the same user not shown) | |||
Line 30: | Line 30: | ||
} | } | ||
</source> | </source> | ||
+ | == Basic multithreaded application == | ||
+ | {|class="wikitable" | ||
+ | |-bgcolor="#DDDDDD" | ||
+ | ! CPU 1 !! CPU 2 | ||
+ | |- | ||
+ | |<source lang=c> | ||
+ | void add( int *a, int *b, int *c ) | ||
+ | { | ||
+ | int tid = 0; | ||
+ | while (tid < N) { | ||
+ | c[tid] = a[tid] + b[tid] ; | ||
+ | tid += 2; | ||
+ | } | ||
+ | } | ||
+ | </source> | ||
+ | |<source lang=c> | ||
+ | void add( int *a, int *b, int *c ) | ||
+ | { | ||
+ | int tid = 1; | ||
+ | while (tid < N) { | ||
+ | c[tid] = a[tid] + b[tid] ; | ||
+ | tid += 2; | ||
+ | } | ||
+ | } | ||
+ | </source> | ||
+ | |- | ||
+ | |} | ||
+ | |||
=CUDA threaded example= | =CUDA threaded example= | ||
− | <source lang= | + | The '''__global__''' tells the compiler [[CUDA nvcc|nvcc]] compiler that the '''add''' subroutine should run on the '''device'''. |
+ | <source lang=c> | ||
#define N 10 | #define N 10 | ||
Line 45: | Line 74: | ||
// allocate the memory on the GPU | // allocate the memory on the GPU | ||
− | + | cudaMalloc( (void**)&dev_a, N * sizeof(int) ); | |
− | + | cudaMalloc( (void**)&dev_b, N * sizeof(int) ); | |
− | + | cudaMalloc( (void**)&dev_c, N * sizeof(int) ); | |
// fill the arrays 'a' and 'b' on the CPU | // fill the arrays 'a' and 'b' on the CPU | ||
Line 56: | Line 85: | ||
// copy the arrays 'a' and 'b' to the GPU | // copy the arrays 'a' and 'b' to the GPU | ||
− | + | cudaMemcpy( dev_a, a, N * sizeof(int), | |
− | cudaMemcpyHostToDevice | + | cudaMemcpyHostToDevice ); |
− | + | cudaMemcpy( dev_b, b, N * sizeof(int), | |
− | cudaMemcpyHostToDevice | + | cudaMemcpyHostToDevice ); |
add<<<N,1>>>( dev_a, dev_b, dev_c ); | add<<<N,1>>>( dev_a, dev_b, dev_c ); | ||
// copy the array 'c' back from the GPU to the CPU | // copy the array 'c' back from the GPU to the CPU | ||
− | + | cudaMemcpy( c, dev_c, N * sizeof(int), | |
− | cudaMemcpyDeviceToHost | + | cudaMemcpyDeviceToHost ); |
// display the results | // display the results | ||
Line 73: | Line 102: | ||
// free the memory allocated on the GPU | // free the memory allocated on the GPU | ||
− | + | cudaFree( dev_a ); | |
− | + | cudaFree( dev_b ); | |
− | + | cudaFree( dev_c ); | |
return 0; | return 0; | ||
} | } | ||
</source> | </source> | ||
− | + | == Memory on the host and memory on the device== | |
− | + | As a generel rule: Host pointers can access memory from host code, and device pointers can access memory from device code. | |
+ | *You '''can''' pass pointers allocated with cudaMalloc( ) to functions that execute on the device. | ||
+ | *You '''can''' use pointers allocated with cudaMalloc() to read or write memory from code that executes on the device. | ||
+ | *You '''can''' pass pointers allocated with cudaMalloc( ) to functions that execute on the host. | ||
+ | *You '''cannot''' use pointers allocated with cudaMalloc() to read or write memory from code that executes on the host. | ||
+ | [[Category:CUDA]][[Category:CoE]] |
Latest revision as of 09:55, 5 December 2010
Contents
Normal single threaded programming
#define N 10
void add( int *a, int *b, int *c ) {
int tid = 0; // this is CPU zero, so we start at zero
while (tid < N) {
c[tid] = a[tid] + b[tid];
tid += 1; // we have one CPU, so we increment by one
}
}
int main( void ) {
int a[N], b[N], c[N];
// fill the arrays 'a' and 'b' on the CPU
for (int i=0; i<N; i++) {
a[i] = -i;
b[i] = i * i;
}
add( a, b, c );
// display the results
for (int i=0; i<N; i++) {
printf( "%d + %d = %d\n", a[i], b[i], c[i] );
}
return 0;
}
Basic multithreaded application
CPU 1 | CPU 2 |
---|---|
void add( int *a, int *b, int *c )
{
int tid = 0;
while (tid < N) {
c[tid] = a[tid] + b[tid] ;
tid += 2;
}
}
|
void add( int *a, int *b, int *c )
{
int tid = 1;
while (tid < N) {
c[tid] = a[tid] + b[tid] ;
tid += 2;
}
}
|
CUDA threaded example
The __global__ tells the compiler nvcc compiler that the add subroutine should run on the device.
#define N 10
__global__ void add( int *a, int *b, int *c ) {
int tid = blockIdx.x; // this thread handles the data at its thread id
if (tid < N)
c[tid] = a[tid] + b[tid];
}
int main( void ) {
int a[N], b[N], c[N];
int *dev_a, *dev_b, *dev_c;
// allocate the memory on the GPU
cudaMalloc( (void**)&dev_a, N * sizeof(int) );
cudaMalloc( (void**)&dev_b, N * sizeof(int) );
cudaMalloc( (void**)&dev_c, N * sizeof(int) );
// fill the arrays 'a' and 'b' on the CPU
for (int i=0; i<N; i++) {
a[i] = -i;
b[i] = i * i;
}
// copy the arrays 'a' and 'b' to the GPU
cudaMemcpy( dev_a, a, N * sizeof(int),
cudaMemcpyHostToDevice );
cudaMemcpy( dev_b, b, N * sizeof(int),
cudaMemcpyHostToDevice );
add<<<N,1>>>( dev_a, dev_b, dev_c );
// copy the array 'c' back from the GPU to the CPU
cudaMemcpy( c, dev_c, N * sizeof(int),
cudaMemcpyDeviceToHost );
// display the results
for (int i=0; i<N; i++) {
printf( "%d + %d = %d\n", a[i], b[i], c[i] );
}
// free the memory allocated on the GPU
cudaFree( dev_a );
cudaFree( dev_b );
cudaFree( dev_c );
return 0;
}
Memory on the host and memory on the device
As a generel rule: Host pointers can access memory from host code, and device pointers can access memory from device code.
- You can pass pointers allocated with cudaMalloc( ) to functions that execute on the device.
- You can use pointers allocated with cudaMalloc() to read or write memory from code that executes on the device.
- You can pass pointers allocated with cudaMalloc( ) to functions that execute on the host.
- You cannot use pointers allocated with cudaMalloc() to read or write memory from code that executes on the host.