58 lines
1.5 KiB
Plaintext
58 lines
1.5 KiB
Plaintext
#include <cmath>
|
||
#include <cuda_runtime.h>
|
||
#include <iostream>
|
||
|
||
// function to add the elements of two arrays
|
||
__global__ void add(int n, float *x, float *y) {
|
||
int index = threadIdx.x + blockIdx.x * blockDim.x;
|
||
int stride = blockDim.x * gridDim.x;
|
||
for (int i = index; i < n; i += stride)
|
||
y[i] = x[i] + y[i];
|
||
}
|
||
|
||
int main(void) {
|
||
int N = 1 << 29;
|
||
|
||
float *x, *y;
|
||
|
||
// Allocate Unified Memory – accessible from CPU or GPU
|
||
cudaMallocManaged(&x, N * sizeof(float));
|
||
cudaMallocManaged(&y, N * sizeof(float));
|
||
|
||
// initialize x and y arrays on the host
|
||
for (int i = 0; i < N; i++) {
|
||
x[i] = 1.0f;
|
||
y[i] = 2.0f;
|
||
}
|
||
|
||
// Prefetch memory to the GPU
|
||
int deviceID = 0;
|
||
cudaMemPrefetchAsync((void *)x, N * sizeof(float), deviceID);
|
||
cudaMemPrefetchAsync((void *)y, N * sizeof(float), deviceID);
|
||
|
||
// Number of threads per block
|
||
int threadsPerBlock = 256;
|
||
// Number of blocks in the grid
|
||
int numberOfBlocks = (N + threadsPerBlock - 1) / threadsPerBlock;
|
||
|
||
std::cout << "Number of thread blocks: " << numberOfBlocks << std::endl;
|
||
|
||
// Run kernel on the elements on the GPU with multiple blocks and threads
|
||
add<<<numberOfBlocks, threadsPerBlock>>>(N, x, y);
|
||
|
||
// Wait for GPU to finish before accessing on host
|
||
cudaDeviceSynchronize();
|
||
|
||
// Check for errors (all values should be 3.0f)
|
||
float maxError = 0.0f;
|
||
for (int i = 0; i < N; i++)
|
||
maxError = fmax(maxError, fabs(y[i] - 3.0f));
|
||
std::cout << "Max error: " << maxError << std::endl;
|
||
|
||
// Free memory
|
||
cudaFree(x);
|
||
cudaFree(y);
|
||
|
||
return 0;
|
||
}
|