#include #include #include // function to add the elements of two arrays __global__ void add(int n, float *x, float *y) { int index = threadIdx.x + blockIdx.x * blockDim.x; int stride = blockDim.x * gridDim.x; for (int i = index; i < n; i += stride) y[i] = x[i] + y[i]; } int main(void) { int N = 1 << 29; float *x, *y; // Allocate Unified Memory – accessible from CPU or GPU cudaMallocManaged(&x, N * sizeof(float)); cudaMallocManaged(&y, N * sizeof(float)); // initialize x and y arrays on the host for (int i = 0; i < N; i++) { x[i] = 1.0f; y[i] = 2.0f; } // Number of threads per block int threadsPerBlock = 256; // Number of blocks in the grid int numberOfBlocks = (N + threadsPerBlock - 1) / threadsPerBlock; std::cout << "Number of thread blocks: " << numberOfBlocks << std::endl; // Run kernel on the elements on the GPU with multiple blocks and threads add<<>>(N, x, y); // Wait for GPU to finish before accessing on host cudaDeviceSynchronize(); // Check for errors (all values should be 3.0f) float maxError = 0.0f; for (int i = 0; i < N; i++) maxError = fmax(maxError, fabs(y[i] - 3.0f)); std::cout << "Max error: " << maxError << std::endl; // Free memory cudaFree(x); cudaFree(y); return 0; }