csc-656-coding-project-4/vecadd_gpu_256t_mb_prefetch.cu
2024-12-12 01:31:08 -08:00

58 lines
1.5 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#include <cmath>
#include <cuda_runtime.h>
#include <iostream>
// function to add the elements of two arrays
__global__ void add(int n, float *x, float *y) {
int index = threadIdx.x + blockIdx.x * blockDim.x;
int stride = blockDim.x * gridDim.x;
for (int i = index; i < n; i += stride)
y[i] = x[i] + y[i];
}
int main(void) {
int N = 1 << 29;
float *x, *y;
// Allocate Unified Memory accessible from CPU or GPU
cudaMallocManaged(&x, N * sizeof(float));
cudaMallocManaged(&y, N * sizeof(float));
// initialize x and y arrays on the host
for (int i = 0; i < N; i++) {
x[i] = 1.0f;
y[i] = 2.0f;
}
// Prefetch memory to the GPU
int deviceID = 0;
cudaMemPrefetchAsync((void *)x, N * sizeof(float), deviceID);
cudaMemPrefetchAsync((void *)y, N * sizeof(float), deviceID);
// Number of threads per block
int threadsPerBlock = 256;
// Number of blocks in the grid
int numberOfBlocks = (N + threadsPerBlock - 1) / threadsPerBlock;
std::cout << "Number of thread blocks: " << numberOfBlocks << std::endl;
// Run kernel on the elements on the GPU with multiple blocks and threads
add<<<numberOfBlocks, threadsPerBlock>>>(N, x, y);
// Wait for GPU to finish before accessing on host
cudaDeviceSynchronize();
// Check for errors (all values should be 3.0f)
float maxError = 0.0f;
for (int i = 0; i < N; i++)
maxError = fmax(maxError, fabs(y[i] - 3.0f));
std::cout << "Max error: " << maxError << std::endl;
// Free memory
cudaFree(x);
cudaFree(y);
return 0;
}