Improved code, added output, performance table, and analysis questions

This commit is contained in:
uzy lol 2024-12-11 22:28:58 -08:00
parent f257da0120
commit 3a15cd67b0
9 changed files with 186 additions and 147 deletions

BIN
Performance Table.xlsx Normal file

Binary file not shown.

42
analysis.md Normal file
View File

@ -0,0 +1,42 @@
# Performance Table
| Implementation | Execution Time (ms) | MFLOPS/s | Mem Bandwidth B/s | Mem Bandwidth GB/s |
|---------------------|---------------------|--------------|-------------------|--------------------|
| CPU | 985.325 | 519.6255043 | 6538401993 | 6.5384 |
| CUDA: 1t 1b | 50918.5392 | 10.05527668 | 126524661.7 | 0.1265 |
| CUDA: 256t 1b | 1689.7723 | 302.9994041 | 3812614838 | 3.8126 |
| CUDA: 256t many b | 1219.1305 | 419.9714469 | 5284463758 | 5.2845 |
| CUDA prefetch | 329.9085 | 1551.945464 | 19527993198 | 19.528 |
# Analysis Questions
1. What is the MFLOP/s performance gain going from the CPU-only code to the final version of your CUDA code (the one with the cudaMemPrefetchAsync() call)? Please report this gain in terms of a multiplier, e.g., 2.5x, rather than with an absolute number or a percentage. Show your work on how you compute this result.
*Answer*:
$$
\text{Performance gain} = \frac{1551.945464}{519.6255043} = 2.98666145 \approx \boxed{\text{3x gain}}
$$
2. What is the memory bandwidth performance gain (or loss) going from the CPU-only code to the final version of your CUDA code (the one with the cudaMemPrefetchAsync() call)? Please report this gain in terms of a multiplier, e.g., 2.5x, rather than with an absolute number or a percentage. Show your work on how you compute this result.
*Answer*:
$$
\text{Mem bandwidth gain/loss} = \frac{19527993198}{6538401993} = 2.98666145 \approx \boxed{\text{3x gain}}
$$
3. For the final version of your CUDA code (the one with the cudaMemPrefetchAsync() call), what is the total number of concurrent threads being run? Show your work on how you arrive at this result.
*Answer*:
$$
\text{Given: } \\
- \ 2,097,152 \text{ thread blocks} \\
- \ N = 536,870,912 \\
\text{Total concurrent threads} = 256 \times 2097152 = 536870912 \\
= \boxed{\text{536,870,912 threads}}
$$

View File

@ -3,68 +3,75 @@
CPU-Only CPU-Only
--- ---
> [!WARNING] 985.325 ms
> Output not yet recorded
CUDA: 1 Thread, 1 Block CUDA: 1 Thread, 1 Block
--- ---
``` ```
uzylol@nid001076:/pscratch/sd/u/uzylol/cuda_vecadd> nsys nvprof vecadd_gpu_1t uzylol@nid001133:/pscratch/sd/u/uzylol/cuda_vecadd> ./helper.sh vecadd_gpu_1t
Choose an option:
1. Build and profile
2. Clean
Enter your choice (1 or 2): 1
Built executable: vecadd_gpu_1t
WARNING: vecadd_gpu_1t and any of its children processes will be profiled. WARNING: vecadd_gpu_1t and any of its children processes will be profiled.
Max error: 0 Max error: 0
Generating '/tmp/nsys-report-383f.qdstrm' Generating '/tmp/nsys-report-135f.qdstrm'
[1/7] [========================100%] report1.nsys-rep [1/7] [========================100%] report1.nsys-rep
[2/7] [========================100%] report1.sqlite [2/7] [========================100%] report1.sqlite
[3/7] Executing 'nvtx_sum' stats report [3/7] Executing 'nvtx_sum' stats report
SKIPPED: /pscratch/sd/u/uzylol/cuda_vecadd/report1.sqlite does not contain NV Tools Extension (NVTX) data. SKIPPED: /pscratch/sd/u/uzylol/cuda_vecadd/report1.sqlite does not contain NV Tools Extension (NVTX) data.
[4/7] Executing 'cuda_api_sum' stats report [4/7] Executing 'cuda_api_sum' stats report
Time (%) Total Time (ns) Num Calls Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Name Time (%) Total Time (ns) Num Calls Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Name
-------- --------------- --------- --------------- --------------- ------------- ------------- ------------- ---------------------- -------- --------------- --------- ---------------- ---------------- -------------- -------------- ------------- ----------------------
68.1 1,203,306,047 1 1,203,306,047.0 1,203,306,047.0 1,203,306,047 1,203,306,047 0.0 cudaDeviceSynchronize 98.5 50,918,539,217 1 50,918,539,217.0 50,918,539,217.0 50,918,539,217 50,918,539,217 0.0 cudaDeviceSynchronize
19.6 346,677,337 2 173,338,668.5 173,338,668.5 65,046 346,612,291 245,045,906.9 cudaMallocManaged 0.6 335,502,692 2 167,751,346.0 167,751,346.0 70,647 335,432,045 237,136,318.7 cudaMallocManaged
11.1 195,868,284 2 97,934,142.0 97,934,142.0 68,440,871 127,427,413 41,709,783.8 cudaFree 0.5 247,516,551 1 247,516,551.0 247,516,551.0 247,516,551 247,516,551 0.0 cudaLaunchKernel
1.2 21,925,779 1 21,925,779.0 21,925,779.0 21,925,779 21,925,779 0.0 cudaLaunchKernel 0.4 198,199,002 2 99,099,501.0 99,099,501.0 68,308,537 129,890,465 43,544,998.9 cudaFree
0.0 1,463 1 1,463.0 1,463.0 1,463 1,463 0.0 cuModuleGetLoadingMode 0.0 1,293 1 1,293.0 1,293.0 1,293 1,293 0.0 cuModuleGetLoadingMode
[5/7] Executing 'cuda_gpu_kern_sum' stats report [5/7] Executing 'cuda_gpu_kern_sum' stats report
Time (%) Total Time (ns) Instances Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Name Time (%) Total Time (ns) Instances Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Name
-------- --------------- --------- --------------- --------------- ------------- ------------- ----------- -------------------------- -------- --------------- --------- ---------------- ---------------- -------------- -------------- ----------- --------------------------
100.0 1,203,302,431 1 1,203,302,431.0 1,203,302,431.0 1,203,302,431 1,203,302,431 0.0 add(int, float *, float *) 100.0 50,918,525,966 1 50,918,525,966.0 50,918,525,966.0 50,918,525,966 50,918,525,966 0.0 add(int, float *, float *)
[6/7] Executing 'cuda_gpu_mem_time_sum' stats report [6/7] Executing 'cuda_gpu_mem_time_sum' stats report
Time (%) Total Time (ns) Count Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Operation Time (%) Total Time (ns) Count Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Operation
-------- --------------- ------- -------- -------- -------- -------- ----------- ------------------------------------ -------- --------------- ------ -------- -------- -------- -------- ----------- ------------------------------------
80.0 446,872,156 152,098 2,938.1 2,175.0 1,663 41,471 3,720.9 [CUDA memcpy Unified Host-to-Device] 65.2 206,887,363 24,576 8,418.3 3,647.5 1,982 41,312 11,229.0 [CUDA memcpy Unified Host-to-Device]
20.0 111,554,845 12,282 9,082.8 3,215.5 1,726 49,504 12,422.3 [CUDA memcpy Unified Device-to-Host] 34.8 110,515,732 12,288 8,993.8 3,135.5 1,726 48,416 12,346.7 [CUDA memcpy Unified Device-to-Host]
[7/7] Executing 'cuda_gpu_mem_size_sum' stats report [7/7] Executing 'cuda_gpu_mem_size_sum' stats report
Total (MB) Count Avg (MB) Med (MB) Min (MB) Max (MB) StdDev (MB) Operation Total (MB) Count Avg (MB) Med (MB) Min (MB) Max (MB) StdDev (MB) Operation
---------- ------- -------- -------- -------- -------- ----------- ------------------------------------ ---------- ------ -------- -------- -------- -------- ----------- ------------------------------------
4,075.237 152,098 0.027 0.008 0.004 1.040 0.099 [CUDA memcpy Unified Host-to-Device] 4,294.967 24,576 0.175 0.033 0.004 1.044 0.301 [CUDA memcpy Unified Host-to-Device]
2,147.222 12,282 0.175 0.033 0.004 1.044 0.301 [CUDA memcpy Unified Device-to-Host] 2,147.484 12,288 0.175 0.033 0.004 1.044 0.301 [CUDA memcpy Unified Device-to-Host]
Generated: Generated:
/pscratch/sd/u/uzylol/cuda_vecadd/report1.nsys-rep /pscratch/sd/u/uzylol/cuda_vecadd/report1.nsys-rep
/pscratch/sd/u/uzylol/cuda_vecadd/report1.sqlite /pscratch/sd/u/uzylol/cuda_vecadd/report1.sqlite
==============================================================================
Runtime for 1 thread block, 1 thread: 1,203,302,431 ns or 1.2 s
``` ```
CUDA: 256 Threads, One Block CUDA: 256 Threads, One Block
--- ---
``` ```
uzylol@nid001220:/pscratch/sd/u/uzylol/cuda_vecadd> nsys nvprof ./vecadd_gpu_256t uzylol@nid001133:/pscratch/sd/u/uzylol/cuda_vecadd> ./helper.sh vecadd_gpu_256t
Choose an option:
1. Build and profile
2. Clean
Enter your choice (1 or 2): 1
Built executable: vecadd_gpu_256t
WARNING: vecadd_gpu_256t and any of its children processes will be profiled. WARNING: vecadd_gpu_256t and any of its children processes will be profiled.
Max error: 0 Max error: 0
Generating '/tmp/nsys-report-e1e4.qdstrm' Generating '/tmp/nsys-report-a2a6.qdstrm'
[1/7] [========================100%] report1.nsys-rep [1/7] [========================100%] report1.nsys-rep
[2/7] [========================100%] report1.sqlite [2/7] [========================100%] report1.sqlite
[3/7] Executing 'nvtx_sum' stats report [3/7] Executing 'nvtx_sum' stats report
@ -73,49 +80,52 @@ SKIPPED: /pscratch/sd/u/uzylol/cuda_vecadd/report1.sqlite does not contain NV To
Time (%) Total Time (ns) Num Calls Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Name Time (%) Total Time (ns) Num Calls Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Name
-------- --------------- --------- --------------- --------------- ------------- ------------- ------------- ---------------------- -------- --------------- --------- --------------- --------------- ------------- ------------- ------------- ----------------------
64.8 1,212,368,064 1 1,212,368,064.0 1,212,368,064.0 1,212,368,064 1,212,368,064 0.0 cudaDeviceSynchronize 71.3 1,689,772,282 1 1,689,772,282.0 1,689,772,282.0 1,689,772,282 1,689,772,282 0.0 cudaDeviceSynchronize
21.7 406,008,370 2 203,004,185.0 203,004,185.0 66,418 405,941,952 286,997,342.4 cudaMallocManaged 13.2 312,348,647 2 156,174,323.5 156,174,323.5 66,999 312,281,648 220,769,095.5 cudaMallocManaged
10.6 197,844,224 2 98,922,112.0 98,922,112.0 69,452,260 128,391,964 41,676,664.4 cudaFree 8.4 200,018,555 2 100,009,277.5 100,009,277.5 70,954,629 129,063,926 41,089,478.0 cudaFree
3.0 55,299,012 1 55,299,012.0 55,299,012.0 55,299,012 55,299,012 0.0 cudaLaunchKernel 7.0 166,925,228 1 166,925,228.0 166,925,228.0 166,925,228 166,925,228 0.0 cudaLaunchKernel
0.0 1,152 1 1,152.0 1,152.0 1,152 1,152 0.0 cuModuleGetLoadingMode 0.0 1,031 1 1,031.0 1,031.0 1,031 1,031 0.0 cuModuleGetLoadingMode
[5/7] Executing 'cuda_gpu_kern_sum' stats report [5/7] Executing 'cuda_gpu_kern_sum' stats report
Time (%) Total Time (ns) Instances Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Name Time (%) Total Time (ns) Instances Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Name
-------- --------------- --------- --------------- --------------- ------------- ------------- ----------- -------------------------- -------- --------------- --------- --------------- --------------- ------------- ------------- ----------- --------------------------
100.0 1,212,362,968 1 1,212,362,968.0 1,212,362,968.0 1,212,362,968 1,212,362,968 0.0 add(int, float *, float *) 100.0 1,689,768,383 1 1,689,768,383.0 1,689,768,383.0 1,689,768,383 1,689,768,383 0.0 add(int, float *, float *)
[6/7] Executing 'cuda_gpu_mem_time_sum' stats report [6/7] Executing 'cuda_gpu_mem_time_sum' stats report
Time (%) Total Time (ns) Count Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Operation Time (%) Total Time (ns) Count Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Operation
-------- --------------- ------- -------- -------- -------- -------- ----------- ------------------------------------ -------- --------------- ------ -------- -------- -------- -------- ----------- ------------------------------------
80.7 466,018,043 157,467 2,959.5 2,239.0 1,663 49,534 3,710.5 [CUDA memcpy Unified Host-to-Device] 65.4 208,609,295 24,576 8,488.3 3,920.5 1,982 41,568 11,229.4 [CUDA memcpy Unified Host-to-Device]
19.3 111,109,764 12,288 9,042.1 3,199.5 1,727 48,384 12,378.9 [CUDA memcpy Unified Device-to-Host] 34.6 110,590,888 12,288 8,999.9 3,167.5 1,726 48,384 12,351.0 [CUDA memcpy Unified Device-to-Host]
[7/7] Executing 'cuda_gpu_mem_size_sum' stats report [7/7] Executing 'cuda_gpu_mem_size_sum' stats report
Total (MB) Count Avg (MB) Med (MB) Min (MB) Max (MB) StdDev (MB) Operation Total (MB) Count Avg (MB) Med (MB) Min (MB) Max (MB) StdDev (MB) Operation
---------- ------- -------- -------- -------- -------- ----------- ------------------------------------ ---------- ------ -------- -------- -------- -------- ----------- ------------------------------------
4,174.426 157,467 0.027 0.008 0.004 1.044 0.098 [CUDA memcpy Unified Host-to-Device] 4,294.967 24,576 0.175 0.033 0.004 1.044 0.301 [CUDA memcpy Unified Host-to-Device]
2,147.484 12,288 0.175 0.033 0.004 1.044 0.301 [CUDA memcpy Unified Device-to-Host] 2,147.484 12,288 0.175 0.033 0.004 1.044 0.301 [CUDA memcpy Unified Device-to-Host]
Generated: Generated:
/pscratch/sd/u/uzylol/cuda_vecadd/report1.nsys-rep /pscratch/sd/u/uzylol/cuda_vecadd/report1.nsys-rep
/pscratch/sd/u/uzylol/cuda_vecadd/report1.sqlite /pscratch/sd/u/uzylol/cuda_vecadd/report1.sqlite
==============================================================================
Runtime for 1 thread block, 256 threads: 1,212,362,968 ns or 1.21 s
``` ```
CUDA: 256 Threads, Many Blocks CUDA: 256 Threads, Many Blocks
--- ---
``` ```
uzylol@nid001220:/pscratch/sd/u/uzylol/cuda_vecadd> nsys nvprof vecadd_gpu_256t_mb uzylol@nid001133:/pscratch/sd/u/uzylol/cuda_vecadd> ./helper.sh vecadd_gpu_256t_mb
Choose an option:
1. Build and profile
2. Clean
Enter your choice (1 or 2): 1
Built executable: vecadd_gpu_256t_mb
WARNING: vecadd_gpu_256t_mb and any of its children processes will be profiled. WARNING: vecadd_gpu_256t_mb and any of its children processes will be profiled.
Number of thread blocks: 2097152 Number of thread blocks: 2097152
Max error: 0 Max error: 0
Generating '/tmp/nsys-report-b2ed.qdstrm' Generating '/tmp/nsys-report-cbad.qdstrm'
[1/7] [========================100%] report1.nsys-rep [1/7] [========================100%] report1.nsys-rep
[2/7] [========================100%] report1.sqlite [2/7] [========================100%] report1.sqlite
[3/7] Executing 'nvtx_sum' stats report [3/7] Executing 'nvtx_sum' stats report
@ -124,49 +134,52 @@ SKIPPED: /pscratch/sd/u/uzylol/cuda_vecadd/report1.sqlite does not contain NV To
Time (%) Total Time (ns) Num Calls Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Name Time (%) Total Time (ns) Num Calls Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Name
-------- --------------- --------- --------------- --------------- ------------- ------------- ------------- ---------------------- -------- --------------- --------- --------------- --------------- ------------- ------------- ------------- ----------------------
66.6 1,232,738,921 1 1,232,738,921.0 1,232,738,921.0 1,232,738,921 1,232,738,921 0.0 cudaDeviceSynchronize 68.5 1,219,130,474 1 1,219,130,474.0 1,219,130,474.0 1,219,130,474 1,219,130,474 0.0 cudaDeviceSynchronize
21.6 399,715,681 2 199,857,840.5 199,857,840.5 59,043 399,656,638 282,558,169.2 cudaMallocManaged 17.8 316,925,924 2 158,462,962.0 158,462,962.0 55,287 316,870,637 224,022,282.4 cudaMallocManaged
10.6 196,866,027 2 98,433,013.5 98,433,013.5 68,696,501 128,169,526 42,053,779.3 cudaFree 11.4 203,677,016 2 101,838,508.0 101,838,508.0 68,914,217 134,762,799 46,561,978.9 cudaFree
1.2 22,183,639 1 22,183,639.0 22,183,639.0 22,183,639 22,183,639 0.0 cudaLaunchKernel 2.3 40,488,959 1 40,488,959.0 40,488,959.0 40,488,959 40,488,959 0.0 cudaLaunchKernel
0.0 1,203 1 1,203.0 1,203.0 1,203 1,203 0.0 cuModuleGetLoadingMode 0.0 1,072 1 1,072.0 1,072.0 1,072 1,072 0.0 cuModuleGetLoadingMode
[5/7] Executing 'cuda_gpu_kern_sum' stats report [5/7] Executing 'cuda_gpu_kern_sum' stats report
Time (%) Total Time (ns) Instances Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Name Time (%) Total Time (ns) Instances Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Name
-------- --------------- --------- --------------- --------------- ------------- ------------- ----------- -------------------------- -------- --------------- --------- --------------- --------------- ------------- ------------- ----------- --------------------------
100.0 1,232,731,745 1 1,232,731,745.0 1,232,731,745.0 1,232,731,745 1,232,731,745 0.0 add(int, float *, float *) 100.0 1,219,123,989 1 1,219,123,989.0 1,219,123,989.0 1,219,123,989 1,219,123,989 0.0 add(int, float *, float *)
[6/7] Executing 'cuda_gpu_mem_time_sum' stats report [6/7] Executing 'cuda_gpu_mem_time_sum' stats report
Time (%) Total Time (ns) Count Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Operation Time (%) Total Time (ns) Count Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Operation
-------- --------------- ------- -------- -------- -------- -------- ----------- ------------------------------------ -------- --------------- ------- -------- -------- -------- -------- ----------- ------------------------------------
80.9 465,307,045 159,535 2,916.6 2,175.0 1,663 41,312 3,669.0 [CUDA memcpy Unified Host-to-Device] 80.3 448,633,690 155,241 2,889.9 2,175.0 1,663 63,231 3,651.9 [CUDA memcpy Unified Host-to-Device]
19.1 110,116,353 12,288 8,961.3 3,167.5 1,727 48,415 12,275.7 [CUDA memcpy Unified Device-to-Host] 19.7 109,875,418 12,286 8,943.1 3,214.5 1,726 48,352 12,278.7 [CUDA memcpy Unified Device-to-Host]
[7/7] Executing 'cuda_gpu_mem_size_sum' stats report [7/7] Executing 'cuda_gpu_mem_size_sum' stats report
Total (MB) Count Avg (MB) Med (MB) Min (MB) Max (MB) StdDev (MB) Operation Total (MB) Count Avg (MB) Med (MB) Min (MB) Max (MB) StdDev (MB) Operation
---------- ------- -------- -------- -------- -------- ----------- ------------------------------------ ---------- ------- -------- -------- -------- -------- ----------- ------------------------------------
4,195.918 159,535 0.026 0.008 0.004 1.044 0.098 [CUDA memcpy Unified Host-to-Device] 3,996.164 155,241 0.026 0.008 0.004 1.044 0.097 [CUDA memcpy Unified Host-to-Device]
2,147.484 12,288 0.175 0.033 0.004 1.044 0.301 [CUDA memcpy Unified Device-to-Host] 2,147.418 12,286 0.175 0.033 0.004 1.044 0.301 [CUDA memcpy Unified Device-to-Host]
Generated: Generated:
/pscratch/sd/u/uzylol/cuda_vecadd/report1.nsys-rep /pscratch/sd/u/uzylol/cuda_vecadd/report1.nsys-rep
/pscratch/sd/u/uzylol/cuda_vecadd/report1.sqlite /pscratch/sd/u/uzylol/cuda_vecadd/report1.sqlite
==============================================================================
Runtime for many thread blocks, 256 threads: 1,232,731,745 ns or 1.23 s
``` ```
CUDA: 256 Threads, Many Blocks with Prefetch CUDA: 256 Threads, Many Blocks with Prefetch
--- ---
``` ```
uzylol@nid001132:/pscratch/sd/u/uzylol/cuda_vecadd> nsys nvprof vecadd_gpu_256t_mb_prefetch uzylol@nid001133:/pscratch/sd/u/uzylol/cuda_vecadd> ./helper.sh vecadd_gpu_256t_mb_prefetch
Choose an option:
1. Build and profile
2. Clean
Enter your choice (1 or 2): 1
Built executable: vecadd_gpu_256t_mb_prefetch
WARNING: vecadd_gpu_256t_mb_prefetch and any of its children processes will be profiled. WARNING: vecadd_gpu_256t_mb_prefetch and any of its children processes will be profiled.
Number of thread blocks: 2097152 Number of thread blocks: 2097152
Max error: 0 Max error: 0
Generating '/tmp/nsys-report-3a53.qdstrm' Generating '/tmp/nsys-report-3752.qdstrm'
[1/7] [========================100%] report1.nsys-rep [1/7] [========================100%] report1.nsys-rep
[2/7] [========================100%] report1.sqlite [2/7] [========================100%] report1.sqlite
[3/7] Executing 'nvtx_sum' stats report [3/7] Executing 'nvtx_sum' stats report
@ -175,25 +188,25 @@ SKIPPED: /pscratch/sd/u/uzylol/cuda_vecadd/report1.sqlite does not contain NV To
Time (%) Total Time (ns) Num Calls Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Name Time (%) Total Time (ns) Num Calls Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Name
-------- --------------- --------- ------------- ------------- ----------- ----------- ------------- ---------------------- -------- --------------- --------- ------------- ------------- ----------- ----------- ------------- ----------------------
51.2 429,287,905 2 214,643,952.5 214,643,952.5 37,182 429,250,723 303,499,805.4 cudaMallocManaged 43.5 329,908,462 2 164,954,231.0 164,954,231.0 50,698 329,857,764 233,208,812.9 cudaMallocManaged
22.5 188,776,511 2 94,388,255.5 94,388,255.5 60,886,335 127,890,176 47,378,870.3 cudaFree 25.1 190,030,072 2 95,015,036.0 95,015,036.0 62,035,958 127,994,114 46,639,459.4 cudaFree
13.3 111,100,135 1 111,100,135.0 111,100,135.0 111,100,135 111,100,135 0.0 cudaLaunchKernel 17.2 130,653,654 1 130,653,654.0 130,653,654.0 130,653,654 130,653,654 0.0 cudaLaunchKernel
12.4 103,925,795 2 51,962,897.5 51,962,897.5 410,784 103,515,011 72,905,698.1 cudaMemPrefetchAsync 13.5 102,455,160 2 51,227,580.0 51,227,580.0 377,189 102,077,971 71,913,312.6 cudaMemPrefetchAsync
0.6 4,775,402 1 4,775,402.0 4,775,402.0 4,775,402 4,775,402 0.0 cudaDeviceSynchronize 0.6 4,773,525 1 4,773,525.0 4,773,525.0 4,773,525 4,773,525 0.0 cudaDeviceSynchronize
0.0 1,212 1 1,212.0 1,212.0 1,212 1,212 0.0 cuModuleGetLoadingMode 0.0 1,312 1 1,312.0 1,312.0 1,312 1,312 0.0 cuModuleGetLoadingMode
[5/7] Executing 'cuda_gpu_kern_sum' stats report [5/7] Executing 'cuda_gpu_kern_sum' stats report
Time (%) Total Time (ns) Instances Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Name Time (%) Total Time (ns) Instances Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Name
-------- --------------- --------- ----------- ----------- --------- --------- ----------- -------------------------- -------- --------------- --------- ----------- ----------- --------- --------- ----------- --------------------------
100.0 4,773,208 1 4,773,208.0 4,773,208.0 4,773,208 4,773,208 0.0 add(int, float *, float *) 100.0 4,771,134 1 4,771,134.0 4,771,134.0 4,771,134 4,771,134 0.0 add(int, float *, float *)
[6/7] Executing 'cuda_gpu_mem_time_sum' stats report [6/7] Executing 'cuda_gpu_mem_time_sum' stats report
Time (%) Total Time (ns) Count Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Operation Time (%) Total Time (ns) Count Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Operation
-------- --------------- ------ -------- -------- -------- -------- ----------- ------------------------------------ -------- --------------- ------ -------- -------- -------- -------- ----------- ------------------------------------
59.9 165,061,185 2,048 80,596.3 80,576.0 80,511 81,088 68.1 [CUDA memcpy Unified Host-to-Device] 59.9 165,077,448 2,048 80,604.2 80,576.0 80,511 81,055 68.8 [CUDA memcpy Unified Host-to-Device]
40.1 110,568,223 12,288 8,998.1 3,471.5 1,726 48,448 12,335.9 [CUDA memcpy Unified Device-to-Host] 40.1 110,650,214 12,288 9,004.7 3,135.5 1,726 48,352 12,355.0 [CUDA memcpy Unified Device-to-Host]
[7/7] Executing 'cuda_gpu_mem_size_sum' stats report [7/7] Executing 'cuda_gpu_mem_size_sum' stats report
@ -205,6 +218,14 @@ SKIPPED: /pscratch/sd/u/uzylol/cuda_vecadd/report1.sqlite does not contain NV To
Generated: Generated:
/pscratch/sd/u/uzylol/cuda_vecadd/report1.nsys-rep /pscratch/sd/u/uzylol/cuda_vecadd/report1.nsys-rep
/pscratch/sd/u/uzylol/cuda_vecadd/report1.sqlite /pscratch/sd/u/uzylol/cuda_vecadd/report1.sqlite
==============================================================================
Runtime for ???: 4,773,208 ns or 0.0048 seconds
``` ```
## Prefetch Thread Count
```
uzylol@nid001013:/pscratch/sd/u/uzylol/cuda_vecadd> compute-sanitizer vecadd_gpu_256t_mb_prefetch
========= COMPUTE-SANITIZER
Number of thread blocks: 2097152
Max error: 0
========= ERROR SUMMARY: 0 errors
```

9
performanceTable.md Normal file
View File

@ -0,0 +1,9 @@
# Performance Table
| Implementation | Execution Time (ms) | MFLOPS/s | Mem Bandwidth B/s | Mem Bandwidth GB/s |
|---------------------|---------------------|--------------|-------------------|--------------------|
| CPU | 985.325 | 519.6255043 | 6538401993 | 6.5384 |
| CUDA: 1t 1b | 50918.5392 | 10.05527668 | 126524661.7 | 0.1265 |
| CUDA: 256t 1b | 1689.7723 | 302.9994041 | 3812614838 | 3.8126 |
| CUDA: 256t many b | 1219.1305 | 419.9714469 | 5284463758 | 5.2845 |
| CUDA prefetch | 329.9085 | 1551.945464 | 19527993198 | 19.528 |

View File

@ -1,10 +0,0 @@
# Report
| Implementation | Elapsed Time (ms) | MFLOP/s | Memory Bandwidth (GB/s) |
| --------------- | ----------------- | ------- | ----------------------- |
| CPU Vector Addition | xx | xx | xx |
| CUDA 1 thread, 1 thread block | 1,203.31 | 425.59 | 5.35 |
| CUDA 256 threads, 1 thread block | 1,212.36 | 422.76 | 5.31 |
| CUDA 256 threads/block, many thread blocks | 1,232.73 | 415.16 | 5.24 |
| CUDA 256 threads/block, many blocks, prefetching | 4.77 | 112,591.01 | 1,349.96 |

View File

@ -1,15 +1,11 @@
#include <iostream> #include <iostream>
#include <math.h> #include <cmath>
#include <cuda_runtime.h> #include <cuda_runtime.h>
// CUDA kernel to add the elements of two arrays // function to add the elements of two arrays
__global__ __global__ void add(int n, float *x, float *y) {
void add(int n, float *x, float *y) { for (int i = 0; i < n; i++)
int index = threadIdx.x + blockIdx.x * blockDim.x;
int stride = blockDim.x * gridDim.x;
for (int i = index; i < n; i += stride) {
y[i] = x[i] + y[i]; y[i] = x[i] + y[i];
}
} }
int main(void) { int main(void) {
@ -27,22 +23,16 @@ int main(void) {
y[i] = 2.0f; y[i] = 2.0f;
} }
// Number of threads per block
int blockSize = 256;
// Number of blocks in the grid
int numBlocks = (N + blockSize - 1) / blockSize;
// Run kernel on the elements on the GPU // Run kernel on the elements on the GPU
add<<<numBlocks, blockSize>>>(N, x, y); add<<<1, 1>>>(N, x, y);
// Wait for GPU to finish before accessing on host // Wait for GPU to finish before accessing on host
cudaDeviceSynchronize(); cudaDeviceSynchronize();
// Check for errors (all values should be 3.0f) // Check for errors (all values should be 3.0f)
float maxError = 0.0f; float maxError = 0.0f;
for (int i = 0; i < N; i++) { for (int i = 0; i < N; i++)
maxError = fmax(maxError, fabs(y[i] - 3.0f)); maxError = fmax(maxError, fabs(y[i] - 3.0f));
}
std::cout << "Max error: " << maxError << std::endl; std::cout << "Max error: " << maxError << std::endl;
// Free memory // Free memory
@ -50,4 +40,4 @@ int main(void) {
cudaFree(y); cudaFree(y);
return 0; return 0;
} }

View File

@ -1,15 +1,13 @@
#include <iostream> #include <iostream>
#include <math.h> #include <cmath>
#include <cuda_runtime.h> #include <cuda_runtime.h>
// CUDA kernel to add the elements of two arrays // function to add the elements of two arrays
__global__ __global__ void add(int n, float *x, float *y) {
void add(int n, float *x, float *y) { int index = threadIdx.x;
int index = threadIdx.x + blockIdx.x * blockDim.x; int stride = blockDim.x;
int stride = blockDim.x * gridDim.x; for (int i = index; i < n; i += stride)
for (int i = index; i < n; i += stride) {
y[i] = x[i] + y[i]; y[i] = x[i] + y[i];
}
} }
int main(void) { int main(void) {
@ -27,22 +25,16 @@ int main(void) {
y[i] = 2.0f; y[i] = 2.0f;
} }
// Number of threads per block // Run kernel on the elements on the GPU with 256 threads
int blockSize = 256; add<<<1, 256>>>(N, x, y);
// Number of blocks in the grid
int numBlocks = (N + blockSize - 1) / blockSize;
// Run kernel on the elements on the GPU
add<<<numBlocks, blockSize>>>(N, x, y);
// Wait for GPU to finish before accessing on host // Wait for GPU to finish before accessing on host
cudaDeviceSynchronize(); cudaDeviceSynchronize();
// Check for errors (all values should be 3.0f) // Check for errors (all values should be 3.0f)
float maxError = 0.0f; float maxError = 0.0f;
for (int i = 0; i < N; i++) { for (int i = 0; i < N; i++)
maxError = fmax(maxError, fabs(y[i] - 3.0f)); maxError = fmax(maxError, fabs(y[i] - 3.0f));
}
std::cout << "Max error: " << maxError << std::endl; std::cout << "Max error: " << maxError << std::endl;
// Free memory // Free memory

View File

@ -1,15 +1,13 @@
#include <iostream> #include <iostream>
#include <math.h> #include <cmath>
#include <cuda_runtime.h> #include <cuda_runtime.h>
// CUDA kernel to add the elements of two arrays // function to add the elements of two arrays
__global__ __global__ void add(int n, float *x, float *y) {
void add(int n, float *x, float *y) { int index = threadIdx.x + blockIdx.x * blockDim.x;
int index = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x; int stride = blockDim.x * gridDim.x;
for (int i = index; i < n; i += stride) { for (int i = index; i < n; i += stride)
y[i] = x[i] + y[i]; y[i] = x[i] + y[i];
}
} }
int main(void) { int main(void) {
@ -21,31 +19,30 @@ int main(void) {
cudaMallocManaged(&x, N * sizeof(float)); cudaMallocManaged(&x, N * sizeof(float));
cudaMallocManaged(&y, N * sizeof(float)); cudaMallocManaged(&y, N * sizeof(float));
// Initialize x and y arrays on the host // initialize x and y arrays on the host
for (int i = 0; i < N; i++) { for (int i = 0; i < N; i++) {
x[i] = 1.0f; x[i] = 1.0f;
y[i] = 2.0f; y[i] = 2.0f;
} }
// Number of threads per block // Number of threads per block
int blockSize = 256; int threadsPerBlock = 256;
// Number of blocks in the grid // Number of blocks in the grid
int numBlocks = (N + blockSize - 1) / blockSize; int numberOfBlocks = (N + threadsPerBlock - 1) / threadsPerBlock;
// Print the number of thread blocks // Print out the number of thread blocks
std::cout << "Number of thread blocks: " << numBlocks << std::endl; std::cout << "Number of thread blocks: " << numberOfBlocks << std::endl;
// Run kernel on the elements on the GPU // Run kernel on the elements on the GPU with multiple blocks and threads
add<<<numBlocks, blockSize>>>(N, x, y); add<<<numberOfBlocks, threadsPerBlock>>>(N, x, y);
// Wait for GPU to finish before accessing on host // Wait for GPU to finish before accessing on host
cudaDeviceSynchronize(); cudaDeviceSynchronize();
// Check for errors (all values should be 3.0f) // Check for errors (all values should be 3.0f)
float maxError = 0.0f; float maxError = 0.0f;
for (int i = 0; i < N; i++) { for (int i = 0; i < N; i++)
maxError = fmax(maxError, fabs(y[i] - 3.0f)); maxError = fmax(maxError, fabs(y[i] - 3.0f));
}
std::cout << "Max error: " << maxError << std::endl; std::cout << "Max error: " << maxError << std::endl;
// Free memory // Free memory

View File

@ -1,15 +1,13 @@
#include <iostream> #include <iostream>
#include <math.h> #include <cmath>
#include <cuda_runtime.h> #include <cuda_runtime.h>
// CUDA kernel to add the elements of two arrays // function to add the elements of two arrays
__global__ __global__ void add(int n, float *x, float *y) {
void add(int n, float *x, float *y) { int index = threadIdx.x + blockIdx.x * blockDim.x;
int index = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x; int stride = blockDim.x * gridDim.x;
for (int i = index; i < n; i += stride) { for (int i = index; i < n; i += stride)
y[i] = x[i] + y[i]; y[i] = x[i] + y[i];
}
} }
int main(void) { int main(void) {
@ -21,35 +19,35 @@ int main(void) {
cudaMallocManaged(&x, N * sizeof(float)); cudaMallocManaged(&x, N * sizeof(float));
cudaMallocManaged(&y, N * sizeof(float)); cudaMallocManaged(&y, N * sizeof(float));
// Initialize x and y arrays on the host // initialize x and y arrays on the host
for (int i = 0; i < N; i++) { for (int i = 0; i < N; i++) {
x[i] = 1.0f; x[i] = 1.0f;
y[i] = 2.0f; y[i] = 2.0f;
} }
// Prefetch memory to the GPU
int deviceID = 0; int deviceID = 0;
cudaMemPrefetchAsync(x, N * sizeof(float), deviceID); cudaMemPrefetchAsync((void *)x, N * sizeof(float), deviceID);
cudaMemPrefetchAsync(y, N * sizeof(float), deviceID); cudaMemPrefetchAsync((void *)y, N * sizeof(float), deviceID);
// Number of threads per block // Number of threads per block
int blockSize = 256; int threadsPerBlock = 256;
// Number of blocks in the grid // Number of blocks in the grid
int numBlocks = (N + blockSize - 1) / blockSize; int numberOfBlocks = (N + threadsPerBlock - 1) / threadsPerBlock;
// Print the number of thread blocks // Print out the number of thread blocks
std::cout << "Number of thread blocks: " << numBlocks << std::endl; std::cout << "Number of thread blocks: " << numberOfBlocks << std::endl;
// Run kernel on the elements on the GPU // Run kernel on the elements on the GPU with multiple blocks and threads
add<<<numBlocks, blockSize>>>(N, x, y); add<<<numberOfBlocks, threadsPerBlock>>>(N, x, y);
// Wait for GPU to finish before accessing on host // Wait for GPU to finish before accessing on host
cudaDeviceSynchronize(); cudaDeviceSynchronize();
// Check for errors (all values should be 3.0f) // Check for errors (all values should be 3.0f)
float maxError = 0.0f; float maxError = 0.0f;
for (int i = 0; i < N; i++) { for (int i = 0; i < N; i++)
maxError = fmax(maxError, fabs(y[i] - 3.0f)); maxError = fmax(maxError, fabs(y[i] - 3.0f));
}
std::cout << "Max error: " << maxError << std::endl; std::cout << "Max error: " << maxError << std::endl;
// Free memory // Free memory