Added raw outputs of profiler runs
This commit is contained in:
parent
5e444713ca
commit
9471fdae86
210
outputs.md
Normal file
210
outputs.md
Normal file
@ -0,0 +1,210 @@
|
||||
# Profiler Outputs
|
||||
|
||||
CPU-Only
|
||||
---
|
||||
|
||||
> [!WARNING]
|
||||
> Output not yet recorded
|
||||
|
||||
CUDA: 1 Thread, 1 Block
|
||||
---
|
||||
|
||||
```
|
||||
uzylol@nid001076:/pscratch/sd/u/uzylol/cuda_vecadd> nsys nvprof vecadd_gpu_1t
|
||||
WARNING: vecadd_gpu_1t and any of its children processes will be profiled.
|
||||
|
||||
Max error: 0
|
||||
Generating '/tmp/nsys-report-383f.qdstrm'
|
||||
[1/7] [========================100%] report1.nsys-rep
|
||||
[2/7] [========================100%] report1.sqlite
|
||||
[3/7] Executing 'nvtx_sum' stats report
|
||||
SKIPPED: /pscratch/sd/u/uzylol/cuda_vecadd/report1.sqlite does not contain NV Tools Extension (NVTX) data.
|
||||
[4/7] Executing 'cuda_api_sum' stats report
|
||||
|
||||
Time (%) Total Time (ns) Num Calls Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Name
|
||||
-------- --------------- --------- --------------- --------------- ------------- ------------- ------------- ----------------------
|
||||
68.1 1,203,306,047 1 1,203,306,047.0 1,203,306,047.0 1,203,306,047 1,203,306,047 0.0 cudaDeviceSynchronize
|
||||
19.6 346,677,337 2 173,338,668.5 173,338,668.5 65,046 346,612,291 245,045,906.9 cudaMallocManaged
|
||||
11.1 195,868,284 2 97,934,142.0 97,934,142.0 68,440,871 127,427,413 41,709,783.8 cudaFree
|
||||
1.2 21,925,779 1 21,925,779.0 21,925,779.0 21,925,779 21,925,779 0.0 cudaLaunchKernel
|
||||
0.0 1,463 1 1,463.0 1,463.0 1,463 1,463 0.0 cuModuleGetLoadingMode
|
||||
|
||||
[5/7] Executing 'cuda_gpu_kern_sum' stats report
|
||||
|
||||
Time (%) Total Time (ns) Instances Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Name
|
||||
-------- --------------- --------- --------------- --------------- ------------- ------------- ----------- --------------------------
|
||||
100.0 1,203,302,431 1 1,203,302,431.0 1,203,302,431.0 1,203,302,431 1,203,302,431 0.0 add(int, float *, float *)
|
||||
|
||||
[6/7] Executing 'cuda_gpu_mem_time_sum' stats report
|
||||
|
||||
Time (%) Total Time (ns) Count Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Operation
|
||||
-------- --------------- ------- -------- -------- -------- -------- ----------- ------------------------------------
|
||||
80.0 446,872,156 152,098 2,938.1 2,175.0 1,663 41,471 3,720.9 [CUDA memcpy Unified Host-to-Device]
|
||||
20.0 111,554,845 12,282 9,082.8 3,215.5 1,726 49,504 12,422.3 [CUDA memcpy Unified Device-to-Host]
|
||||
|
||||
[7/7] Executing 'cuda_gpu_mem_size_sum' stats report
|
||||
|
||||
Total (MB) Count Avg (MB) Med (MB) Min (MB) Max (MB) StdDev (MB) Operation
|
||||
---------- ------- -------- -------- -------- -------- ----------- ------------------------------------
|
||||
4,075.237 152,098 0.027 0.008 0.004 1.040 0.099 [CUDA memcpy Unified Host-to-Device]
|
||||
2,147.222 12,282 0.175 0.033 0.004 1.044 0.301 [CUDA memcpy Unified Device-to-Host]
|
||||
|
||||
Generated:
|
||||
/pscratch/sd/u/uzylol/cuda_vecadd/report1.nsys-rep
|
||||
/pscratch/sd/u/uzylol/cuda_vecadd/report1.sqlite
|
||||
==============================================================================
|
||||
Runtime for 1 thread block, 1 thread: 1,203,302,431 ns or 1.2 s
|
||||
```
|
||||
|
||||
CUDA: 256 Threads, One Block
|
||||
---
|
||||
|
||||
```
|
||||
uzylol@nid001220:/pscratch/sd/u/uzylol/cuda_vecadd> nsys nvprof ./vecadd_gpu_256t
|
||||
WARNING: vecadd_gpu_256t and any of its children processes will be profiled.
|
||||
|
||||
Max error: 0
|
||||
Generating '/tmp/nsys-report-e1e4.qdstrm'
|
||||
[1/7] [========================100%] report1.nsys-rep
|
||||
[2/7] [========================100%] report1.sqlite
|
||||
[3/7] Executing 'nvtx_sum' stats report
|
||||
SKIPPED: /pscratch/sd/u/uzylol/cuda_vecadd/report1.sqlite does not contain NV Tools Extension (NVTX) data.
|
||||
[4/7] Executing 'cuda_api_sum' stats report
|
||||
|
||||
Time (%) Total Time (ns) Num Calls Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Name
|
||||
-------- --------------- --------- --------------- --------------- ------------- ------------- ------------- ----------------------
|
||||
64.8 1,212,368,064 1 1,212,368,064.0 1,212,368,064.0 1,212,368,064 1,212,368,064 0.0 cudaDeviceSynchronize
|
||||
21.7 406,008,370 2 203,004,185.0 203,004,185.0 66,418 405,941,952 286,997,342.4 cudaMallocManaged
|
||||
10.6 197,844,224 2 98,922,112.0 98,922,112.0 69,452,260 128,391,964 41,676,664.4 cudaFree
|
||||
3.0 55,299,012 1 55,299,012.0 55,299,012.0 55,299,012 55,299,012 0.0 cudaLaunchKernel
|
||||
0.0 1,152 1 1,152.0 1,152.0 1,152 1,152 0.0 cuModuleGetLoadingMode
|
||||
|
||||
[5/7] Executing 'cuda_gpu_kern_sum' stats report
|
||||
|
||||
Time (%) Total Time (ns) Instances Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Name
|
||||
-------- --------------- --------- --------------- --------------- ------------- ------------- ----------- --------------------------
|
||||
100.0 1,212,362,968 1 1,212,362,968.0 1,212,362,968.0 1,212,362,968 1,212,362,968 0.0 add(int, float *, float *)
|
||||
|
||||
[6/7] Executing 'cuda_gpu_mem_time_sum' stats report
|
||||
|
||||
Time (%) Total Time (ns) Count Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Operation
|
||||
-------- --------------- ------- -------- -------- -------- -------- ----------- ------------------------------------
|
||||
80.7 466,018,043 157,467 2,959.5 2,239.0 1,663 49,534 3,710.5 [CUDA memcpy Unified Host-to-Device]
|
||||
19.3 111,109,764 12,288 9,042.1 3,199.5 1,727 48,384 12,378.9 [CUDA memcpy Unified Device-to-Host]
|
||||
|
||||
[7/7] Executing 'cuda_gpu_mem_size_sum' stats report
|
||||
|
||||
Total (MB) Count Avg (MB) Med (MB) Min (MB) Max (MB) StdDev (MB) Operation
|
||||
---------- ------- -------- -------- -------- -------- ----------- ------------------------------------
|
||||
4,174.426 157,467 0.027 0.008 0.004 1.044 0.098 [CUDA memcpy Unified Host-to-Device]
|
||||
2,147.484 12,288 0.175 0.033 0.004 1.044 0.301 [CUDA memcpy Unified Device-to-Host]
|
||||
|
||||
Generated:
|
||||
/pscratch/sd/u/uzylol/cuda_vecadd/report1.nsys-rep
|
||||
/pscratch/sd/u/uzylol/cuda_vecadd/report1.sqlite
|
||||
==============================================================================
|
||||
Runtime for 1 thread block, 256 threads: 1,212,362,968 ns or 1.21 s
|
||||
```
|
||||
|
||||
CUDA: 256 Threads, Many Blocks
|
||||
---
|
||||
|
||||
```
|
||||
uzylol@nid001220:/pscratch/sd/u/uzylol/cuda_vecadd> nsys nvprof vecadd_gpu_256t_mb
|
||||
WARNING: vecadd_gpu_256t_mb and any of its children processes will be profiled.
|
||||
|
||||
Number of thread blocks: 2097152
|
||||
Max error: 0
|
||||
Generating '/tmp/nsys-report-b2ed.qdstrm'
|
||||
[1/7] [========================100%] report1.nsys-rep
|
||||
[2/7] [========================100%] report1.sqlite
|
||||
[3/7] Executing 'nvtx_sum' stats report
|
||||
SKIPPED: /pscratch/sd/u/uzylol/cuda_vecadd/report1.sqlite does not contain NV Tools Extension (NVTX) data.
|
||||
[4/7] Executing 'cuda_api_sum' stats report
|
||||
|
||||
Time (%) Total Time (ns) Num Calls Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Name
|
||||
-------- --------------- --------- --------------- --------------- ------------- ------------- ------------- ----------------------
|
||||
66.6 1,232,738,921 1 1,232,738,921.0 1,232,738,921.0 1,232,738,921 1,232,738,921 0.0 cudaDeviceSynchronize
|
||||
21.6 399,715,681 2 199,857,840.5 199,857,840.5 59,043 399,656,638 282,558,169.2 cudaMallocManaged
|
||||
10.6 196,866,027 2 98,433,013.5 98,433,013.5 68,696,501 128,169,526 42,053,779.3 cudaFree
|
||||
1.2 22,183,639 1 22,183,639.0 22,183,639.0 22,183,639 22,183,639 0.0 cudaLaunchKernel
|
||||
0.0 1,203 1 1,203.0 1,203.0 1,203 1,203 0.0 cuModuleGetLoadingMode
|
||||
|
||||
[5/7] Executing 'cuda_gpu_kern_sum' stats report
|
||||
|
||||
Time (%) Total Time (ns) Instances Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Name
|
||||
-------- --------------- --------- --------------- --------------- ------------- ------------- ----------- --------------------------
|
||||
100.0 1,232,731,745 1 1,232,731,745.0 1,232,731,745.0 1,232,731,745 1,232,731,745 0.0 add(int, float *, float *)
|
||||
|
||||
[6/7] Executing 'cuda_gpu_mem_time_sum' stats report
|
||||
|
||||
Time (%) Total Time (ns) Count Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Operation
|
||||
-------- --------------- ------- -------- -------- -------- -------- ----------- ------------------------------------
|
||||
80.9 465,307,045 159,535 2,916.6 2,175.0 1,663 41,312 3,669.0 [CUDA memcpy Unified Host-to-Device]
|
||||
19.1 110,116,353 12,288 8,961.3 3,167.5 1,727 48,415 12,275.7 [CUDA memcpy Unified Device-to-Host]
|
||||
|
||||
[7/7] Executing 'cuda_gpu_mem_size_sum' stats report
|
||||
|
||||
Total (MB) Count Avg (MB) Med (MB) Min (MB) Max (MB) StdDev (MB) Operation
|
||||
---------- ------- -------- -------- -------- -------- ----------- ------------------------------------
|
||||
4,195.918 159,535 0.026 0.008 0.004 1.044 0.098 [CUDA memcpy Unified Host-to-Device]
|
||||
2,147.484 12,288 0.175 0.033 0.004 1.044 0.301 [CUDA memcpy Unified Device-to-Host]
|
||||
|
||||
Generated:
|
||||
/pscratch/sd/u/uzylol/cuda_vecadd/report1.nsys-rep
|
||||
/pscratch/sd/u/uzylol/cuda_vecadd/report1.sqlite
|
||||
==============================================================================
|
||||
Runtime for many thread blocks, 256 threads: 1,232,731,745 ns or 1.23 s
|
||||
```
|
||||
|
||||
CUDA: 256 Threads, Many Blocks with Prefetch
|
||||
---
|
||||
|
||||
```
|
||||
uzylol@nid001132:/pscratch/sd/u/uzylol/cuda_vecadd> nsys nvprof vecadd_gpu_256t_mb_prefetch
|
||||
WARNING: vecadd_gpu_256t_mb_prefetch and any of its children processes will be profiled.
|
||||
|
||||
Number of thread blocks: 2097152
|
||||
Max error: 0
|
||||
Generating '/tmp/nsys-report-3a53.qdstrm'
|
||||
[1/7] [========================100%] report1.nsys-rep
|
||||
[2/7] [========================100%] report1.sqlite
|
||||
[3/7] Executing 'nvtx_sum' stats report
|
||||
SKIPPED: /pscratch/sd/u/uzylol/cuda_vecadd/report1.sqlite does not contain NV Tools Extension (NVTX) data.
|
||||
[4/7] Executing 'cuda_api_sum' stats report
|
||||
|
||||
Time (%) Total Time (ns) Num Calls Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Name
|
||||
-------- --------------- --------- ------------- ------------- ----------- ----------- ------------- ----------------------
|
||||
51.2 429,287,905 2 214,643,952.5 214,643,952.5 37,182 429,250,723 303,499,805.4 cudaMallocManaged
|
||||
22.5 188,776,511 2 94,388,255.5 94,388,255.5 60,886,335 127,890,176 47,378,870.3 cudaFree
|
||||
13.3 111,100,135 1 111,100,135.0 111,100,135.0 111,100,135 111,100,135 0.0 cudaLaunchKernel
|
||||
12.4 103,925,795 2 51,962,897.5 51,962,897.5 410,784 103,515,011 72,905,698.1 cudaMemPrefetchAsync
|
||||
0.6 4,775,402 1 4,775,402.0 4,775,402.0 4,775,402 4,775,402 0.0 cudaDeviceSynchronize
|
||||
0.0 1,212 1 1,212.0 1,212.0 1,212 1,212 0.0 cuModuleGetLoadingMode
|
||||
|
||||
[5/7] Executing 'cuda_gpu_kern_sum' stats report
|
||||
|
||||
Time (%) Total Time (ns) Instances Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Name
|
||||
-------- --------------- --------- ----------- ----------- --------- --------- ----------- --------------------------
|
||||
100.0 4,773,208 1 4,773,208.0 4,773,208.0 4,773,208 4,773,208 0.0 add(int, float *, float *)
|
||||
|
||||
[6/7] Executing 'cuda_gpu_mem_time_sum' stats report
|
||||
|
||||
Time (%) Total Time (ns) Count Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Operation
|
||||
-------- --------------- ------ -------- -------- -------- -------- ----------- ------------------------------------
|
||||
59.9 165,061,185 2,048 80,596.3 80,576.0 80,511 81,088 68.1 [CUDA memcpy Unified Host-to-Device]
|
||||
40.1 110,568,223 12,288 8,998.1 3,471.5 1,726 48,448 12,335.9 [CUDA memcpy Unified Device-to-Host]
|
||||
|
||||
[7/7] Executing 'cuda_gpu_mem_size_sum' stats report
|
||||
|
||||
Total (MB) Count Avg (MB) Med (MB) Min (MB) Max (MB) StdDev (MB) Operation
|
||||
---------- ------ -------- -------- -------- -------- ----------- ------------------------------------
|
||||
4,294.967 2,048 2.097 2.097 2.097 2.097 0.000 [CUDA memcpy Unified Host-to-Device]
|
||||
2,147.484 12,288 0.175 0.033 0.004 1.044 0.301 [CUDA memcpy Unified Device-to-Host]
|
||||
|
||||
Generated:
|
||||
/pscratch/sd/u/uzylol/cuda_vecadd/report1.nsys-rep
|
||||
/pscratch/sd/u/uzylol/cuda_vecadd/report1.sqlite
|
||||
==============================================================================
|
||||
Runtime for ???: 4,773,208 ns or 0.0048 seconds
|
||||
```
|
||||
Loading…
Reference in New Issue
Block a user