From 3a15cd67b07c6ec396e88b972d808f33e7abc372 Mon Sep 17 00:00:00 2001 From: Uzair Mohammed Date: Wed, 11 Dec 2024 22:28:58 -0800 Subject: [PATCH] Improved code, added output, performance table, and analysis questions --- Performance Table.xlsx | Bin 0 -> 10790 bytes analysis.md | 42 +++++++++ outputs.md | 159 +++++++++++++++++++-------------- performanceTable.md | 9 ++ report.md | 10 --- vecadd_gpu_1t.cu | 24 ++--- vecadd_gpu_256t.cu | 26 ++---- vecadd_gpu_256t_mb.cu | 29 +++--- vecadd_gpu_256t_mb_prefetch.cu | 34 ++++--- 9 files changed, 186 insertions(+), 147 deletions(-) create mode 100644 Performance Table.xlsx create mode 100644 analysis.md create mode 100644 performanceTable.md delete mode 100644 report.md diff --git a/Performance Table.xlsx b/Performance Table.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..3d9fdf5cf0eaaa3b3a46eeb20b34d4e05ca156c9 GIT binary patch literal 10790 zcmeHtWmg^9*7XVQPVnIF?iwIC1cJL9+}+)SyF-8w5+nq7cXxMpcYR5^?~`u2-(PTV zeW)5|j9PP?T6->=d&^0Jf}sH*08jt`fDmBMrk3aW3IOm22LMn3P$24Wtt{<He8kIAZXAe?FdHfpe#VUeAIPRpbKxMRL2ZqTZ>Jvtf1D;S2 zahDv8W4RFcEKCGTtJ^VYFdB|oeNxlxYZB63Z0rDF@bZnpYx$gDuQ3hL&lsi%r;T$S zQC!X-vUsM9?O9K7%oX+@>I+@m3f9M5--lIRseO~k$4s9HRmsv%mWb$YEvZC4n!e?SJLq{TcC|&9qu6%q$Cq*N*7H{t{dqVmZ|QVCM2AJ4Lyi) zJeF-?PtJryQd(j)`nIFdXg%8QhrY2<$}2l>^-V0UYX|_~=?N4d_qUL(P-GxJf99do zb40>Ehpe_O(ANFT8MK;L;ac;5HUE%1czb`xH%vJ`$qLE|Q_bSer=ytQ$Fq$0CT z5Va{>X-9UPxR|(1ekb8V;n)yKRZyM(NwQ~&Saj?}_y^Juoia8w>Sx@**SyI-YMs)m zi+b0^uO@^P4vGRxs@Su3V~0|`rV{dY;Q50&rS?aY(Yx*RjAn}6yUhtNZm<;bcAzK6}R5oZiXG)=baJ%Is%ND^%39D&Ro!&^CDjT}{_H zgEw_W2Qj9uP*zN_aLO#kW$2(zm~q34VD#` zrD3HA0^dntkcVQuIZGk4Q@dqsqKw<3=rcoDBpfB?8z%2vwb*yck+c{F3Vw5L`=)NM z_z@o&eOpp2eRE<9uoGYxPvi#yv$*1!*OAd7CqYR2vijC>*5l3Rx}Kw7SotuhI9&s7}#BkET2FSCPMqwI3!ncQDS(d<$LDD zymvz(2Ze_ENRulW1={OvSoeVFb>tcQDq$rx`dvm}yf_U;YUjlP(@QBM+IYj{poA^% zH?))sa&-(}YH*-@{;#Lgksf`gUBdr6C@=2_pPEIT!y95ebZ zQpZn%`+;qcO5HFt?l^Bn!rhwQU#h&)4>JeDdrc+8n7a2}cfXFZ^oI`*29}Htx9GNd zLiDH4_}n&|GfGShOiS;vU8&@742$q~mt`(g;Re(PsA`XCh#Z2dsg9!>b}G`wIk&Z> z{d9w)(Wzi-6Q|N~4Wn>DU8e+bMZaGx* zEH6csYP3B|>A^RT*+@%+QWQ713W-?_gBI2dIW2Kd&p&NvD`u~HI^Qk99W~#jWSFEa zWSUgpyOuR_j$LwTdbxEEvTDt@_pG|#b(mIM+oeA;YU)1f3k(phnBp3k$}rX3S>O$EM*jHy?PT;H|OP4bWMVfZG>(h4&z(s z2-G(NKkDckAP#OemNZW%1qTJ6-kMUbQ#OrW`=XP=7o4x>2huig^%-tfE{M>EcZTC@ z+EJALj@e8OpPsaBY#!q15Gw)y7+R3L4}V-0LFlK*!6&o2Z>B7~+z%ts8?djLj)yzJ zg?$>yl#bQOqmkEvcLOPxxI;p*cPhOn0|}U=tXPd{{19*3QMKA0f zu{rv3RwU5bR+&=3xl%1ncl8gIFAPhnd|xAt+gvvwB#yD+%7uSDzhzo9JTHS>GLX@y4rC@(NB;L=(~ zWxqDLycs*sI-g`oQwQ@OM63Q$IRwpExDtb{eNpe1tY#ukTfYm>*sxh{n`$7LO&}JW zo84w4RE)2iU;8?Y9(_)HafVs=EJ3Y%i6xs|ImIAynPg&4^N5kuO`uSOdcQ}IF^0x# z#$IB~0oDP(Es4o~+*-ssvY6@cEUQ=u92Rf80*u%XzlkWLMcqtfh#@Z zWgMzo6^WJW&d^1j9K_Zb9CHr3^dn)#w}L<__D&1L~u?9BMhR^l?`8J%cCano4>bDtn_yK%{u+Sh}UV{krj z0T#;^f!zL3UhU&bT5Bx7wkHmFC_(Rew8bvviczfF0#WquJH-`C<14Tjshz&&`=EWC zMBim3EiqNlavw&#f#1L(c%sqTzC$47blDYWm%0yw~B;*d%{+#wC9O^*Q8#3R8wUe$x-2a!e)M+S@!|;R@!CQ~t%5U$?mU zxro*0)<_%t0;XE*fMf7gyaeoqAd&)$9dli_vE$Xjl!+wb3=w?1fcI1TW#ZB&ex-r$XGX-Cj%h`YS)AIAR$c|QHRG9A9uT3WrPabah$Fr9;qFgV zpSqiRiJH!Tw6Sb%{#auPJX2u}4{+IM3BMssklc8Ca~p6f!X5WL_S9kS!*QV)GaMp$ zy6{Ye2D9oBLDt|DWO`0Fd#ZbQ)&N_hMd&>?79#|Zl^OFuMWMO;#=fp-CdT}Hx@@VF z$psV>gz_tGiR3pfhYt0L-OkmnH%8st;0D=vQSW_eZiQUY;Ix<^zkYW^F?jE2;pD=< zv|DBFQ1Jb3U#u=CUWl(OjOr7L0kES9xl*8$G6i9h+mWADZ!A z)2;=8?Syr1Gh>S;fCY7`SV1Ri2-T};%#Z0D00}uSCH4=u`StA`exNL@Bls|OnIq&7 zH@TgKI%esj#!e?dnC!s0!w^Gcwg?!0Xel{2C}M?(8H-O=Rhe&VCa|MWjtL7AV#z{P z$fzy#cPenxK+x?UY3c52%Jurq5ti^4m{tZPVK35sNEH#`s5~dcjC zE9!MNE9d^O@Ur>ovPsf<1JT@vL#Y8D^GThWx2(9niQffzj}dgk#X-p9>`t}gaS zyQ|oOq))I!Bpc#Wu!q-gh0d7*n>!+@{Jl&TW=E-Gxyrf#k>AL*@wX9No579JxTN z{be`L>7#L)`d7f~O@v0i-HO)e0QRilk#%>B1@nrz0o;ZF8V+vzR_=kkCG5>(5b%+b za5~-&Uvg9J!j5Y!41ItxL0c%9Sdl_$Gjs5&*0*Zmw^Ry&Q*R_}T1nyVbddzMCmyWt ztY{(-q3FcTw3>wv;Y|3%2En8ir`!N)4Jhjy>`V2|j~5@eo?K2kFz7R$VDUzPxlD=) z^Mo=9z#9C=$F8S`3#aX-r^h)&?Z=+*MBsga*W=?v)zU+ptBPp=rPkw3fwH#O#VNhl z^&`GESNRoKM#2gzHfo4WH>qpCzO#TnOe%kJXsWmrkrXX7u8(EVZgeQkat}5$MzO3p zDj#|jP^0;-!07c<7y$7?gA`U)sb`e8MkU@RlCjZxfVVkh zV|j!s;9NqJ3KIE;aRyTwMLh}hd$Fg^A!r+(1*14MN8UW<7E*1Bb1V@LsdTdB;jido z0mFphE%as>g4Pwl)U_}9L&16`XgMDU1KT(4?-&A z#6NU;QVkIbGjeV8g7a@)!i-!kL#FLJTq#Ja@jo-|Dp!TB}u3H;((1Nc2)fny37T8O<%a0A8;g>?3Dvx%x1nsI%G-N z#LBhc$bAn1!Hde_)4h~cB^VYf5ENmBuH<<=d!d=AG+<5lE~c4?N*C4WO_B{|^xtu{ z3T!V|JOe5QkiN$#fXww+uGZ38SW4m?JvElkR3%u!rp)^* z6Y`>9(Cq)*uSicSQR813jm7rx!B)D;gs*5-pKoY3J+Eoqh%8P8ea#aTTXu${{M&~> z%gR&10jswQC7LNpdx72w4bUN@25ccIe!^zV*Vrhvc9)(i_P8{&Y7d=?+<5>~_3zDY z?7meO#Z&o4VOhkGthX9>ly(=Lb+epD6JvK_J-x_)q7F@2Czh#($S;|6uodjb zOgck-9DK{1=GXv!9=^gPL+G6AdDUtT9YoJeT8jR}_LUS}9{xud)5p}p*c}UfW>A+4Ycq;KY6^zZw9>HZ!HlnKS)WK5cpds+@3h&)H`@C6 zY1}f)0BN%k5p_)qnQ@0AOT$TozKq#iU46>x*$Sfa9`L|XPVW8zTl)26>UiZ@>i{KR z$A+cba4iocHVE{l1x&@=idt0vLZIa-Hq;;Z6-+STTA>& zC+SuMs{=k$=~$5k^X_YMtc&bR0$?kTRs_w_0@S$R7bO@7RDQa{h2w3yWuul8uJMA? zxPiRW<*93nTW@L^mb3VABf7#%MsWefhgaC~T1{;EEOR-Y90;NppAeVb?`nLjFw^bq z)SExAI2^tE)ndHVb7e0nM!!C5xysLKFZ>^c-p<~|9BB7*R>@VgFq>mQdgPn-Lb^Td zbzGx?u9O@bm)q{m!rL#<>kG-(3d_wM?0j^Sb=1P;svPkQk^M4eH?%;kIa;6uQx5F3 zEn>pe^v7gW+zl;I3~b^@Zvi9T%m0=?913*qqG{~q_mU`@gTAab>?ib=V{0?pO2Hzm z)L<4r2(V1@O;^Ja$DM@nO13){TBlXV;3Qi z^(#UVXy~=FS@koSTx3S`z6N&=S+k-vk&lD1qBA=huv;f)CC9&s_w7h2b>&TR~Qb%mae;-!xbIkE2eG@MY&>Fu#IUNpF? zLbzW$GQc@b(OB&J^Fdx4Tk#)YFJ~>+UNC6R^Jgmu9%gf3;Sy-J@@SeWv|RA9ZY=jI zw3tE)!)q5kOzl8;CeCtJgSe>K)Q*P2XiFg4vSg*jK%Z=9Hb>sV)fGk0b_> zxuYVGp_Q_`)qAd-1R3*Q2I;<6&s+mt^}fW~(RW=J89Mg!Dg!AVdILhsuEA0KD1gaO za5?TfOuc-6x0hnq50aj*BF)~)i^tCFUfNU|YGvRacH3Ds>wtvCH*=-+)p*~oWO6~h zTI*N@r;Yi3&+!phTTQ#FgEC0ovK>R(!jeqyYhcf;O%rN@8N}WMsjv*noQ0$20`uL> z5L^nTjNVaub*(3Hr6v2z(!%>J-gUn4t`!HA06`lp(H3JV@)iU4WH1qwj*lnCo~dX( z-fpLZs>m@EyI&&6Gy8E9rK}x>ce93vVToE!=|!UaP}a{Qqne(X`_JZ;wohdH{+Y{N z&oT|#pPV+zbO!In{r#VF` zHDKSoU#i^0tqMvAae3o@DeV!9dFBS9u|yw(LScvCz4wQ}-Vrv14p+~sEbk6^V5&gU zSKiW;^(o~*Rf0(rqi?TGy437cbEac(-j4i zi=Q}h?0JrOo^WaX13SA7UP3jq(sNOF2A?gc;tYyC!)k~pE69ZBq^ck{Fmb0Qwc~x~ zd*6u2?mvc%ml=TDf=ROF*-Jvt-Xi|PTR&y5KZT1wbgtrU3iR=< zvs>(B0Nh&zcA{#t(K96Ni)1cVtkiR%*ln!6E7-zT0 z!Kt^N;GWZEhM5NfSe`|H-`OSfzwreh<3_Ue`RgZ8008noJYfAy6nmhp0?^+6Cu8Ve z{AQ%nGqK!PS&#q`M0-?PKoca{esx)z>N^hw1M>7Vtawl9M<5IfCQbnlLhtUsn00l6 ztN*Y-`a#4R zT5ok1YnM1*YMS`lsMYC{`el!~xxHz03TFp)4aG~DICHkk)f#*G!X=%zRoqd*b>+^t zDK0@XvS?e4TyOg8rsN2*%cAG>TzWuKD3w+D0?oYqP#ZFK-k)O+d)>}&Z}a;s50NRT z@yQSCbqF^ib)&~5FYowt_4ecFaJzIU5;N?2ArsfzERyTBI~S(KS0RRP-iEcuj$C0C z*GSDxo#%0p%TSKP>&>xDyuVBMy#2+aiQg28yt{dFA}yAgsA_J%Y9QS zVHED{iF9hC`Y~)Yt)|wW7}r+I2o>E=Fl11zwTo&M#=`{aeS3*4oitty2}pqeLGNv0 zE0yASab%%lLE)Sw$CMuYlmXtIGMe{nkI)6iCsQ};wbxqTl&16yg6wHSnvr}2$xxeG zNW{TNK=h%m8&8PL`8_-+U!7(8GL?0>kBhnyHe)O_txP^XFY-Ur2H6b>Vd1g|EzH;7K{_K~C@VJ$9W(%p=c=YcIW#KHrx!5z!EuNDm^0UfqV5Ki-Yh`Um zuWw}w{P}^;8@T@yx1Se2EJjnZjR7;@2>j`7k4I`{s>0hjpWb3zC`*dPfc1sf=6qt= z3-=d!2TIguLfz7~G8t)SF?x-O4(HI5&_xn)q?Dloi~|9)3C0a-?d%k`8B{W6MOng- z*2Y?zT4?=9d9*yL7R|1&=wL`TpqNQ#U96xwBMt=Q6TyODRz=mtJ*H=A+2R_;XqS1U zjZ}QUq`8Kw>zG3jaS?IiL~XWB;(0T}v5LtpuF&)xM^5?09}dB$GucY4Ad=a0b>fwA z?sc>@IO#F4NyoCp(usCwu1syL*(tNW{+5mo`T;!++0e(;qTm7polR{QFCN9&=bb!v zX9`yn9O@F(vQrTARSj`T9n}M9a}H9(xuQ&O-}<9&g@9tl<`_p`o7!gurXF$6yGe*^ zhbH(r{BHd)FBBt^CNkxnU}umI;Cy!k6YT#QUb^DYNFmSkLw??5LHaBGbgixb$G&Hm z{duLu2wKfCzy%zE-r~b8;BYR;!TA*Gi7B6#=)Q`nG?LEK|5Qp6To}DS6Js z>ka-6tWa1oPDwu7;uCYV4rj?|`#jbv+Y!(l+h~PtHW2}&r=YYx!I}Vybv$tgjUumI zQ5GM*C-|hXE4$wh}}2+CFS8ZSQWb_6MmnQVC}t}EtIpbEs!A6B+kjWQG2b(1^HiP=9d632iD&JeptT%evhv&O<&FhzfF~K|1fs<*(?7>2*7*kgMaOuUjn{tJ%0lRz4- [!WARNING] -> Output not yet recorded +985.325 ms CUDA: 1 Thread, 1 Block --- ``` -uzylol@nid001076:/pscratch/sd/u/uzylol/cuda_vecadd> nsys nvprof vecadd_gpu_1t +uzylol@nid001133:/pscratch/sd/u/uzylol/cuda_vecadd> ./helper.sh vecadd_gpu_1t +Choose an option: +1. Build and profile +2. Clean +Enter your choice (1 or 2): 1 +Built executable: vecadd_gpu_1t WARNING: vecadd_gpu_1t and any of its children processes will be profiled. Max error: 0 -Generating '/tmp/nsys-report-383f.qdstrm' +Generating '/tmp/nsys-report-135f.qdstrm' [1/7] [========================100%] report1.nsys-rep [2/7] [========================100%] report1.sqlite [3/7] Executing 'nvtx_sum' stats report SKIPPED: /pscratch/sd/u/uzylol/cuda_vecadd/report1.sqlite does not contain NV Tools Extension (NVTX) data. [4/7] Executing 'cuda_api_sum' stats report - Time (%) Total Time (ns) Num Calls Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Name - -------- --------------- --------- --------------- --------------- ------------- ------------- ------------- ---------------------- - 68.1 1,203,306,047 1 1,203,306,047.0 1,203,306,047.0 1,203,306,047 1,203,306,047 0.0 cudaDeviceSynchronize - 19.6 346,677,337 2 173,338,668.5 173,338,668.5 65,046 346,612,291 245,045,906.9 cudaMallocManaged - 11.1 195,868,284 2 97,934,142.0 97,934,142.0 68,440,871 127,427,413 41,709,783.8 cudaFree - 1.2 21,925,779 1 21,925,779.0 21,925,779.0 21,925,779 21,925,779 0.0 cudaLaunchKernel - 0.0 1,463 1 1,463.0 1,463.0 1,463 1,463 0.0 cuModuleGetLoadingMode + Time (%) Total Time (ns) Num Calls Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Name + -------- --------------- --------- ---------------- ---------------- -------------- -------------- ------------- ---------------------- + 98.5 50,918,539,217 1 50,918,539,217.0 50,918,539,217.0 50,918,539,217 50,918,539,217 0.0 cudaDeviceSynchronize + 0.6 335,502,692 2 167,751,346.0 167,751,346.0 70,647 335,432,045 237,136,318.7 cudaMallocManaged + 0.5 247,516,551 1 247,516,551.0 247,516,551.0 247,516,551 247,516,551 0.0 cudaLaunchKernel + 0.4 198,199,002 2 99,099,501.0 99,099,501.0 68,308,537 129,890,465 43,544,998.9 cudaFree + 0.0 1,293 1 1,293.0 1,293.0 1,293 1,293 0.0 cuModuleGetLoadingMode [5/7] Executing 'cuda_gpu_kern_sum' stats report - Time (%) Total Time (ns) Instances Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Name - -------- --------------- --------- --------------- --------------- ------------- ------------- ----------- -------------------------- - 100.0 1,203,302,431 1 1,203,302,431.0 1,203,302,431.0 1,203,302,431 1,203,302,431 0.0 add(int, float *, float *) + Time (%) Total Time (ns) Instances Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Name + -------- --------------- --------- ---------------- ---------------- -------------- -------------- ----------- -------------------------- + 100.0 50,918,525,966 1 50,918,525,966.0 50,918,525,966.0 50,918,525,966 50,918,525,966 0.0 add(int, float *, float *) [6/7] Executing 'cuda_gpu_mem_time_sum' stats report - Time (%) Total Time (ns) Count Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Operation - -------- --------------- ------- -------- -------- -------- -------- ----------- ------------------------------------ - 80.0 446,872,156 152,098 2,938.1 2,175.0 1,663 41,471 3,720.9 [CUDA memcpy Unified Host-to-Device] - 20.0 111,554,845 12,282 9,082.8 3,215.5 1,726 49,504 12,422.3 [CUDA memcpy Unified Device-to-Host] + Time (%) Total Time (ns) Count Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Operation + -------- --------------- ------ -------- -------- -------- -------- ----------- ------------------------------------ + 65.2 206,887,363 24,576 8,418.3 3,647.5 1,982 41,312 11,229.0 [CUDA memcpy Unified Host-to-Device] + 34.8 110,515,732 12,288 8,993.8 3,135.5 1,726 48,416 12,346.7 [CUDA memcpy Unified Device-to-Host] [7/7] Executing 'cuda_gpu_mem_size_sum' stats report - Total (MB) Count Avg (MB) Med (MB) Min (MB) Max (MB) StdDev (MB) Operation - ---------- ------- -------- -------- -------- -------- ----------- ------------------------------------ - 4,075.237 152,098 0.027 0.008 0.004 1.040 0.099 [CUDA memcpy Unified Host-to-Device] - 2,147.222 12,282 0.175 0.033 0.004 1.044 0.301 [CUDA memcpy Unified Device-to-Host] + Total (MB) Count Avg (MB) Med (MB) Min (MB) Max (MB) StdDev (MB) Operation + ---------- ------ -------- -------- -------- -------- ----------- ------------------------------------ + 4,294.967 24,576 0.175 0.033 0.004 1.044 0.301 [CUDA memcpy Unified Host-to-Device] + 2,147.484 12,288 0.175 0.033 0.004 1.044 0.301 [CUDA memcpy Unified Device-to-Host] Generated: /pscratch/sd/u/uzylol/cuda_vecadd/report1.nsys-rep /pscratch/sd/u/uzylol/cuda_vecadd/report1.sqlite -============================================================================== -Runtime for 1 thread block, 1 thread: 1,203,302,431 ns or 1.2 s ``` CUDA: 256 Threads, One Block --- ``` -uzylol@nid001220:/pscratch/sd/u/uzylol/cuda_vecadd> nsys nvprof ./vecadd_gpu_256t +uzylol@nid001133:/pscratch/sd/u/uzylol/cuda_vecadd> ./helper.sh vecadd_gpu_256t +Choose an option: +1. Build and profile +2. Clean +Enter your choice (1 or 2): 1 +Built executable: vecadd_gpu_256t WARNING: vecadd_gpu_256t and any of its children processes will be profiled. Max error: 0 -Generating '/tmp/nsys-report-e1e4.qdstrm' +Generating '/tmp/nsys-report-a2a6.qdstrm' [1/7] [========================100%] report1.nsys-rep [2/7] [========================100%] report1.sqlite [3/7] Executing 'nvtx_sum' stats report @@ -73,49 +80,52 @@ SKIPPED: /pscratch/sd/u/uzylol/cuda_vecadd/report1.sqlite does not contain NV To Time (%) Total Time (ns) Num Calls Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Name -------- --------------- --------- --------------- --------------- ------------- ------------- ------------- ---------------------- - 64.8 1,212,368,064 1 1,212,368,064.0 1,212,368,064.0 1,212,368,064 1,212,368,064 0.0 cudaDeviceSynchronize - 21.7 406,008,370 2 203,004,185.0 203,004,185.0 66,418 405,941,952 286,997,342.4 cudaMallocManaged - 10.6 197,844,224 2 98,922,112.0 98,922,112.0 69,452,260 128,391,964 41,676,664.4 cudaFree - 3.0 55,299,012 1 55,299,012.0 55,299,012.0 55,299,012 55,299,012 0.0 cudaLaunchKernel - 0.0 1,152 1 1,152.0 1,152.0 1,152 1,152 0.0 cuModuleGetLoadingMode + 71.3 1,689,772,282 1 1,689,772,282.0 1,689,772,282.0 1,689,772,282 1,689,772,282 0.0 cudaDeviceSynchronize + 13.2 312,348,647 2 156,174,323.5 156,174,323.5 66,999 312,281,648 220,769,095.5 cudaMallocManaged + 8.4 200,018,555 2 100,009,277.5 100,009,277.5 70,954,629 129,063,926 41,089,478.0 cudaFree + 7.0 166,925,228 1 166,925,228.0 166,925,228.0 166,925,228 166,925,228 0.0 cudaLaunchKernel + 0.0 1,031 1 1,031.0 1,031.0 1,031 1,031 0.0 cuModuleGetLoadingMode [5/7] Executing 'cuda_gpu_kern_sum' stats report Time (%) Total Time (ns) Instances Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Name -------- --------------- --------- --------------- --------------- ------------- ------------- ----------- -------------------------- - 100.0 1,212,362,968 1 1,212,362,968.0 1,212,362,968.0 1,212,362,968 1,212,362,968 0.0 add(int, float *, float *) + 100.0 1,689,768,383 1 1,689,768,383.0 1,689,768,383.0 1,689,768,383 1,689,768,383 0.0 add(int, float *, float *) [6/7] Executing 'cuda_gpu_mem_time_sum' stats report - Time (%) Total Time (ns) Count Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Operation - -------- --------------- ------- -------- -------- -------- -------- ----------- ------------------------------------ - 80.7 466,018,043 157,467 2,959.5 2,239.0 1,663 49,534 3,710.5 [CUDA memcpy Unified Host-to-Device] - 19.3 111,109,764 12,288 9,042.1 3,199.5 1,727 48,384 12,378.9 [CUDA memcpy Unified Device-to-Host] + Time (%) Total Time (ns) Count Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Operation + -------- --------------- ------ -------- -------- -------- -------- ----------- ------------------------------------ + 65.4 208,609,295 24,576 8,488.3 3,920.5 1,982 41,568 11,229.4 [CUDA memcpy Unified Host-to-Device] + 34.6 110,590,888 12,288 8,999.9 3,167.5 1,726 48,384 12,351.0 [CUDA memcpy Unified Device-to-Host] [7/7] Executing 'cuda_gpu_mem_size_sum' stats report - Total (MB) Count Avg (MB) Med (MB) Min (MB) Max (MB) StdDev (MB) Operation - ---------- ------- -------- -------- -------- -------- ----------- ------------------------------------ - 4,174.426 157,467 0.027 0.008 0.004 1.044 0.098 [CUDA memcpy Unified Host-to-Device] - 2,147.484 12,288 0.175 0.033 0.004 1.044 0.301 [CUDA memcpy Unified Device-to-Host] + Total (MB) Count Avg (MB) Med (MB) Min (MB) Max (MB) StdDev (MB) Operation + ---------- ------ -------- -------- -------- -------- ----------- ------------------------------------ + 4,294.967 24,576 0.175 0.033 0.004 1.044 0.301 [CUDA memcpy Unified Host-to-Device] + 2,147.484 12,288 0.175 0.033 0.004 1.044 0.301 [CUDA memcpy Unified Device-to-Host] Generated: /pscratch/sd/u/uzylol/cuda_vecadd/report1.nsys-rep /pscratch/sd/u/uzylol/cuda_vecadd/report1.sqlite -============================================================================== -Runtime for 1 thread block, 256 threads: 1,212,362,968 ns or 1.21 s ``` CUDA: 256 Threads, Many Blocks --- ``` -uzylol@nid001220:/pscratch/sd/u/uzylol/cuda_vecadd> nsys nvprof vecadd_gpu_256t_mb +uzylol@nid001133:/pscratch/sd/u/uzylol/cuda_vecadd> ./helper.sh vecadd_gpu_256t_mb +Choose an option: +1. Build and profile +2. Clean +Enter your choice (1 or 2): 1 +Built executable: vecadd_gpu_256t_mb WARNING: vecadd_gpu_256t_mb and any of its children processes will be profiled. Number of thread blocks: 2097152 Max error: 0 -Generating '/tmp/nsys-report-b2ed.qdstrm' +Generating '/tmp/nsys-report-cbad.qdstrm' [1/7] [========================100%] report1.nsys-rep [2/7] [========================100%] report1.sqlite [3/7] Executing 'nvtx_sum' stats report @@ -124,49 +134,52 @@ SKIPPED: /pscratch/sd/u/uzylol/cuda_vecadd/report1.sqlite does not contain NV To Time (%) Total Time (ns) Num Calls Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Name -------- --------------- --------- --------------- --------------- ------------- ------------- ------------- ---------------------- - 66.6 1,232,738,921 1 1,232,738,921.0 1,232,738,921.0 1,232,738,921 1,232,738,921 0.0 cudaDeviceSynchronize - 21.6 399,715,681 2 199,857,840.5 199,857,840.5 59,043 399,656,638 282,558,169.2 cudaMallocManaged - 10.6 196,866,027 2 98,433,013.5 98,433,013.5 68,696,501 128,169,526 42,053,779.3 cudaFree - 1.2 22,183,639 1 22,183,639.0 22,183,639.0 22,183,639 22,183,639 0.0 cudaLaunchKernel - 0.0 1,203 1 1,203.0 1,203.0 1,203 1,203 0.0 cuModuleGetLoadingMode + 68.5 1,219,130,474 1 1,219,130,474.0 1,219,130,474.0 1,219,130,474 1,219,130,474 0.0 cudaDeviceSynchronize + 17.8 316,925,924 2 158,462,962.0 158,462,962.0 55,287 316,870,637 224,022,282.4 cudaMallocManaged + 11.4 203,677,016 2 101,838,508.0 101,838,508.0 68,914,217 134,762,799 46,561,978.9 cudaFree + 2.3 40,488,959 1 40,488,959.0 40,488,959.0 40,488,959 40,488,959 0.0 cudaLaunchKernel + 0.0 1,072 1 1,072.0 1,072.0 1,072 1,072 0.0 cuModuleGetLoadingMode [5/7] Executing 'cuda_gpu_kern_sum' stats report Time (%) Total Time (ns) Instances Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Name -------- --------------- --------- --------------- --------------- ------------- ------------- ----------- -------------------------- - 100.0 1,232,731,745 1 1,232,731,745.0 1,232,731,745.0 1,232,731,745 1,232,731,745 0.0 add(int, float *, float *) + 100.0 1,219,123,989 1 1,219,123,989.0 1,219,123,989.0 1,219,123,989 1,219,123,989 0.0 add(int, float *, float *) [6/7] Executing 'cuda_gpu_mem_time_sum' stats report Time (%) Total Time (ns) Count Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Operation -------- --------------- ------- -------- -------- -------- -------- ----------- ------------------------------------ - 80.9 465,307,045 159,535 2,916.6 2,175.0 1,663 41,312 3,669.0 [CUDA memcpy Unified Host-to-Device] - 19.1 110,116,353 12,288 8,961.3 3,167.5 1,727 48,415 12,275.7 [CUDA memcpy Unified Device-to-Host] + 80.3 448,633,690 155,241 2,889.9 2,175.0 1,663 63,231 3,651.9 [CUDA memcpy Unified Host-to-Device] + 19.7 109,875,418 12,286 8,943.1 3,214.5 1,726 48,352 12,278.7 [CUDA memcpy Unified Device-to-Host] [7/7] Executing 'cuda_gpu_mem_size_sum' stats report Total (MB) Count Avg (MB) Med (MB) Min (MB) Max (MB) StdDev (MB) Operation ---------- ------- -------- -------- -------- -------- ----------- ------------------------------------ - 4,195.918 159,535 0.026 0.008 0.004 1.044 0.098 [CUDA memcpy Unified Host-to-Device] - 2,147.484 12,288 0.175 0.033 0.004 1.044 0.301 [CUDA memcpy Unified Device-to-Host] + 3,996.164 155,241 0.026 0.008 0.004 1.044 0.097 [CUDA memcpy Unified Host-to-Device] + 2,147.418 12,286 0.175 0.033 0.004 1.044 0.301 [CUDA memcpy Unified Device-to-Host] Generated: /pscratch/sd/u/uzylol/cuda_vecadd/report1.nsys-rep /pscratch/sd/u/uzylol/cuda_vecadd/report1.sqlite -============================================================================== -Runtime for many thread blocks, 256 threads: 1,232,731,745 ns or 1.23 s ``` CUDA: 256 Threads, Many Blocks with Prefetch --- ``` -uzylol@nid001132:/pscratch/sd/u/uzylol/cuda_vecadd> nsys nvprof vecadd_gpu_256t_mb_prefetch +uzylol@nid001133:/pscratch/sd/u/uzylol/cuda_vecadd> ./helper.sh vecadd_gpu_256t_mb_prefetch +Choose an option: +1. Build and profile +2. Clean +Enter your choice (1 or 2): 1 +Built executable: vecadd_gpu_256t_mb_prefetch WARNING: vecadd_gpu_256t_mb_prefetch and any of its children processes will be profiled. Number of thread blocks: 2097152 Max error: 0 -Generating '/tmp/nsys-report-3a53.qdstrm' +Generating '/tmp/nsys-report-3752.qdstrm' [1/7] [========================100%] report1.nsys-rep [2/7] [========================100%] report1.sqlite [3/7] Executing 'nvtx_sum' stats report @@ -175,25 +188,25 @@ SKIPPED: /pscratch/sd/u/uzylol/cuda_vecadd/report1.sqlite does not contain NV To Time (%) Total Time (ns) Num Calls Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Name -------- --------------- --------- ------------- ------------- ----------- ----------- ------------- ---------------------- - 51.2 429,287,905 2 214,643,952.5 214,643,952.5 37,182 429,250,723 303,499,805.4 cudaMallocManaged - 22.5 188,776,511 2 94,388,255.5 94,388,255.5 60,886,335 127,890,176 47,378,870.3 cudaFree - 13.3 111,100,135 1 111,100,135.0 111,100,135.0 111,100,135 111,100,135 0.0 cudaLaunchKernel - 12.4 103,925,795 2 51,962,897.5 51,962,897.5 410,784 103,515,011 72,905,698.1 cudaMemPrefetchAsync - 0.6 4,775,402 1 4,775,402.0 4,775,402.0 4,775,402 4,775,402 0.0 cudaDeviceSynchronize - 0.0 1,212 1 1,212.0 1,212.0 1,212 1,212 0.0 cuModuleGetLoadingMode + 43.5 329,908,462 2 164,954,231.0 164,954,231.0 50,698 329,857,764 233,208,812.9 cudaMallocManaged + 25.1 190,030,072 2 95,015,036.0 95,015,036.0 62,035,958 127,994,114 46,639,459.4 cudaFree + 17.2 130,653,654 1 130,653,654.0 130,653,654.0 130,653,654 130,653,654 0.0 cudaLaunchKernel + 13.5 102,455,160 2 51,227,580.0 51,227,580.0 377,189 102,077,971 71,913,312.6 cudaMemPrefetchAsync + 0.6 4,773,525 1 4,773,525.0 4,773,525.0 4,773,525 4,773,525 0.0 cudaDeviceSynchronize + 0.0 1,312 1 1,312.0 1,312.0 1,312 1,312 0.0 cuModuleGetLoadingMode [5/7] Executing 'cuda_gpu_kern_sum' stats report Time (%) Total Time (ns) Instances Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Name -------- --------------- --------- ----------- ----------- --------- --------- ----------- -------------------------- - 100.0 4,773,208 1 4,773,208.0 4,773,208.0 4,773,208 4,773,208 0.0 add(int, float *, float *) + 100.0 4,771,134 1 4,771,134.0 4,771,134.0 4,771,134 4,771,134 0.0 add(int, float *, float *) [6/7] Executing 'cuda_gpu_mem_time_sum' stats report Time (%) Total Time (ns) Count Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Operation -------- --------------- ------ -------- -------- -------- -------- ----------- ------------------------------------ - 59.9 165,061,185 2,048 80,596.3 80,576.0 80,511 81,088 68.1 [CUDA memcpy Unified Host-to-Device] - 40.1 110,568,223 12,288 8,998.1 3,471.5 1,726 48,448 12,335.9 [CUDA memcpy Unified Device-to-Host] + 59.9 165,077,448 2,048 80,604.2 80,576.0 80,511 81,055 68.8 [CUDA memcpy Unified Host-to-Device] + 40.1 110,650,214 12,288 9,004.7 3,135.5 1,726 48,352 12,355.0 [CUDA memcpy Unified Device-to-Host] [7/7] Executing 'cuda_gpu_mem_size_sum' stats report @@ -205,6 +218,14 @@ SKIPPED: /pscratch/sd/u/uzylol/cuda_vecadd/report1.sqlite does not contain NV To Generated: /pscratch/sd/u/uzylol/cuda_vecadd/report1.nsys-rep /pscratch/sd/u/uzylol/cuda_vecadd/report1.sqlite -============================================================================== -Runtime for ???: 4,773,208 ns or 0.0048 seconds ``` + +## Prefetch Thread Count + +``` +uzylol@nid001013:/pscratch/sd/u/uzylol/cuda_vecadd> compute-sanitizer vecadd_gpu_256t_mb_prefetch +========= COMPUTE-SANITIZER +Number of thread blocks: 2097152 +Max error: 0 +========= ERROR SUMMARY: 0 errors +``` \ No newline at end of file diff --git a/performanceTable.md b/performanceTable.md new file mode 100644 index 0000000..b913ccb --- /dev/null +++ b/performanceTable.md @@ -0,0 +1,9 @@ +# Performance Table + +| Implementation | Execution Time (ms) | MFLOPS/s | Mem Bandwidth B/s | Mem Bandwidth GB/s | +|---------------------|---------------------|--------------|-------------------|--------------------| +| CPU | 985.325 | 519.6255043 | 6538401993 | 6.5384 | +| CUDA: 1t 1b | 50918.5392 | 10.05527668 | 126524661.7 | 0.1265 | +| CUDA: 256t 1b | 1689.7723 | 302.9994041 | 3812614838 | 3.8126 | +| CUDA: 256t many b | 1219.1305 | 419.9714469 | 5284463758 | 5.2845 | +| CUDA prefetch | 329.9085 | 1551.945464 | 19527993198 | 19.528 | \ No newline at end of file diff --git a/report.md b/report.md deleted file mode 100644 index fe236ad..0000000 --- a/report.md +++ /dev/null @@ -1,10 +0,0 @@ -# Report - -| Implementation | Elapsed Time (ms) | MFLOP/s | Memory Bandwidth (GB/s) | -| --------------- | ----------------- | ------- | ----------------------- | -| CPU Vector Addition | xx | xx | xx | -| CUDA 1 thread, 1 thread block | 1,203.31 | 425.59 | 5.35 | -| CUDA 256 threads, 1 thread block | 1,212.36 | 422.76 | 5.31 | -| CUDA 256 threads/block, many thread blocks | 1,232.73 | 415.16 | 5.24 | -| CUDA 256 threads/block, many blocks, prefetching | 4.77 | 112,591.01 | 1,349.96 | - diff --git a/vecadd_gpu_1t.cu b/vecadd_gpu_1t.cu index 79feb25..5e83d59 100644 --- a/vecadd_gpu_1t.cu +++ b/vecadd_gpu_1t.cu @@ -1,15 +1,11 @@ #include -#include +#include #include -// CUDA kernel to add the elements of two arrays -__global__ -void add(int n, float *x, float *y) { - int index = threadIdx.x + blockIdx.x * blockDim.x; - int stride = blockDim.x * gridDim.x; - for (int i = index; i < n; i += stride) { +// function to add the elements of two arrays +__global__ void add(int n, float *x, float *y) { + for (int i = 0; i < n; i++) y[i] = x[i] + y[i]; - } } int main(void) { @@ -27,22 +23,16 @@ int main(void) { y[i] = 2.0f; } - // Number of threads per block - int blockSize = 256; - // Number of blocks in the grid - int numBlocks = (N + blockSize - 1) / blockSize; - // Run kernel on the elements on the GPU - add<<>>(N, x, y); + add<<<1, 1>>>(N, x, y); // Wait for GPU to finish before accessing on host cudaDeviceSynchronize(); // Check for errors (all values should be 3.0f) float maxError = 0.0f; - for (int i = 0; i < N; i++) { + for (int i = 0; i < N; i++) maxError = fmax(maxError, fabs(y[i] - 3.0f)); - } std::cout << "Max error: " << maxError << std::endl; // Free memory @@ -50,4 +40,4 @@ int main(void) { cudaFree(y); return 0; -} +} \ No newline at end of file diff --git a/vecadd_gpu_256t.cu b/vecadd_gpu_256t.cu index 845a682..090be88 100644 --- a/vecadd_gpu_256t.cu +++ b/vecadd_gpu_256t.cu @@ -1,15 +1,13 @@ #include -#include +#include #include -// CUDA kernel to add the elements of two arrays -__global__ -void add(int n, float *x, float *y) { - int index = threadIdx.x + blockIdx.x * blockDim.x; - int stride = blockDim.x * gridDim.x; - for (int i = index; i < n; i += stride) { +// function to add the elements of two arrays +__global__ void add(int n, float *x, float *y) { + int index = threadIdx.x; + int stride = blockDim.x; + for (int i = index; i < n; i += stride) y[i] = x[i] + y[i]; - } } int main(void) { @@ -27,22 +25,16 @@ int main(void) { y[i] = 2.0f; } - // Number of threads per block - int blockSize = 256; - // Number of blocks in the grid - int numBlocks = (N + blockSize - 1) / blockSize; - - // Run kernel on the elements on the GPU - add<<>>(N, x, y); + // Run kernel on the elements on the GPU with 256 threads + add<<<1, 256>>>(N, x, y); // Wait for GPU to finish before accessing on host cudaDeviceSynchronize(); // Check for errors (all values should be 3.0f) float maxError = 0.0f; - for (int i = 0; i < N; i++) { + for (int i = 0; i < N; i++) maxError = fmax(maxError, fabs(y[i] - 3.0f)); - } std::cout << "Max error: " << maxError << std::endl; // Free memory diff --git a/vecadd_gpu_256t_mb.cu b/vecadd_gpu_256t_mb.cu index c7fe687..c34e260 100644 --- a/vecadd_gpu_256t_mb.cu +++ b/vecadd_gpu_256t_mb.cu @@ -1,15 +1,13 @@ #include -#include +#include #include -// CUDA kernel to add the elements of two arrays -__global__ -void add(int n, float *x, float *y) { - int index = blockIdx.x * blockDim.x + threadIdx.x; +// function to add the elements of two arrays +__global__ void add(int n, float *x, float *y) { + int index = threadIdx.x + blockIdx.x * blockDim.x; int stride = blockDim.x * gridDim.x; - for (int i = index; i < n; i += stride) { + for (int i = index; i < n; i += stride) y[i] = x[i] + y[i]; - } } int main(void) { @@ -21,31 +19,30 @@ int main(void) { cudaMallocManaged(&x, N * sizeof(float)); cudaMallocManaged(&y, N * sizeof(float)); - // Initialize x and y arrays on the host + // initialize x and y arrays on the host for (int i = 0; i < N; i++) { x[i] = 1.0f; y[i] = 2.0f; } // Number of threads per block - int blockSize = 256; + int threadsPerBlock = 256; // Number of blocks in the grid - int numBlocks = (N + blockSize - 1) / blockSize; + int numberOfBlocks = (N + threadsPerBlock - 1) / threadsPerBlock; - // Print the number of thread blocks - std::cout << "Number of thread blocks: " << numBlocks << std::endl; + // Print out the number of thread blocks + std::cout << "Number of thread blocks: " << numberOfBlocks << std::endl; - // Run kernel on the elements on the GPU - add<<>>(N, x, y); + // Run kernel on the elements on the GPU with multiple blocks and threads + add<<>>(N, x, y); // Wait for GPU to finish before accessing on host cudaDeviceSynchronize(); // Check for errors (all values should be 3.0f) float maxError = 0.0f; - for (int i = 0; i < N; i++) { + for (int i = 0; i < N; i++) maxError = fmax(maxError, fabs(y[i] - 3.0f)); - } std::cout << "Max error: " << maxError << std::endl; // Free memory diff --git a/vecadd_gpu_256t_mb_prefetch.cu b/vecadd_gpu_256t_mb_prefetch.cu index 89f9aa8..ef6721c 100644 --- a/vecadd_gpu_256t_mb_prefetch.cu +++ b/vecadd_gpu_256t_mb_prefetch.cu @@ -1,15 +1,13 @@ #include -#include +#include #include -// CUDA kernel to add the elements of two arrays -__global__ -void add(int n, float *x, float *y) { - int index = blockIdx.x * blockDim.x + threadIdx.x; +// function to add the elements of two arrays +__global__ void add(int n, float *x, float *y) { + int index = threadIdx.x + blockIdx.x * blockDim.x; int stride = blockDim.x * gridDim.x; - for (int i = index; i < n; i += stride) { + for (int i = index; i < n; i += stride) y[i] = x[i] + y[i]; - } } int main(void) { @@ -21,35 +19,35 @@ int main(void) { cudaMallocManaged(&x, N * sizeof(float)); cudaMallocManaged(&y, N * sizeof(float)); - // Initialize x and y arrays on the host + // initialize x and y arrays on the host for (int i = 0; i < N; i++) { x[i] = 1.0f; y[i] = 2.0f; } + // Prefetch memory to the GPU int deviceID = 0; - cudaMemPrefetchAsync(x, N * sizeof(float), deviceID); - cudaMemPrefetchAsync(y, N * sizeof(float), deviceID); + cudaMemPrefetchAsync((void *)x, N * sizeof(float), deviceID); + cudaMemPrefetchAsync((void *)y, N * sizeof(float), deviceID); // Number of threads per block - int blockSize = 256; + int threadsPerBlock = 256; // Number of blocks in the grid - int numBlocks = (N + blockSize - 1) / blockSize; + int numberOfBlocks = (N + threadsPerBlock - 1) / threadsPerBlock; - // Print the number of thread blocks - std::cout << "Number of thread blocks: " << numBlocks << std::endl; + // Print out the number of thread blocks + std::cout << "Number of thread blocks: " << numberOfBlocks << std::endl; - // Run kernel on the elements on the GPU - add<<>>(N, x, y); + // Run kernel on the elements on the GPU with multiple blocks and threads + add<<>>(N, x, y); // Wait for GPU to finish before accessing on host cudaDeviceSynchronize(); // Check for errors (all values should be 3.0f) float maxError = 0.0f; - for (int i = 0; i < N; i++) { + for (int i = 0; i < N; i++) maxError = fmax(maxError, fabs(y[i] - 3.0f)); - } std::cout << "Max error: " << maxError << std::endl; // Free memory