From e0750da7fd5f2d0fb799a0d0929f2386557918c3 Mon Sep 17 00:00:00 2001 From: GitHub Action Date: Sat, 27 Jan 2024 11:01:02 +0000 Subject: [PATCH] Add changes for 4b74402543d1d8aa7420a7e7087767c0dee8ff05 --- latest/design.html | 7 +++++++ latest/genindex.html | 2 ++ latest/objects.inv | Bin 7751 -> 7767 bytes latest/searchindex.js | 2 +- 4 files changed, 10 insertions(+), 1 deletion(-) diff --git a/latest/design.html b/latest/design.html index 04e116596..b807d1a73 100644 --- a/latest/design.html +++ b/latest/design.html @@ -148,6 +148,7 @@
  • compile_restrictions()
  • config_valid()
  • convert_constraint_restriction()
  • +
  • correct_open_cache()
  • cuda_error_check()
  • delete_temp_file()
  • detect_language()
  • @@ -1721,6 +1722,12 @@

    Util Functions +
    +kernel_tuner.util.correct_open_cache(cache, open_cache=True)
    +

    if cache file was not properly closed, pretend it was properly closed

    +
    +
    kernel_tuner.util.cuda_error_check(error)
    diff --git a/latest/genindex.html b/latest/genindex.html index 1d4f8a8b8..f03ebf4e5 100644 --- a/latest/genindex.html +++ b/latest/genindex.html @@ -274,6 +274,8 @@

    C

  • (kernel_tuner.core.DeviceInterface method)
  • +
  • correct_open_cache() (in module kernel_tuner.util) +
  • create_device_targets() (in module kernel_tuner)
  • create_kernel_instance() (kernel_tuner.core.DeviceInterface method) diff --git a/latest/objects.inv b/latest/objects.inv index 75adeb564af91c37229e2d1b3ec28e6caad8c765..c9b394140baa07377d6d6259724900a3b0ee2a0a 100644 GIT binary patch delta 3677 zcmV-j4x;hLJl8z1`38R(0Ngl|lD#P--MM5#Hy&cCiZfvz=gfOK8lWN#p;_`w+Tr{l z=d?uv%*=+S=pL8i5um@&mSM)TmxK4Pw@#bB91rpD6*q*(v92(82g6urFkpDdQUn5v zDjGr~BN@9#157Q|UZ;0NYj0z)D_U6lsT>x^>gR$M(kP_^*wcUX{WEiO&*WUdZMytS zLfuaN@Os{~W09!eosgcBO+zV0f*fbV$lof!Zdg@WnVJ=eA*Ry6T+XUZgCyFK@tw%W zC#-3%3Ubl(!Zk%HAHL_lEEoFRq24B*zW4z_++XbVJoA0=ApDnESSNg8Ax%gxe7DWd z9BHUA&DBp$aZi6UiTwNQvYe3$XTQjAy2jP|#i4qUv0$)=a~=Yu3toS|`B{PF9mP#H z*rHZ)bFO)}@`8iv75|>|?`{}PQitr>`}EmFWv$^nK2eo_iRA4QyL?&Impm}$*_DoS zHPg1lwa)pq&NX32Gd|4_7Gr%+hZap!jTUXSqIb6v3jTk%)D<~mT-`5qQ_(LzD>P`E z)-STy$NZ{n7B)jE=eMY@FD}_1v zDqj9MF|U5O$*&DcKisITWwO}Rs#4-F#2&CV7(F(WuMh zp)ZwkRBZ*>-z;XX%fg6rrF_p{6iIgjb~Ss;GBi1mS3(x*YS`z!lfP(QiHWCqy-v2C ztXKw7F;P3)W@=9w^uZgdV8Rc2$Z1dDHIB*GeZ00058Ki{Yje>^v0I3R@Lq;j{vVw0 z5g~tcFQ5Q^PxZk5f!Q7O!#4W}U?!H!=j=Z)J0nK;>ihtfVhyYO1LkQ#iCH!qz*Q_R zlYhWmEhsVT+3Z}|QjlEuoXzC~5;UrM*O}O<$VtTcW!|*%QB(#p3qgvVvK_~JoBWLC zy+`u$-p#u<@V5HfXjdPxGi(A^AI|$Sg*|_n0T)QHCe9>v zB2*D)&wj0YI=lABSOA=z>N3f6Hmax?@P%(eX zA-1thKI)}G!0zOTmj-@*@Cetj1$wjwj5;*hvv0Q^s{rL~+q@%%B{pn41j1UwmLiF4 zH6UXF5O$h2IZ0%w2^x(6GZmYb_&4H*D#Gl+q#y46{~<~!8$fA>`^W)^64(YvI$=HF zAW|MiIz`$LcB}$M8E6|kJ#ZfA0MLJd8GuOx+($@2l%O_%!+qzW6#yNe0f?~PeM|;K zDZmEyqgTvtYu>Xsz`w2en1RE0`(YoE@(|L%z}v6}ICBT3hj11g^j!}Put3s_p1o7Sz++b=*OCSNg_ux2~8p0AlaPK;*E?NUh zJwc!!d`KQn9c&>u==UAb2hxI?fk*(n2Lr;?;Fds>0pD?iAT6*N@WjA-2qH`^z!JG^ zB66ih;E9XKl^S7O!HI(Jct?;H+zep4;5kwfqJ*&_QPL27u4Nh+q~(A3)FJY)%@iO= zn-e4uu?K)AL4li%Od|phBgH7gZ9^mz0cW8?718!UQi_0MSfPq2djN@r`|wwQ4%~nQ z=|#lRvseIxo!}(HfA}p>4{i)1)d)Ct7pjP{2bysB5Bvq{A&h~i9RWuTLlpz;k*lC1 z7hVURY>ZrZ9mW-$diZ~iQ3h$j%>bqzo}-%~N*Egwr5@4eN2h^7T8>XWA`i7r0fMwS zLFy5E&~_3OxY@|mBk<5~j56FdMCuW6<~URlZ4V^%2srK>s)(`&kb1ZeSqJFA4M>oB zL>%#s1whydPCfjG%mek{#voFUfaC32lOs|QI1(bA4xjAo^2 zhX@66+8q!a&1$C~CKklZc8F&*MH^70d>HBd;l}Z7_XweaK7N-o%^&zV-BBLYc`SOc zWDrFHzfVm1VPalp%4yM2j1)b0wV#3O$muvvE~HomFYm+I)=^v!+95(dPW!`uvSS$S zb^Rh9Hiwh9BiI~t{UTmA`Z3;7WOP-pgpbPQMDQ3E7cEz%r)_@u;yvWAOwH;Qm0C~p z`xDJ0DC||e5&=~7!_gzj=*j^y0fhEvtw&PYs|QI1(App09!YDj9wZe&OFyYQl9a9- zAQM1HJ77MNkEZJv31D+Mtv-@}%0A<^{xGs_H~X?hPDS!RGhIS;w6A6sh4Kjh30 zPklXaN4Q52w1Y&*`iQbcM12+f+X+|tbNE#%IqE_iKE=(Z(i=G$XHS`bWjusvsyV@P zGlO=BkdMx!QGKoTus zgKdJ)SXI3e9xD8q7u^|(ma8(P?bx*QqU5OaHas*%Dc@k7`?6eU7T-A!w2lFZko6H= zI{MTn(3fT(g}Ud3nL)~bLreUiA|fUB`krMgF}#fWVPZaJ&-E~{BjZ_@Eo0Qkx?6SM zNnO8)kImc<+lyy9ILq)QX`(r*f4%B|l=@*}erEKE5`EvRYR+0u^IsU-=v!aWa#d<= zw|%uVBzON7WhW7)?ymOoVf^DLaE(KW){vZpVaHts#Domp6J2b7ulLQ z)~)a4J~QLgwdeK2!@_d0Q$4(P`(uN*N(PnPtX~gmSH&{-?1X9>Jmp{N`6-{D)?O6{ zSG$0(erDRAcG~|E+Ml?#B9SqPj7nreBI6QAO6=DFYCMJPI}Z3bEKU~JdxhU_KWAHE zMu#G6lolS_+P*t~8`%2K)suDgy%U7hJJg5Q`H9l6+o2)sN6cdOvAQ*Ch|)|iJvPU@ z66-P6T{OI>$;RYt?9L!B%Xsg~vSUMZ*fKqO4Re;;I2-vSWa*X4ubHg)o{7s@ks~<$ zOy$Mg`CQC0cQ){u%ijt44?>=oWuhz-WlBS5xo3szCR=5H_`UXNVZBl`KV|R6zRiZG z*NH`>bDy!>sp7_n`)jf_1HLI$B8l%!onXW*s&Rtcsd~%48&+%1ec$T9G54R=2<$f0 z8xOx5wWi~5O1;^zdsAyN{1)9cwcZvvoA#B<+aed^oZgI#gS;(*Xl!urjBt$K3YX%J z)Ev!!)7ZOzX#U6Ta5TD_-&xg7{zujs{LnHHHcN}=N^WnYNn(7`u}QdRI*>~3pA2Nn z{5Yl8LFk%`UuR+xt`j)I{TbSUI6(souPZz@?Qw{PMxxx}u|c47v=fQCqPG_v*`)v4 z2KvNfV-cRw)GXdV8Yq2xFKI9(JSa0z{%lYd9#WKlWo>|GloZ&##eP=Lc@>#yK9|>B zUgX0_vqzpt#f_)WQLutGyO)`Cc9G8NE-SK&v>27-Yr*b^i6x0TrLBfLbJYQ6zsf3I z{fdp;{E<+Jf?ibz+Fp{6+~3x(2h{GRZp%J*Ud+iO1?KO=Pl^t#BTJ|be%lx>f zlVcn$0Zo&U9Fu?OUOn{Qa|sl+TvfXwwdaFuQ#I-cymbeMnvW|a9h4< zTfs=|kUfV&wvc&i(WTisU(mkWltGrRE70l3H)5j(m0Q%l9_xMvDu8EOh6^z%AHHXS zx-Y<{Y+F=s*+f8DGyK-5VB8$QtHo^eEZ0{Sa~qfI8<&58shcA+jgo7x+`v~VKDSnk$*iZUU8C+ct6 v)*otiCgCL=>D`uisPf)4OxLnyF>ei;>yuM$&X=z2E|}B`iYxyEbLst3eNY5$ delta 3638 zcmV-64$1M?JjXn+`38TPc(|o1&V;#(Gw-5kfQmGPrnoa{hx3D+kQNCrlah}2qI+D5 zM}Yo9TQ1p67p=AIELsE7mu&S~$uPPEl%!7fsoK>4fIJ6_1nInRArF5^h3w~41O-u{dGi@jTCzAqkx|Evn@gfA=}3F(FJw#k(vjT5H1`l%`I zX&R7!pIz1>QsIB>7x_)sxLUtBR4+0X4EAu&Lx6O_>(4hoE0DaSxXA`v)Jks7HSbnl za8SMC-*f)m4WmiwkUe`DKAWhlHJryMs`4+9d3|D+FRS{JN3}dP(s8b4+LpN1IltDq zChTa&ry0Uxtnca2qB*9~qODf+?p8v7WFlz3tx+G8UJR2Rtj>K?0Ml23fD>r{bKC5Qt8~BcW!t)n?}$Ts;lpP&`jip zypX0J_)Jb^R;K}wI>e$0>v~o65BAyG?bL1eBk#g`nwG;Ydq6(Oe_RW)_q@Q=#?p06 zW2TXTzX5+u_@nri@oyaZrEhq@pX%YYyFZ|9)$6Ua78sPTeyDP#FehKdn>#1w)ekrM zwL$5J8@07e7MogCN<2-efF4t7)*2f)dBMBRgZzFkz7=Ez?`%^WEq$ai56E7G}vLwWoU9BuY@er)v(WdCx6kr60=70dYvgfS+NYFVwQEb zO|zaf=z}*@!Gs_5kkg*PYaElWn{aI(9=3IP*5;y-V)y?F;k^v6{69F~BSPp#J^}om z>Vbd#1G78mhwao6z)Y-l&)I)qc1DcwrS$Hgg%xdynMhy_+;` z;BED{(XKvXTh|1xKAiVu{&_M3E|6eNoJoJ`uCwW^Qc*F=NZaIk`h<2#M5rRpp8Z1g zbaw5Lu>d$b)isXkY*bM(%1LbF%lDJp#!)fKXxrE>0!htcfIvN*F{X8_=`1s$rvO3P zd|CRI(9{bkLKkP4y1hd>%T!d1GS;?jmx+|7ZJ-EUq+!Zd7YQv>P%+9Owy{h;>ZN}{ zz}Di3mj-@*@Cetje|fY9j5;*hv+tE2s{rL~+j%2}B{pn41j1Uwejtf#H6UXF5O$h& zC`n|f2^x(6GZmYb_;=BVD#Gl+q#y3h`yomw8$fA>`@8^%64(YvI$=GXAW|MiIz`$L zb|wNw8E6|kJ#e1J0MLOMfJp<~=Rbcyl%O_%!+qz86965c0f?~PeZ~buDZmEyqgTvt zYuv3m~ zCBT3hj11g^&On2!Lajms@!o&q*I;T$OCSNg_tZC-8p0AlaPK;2E?NUhJwc!!e8L@0 z9c&>u==Ytk2hxI?fk*(nr~1Lv;Fds>0pD4IAT6*N@WjA-f+0*Tz!JG^B66ih;Bkq_ zl^S7O!HI(JY(|h4+zep4;5iQxqJ*&_QPL27C}kQLq~-Y3A@Zcm6d->{n-e4uv8Q(? zL4li%Od|qM62&OPZ9^mz0SB2v718!UQi^~xRH2F}djN@r`{Y)D4%~nQ=|#l3u~-0v zo!}(He{wBQ4{i)1)d)B<7pjP{2bysBPwNHhA&h~i9RcSALlpz;k*lC17hVS*ON?B2 z9mW-$dic&j25G^~0H$ajo^zKWN*Egwr5@3TKc|5~T8>XWB2T1F0fMwSLFy5EN_G+y zxY@|mBk;s-j56FdMCuW6U^r9}Z4V^%2so=8s)(`&kb1aJNC)V^4M>oBM4Zo#1whyd zPCfi5!~^x<#voFUfV1h7+zl;4cyDtK;S`NzdA}Y;jG=h}iI9a7WdZkGcT2xaNjR%` zFg36x>k%MH%viwyI*|s5!zHAg+*TZ6f(Z_!6>Ek(+#-`p4q*XgldTSH0zsUU8xK4X zgB!ti9FbxbqImO@uEas6lV=Y$Bp!1cW`I>aNGgsNKRFjpi>n7o1<}%uU5sX>X@>{} zaoU~k8H>?kt2s#n5C<#PIV z42z4FtJ2dpzkKoD=vSs@^*~Ckr}_Qq;Sm(}s$PizDu4Px2q_tNMk_w=upNbtxN>>h$2_U2$<{rsM)Afr4u(_NJA4%n+=&27Np&u|G zNkmr;kO?7#4|QEIQPLrF`?;WnbxTKJJV{MCK*mF;?s)hV@4Vj&Me7qptM`y-`u>?k z;mEw$tya^?q>zMi+^(jy4kK_X;*MA?F!z6$=Gf-C(w{3?|k zb)gNP;$|-Cjhu`#mCQ08LNu+L;CYZiJ4DFGX(UvKz@KntfIvAfIhK~hI{{74T^+$7 zZt|E_JxIz+OH2cZj_q|tPkkVXmaxG#L1?V1UVjM>75>bN?hHlCRT& zFQa~#n2*_WJq+x4cGhLf7&Wr)R^4|}*DvB@Gxx*xl9vw7GJN@$XpZV%ulgUQewdh_ z8Gn7EMBn$SnzPo^{1?VH`qo#pT$NheZC@=7$=$#E*hz$`yQ{r?82>m5T;oupH6$lI z*l||@G5JCFL>C+E`ijim_EAG&obrG*;yby|%s6$Ocm43Nuw3j^53k++h~BM|L1j1V z*Mr(svCKU?4Vnf|`ImZr%IBxGSH&UGE`Q*wpPBZjo%X+k_9w2bNMuYRqY{~r$hgFj z5_|Jbji-=(#{nOQ#mVA&ukhRL=WHv?=ul*h(!yg~+jnOJTmQLwvaY^&g0Omr`tUkG zQTlZ|G=%+NSj;|Fw?++7n(3v-=9pJvJ;u6=hW9ktn4FE>8RTUd?_F7TY={n9rhiAT zVa{?JXCt44EWJ|sHIo(JGjTa9as;QJsl1q{o{JgX&IUen`8y&1LCEv6Oq6A!OljyW z_lI!ZWUCCn*FG(*SBmDR?A_S6+3@r_v50i;Gj=;w+!%3xO}1vhH>FA>@x7@NjJQQL zPLMlQZ`pUlYR$RtTOBy&{?i(P-G7F9=1% zTjXM#)0>fTkhet;jScRd5svX&;Zod@nxpw|8haPb|F|8FMpyGYtGdbm$U1``S|-9~ zMDbk7?Ts`^j88f?3HM9~QmOrufoz!{r_VYFU32m4OiXxn0!O$%Lpu;BXn$bgb%n>K zJr2>(NR(SVHVAZ%b|O(%^!B17oAh7XK%aPQEW$IIn#KD^1Ep{8B@L#82W1AzpAE{w zLyEGj4e*ST0=u`^&+0j^A~VhB^1925d>Co=$P=l!@$@+gR?uemGLz0O(plYQMRt)E zqmq0r*!?iEBvGfd)o^F7Ix@iQS6QX2U$K#!KN2cY(5vb|+e`A1``h~UfZDzE&2sbr zd&rB}_2si2jNWMDNJ`qFlZF&90_>-gxD@RII|Gxa6(9+0R)f7f_SxEzlh75w0s||P zk`{~+n>WjAA-?RIpajO8kH&TPHIpwFFaa5pU>CX>hI(s}vjq#tFDuB}dKM}OWqB$d zX!tI7dy_L5Py+5|lZ6(9y85uhced!?v+o2JKTl#2R}6G*e~8`Oji{<(ZC4I%X$VdjZ;h5f>xdDY_0Cwj*=Z({b$$itROx z6O|op>RR?HSIhiprIT|UEdc|Qp&XNcRNi`4{qYhgYPqU*V}^U9Z>hERm2ETC{a$ZI zCGh5u=8cHVZQ!Eao;Y*EcSI0aa~{ zUaM@}>t9q_B*$FPn$5R{HJaNDe3hkjai%V9 zo5ufE)zGt~OB2EK?^Q`}vi8;rvrRe9FQJ4x#CU5CFq8}7xk7UD0_{RwayGR$d}-lC zim}|S`4nYB{!Y~2vaLVV>`cNXJJP!?@lfTxX_&5M%VORdG}kAm+MF+4* diff --git a/latest/searchindex.js b/latest/searchindex.js index 80c447d54..6d63fb11d 100644 --- a/latest/searchindex.js +++ b/latest/searchindex.js @@ -1 +1 @@ -Search.setIndex({"docnames": ["backends", "cache_files", "contents", "contributing", "convolution", "correctness", "design", "diffusion", "diffusion_opencl", "diffusion_use_optparam", "examples", "grid3d", "hostcode", "index", "install", "matrix_multiplication", "metrics", "observers", "optimization", "quickstart", "structs", "templates", "user-api", "vocabulary"], "filenames": ["backends.rst", "cache_files.rst", "contents.rst", "contributing.rst", "convolution.ipynb", "correctness.rst", "design.rst", "diffusion.ipynb", "diffusion_opencl.ipynb", "diffusion_use_optparam.ipynb", "examples.rst", "grid3d.ipynb", "hostcode.rst", "index.rst", "install.rst", "matrix_multiplication.ipynb", "metrics.rst", "observers.rst", "optimization.rst", "quickstart.rst", "structs.rst", "templates.rst", "user-api.rst", "vocabulary.rst"], "titles": ["Backends", "Cache files", "The Kernel Tuner documentation", "Contribution guide", "Convolution", "Correctness Verification", "Design documentation", "Diffusion", "Tutorial: From physics to tuned GPU kernels", "Tutorial: From physics to tuned GPU kernels", "Kernel Tuner Examples", "3D Grid on GPU with Kernel Tuner", "Tuning Host Code", "The Kernel Tuner documentation", "Installation", "Matrix multiplication", "Metrics and Objectives", "Observers", "Optimization strategies", "Getting Started", "Using structs", "Templated kernels", "API Documentation", "Parameter Vocabulary"], "terms": {"kernel": [0, 1, 3, 4, 5, 6, 12, 14, 16, 17, 18, 19, 20, 22, 23], "tuner": [0, 1, 3, 4, 5, 6, 12, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23], "implement": [0, 5, 6, 10, 11, 16, 17, 18, 22], "multipl": [0, 2, 6, 12, 17, 21, 22], "one": [0, 3, 4, 6, 7, 8, 9, 11, 14, 15, 17, 18, 22], "opencl": [0, 3, 4, 7, 8, 9, 10, 12, 13, 15, 22], "hip": [0, 3, 13, 22], "gener": [0, 3, 4, 6, 7, 8, 9, 13, 15, 17, 18, 20, 22, 23], "select": [0, 3, 4, 6, 7, 8, 9, 11, 14, 15, 17, 18, 22], "i": [0, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23], "most": [0, 3, 6, 7, 8, 9, 10, 12, 13, 15, 17, 18, 19, 20, 22], "case": [0, 3, 4, 5, 6, 7, 8, 9, 11, 15, 16, 17, 19, 20, 22], "automat": [0, 3, 4, 7, 8, 9, 11, 12, 15, 21, 22], "done": [0, 4, 14, 16, 17], "base": [0, 3, 6, 16, 17, 21, 22], "": [0, 3, 4, 6, 7, 8, 9, 10, 12, 14, 15, 16, 17, 19, 20, 21, 22], "program": [0, 3, 5, 7, 8, 9, 12, 15, 20, 21], "languag": [0, 6, 9, 12, 15, 20, 22], "sometim": [0, 3, 7, 8, 9, 20], "you": [0, 1, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14, 15, 17, 18, 19, 20, 21, 22, 23], "ll": [0, 4, 7, 8, 9, 14, 15], "want": [0, 5, 9, 11, 12, 14, 15, 17, 19, 22, 23], "specif": [0, 4, 6, 7, 8, 9, 11, 16, 17, 18, 22], "choos": [0, 7, 8, 9, 15, 18, 22], "pycuda": [0, 3, 7, 9, 11, 12, 17, 21], "default": [0, 3, 4, 5, 6, 7, 8, 9, 11, 15, 16, 17, 18, 21, 22], "It": [0, 3, 4, 6, 7, 8, 9, 12, 14, 15, 17, 21, 22], "compar": [0, 4, 5, 7, 8, 9, 11, 15, 16, 17], "complet": [0, 1, 4], "cupi": [0, 3, 12, 14, 17, 21, 22], "becaus": [0, 4, 5, 7, 8, 9, 12, 14, 15, 16, 21, 23], "ident": 0, "includ": [0, 3, 4, 5, 7, 8, 9, 11, 12, 14, 15, 17, 21, 22], "here": [0, 4, 11, 12, 14, 15, 17, 22], "well": [0, 7, 8, 9, 11, 15, 17, 22], "To": [0, 3, 5, 7, 8, 9, 11, 12, 13, 14, 15, 17, 18, 19, 20, 21, 22], "us": [0, 1, 2, 3, 4, 5, 6, 10, 12, 13, 14, 16, 17, 18, 19, 21, 22, 23], "nvidia": [0, 3, 6, 14, 15, 17, 21], "gpu": [0, 3, 4, 5, 6, 10, 12, 13, 15, 17, 19, 20, 22, 23], "see": [0, 1, 3, 4, 6, 7, 8, 9, 11, 12, 14, 15, 17, 19, 21, 22], "http": [0, 3, 13, 14, 17], "github": [0, 3, 4, 7, 8, 9, 11, 14, 15], "com": [0, 3, 13, 14], "jatinx": [0, 14], "nv": 0, "while": [0, 1, 4, 6, 7, 8, 9, 10, 15, 17, 18], "expect": [0, 3, 4, 5, 6, 7, 8, 9, 15, 17, 22], "all": [0, 3, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 17, 18, 19, 22], "input": [0, 4, 5, 7, 8, 9, 10, 12, 15, 16, 19, 20, 22], "output": [0, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 15, 19, 22, 23], "numpi": [0, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14, 15, 19, 20, 21, 22], "arrai": [0, 4, 5, 6, 7, 8, 9, 11, 12, 19, 20, 22], "also": [0, 1, 3, 4, 6, 7, 8, 9, 11, 12, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23], "argument": [0, 1, 4, 5, 6, 7, 8, 9, 10, 11, 12, 15, 17, 18, 19, 20, 21, 22], "thi": [0, 1, 3, 4, 5, 6, 7, 8, 9, 11, 12, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23], "give": [0, 7, 8, 9, 18], "user": [0, 3, 4, 5, 6, 8, 10, 14, 15, 16, 17, 18, 21, 22], "more": [0, 3, 5, 6, 7, 8, 9, 13, 14, 15, 16, 17, 19, 21, 22], "control": [0, 7, 8, 9, 17, 18, 22], "over": [0, 6, 7, 8, 9, 14, 15, 17, 18], "how": [0, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 19, 20, 21, 22], "memori": [0, 4, 6, 10, 12, 17, 20, 22, 23], "handl": [0, 12, 22], "check": [0, 3, 5, 6, 7, 8, 9, 12, 15], "dure": [0, 1, 6, 7, 8, 9, 11, 17, 22], "verif": [0, 2, 10, 22], "can": [0, 1, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23], "happen": [0, 1, 3, 4, 15, 19], "entir": [0, 6, 7, 8, 9, 15, 18, 22], "when": [0, 1, 3, 4, 6, 7, 8, 9, 12, 14, 15, 16, 17, 18, 20, 21, 22, 23], "onli": [0, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15, 17, 18, 20, 22], "textur": [0, 6, 22], "c": [0, 3, 4, 6, 10, 12, 13, 14, 15, 19, 21, 22], "signatur": [0, 4, 6], "With": [0, 11, 12], "other": [0, 1, 3, 4, 6, 7, 8, 9, 12, 15, 16, 17, 18, 22, 23], "requir": [0, 3, 4, 6, 7, 8, 9, 11, 12, 14, 15, 17, 21], "ha": [0, 3, 4, 6, 7, 8, 9, 12, 15, 17, 18, 22], "extern": [0, 17, 21], "linkag": [0, 21], "If": [0, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14, 15, 17, 18, 20, 22], "code": [0, 2, 4, 6, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23], "wrap": [0, 6, 19, 21, 22], "an": [0, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22], "block": [0, 4, 6, 7, 8, 9, 10, 11, 14, 15, 16, 19, 22, 23], "which": [0, 3, 4, 6, 7, 8, 9, 10, 11, 12, 15, 16, 17, 18, 19, 20, 21, 22, 23], "mai": [0, 3, 4, 5, 6, 7, 8, 9, 12, 14, 15, 16, 17, 18, 19, 20, 22], "caus": [0, 7, 8, 9], "issu": [0, 20], "contain": [0, 1, 4, 6, 7, 8, 9, 11, 12, 15, 17, 18, 21, 22], "cannot": [0, 3, 7, 8, 9, 17], "have": [0, 1, 3, 4, 6, 7, 8, 9, 11, 12, 13, 14, 15, 17, 18, 19, 21, 22, 23], "present": [0, 3, 15], "header": [0, 22], "file": [0, 2, 3, 4, 6, 7, 8, 10, 12, 15, 18, 19, 21, 22], "As": [0, 1, 4, 7, 8, 9, 11, 14, 15, 17], "detail": [0, 6, 14, 22], "further": [0, 7, 8, 9, 14, 15], "templat": [0, 2, 11], "ar": [0, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23], "fulli": [0, 3, 14], "limit": [0, 4, 6, 7, 8, 9, 10, 15, 17, 18, 21, 22, 23], "python": [0, 3, 4, 6, 10, 11, 12, 15, 17, 19, 20, 21, 22], "benchmark": [0, 1, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 17, 18, 19, 22, 23], "observ": [0, 2, 6, 16, 22, 23], "constant": [0, 4, 6, 7, 8, 9, 10, 12, 15, 18, 22], "dynam": [0, 6, 22], "share": [0, 4, 6, 22], "anoth": [0, 7, 8, 9, 12, 15, 16, 18, 22], "import": [0, 4, 5, 7, 8, 9, 11, 14, 15, 16, 19, 20, 21], "differ": [0, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 15, 16, 17, 18, 22], "between": [0, 7, 8, 9, 12, 14, 15, 16, 18, 22], "The": [0, 1, 3, 4, 5, 6, 7, 8, 9, 11, 12, 14, 15, 16, 17, 18, 19, 20, 21, 22], "tabl": 0, "below": [0, 3, 9, 10, 11, 12, 14, 15, 16, 17, 18, 20], "list": [0, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 15, 16, 17, 18, 19, 20, 22], "packag": [0, 3], "pyhip": [0, 6], "interfac": [0, 4, 5, 12, 14, 17, 18, 20, 22], "lang": [0, 6, 10, 12, 21, 22], "nvcuda": 0, "nvcc": [0, 6], "nvrtc": [0, 21], "hiprtc": 0, "A": [1, 3, 4, 6, 13, 14, 15, 17, 18, 22], "veri": [1, 5, 7, 8, 9, 12, 14, 15, 17, 20, 21], "featur": [1, 4, 5, 10, 14, 16, 17, 19, 21, 22], "abil": 1, "store": [1, 3, 4, 6, 9, 15, 17, 19, 22], "result": [1, 3, 4, 5, 6, 9, 11, 15, 16, 17, 18, 19, 22, 23], "tune": [1, 2, 5, 6, 10, 13, 14, 18, 19, 21, 22, 23], "enabl": [1, 17, 18, 20, 21], "pass": [1, 3, 5, 6, 7, 8, 9, 10, 11, 12, 15, 16, 17, 18, 19, 21, 22], "ani": [1, 3, 4, 6, 7, 8, 9, 12, 15, 16, 17, 18, 20, 21, 22, 23], "filenam": [1, 4, 6, 10, 15, 19, 22], "option": [1, 3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 15, 16, 17, 18, 21, 22, 23], "tune_kernel": [1, 4, 5, 6, 7, 8, 9, 11, 12, 13, 15, 16, 18, 19, 20, 21, 22], "individu": [1, 17, 18], "configur": [1, 4, 6, 7, 8, 9, 10, 11, 15, 16, 17, 18, 22], "append": [1, 6, 14, 22], "run": [1, 4, 5, 6, 7, 8, 11, 12, 14, 15, 17, 18, 22], "allow": [1, 3, 4, 5, 6, 7, 8, 9, 15, 16, 17, 18, 21, 22], "restart": [1, 3, 7, 8, 9, 18], "session": [1, 3, 6, 18], "from": [1, 3, 4, 5, 6, 7, 10, 11, 12, 14, 15, 17, 18, 20, 21, 22], "exist": [1, 6, 22], "should": [1, 3, 4, 5, 6, 7, 8, 9, 12, 15, 16, 17, 19, 22], "someth": [1, 4, 7, 8, 9, 15], "termin": [1, 14], "previou": [1, 3, 7, 8, 9, 18, 22], "befor": [1, 3, 4, 5, 6, 7, 8, 9, 11, 12, 14, 15, 17, 18, 22], "had": [1, 4], "quit": [1, 7, 8, 9, 11, 15, 21], "often": [1, 7, 8, 9, 17], "hpc": 1, "environ": [1, 4, 6, 14, 18, 22], "job": 1, "reserv": [1, 8, 23], "out": [1, 3, 4, 5, 11, 14, 15], "number": [1, 4, 5, 6, 7, 8, 9, 10, 11, 13, 15, 16, 17, 18, 19, 20, 22, 23], "simul": [1, 6, 9, 13, 18, 20, 22], "visual": [1, 3, 15], "optim": [1, 2, 4, 5, 6, 7, 8, 9, 12, 13, 15, 16, 17, 22], "strategi": [1, 2, 4, 16, 22], "start": [1, 2, 4, 5, 6, 7, 8, 9, 12, 14, 15, 17, 18, 22], "call": [1, 4, 5, 6, 7, 8, 9, 11, 12, 15, 17, 18, 19, 20, 21, 22], "full": [1, 3, 6, 17, 19], "search": [1, 4, 6, 10, 13, 15, 16, 18, 22], "space": [1, 3, 4, 5, 6, 11, 12, 15, 16, 18, 22], "true": [1, 4, 5, 6, 7, 8, 9, 12, 15, 17, 18, 22], "creat": [1, 3, 4, 6, 7, 8, 9, 11, 15, 17, 19, 20, 22], "even": [1, 3, 7, 8, 9, 12, 15, 18], "work": [1, 3, 4, 6, 7, 8, 9, 14, 16, 18, 21, 22], "still": [1, 3, 5, 15], "new": [1, 3, 6, 7, 8, 9, 18, 22], "come": [1, 6, 7, 8, 9, 15, 17, 21], "thei": [1, 3, 6, 7, 8, 9, 10, 15, 16], "stream": [1, 6, 7, 8, 9], "pleas": [1, 3, 4, 13, 14, 17, 19, 20, 22], "dashboard": [1, 13], "introduct": 2, "instal": [2, 3, 4, 7, 8, 9, 11, 12, 15, 17, 19], "get": [2, 4, 6, 7, 8, 9, 11, 14, 15], "convolut": [2, 5, 12, 15], "diffus": 2, "matrix": 2, "exampl": [2, 3, 5, 6, 7, 8, 9, 12, 14, 15, 16, 17, 18, 19, 20, 22], "backend": [2, 3, 12, 17], "cach": [2, 3, 6, 7, 8, 9, 14, 15, 18, 22], "correct": [2, 3, 12, 20, 22], "host": [2, 3, 6, 8, 9, 10, 17, 20, 21, 22], "struct": 2, "metric": [2, 4, 6, 10, 15, 22], "object": [2, 4, 5, 6, 7, 8, 9, 18, 22], "api": [2, 4, 6], "paramet": [2, 5, 6, 7, 8, 10, 12, 15, 16, 18, 19, 20, 21, 22], "vocabulari": [2, 17, 19], "design": [2, 3, 7, 8, 9, 17], "contribut": 2, "thank": 3, "consid": [3, 11, 13, 15, 22], "Not": [3, 6], "help": [3, 21], "u": [3, 4, 7, 8, 9], "improv": [3, 6, 7, 8, 9, 15, 18, 22], "about": [3, 4, 6, 7, 8, 9, 13, 15, 17, 18, 19, 22], "problem": [3, 4, 6, 7, 8, 9, 10, 11, 12, 15, 22], "ensur": [3, 5, 7, 8, 9, 12, 14, 17, 20], "follow": [3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14, 15, 17, 18, 21, 22], "describ": [3, 4, 6, 12, 17, 20], "what": [3, 4, 5, 6, 7, 8, 9, 12, 15, 17, 19, 20, 21, 22, 23], "possibl": [3, 4, 5, 7, 8, 9, 11, 12, 15, 17, 18, 19, 20, 22], "minim": [3, 16, 21, 22], "reproduc": 3, "actual": [3, 4, 5, 6, 7, 8, 9, 11, 15, 21], "error": [3, 4, 5, 6, 12, 15, 21], "print": [3, 4, 6, 7, 8, 9, 11, 15, 22], "version": [3, 4, 15, 17, 22], "cuda": [3, 4, 5, 6, 7, 9, 10, 11, 12, 13, 17, 19, 20, 21, 22], "compil": [3, 4, 5, 7, 8, 9, 10, 11, 12, 13, 14, 15, 17, 18, 19, 21, 22, 23], "applic": [3, 4, 7, 8, 9, 10, 11, 12, 13, 16, 17, 20, 21, 22], "For": [3, 4, 5, 6, 7, 8, 9, 11, 14, 17, 19, 20, 22], "propos": 3, "chang": [3, 11, 17, 22], "addit": [3, 4, 7, 8, 9, 14, 16, 19], "signific": 3, "first": [3, 4, 5, 7, 8, 9, 11, 12, 13, 14, 15, 16, 18, 20, 21, 22], "discuss": [3, 6], "Then": [3, 7, 8, 9, 11, 13, 14, 21], "fork": 3, "repositori": [3, 4, 7, 8, 9, 11, 13, 14, 15], "branch": 3, "per": [3, 4, 7, 8, 9, 11, 16, 17, 22], "pull": 3, "request": [3, 17, 22], "googl": 3, "style": 3, "sphinxdoc": 3, "docstr": [3, 6], "modul": [3, 6, 12, 17], "public": [3, 13], "function": [3, 4, 5, 7, 8, 9, 10, 11, 12, 15, 17, 18, 19, 20, 21, 22], "up": [3, 4, 6, 7, 8, 9, 14, 15, 19, 22], "date": 3, "written": [3, 21], "unit": [3, 6], "your": [3, 4, 7, 8, 9, 11, 12, 13, 14, 17, 20, 22], "nox": 3, "do": [3, 4, 6, 7, 8, 9, 11, 12, 15, 22], "hardwar": [3, 7, 8, 9, 11, 17, 18, 19], "skip": [3, 4, 7, 8, 9, 22], "produc": [3, 5], "same": [3, 4, 5, 6, 7, 8, 9, 11, 12, 17, 19, 22], "better": [3, 7, 8, 9], "entri": [3, 6, 7, 8], "changelog": 3, "md": 3, "match": [3, 4, 5, 6], "roadmap": 3, "updat": [3, 6], "remov": [3, 18], "doubt": 3, "where": [3, 4, 5, 6, 7, 8, 9, 15, 16, 17, 20, 21, 22], "put": [3, 6, 7, 8, 9], "look": [3, 4, 6, 7, 8, 9, 11, 14, 15, 21], "regard": [3, 6, 18], "step": [3, 7, 8, 9, 14, 15, 16, 18, 21], "set": [3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 15, 17, 18, 19, 21, 22, 23], "sudo": [3, 14], "access": [3, 4, 7, 8, 9, 11, 17, 20], "e": [3, 14, 16, 17, 18, 22], "g": [3, 14, 16, 17], "devic": [3, 4, 5, 7, 8, 9, 10, 12, 17, 21, 22], "clone": [3, 4, 7, 8, 9, 11, 14, 15], "git": [3, 17], "desir": 3, "locat": [3, 5, 11, 17], "kerneltun": [3, 13], "kernel_tun": [3, 4, 5, 7, 8, 9, 11, 12, 13, 14, 15, 17, 19, 20, 21, 22, 23], "cd": [3, 14], "prepar": [3, 6, 7, 8, 9], "system": [3, 13, 14, 17], "On": [3, 7, 8, 9, 22], "ubuntu": 3, "apt": 3, "upgrad": 3, "y": [3, 4, 6, 7, 8, 9, 11, 12, 15, 22], "make": [3, 4, 7, 8, 9, 11, 13, 14, 15, 17, 20, 21], "essenti": [3, 4], "libssl": 3, "dev": [3, 14, 17], "zlib1g": 3, "libbz2": 3, "libreadlin": 3, "libsqlite3": 3, "wget": [3, 14], "curl": [3, 14], "llvm": 3, "libncurses5": 3, "libncursesw5": 3, "xz": 3, "util": [3, 15], "tk": 3, "libffi": 3, "liblzma": 3, "openssl": 3, "pyenv": 3, "linux": [3, 14], "bash": [3, 14], "rememb": [3, 4, 7, 8, 9, 15], "add": [3, 4, 6, 7, 8, 9, 12, 15, 17, 18], "bash_profil": 3, "bashrc": 3, "specifi": [3, 4, 5, 6, 7, 8, 9, 11, 12, 15, 16, 17, 18, 19, 20, 21, 22, 23], "maco": 3, "brew": 3, "after": [3, 4, 5, 6, 7, 8, 9, 12, 14, 15, 17, 22], "shell": 3, "some": [3, 4, 6, 7, 8, 9, 14, 15, 16, 17, 18, 19, 20, 21, 22], "need": [3, 4, 5, 6, 7, 8, 9, 11, 12, 14, 15, 16, 17, 19, 20, 21, 22], "libgdbm": 3, "libnss3": 3, "lzma": 3, "3": [3, 5, 7, 8, 9, 11, 12, 14, 15, 18, 22], "8": [3, 4, 6, 7, 8, 9, 11, 14, 15, 17], "9": [3, 4, 5, 7, 8, 9, 12], "10": [3, 7, 8, 9, 13, 18], "11": [3, 7, 8, 9], "virtual": [3, 14], "folder": 3, "virtualenv": 3, "whatev": [3, 6, 12, 18], "name": [3, 4, 5, 6, 7, 8, 9, 11, 15, 16, 17, 18, 19, 22, 23], "prefer": [3, 4, 6, 7, 9, 17, 22], "so": [3, 4, 6, 7, 8, 9, 11, 12, 14, 15, 17, 18, 19, 21, 22], "found": [3, 4, 6, 13, 17, 18], "replac": [3, 4, 5, 6, 7, 8, 9, 11, 15, 22], "global": [3, 6, 7, 8, 9, 18], "poetri": [3, 14], "ssl": [3, 14], "org": [3, 13, 14], "python3": [3, 14], "sure": [3, 4, 7, 8, 9, 13, 14, 15], "path": [3, 4, 17], "instruct": [3, 7, 8, 9, 10, 14, 15], "end": [3, 4, 6, 7, 8, 9, 11, 15, 17, 18, 20], "non": [3, 5], "depend": [3, 4, 5, 9, 10, 11, 13, 16, 22], "re": [3, 4, 7, 8, 9, 11, 15], "open": [3, 5, 7, 8, 12, 15], "take": [3, 4, 6, 7, 8, 9, 11, 15, 17, 18, 19, 21, 22], "effect": [3, 4, 7, 8, 9, 22], "activ": 3, "project": [3, 14], "extra": [3, 14, 21], "doc": [3, 4, 7, 8, 9, 11, 14, 15], "leav": 3, "doe": [3, 5, 6, 7, 8, 9, 11, 12, 15, 17, 21, 22], "appli": [3, 7, 8, 9], "go": [3, 4, 7, 8, 9, 11, 13, 14, 15, 19], "necessari": [3, 5, 6, 7, 8, 9, 22], "conveni": [3, 7, 8, 9, 12, 22], "cuda11x": 3, "cuda12x": 3, "These": [3, 7, 8, 9, 11, 14, 15, 17, 21, 22], "current": [3, 4, 5, 6, 7, 8, 9, 14, 15, 17, 18, 22], "defin": [3, 4, 5, 6, 7, 8, 9, 10, 11, 15, 16, 17, 21, 22], "part": [3, 7, 8, 9, 13, 14, 15, 16, 20, 22], "forget": [3, 11], "correctli": [3, 15], "ld_libary_path": 3, "cpath": 3, "pytest": 3, "except": [3, 6, 10], "been": [3, 4, 6, 7, 8, 9, 12, 15, 18], "left": [3, 6, 7, 8, 9, 11, 16], "gracefulli": 3, "note": [3, 4, 6, 7, 8, 9, 11, 14, 15, 17, 20, 22], "driver": [3, 6, 7, 9, 11], "privileg": [3, 17], "read": [3, 4, 5, 6, 7, 8, 9, 11, 12, 15, 17, 22], "counter": [3, 17], "energi": [3, 13, 17, 18, 23], "measur": [3, 6, 7, 8, 9, 11, 12, 15, 16, 17, 22, 23], "cat": 3, "proc": 3, "param": [3, 4, 5, 6, 17, 18, 22], "grep": 3, "rmprofilingadminonli": 3, "1": [3, 4, 5, 7, 8, 9, 11, 12, 15, 17, 18, 22], "without": [3, 7, 8, 9, 11, 12, 17, 18], "conda": 3, "mamba": 3, "perform": [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 16, 17, 18, 19, 20, 22], "miniconda": [3, 14], "tradit": 3, "under": [3, 4, 13, 22], "quota": 3, "otherwis": [3, 6, 15, 22], "restrict": [3, 6, 10, 15, 21, 22], "disk": 3, "directori": [3, 4, 7, 8, 9, 11, 14, 15], "save": [3, 7, 8], "ad": [3, 7, 8, 9, 12, 22], "condarc": 3, "envs_dir": 3, "both": [3, 7, 8, 9, 10, 15], "via": [3, 18], "usual": [3, 17], "provid": [3, 5, 6, 7, 8, 9, 12, 21, 22], "exit": 3, "enter": [3, 4, 7, 8, 9, 11, 15], "avail": [3, 4, 7, 8, 9, 10, 11, 14, 17], "continu": [3, 4, 6, 7, 8, 9, 14, 17, 18, 22], "n": [3, 5, 7, 8, 9, 11, 12, 13, 15, 18, 19, 21], "forg": 3, "execut": [3, 4, 6, 7, 8, 9, 10, 11, 12, 15, 16, 18, 22], "config": [3, 6], "auto_activate_bas": 3, "fals": [3, 6, 17, 18, 22], "load": [3, 6], "unload": [3, 6], "rocm": [3, 14, 17], "inform": [3, 4, 6, 7, 8, 9, 13, 17, 18, 19, 22, 23], "like": [3, 4, 6, 7, 8, 9, 10, 11, 15, 18, 19, 20, 21, 22], "keyr": 3, "seemingli": 3, "weird": 3, "known": [3, 15], "pip": [3, 4, 7, 8, 13, 14, 15], "m": [3, 7, 8, 9, 11], "disabl": 3, "verifi": [3, 5, 6, 10, 22], "miss": [3, 6, 22], "sync": [3, 20], "dry": 3, "node": [3, 18], "In": [3, 4, 5, 6, 7, 8, 9, 11, 12, 14, 15, 16, 17, 19, 20, 22, 23], "noxset": 3, "toml": 3, "venvbackend": 3, "2": [3, 4, 5, 7, 8, 9, 10, 11, 12, 15, 17, 18, 22], "anaconda": 3, "venv": 3, "alreadi": [3, 4, 6, 7, 8, 9, 14, 15, 22], "Be": [3, 7, 8, 9], "adjust": [3, 4], "envdir": 3, "particularli": [3, 4, 16], "diskquota": 3, "against": [3, 5, 6], "support": [3, 4, 6, 7, 8, 9, 12, 14, 17, 18, 21, 22, 23], "isol": [3, 21], "top": [3, 6, 11, 17, 22], "level": [3, 6, 17], "coverag": 3, "gigabyt": 3, "size": [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 18, 19, 21, 22], "tight": 3, "diskspac": 3, "small": [3, 4, 7, 8, 9, 15], "each": [3, 4, 5, 6, 7, 8, 11, 15, 17, 18, 22], "ran": 3, "longer": [3, 4, 6, 16], "would": [3, 4, 7, 8, 9, 21], "command": [3, 14], "line": [3, 4, 7, 8, 9], "combin": [3, 4, 6, 7, 8, 9, 10, 11, 15, 17, 18, 19, 22], "compat": [3, 6, 14], "involv": 3, "especi": 3, "don": [3, 6, 7, 9, 11, 12, 22], "t": [3, 4, 6, 7, 8, 9, 11, 12, 14, 18, 21, 22], "break": [3, 21], "them": [3, 4, 9, 11, 12, 15], "capabl": [3, 6, 7, 8, 15, 22], "hold": [3, 7, 8, 15, 19, 20, 22], "pyopencl": [3, 6, 8, 17], "invok": 3, "tab": 3, "studio": 3, "id": [3, 6, 17], "seen": [3, 4, 6, 15], "integr": [3, 21], "type": [3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 17, 18, 19, 20, 21, 22], "html": 3, "page": [3, 4, 7, 8, 9, 10, 11, 13, 15, 16], "sourc": [3, 4, 6, 7, 8, 9, 11, 12, 14, 15, 17, 21, 22], "inspect": [3, 6, 17], "commit": 3, "brows": 3, "through": [3, 6, 7, 8, 9, 11, 13, 16, 17, 18, 22], "least": [3, 6], "those": [3, 4, 10, 14, 17], "pandoc": 3, "mac": 3, "onlin": 3, "built": [3, 17, 18, 20, 22], "action": 3, "correspond": [3, 4, 7, 8, 9, 11, 17, 18, 19], "master": 3, "latest": [3, 14], "last": [3, 6, 20], "releas": [3, 6], "stabl": 3, "publish": [3, 13], "point": [3, 4, 6, 7, 8, 9, 11, 12, 15, 16, 17, 19, 22], "process": [3, 4, 6, 7, 8, 9, 15, 16, 17, 18, 21], "again": [3, 4, 7, 8, 9, 11, 15], "autom": 3, "guid": [4, 7, 15, 16, 19], "meant": 4, "write": [4, 10, 11, 15, 21, 22], "script": [4, 6, 15, 20, 21], "we": [4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15, 17, 19, 20, 21], "simpl": [4, 6, 7, 8, 9, 10, 11, 12, 13, 15, 17, 18, 19, 20], "find": [4, 12, 15, 18, 22], "shortli": 4, "much": [4, 7, 8, 9, 11, 17, 21, 22], "reus": [4, 7, 8, 9, 15], "document": [4, 5, 7, 8, 9, 11, 14, 15, 20, 23], "jupyt": [4, 7, 8, 9, 11, 14, 15], "notebook": [4, 7, 8, 9, 11, 14, 15], "just": [4, 5, 6, 7, 8, 9, 11, 12, 14, 15], "tutori": [4, 7, 11, 13, 14, 15], "readi": [4, 6, 7, 8, 9, 11, 15], "oper": [4, 7, 8, 9, 11, 12, 15, 16], "signal": [4, 23], "imag": [4, 7, 8, 9], "main": [4, 6, 11, 17, 19], "neural": 4, "network": 4, "deep": 4, "learn": 4, "comput": [4, 5, 6, 10, 11, 12, 13, 15, 18, 22], "linear": [4, 15, 22], "weight": [4, 18], "filter": [4, 5, 10, 12], "rang": [4, 5, 7, 8, 9, 11, 12, 21], "pixel": 4, "w": [4, 7, 8, 16, 18], "time": [4, 6, 7, 8, 9, 11, 12, 15, 16, 17, 18, 21, 22, 23], "h": [4, 11, 22], "f": [4, 5, 11, 12, 20], "f_w": 4, "f_h": 4, "o": [4, 6], "begin": [4, 7, 8, 9, 11], "equat": [4, 7, 8, 9, 11, 18], "nonumb": [4, 11], "x": [4, 5, 6, 7, 8, 9, 11, 13, 15, 19, 21, 22], "sum": [4, 5, 6, 15], "limits_": 4, "j": [4, 7, 8, 9, 13, 15], "0": [4, 5, 6, 7, 8, 9, 11, 12, 15, 17, 18, 20, 22], "naiv": [4, 5, 7, 8, 9], "parallel": [4, 7, 8, 9], "thread": [4, 6, 7, 8, 9, 10, 11, 16, 17, 19, 22, 23], "avoid": [4, 15, 23], "confus": 4, "around": [4, 10], "term": 4, "refer": [4, 5, 6, 7, 8, 9, 10, 12, 14, 17, 22], "shown": [4, 6, 17], "press": [4, 7, 8, 9, 11, 15], "shift": [4, 7, 8, 9, 11, 15], "writefil": [4, 15], "convolution_na": [4, 5], "cu": [4, 5, 12, 15, 19, 21], "__global__": [4, 7, 9, 11, 13, 15, 19, 21], "void": [4, 7, 8, 9, 11, 13, 15, 19, 20, 21], "convolution_kernel": [4, 5], "float": [4, 6, 7, 8, 9, 11, 12, 13, 15, 16, 17, 18, 19, 20, 21, 22], "int": [4, 6, 7, 8, 9, 11, 13, 15, 19, 21, 22], "blockidx": [4, 7, 8, 9, 11, 13, 15, 19, 21], "blockdim": [4, 19, 22], "threadidx": [4, 7, 8, 9, 11, 13, 15, 19, 21], "image_height": 4, "image_width": 4, "filter_height": 4, "filter_width": 4, "input_width": 4, "run_kernel": [4, 5, 6, 10, 22], "our": [4, 7, 8, 9, 11, 15, 19, 20], "But": [4, 7, 8, 9, 11, 19], "data": [4, 6, 7, 8, 9, 11, 12, 15, 16, 17, 19, 20, 22], "np": [4, 6, 11, 15, 19, 20], "filter_s": 4, "17": [4, 5, 7, 8, 9, 12], "output_s": 4, "4096": [4, 5, 7, 8, 9, 12, 15], "prod": [4, 5, 12], "border_s": 4, "input_s": [4, 5, 12], "output_imag": 4, "zero": [4, 5, 11, 12, 15], "astyp": [4, 5, 7, 8, 9, 11, 12, 13, 15, 19, 21], "float32": [4, 5, 6, 7, 8, 9, 11, 12, 13, 15, 19, 21, 22], "input_imag": 4, "random": [4, 5, 6, 7, 8, 9, 11, 12, 13, 15, 18, 19, 21, 22], "randn": [4, 5, 12, 13, 15, 19, 21], "conv_filt": 4, "now": [4, 6, 7, 8, 9, 11, 12, 15, 19], "structur": [4, 6, 7, 8, 15, 19], "kernel_nam": [4, 6, 12, 20, 21, 22], "kernel_sourc": [4, 6, 20, 22], "problem_s": [4, 5, 6, 7, 8, 9, 11, 12, 15, 19, 20, 22, 23], "ellipsi": 4, "indic": [4, 18, 23], "mani": [4, 6, 7, 8, 9, 15, 16, 17, 18, 22], "won": 4, "right": [4, 7, 8, 9, 11, 14], "interest": [4, 20], "five": [4, 6, 19], "string": [4, 6, 7, 8, 9, 10, 15, 16, 17, 19, 20, 22], "domain": [4, 7, 8, 9, 10, 11, 22], "three": [4, 5, 15], "dimens": [4, 6, 7, 8, 9, 10, 11, 12, 15, 16, 18, 19, 22, 23], "dictionari": [4, 6, 7, 8, 9, 11, 15, 17, 18, 19, 22], "simpli": [4, 5, 6, 7, 8, 9, 11, 18, 19, 22], "cell": [4, 7, 8, 9, 11, 15], "wrote": 4, "determin": [4, 7, 8, 9, 11, 17, 18], "grid": [4, 6, 7, 8, 9, 10, 12, 15, 22, 23], "abov": [4, 6, 7, 8, 9, 11, 14, 15, 19, 20], "divid": [4, 7, 8, 9, 11, 12, 15, 22], "divisor": [4, 6, 7, 8, 9, 15, 22], "scalar": [4, 7, 8, 9, 11, 22], "therefor": [4, 5, 7, 8, 9, 11, 12, 15], "exactli": [4, 6, 7, 8, 9, 15, 17], "order": [4, 5, 6, 7, 8, 9, 11, 12, 15, 16, 18, 19, 22], "32": [4, 6, 7, 8, 9, 11, 13, 15, 19, 22], "bit": [4, 6, 7, 8, 9, 11, 12, 15], "final": [4, 5, 7, 8, 9, 11], "anyth": 4, "insert": [4, 5, 6, 9, 11, 12, 15, 19, 21, 22, 23], "preprocessor": [4, 6, 22], "statement": [4, 9, 11, 15, 21], "valu": [4, 5, 6, 7, 8, 9, 10, 11, 12, 15, 17, 18, 19, 22], "were": [4, 7, 8, 9, 11, 15, 22], "i_like_convolut": 4, "42": 4, "definit": [4, 11, 22], "unless": 4, "cours": [4, 7, 8, 9, 14, 15], "somewher": 4, "token": 4, "freeli": 4, "few": [4, 7, 8, 9, 11, 12, 21], "special": [4, 7, 8, 9, 17, 19, 23], "notic": [4, 7, 8, 9], "haven": [4, 14], "yet": [4, 6, 11, 12, 19], "basic": [4, 6, 7, 8, 9, 19], "block_size_x": [4, 5, 6, 7, 8, 9, 11, 12, 13, 15, 19, 21, 22], "block_size_i": [4, 5, 7, 8, 9, 11, 12, 15, 22], "block_size_z": [4, 7, 8, 9, 11, 22], "interpret": 4, "z": [4, 6, 11, 22], "block_size_nam": [4, 6, 22], "let": [4, 6, 7, 8, 9, 19, 21], "creation": [4, 13, 18], "trusti": 4, "old": 4, "16": [4, 5, 7, 8, 9, 11, 12, 15], "dict": [4, 5, 6, 9, 12, 13, 17, 18, 19, 21, 22], "undefin": [4, 6, 7, 8, 9, 15], "filter_heigth": 4, "could": [4, 5, 6, 7, 8, 9, 12, 14, 15, 17, 18, 21, 22], "runtim": [4, 6, 7, 8, 9, 13, 14, 17, 21], "setup": [4, 7, 8, 9, 12, 14, 17, 20], "everyth": [4, 6, 7, 8, 9], "answer": [4, 5, 6, 7, 8, 9, 10, 22], "alloc": [4, 6, 7, 8, 9, 10, 12, 22], "move": [4, 6, 7, 12, 15, 18, 22], "content": [4, 6, 22], "deriv": [4, 6, 7, 8, 9, 16], "retriev": [4, 6, 22], "free": [4, 7, 8, 9, 12, 14, 15], "return": [4, 5, 6, 7, 8, 9, 11, 12, 15, 17, 18, 19, 20, 22], "contrast": 4, "wa": [4, 6, 7, 8, 9, 17, 22], "finish": [4, 6, 8, 11, 12, 17], "than": [4, 7, 8, 9, 11, 16, 17, 18, 22, 23], "highli": [4, 13, 15], "parametr": 4, "long": [4, 7, 8, 9, 11, 12, 15, 20], "instead": [4, 6, 10, 15, 22], "littl": [4, 7, 8, 9, 15], "ve": [4, 7, 8, 9, 14, 15], "familiar": [4, 15], "kernel_str": [4, 5, 6, 7, 8, 9, 12, 13, 18, 22], "tune_param": [4, 5, 6, 7, 8, 9, 11, 12, 13, 15, 18, 19, 20, 21, 22], "similarli": 4, "singl": [4, 5, 6, 7, 8, 9, 12, 15, 17, 21, 22], "wai": [4, 7, 8, 9, 11, 12, 13, 14, 15, 16, 17, 22], "64": [4, 7, 8, 9, 13, 15, 19, 21], "128": [4, 7, 8, 9, 13, 19, 21], "try": [4, 6, 7, 8, 9, 14, 15, 18, 22], "env": [4, 6, 18, 19, 22], "cartesian": [4, 11], "product": [4, 7, 8, 22], "realli": [4, 7, 8, 9, 14], "howev": [4, 5, 7, 8, 9, 12, 14, 15, 17, 20, 21, 22], "lot": [4, 7, 8, 9, 15, 17, 19, 20, 22], "problemat": 4, "explain": [4, 6, 7, 8, 9, 12, 14, 15, 16, 19, 21, 22], "illeg": 4, "2048": 4, "1024": [4, 7, 8, 9, 19], "fail": [4, 6, 14, 22], "reason": [4, 6, 20, 22], "too": [4, 7, 8, 9, 11, 12, 15, 22], "regist": [4, 7, 8, 9, 15, 17], "silent": 4, "verbos": [4, 5, 6, 7, 8, 9, 12, 22], "bound": [4, 6, 15, 18], "ignor": [4, 6, 7, 8, 9, 22], "two": [4, 6, 7, 8, 9, 10, 15, 16, 18, 22], "thing": [4, 12, 15], "record": [4, 6, 7, 17, 22], "show": [4, 7, 8, 9, 10, 13, 16, 20], "secondli": [4, 15], "experi": 4, "took": [4, 7, 9, 18, 19, 22], "place": [4, 7, 8, 9, 17, 18, 19, 22], "That": [4, 7, 8, 9, 12, 15, 16, 19], "mean": [4, 12, 15, 16, 18, 20, 21, 23], "softwar": [4, 7, 8, 9, 13, 14, 17, 18, 19], "along": [4, 6, 14, 19, 23], "second": [4, 5, 6, 7, 8, 9, 11, 15, 16, 17, 18, 22], "alwai": [4, 6, 7, 8, 9], "circumst": 4, "obtain": [4, 7, 8, 9, 11, 17], "promis": 4, "tile": [4, 10, 15], "factor": [4, 7, 8, 9, 10, 11, 15, 23], "amount": [4, 7, 8, 9, 15, 16, 22], "particular": [4, 6, 7, 8, 10, 12, 15, 17, 20], "increas": [4, 7, 8, 9, 17], "certain": [4, 6, 7, 8, 9, 17, 23], "tile_size_x": [4, 5, 7, 8, 9, 12, 15], "4": [4, 7, 8, 9, 11, 15, 17], "tile_size_i": [4, 5, 7, 8, 9, 12, 15, 22], "understand": 4, "everi": [4, 5, 7, 8, 9, 10, 17, 19], "fewer": [4, 7, 8, 9], "total": [4, 6, 7, 8, 9, 15, 16, 19], "stai": 4, "tell": [4, 7, 8, 9, 10, 12, 15, 19, 20], "influenc": 4, "did": [4, 7, 8, 9, 15], "mimick": 4, "behavior": [4, 15, 17, 22], "assum": [4, 6, 7, 8, 9, 15, 22], "far": [4, 7, 8, 9, 15, 19], "grid_div_x": [4, 5, 7, 8, 9, 12, 15, 22], "grid_div_i": [4, 5, 7, 8, 9, 12, 15, 22], "decreas": [4, 15], "correspondingli": 4, "displai": 4, "commonli": [4, 7, 8, 9, 14, 15], "gflop": [4, 6, 10, 15, 16], "giga": [4, 15], "compos": [4, 6, 15, 16], "lambda": [4, 6, 7, 8, 15, 16, 22], "collect": [4, 6, 7, 8, 9, 11, 15, 17, 20], "ordereddict": [4, 7, 8, 9, 11, 15, 16], "p": [4, 6, 15, 16, 20, 22], "1e9": [4, 15], "1e3": [4, 7, 8, 9, 15, 16], "expand": [4, 15, 17], "sinc": [4, 9, 11, 15, 21], "And": [4, 7, 8, 9, 18, 21, 22], "know": [4, 7, 8, 9, 15, 16], "enough": [4, 5, 15], "abl": [4, 6, 7, 8, 9], "own": [4, 9, 12, 14, 16, 17], "whenev": 5, "good": [5, 7, 8, 9, 23], "fast": [5, 7, 8, 9], "instanc": [5, 6, 7, 8, 9, 12, 17, 22], "none": [5, 6, 17, 18, 22], "onc": [5, 6, 7, 8, 9, 11, 17, 22], "comparison": 5, "allclos": [5, 22], "maximum": [5, 6, 11, 18, 22], "absolut": [5, 22], "1e": [5, 22], "6": [5, 7, 8, 9, 11, 12, 22], "toler": 5, "atol": [5, 6, 22], "convolution_correct": 5, "py": [5, 12, 14], "demonstr": [5, 9, 10, 15], "r": [5, 12], "cmem_arg": [5, 6, 22], "d_filter": 5, "arg": [5, 6, 7, 8, 9, 11, 12, 13, 15, 18, 19, 20, 21], "field": [5, 7, 8, 9], "its": [5, 6, 7, 8, 9, 11, 13, 14, 15, 16, 17, 22], "almost": [5, 7, 8, 9, 17], "whose": [5, 22], "trust": [5, 18], "construct": [5, 15], "There": [5, 6, 7, 8, 9, 10, 11, 12, 14, 15, 17, 19, 22, 23], "precomput": 5, "flexibl": [5, 7, 8, 15], "callabl": [5, 6, 22], "accept": [5, 6, 18, 22], "cpu_result": 5, "gpu_result": [5, 7, 9], "although": 5, "semant": 5, "posit": [5, 6, 11, 18, 21, 22], "reflect": [5, 17], "reduct": [5, 16, 22], "snippet": 5, "sum_x": 5, "custom": [5, 10, 16, 17, 20], "def": [5, 6, 7, 8, 9, 11, 17, 20], "verify_partial_reduc": 5, "isclos": 5, "first_kernel": 5, "_": [5, 7, 8, 9], "sum_float": 5, "map": [5, 10, 11], "third": [5, 15], "partial": [5, 7, 8, 9, 10], "cpu": [5, 8, 9, 12], "achiev": [5, 9], "element": [5, 7, 8, 9, 15, 16, 19, 20, 22], "necessarili": [5, 12], "section": [6, 7, 8, 9], "intern": [6, 13, 18, 21], "mostli": [6, 13, 22], "relev": [6, 13, 17], "develop": [6, 13, 14], "extens": 6, "architectur": [6, 17], "At": [6, 11, 22], "expos": 6, "respons": 6, "iter": [6, 7, 8, 9, 11, 15, 17, 18, 19, 22], "brute_forc": [6, 22], "valid": [6, 10, 15, 22], "random_sampl": [6, 22], "sampl": [6, 18, 22], "advanc": [6, 21, 22], "being": [6, 7, 8, 9, 15, 17, 18, 22], "strategy_opt": [6, 18, 22], "sai": [6, 7, 8, 9, 19, 21], "foreseen": 6, "futur": [6, 13, 22, 23], "high": [6, 7, 8, 9, 13, 15, 17], "low": [6, 7, 8, 9, 15], "abstract": [6, 17], "ready_argument_list": 6, "build": [6, 7, 8, 9], "bottom": 6, "either": [6, 11, 18, 21, 22], "typic": [6, 14, 15, 22], "gcc": 6, "fortran": [6, 10, 21], "turn": 6, "launch": [6, 7, 8, 9, 12, 17, 22], "rest": [6, 7, 8, 9], "helper": [6, 17], "get_opt": 6, "suppli": [6, 12, 15, 18, 21, 22], "get_strategy_docstr": 6, "method": [6, 7, 8, 9, 12, 15, 17, 18], "make_strategy_options_doc": 6, "scale_from_param": 6, "ep": [6, 18], "func": [6, 17, 22], "invers": 6, "unscal": 6, "setup_method_argu": 6, "setup_method_opt": 6, "tuning_opt": [6, 18], "snap_to_nearest_config": 6, "closest": 6, "unscale_and_snap_to_nearest": 6, "snap": 6, "scale": 6, "variabl": [6, 11, 14, 18, 22], "nearest": [6, 22], "class": [6, 17, 18], "kernel_opt": 6, "device_opt": 6, "__init__": 6, "instanti": [6, 21], "kernelsourc": 6, "parameter_spac": [6, 18], "iterfac": 6, "platform": [6, 13, 14, 17, 22], "quiet": [6, 22], "compiler_opt": [6, 22], "7": [6, 7, 8, 9, 11, 22], "offer": 6, "bool": [6, 20, 22], "gpu_arg": 6, "benchmark_continu": 6, "durat": [6, 17], "benchmark_default": 6, "check_kernel_output": 6, "compile_kernel": 6, "copy_constant_memory_arg": 6, "recent": [6, 14, 17], "copy_shared_memory_arg": 6, "smem_arg": [6, 22], "copy_texture_memory_arg": 6, "texmem_arg": [6, 22], "create_kernel_inst": 6, "get_environ": 6, "memcpy_dtoh": [6, 7], "dest": 6, "src": 6, "copi": [6, 7, 8, 9, 12, 19, 22], "static": 6, "preprocess_gpu_argu": 6, "old_argu": 6, "flat": 6, "given": [6, 7, 8, 9, 11, 17, 18, 22], "mem": 6, "group": [6, 7, 8, 9, 22], "maintain": 6, "state": [6, 7, 8, 9, 17, 22], "interact": [6, 17], "properti": [6, 15, 22], "context": [6, 7, 9, 11], "kernel_inst": 6, "lookup": 6, "directli": [6, 7, 8, 9, 12, 15, 17, 21, 22], "ndarrai": [6, 11], "format": [6, 7, 8, 20], "kei": [6, 7, 8, 9, 15, 18, 19, 22], "symbol": [6, 22], "similar": [6, 12, 15, 22], "regular": [6, 9, 17], "int32": [6, 13, 19, 21, 22], "kernel_finish": 6, "devicealloc": 6, "memcpy_htod": [6, 7], "memset": 6, "unsign": [6, 8], "byte": [6, 20, 22], "tupl": [6, 9, 11, 18, 22], "start_ev": 6, "event": [6, 7, 12, 17], "mark": 6, "stop_ev": 6, "synchron": [6, 7, 9, 11, 15, 16], "halt": [6, 12], "until": [6, 12], "task": 6, "rawkernel": 6, "cudeviceptr": 6, "cufunct": 6, "must": [6, 16, 22], "buffer": [6, 8, 20], "fill": [6, 15], "item": [6, 7, 8, 9, 11], "ndrang": 6, "cfunction": 6, "cleanup_lib": 6, "previous": [6, 7, 8, 9, 15], "librari": [6, 10, 17, 20], "kernelinst": 6, "repres": [6, 7, 8, 9], "tunabl": [6, 7, 8, 9, 10, 11, 15, 16, 17, 18, 19, 21, 22, 23], "ctype": 6, "_funcptr": 6, "asynchron": 6, "memcpi": [6, 12], "c_arg": 6, "robust": 6, "averag": [6, 7, 8, 9, 12, 17], "ptr": 6, "pionter": 6, "compilationfailedconfig": 6, "errorconfig": 6, "invalidconfig": 6, "npencod": 6, "skipkei": 6, "ensure_ascii": 6, "check_circular": 6, "allow_nan": 6, "sort_kei": 6, "indent": 6, "separ": [6, 10, 12, 21], "dump": [6, 7, 8], "json": [6, 7, 8, 10, 22], "obj": 6, "subclass": 6, "serializ": 6, "rais": 6, "typeerror": 6, "arbitrari": 6, "self": [6, 17, 18], "els": 6, "jsonencod": 6, "runtimefailedconfig": 6, "skippablefailur": 6, "stopcriterionreach": 6, "thrown": 6, "stop": [6, 18], "criterion": [6, 18], "reach": 6, "check_argument_list": 6, "check_argument_typ": 6, "dtype": [6, 20], "kernel_argu": 6, "check_restrict": 6, "whether": [6, 16, 18, 22], "meet": 6, "check_stop_criterion": 6, "max_fev": [6, 18, 22], "exceed": 6, "check_thread_block_dimens": 6, "max_thread": 6, "check_tune_params_list": 6, "simulation_mod": [6, 22], "forbidden": 6, "compile_restrict": 6, "monolith": 6, "try_to_constraint": 6, "union": 6, "str": [6, 7, 8, 9, 11], "constraint": 6, "pars": [6, 7, 8], "config_valid": 6, "max": 6, "convert_constraint_restrict": 6, "convert": [6, 7, 8], "backward": 6, "cuda_error_check": 6, "statu": 6, "delete_temp_fil": 6, "delet": 6, "temporari": 6, "complain": 6, "detect_languag": 6, "attempt": [6, 21], "detect": [6, 18, 21, 22], "dump_cach": 6, "omit": 6, "sever": [6, 7, 8, 9, 10, 11, 14, 15, 21, 22], "store_cach": 6, "speed": 6, "great": [6, 7, 8, 9, 19], "power": [6, 15, 17, 23], "get_best_config": 6, "objective_higher_is_bett": [6, 16, 22], "best": [6, 7, 8, 11, 15, 18, 21, 22, 23], "accord": [6, 22], "get_config_str": 6, "compact": 6, "represent": [6, 20], "get_grid_dimens": 6, "current_problem_s": 6, "grid_div": 6, "dim": 6, "get_instance_str": 6, "debug": 6, "advis": 6, "get_kernel_str": [6, 7, 8, 9], "One": [6, 7, 8, 9, 17, 20], "get_problem_s": 6, "get_smem_arg": 6, "get_temp_filenam": 6, "suffix": [6, 22], "form": [6, 15, 17, 18], "temp_x": 6, "larg": [6, 7, 8, 9, 11, 22], "integ": [6, 17, 20, 22], "get_thread_block_dimens": 6, "convent": [6, 12, 22], "get_total_tim": 6, "overhead_tim": 6, "looks_like_a_filenam": 6, "normalize_verify_funct": 6, "v": [6, 7, 8, 9, 11], "normal": [6, 18, 22], "result_host": 6, "keyword": 6, "behaviour": 6, "parse_restrict": 6, "prepare_kernel_str": 6, "prepend": [6, 9], "seri": [6, 11], "By": [6, 12, 15, 18, 22], "macro": 6, "made": 6, "print_config": 6, "print_config_output": 6, "process_cach": 6, "device_nam": [6, 22], "tune_params_kei": 6, "x1": 6, "x2": 6, "xn": 6, "234342": 6, "y1": 6, "y2": 6, "yn": 6, "134233": 6, "close": [6, 7, 8, 9], "bracket": 6, "earlier": [6, 7, 8, 9, 11], "abruptli": 6, "process_metr": 6, "calcul": [6, 11], "express": [6, 7, 8, 9, 10, 12, 15, 22], "10000": 6, "read_cach": 6, "open_cach": 6, "cachefil": [6, 22], "read_fil": 6, "replace_param_occurr": 6, "occurr": 6, "setup_block_and_grid": 6, "write_fil": 6, "whole": [7, 8, 9, 15, 18], "model": [7, 8, 9, 13], "physic": 7, "numer": [7, 8, 9], "introduc": [7, 8, 9, 15, 17], "redistribut": [7, 8, 9], "region": [7, 8, 9], "concentr": [7, 8, 9], "bulk": [7, 8, 9], "motion": [7, 8, 9], "concept": [7, 8, 9], "wide": [7, 8, 9, 14, 15], "chemistri": [7, 8, 9], "biologi": [7, 8, 9], "suppos": [7, 8, 9], "metal": [7, 8, 9], "sheet": [7, 8, 9], "temperatur": [7, 8, 9, 17, 18, 23], "equal": [7, 8, 9, 15, 22], "degre": [7, 8, 9], "everywher": [7, 8, 9], "heat": [7, 8, 9], "thousand": [7, 8, 9], "instant": [7, 8, 9, 11], "hotspot": [7, 8, 9], "cooler": [7, 8, 9], "area": [7, 8, 9, 15], "melt": [7, 8, 9], "loss": [7, 8, 9], "radiat": [7, 8, 9], "frac": [7, 8, 9], "d": [7, 8, 9, 11, 18, 19], "spatial": [7, 8, 9], "descret": [7, 8, 9], "2d": [7, 8, 9, 10], "quantiti": [7, 8, 9, 16, 17, 22], "nx": [7, 8, 9, 11], "equi": [7, 8, 9], "distant": [7, 8, 9], "direct": [7, 8, 9, 12, 15, 16, 22], "ny": [7, 8, 9, 11], "distanc": [7, 8, 9, 18], "delta": [7, 8, 9], "central": [7, 8, 9], "approxim": [7, 8, 9], "x_i": [7, 8, 9, 11], "x_": [7, 8, 9], "approx": [7, 8, 9], "u_": [7, 8, 9], "2u_": [7, 8, 9], "y_": [7, 8, 9], "estim": [7, 8, 9], "next": [7, 8, 9, 15, 20], "simplifi": [7, 8, 9], "formula": [7, 8, 9], "4u_": [7, 8, 9], "simplic": [7, 8, 9, 11], "assumpt": [7, 8, 9], "boundari": [7, 8, 9], "condit": [7, 8, 9, 15], "dt": [7, 8, 9], "225": [7, 8, 9], "test": [7, 8, 9, 10, 14, 15, 17, 22], "initi": [7, 8, 9, 20], "hot": [7, 8, 9], "plot": [7, 8, 9], "color": [7, 8, 9], "matplotlib": [7, 8, 9, 14], "pyplot": [7, 8, 9], "inlin": [7, 8, 9], "get_initial_condit": [7, 8, 9], "ones": [7, 8, 9, 23], "randint": [7, 8, 9], "1000": [7, 8, 9, 11], "2000": [7, 8, 9], "fig": [7, 8, 9], "ax1": [7, 8, 9], "ax2": [7, 8, 9], "subplot": [7, 8, 9], "imshow": [7, 8, 9], "lt": [7, 8, 9], "axesimag": [7, 8, 9], "0x2aaab952f240": 7, "gt": [7, 8, 9], "quick": [7, 8, 9], "later": [7, 8, 9, 11, 22], "field_copi": [7, 8], "4164": 7, "018869400024": 7, "0x2aab1c98b3c8": 7, "worri": [7, 9], "terminologi": [7, 9], "text": [7, 9, 15], "5": [7, 8, 9, 11, 18], "225f": [7, 8, 9], "diffuse_kernel": [7, 8, 9], "u_new": [7, 8, 9], "0f": [7, 8, 9], "togeth": [7, 8, 9, 14, 22], "impact": [7, 8, 9, 12], "fix": [7, 8, 9, 18, 22], "unrol": [7, 8, 9, 10, 15, 23], "loop": [7, 8, 9, 10, 15, 23], "drv": 7, "sourcemodul": [7, 9, 11], "init": 7, "make_context": 7, "devprop": 7, "k": [7, 8, 9, 11, 13, 15, 19], "get_devic": 7, "get_attribut": 7, "cc": 7, "compute_capability_major": 7, "compute_capability_minor": 7, "u_old": [7, 9], "mem_alloc": 7, "nbyte": 7, "block_size_str": [7, 9], "arch": 7, "sm_": 7, "get_funct": [7, 9, 11], "boilerpl": [7, 8, 9], "moment": [7, 8, 9, 22], "serv": [7, 8, 9, 16, 18], "guess": [7, 8, 9], "pair": [7, 8, 9], "500": [7, 8, 9], "time_sinc": 7, "zeros_lik": [7, 11, 13, 15, 19, 21], "set_titl": [7, 8, 9], "53": [7, 8, 9], "423038482666016": 7, "0x2aaabbdcb2e8": 7, "faster": [7, 8, 9, 15], "cleanup": 7, "pop": 7, "think": [7, 8, 9], "messi": [7, 8, 9], "got": [7, 8, 9], "cleaner": [7, 8, 9], "plai": [7, 8, 9], "difficult": [7, 8, 9, 20, 21], "rather": [7, 8, 9, 22], "underutil": [7, 8, 9], "purpos": [7, 8, 9, 12, 15, 22, 23], "feel": [7, 8, 9], "48": [7, 8, 9], "care": [7, 8, 9], "appropi": [7, 8, 9], "fly": [7, 8, 9], "12": [7, 8, 9], "13": [7, 8, 9], "geforc": [7, 8, 9, 11], "gtx": [7, 8, 9, 11], "titan": [7, 8, 9], "22305920124": 7, "779033613205": 7, "824838399887": 7, "900499212742": 7, "999763202667": 7, "727967989445": 7, "752479994297": 7, "797900807858": 7, "876627194881": 7, "93347837925": 7, "766662418842": 7, "803033602238": 7, "853574407101": 7, "971545600891": 7, "763775992393": 7, "791257584095": 7, "848044800758": 7, "922745585442": 7, "792595207691": 7, "822137594223": 7, "893279993534": 7, "millisecond": [7, 8, 9], "matter": [7, 8, 9, 12], "analyz": [7, 8, 9], "seem": [7, 8, 9], "vari": [7, 8, 9, 11, 15, 16], "addtion": [7, 8, 9], "among": [7, 8, 9, 13, 18], "128x32": [7, 8, 9], "likewis": [7, 8, 9], "becom": [7, 8, 9, 17, 18], "affect": [7, 8, 9, 15], "within": [7, 8, 9, 11, 15, 18, 22], "exchang": [7, 8, 9], "fact": [7, 8, 9, 12], "commun": [7, 8, 9], "idea": [7, 8, 9, 12, 15, 23], "l2": [7, 8, 9], "closer": [7, 8, 9], "multiprocessor": [7, 8, 9], "l1": [7, 8, 9], "fine": [7, 8, 9], "grain": [7, 8, 9], "manag": [7, 8, 9, 15, 17], "cost": [7, 8, 9, 18], "overhead": [7, 8, 9, 15], "degrad": [7, 8, 9], "intermedi": [7, 8, 9], "mind": [7, 8, 9], "14": [7, 8, 9], "tx": [7, 8, 9, 15], "ty": [7, 8, 9, 15], "bx": [7, 8, 9, 11], "__shared__": [7, 9, 15], "sh_u": [7, 8, 9], "pragma": [7, 8, 9, 15], "__syncthread": [7, 8, 9, 15], "75041918755": 7, "18713598251": 7, "09015038013": 7, "06844799519": 7, "09730558395": 7, "14420480728": 7, "05957758427": 7, "07508480549": 7, "0731967926": 7, "14729599953": 7, "08389122486": 7, "10700161457": 7, "10125439167": 7, "31661438942": 7, "0629119873": 7, "04807043076": 7, "054880023": 7, "12033278942": 7, "06672639847": 7, "05816960335": 7, "12000002861": 7, "merg": [7, 8, 9, 15], "half": [7, 8, 9], "doubl": [7, 8, 9, 20, 21], "cover": [7, 8, 9, 18], "beyond": [7, 8, 9, 22], "reduc": [7, 8, 9, 15], "condens": [7, 8, 9], "keep": [7, 8, 9, 15, 20], "importantli": [7, 8, 9], "worst": [7, 8, 9], "15": [7, 8, 9, 21], "tj": [7, 8, 9], "ti": [7, 8, 9, 11], "somehow": [7, 8, 9], "larger": [7, 8, 9, 12, 18, 21], "insid": [7, 8, 9, 12, 15, 21, 22], "round": [7, 8, 9, 22], "arithmet": [7, 8, 9, 22], "evalu": [7, 8, 9, 15, 18, 22], "759308815": 7, "29789438248": 7, "06983039379": 7, "2634239912": 7, "997139203548": 7, "843692803383": 7, "05549435616": 7, "862348806858": 7, "750636804104": 7, "19084160328": 7, "876377594471": 7, "714169609547": 7, "875001597404": 7, "691116797924": 7, "575859189034": 7, "759679996967": 7, "622867202759": 7, "650336003304": 7, "09794559479": 7, "826515209675": 7, "692665600777": 7, "78363519907": 7, "646092808247": 7, "554745602608": 7, "716115188599": 7, "581280004978": 7, "662566399574": 7, "07386879921": 7, "833420813084": 7, "705055999756": 7, "840755212307": 7, "652575993538": 7, "569388794899": 7, "689356791973": 7, "597267186642": 7, "675232005119": 7, "10033922195": 7, "860332798958": 7, "731891202927": 7, "867276787758": 7, "68781440258": 7, "595276796818": 7, "735436797142": 7, "60216319561": 7, "852166390419": 7, "15089921951": 7, "852575981617": 7, "705932807922": 7, "888671982288": 7, "673248004913": 7, "563417613506": 7, "761139214039": 7, "621254396439": 7, "676595199108": 7, "06709122658": 7, "804953610897": 7, "685670387745": 7, "801798415184": 7, "632006394863": 7, "542387211323": 7, "722668802738": 7, "578745603561": 7, "618598401546": 7, "08220798969": 7, "821881604195": 7, "687955200672": 7, "77759360075": 7, "618003201485": 7, "539891195297": 7, "705900788307": 7, "568556785583": 7, "624492788315": 7, "0799423933": 7, "832300806046": 7, "70140799284": 7, "835481595993": 7, "638348805904": 7, "550105595589": 7, "667251205444": 7, "576044797897": 7, "732409596443": 7, "15916161537": 7, "869497597218": 7, "733248019218": 7, "890803205967": 7, "677363204956": 7, "577215993404": 7, "730982398987": 7, "58035838604": 7, "10066559315": 7, "837804794312": 7, "691385602951": 7, "851040017605": 7, "666656005383": 7, "560505592823": 7, "771103990078": 7, "626163220406": 7, "694451200962": 7, "11514236927": 7, "837299215794": 7, "703302407265": 7, "806828796864": 7, "648620784283": 7, "562521612644": 7, "760915207863": 7, "605760002136": 7, "690009605885": 7, "10740480423": 7, "841631996632": 7, "700883197784": 7, "838195204735": 7, "649779188633": 7, "56585599184": 7, "7168192029": 7, "59088640213": 7, "69627519846": 7, "3269824028": 7, "02665598392": 7, "840908801556": 7, "03752319813": 7, "788345599174": 7, "662041604519": 7, "85437438488": 7, "680422389507": 7, "0759360075": 7, "801996803284": 7, "666003203392": 7, "808000004292": 7, "643359994888": 7, "544691193104": 7, "741964805126": 7, "60942081213": 7, "681350398064": 7, "05262081623": 7, "792108798027": 7, "66344319582": 7, "768064010143": 7, "625260794163": 7, "540352010727": 7, "721862399578": 7, "579411196709": 7, "626976013184": 7, "06332798004": 7, "808211183548": 7, "679372787476": 7, "803718411922": 7, "627136015892": 7, "538227200508": 7, "682188808918": 7, "573836791515": 7, "725548803806": 7, "13023357391": 7, "843411195278": 7, "713843202591": 7, "85886080265": 7, "657920002937": 7, "565254402161": 7, "697094392776": 7, "579904007912": 7, "07484800816": 7, "801119995117": 7, "667347204685": 7, "799059200287": 7, "643820810318": 7, "542937588692": 7, "740518403053": 7, "615148806572": 7, "731334400177": 7, "07002239227": 7, "805299210548": 7, "675923216343": 7, "782060790062": 7, "631142401695": 7, "540383994579": 7, "723999989033": 7, "578681600094": 7, "726335990429": 7, "13297917843": 7, "844428789616": 7, "710278391838": 7, "835494399071": 7, "637958395481": 7, "567417597771": 7, "699366402626": 7, "588492810726": 7, "tri": [7, 8, 9, 18], "grow": [7, 8, 9], "quickli": [7, 8, 9], "went": [7, 8, 9, 11], "72": [7, 8, 9], "26": [7, 8, 9], "32x2": [7, 8, 9], "64x4": [7, 8, 9], "four": [7, 8, 9], "best_tim": [7, 8], "min": [7, 8], "05": [7, 8], "join": [7, 8], "nice": [7, 8], "stdout": [7, 8], "why": [7, 8, 12, 16], "easili": [7, 8, 17], "easi": [7, 8, 16, 17, 22], "csv": [7, 8, 10], "analysi": [7, 8], "panda": [7, 8, 10, 14], "18": [7, 8, 9], "fp": [7, 8], "datafram": [7, 8], "df": [7, 8], "to_csv": [7, 8], "0x2aab1de088d0": 8, "01": 8, "sy": 8, "140": 8, "wall": 8, "98": 8, "__kernel": 8, "get_group_id": 8, "get_local_id": 8, "cl": 8, "ctx": 8, "create_some_context": 8, "mf": 8, "mem_flag": 8, "a_h": 8, "a_d": 8, "read_writ": 8, "copy_host_ptr": 8, "hostbuf": 8, "b_d": 8, "kernel_src": 8, "prg": 8, "queue": 8, "commandqueu": 8, "run_gpu": 8, "444": 8, "154": 8, "598": 8, "985": 8, "enqueue_copi": 8, "1748096": 8, "7284544": 8, "7707904": 8, "8573184": 8, "8380288": 8, "686528": 8, "69648": 8, "7461632": 8, "818304": 8, "771072": 8, "7190464": 8, "7522432": 8, "7982208": 8, "9624512": 8, "7214464": 8, "7453312": 8, "8028416": 8, "8922624": 8, "747328": 8, "7860736": 8, "8637184": 8, "__local": 8, "barrier": 8, "clk_local_mem_f": 8, "8449472": 8, "1912576": 8, "1035136": 8, "0927808": 8, "1140736": 8, "1790336": 8, "0808192": 8, "0809792": 8, "0836928": 8, "1545856": 8, "1249984": 8, "1264": 8, "1230336": 8, "4015104": 8, "0873216": 8, "0626496": 8, "0692224": 8, "140192": 8, "0801344": 8, "0688128": 8, "1428928": 8, "8844544": 8, "3245952": 8, "0911808": 8, "3039616": 8, "0079296": 8, "84848": 8, "0708288": 8, "857728": 8, "7561792": 8, "231072": 8, "8774336": 8, "7087296": 8, "8772672": 8, "6911872": 8, "5715968": 8, "7584896": 8, "6292032": 8, "6498688": 8, "1145664": 8, "8252928": 8, "6757568": 8, "7881152": 8, "6237696": 8, "544224": 8, "6951168": 8, "5648128": 8, "6452736": 8, "1065792": 8, "8313792": 8, "6905984": 8, "8302656": 8, "6367488": 8, "5478592": 8, "6660672": 8, "5719744": 8, "6551744": 8, "1384064": 8, "8531072": 8, "7078976": 8, "8516672": 8, "6677696": 8, "5685632": 8, "7074048": 8, "5753152": 8, "8228864": 8, "2124736": 8, "8633344": 8, "6921216": 8, "8896384": 8, "6659904": 8, "5582144": 8, "7522624": 8, "6081536": 8, "6664448": 8, "1095936": 8, "8063424": 8, "6717888": 8, "7982848": 8, "6263552": 8, "5289728": 8, "7008832": 8, "567456": 8, "5968704": 8, "1018432": 8, "8117248": 8, "6724736": 8, "7728576": 8, "6038336": 8, "5172352": 8, "6796352": 8, "5470016": 8, "5968448": 8, "1107712": 8, "8237248": 8, "6810944": 8, "821952": 8, "620352": 8, "5230208": 8, "6415552": 8, "5476864": 8, "7168192": 8, "1942016": 8, "8626304": 8, "7099712": 8, "9123328": 8, "6608448": 8, "5631168": 8, "7113024": 8, "556576": 8, "1583104": 8, "8384832": 8, "67856": 8, "845856": 8, "6581248": 8, "54944": 8, "7520064": 8, "6076224": 8, "6842112": 8, "1547072": 8, "8422016": 8, "6895552": 8, "8037312": 8, "6387072": 8, "5383296": 8, "7326656": 8, "5863488": 8, "6813376": 8, "1493952": 8, "8444928": 8, "6929216": 8, "832768": 8, "6389312": 8, "5412672": 8, "698336": 8, "5717568": 8, "676096": 8, "4303104": 8, "0341696": 8, "8365184": 8, "0398656": 8, "7786496": 8, "648928": 8, "8479232": 8, "6508544": 8, "1219392": 8, "7994048": 8, "6492288": 8, "8068416": 8, "6343168": 8, "5235328": 8, "7268928": 8, "5898432": 8, "6633536": 8, "0849664": 8, "7869632": 8, "6458624": 8, "7611968": 8, "613088": 8, "50912": 8, "6972928": 8, "5620608": 8, "601856": 8, "095232": 8, "7967488": 8, "6601472": 8, "7952896": 8, "6047296": 8, "5108224": 8, "6607744": 8, "5492416": 8, "7091136": 8, "171552": 8, "8473408": 8, "6962112": 8, "8663936": 8, "6466816": 8, "5475584": 8, "6754048": 8, "5591744": 8, "108896": 8, "7907264": 8, "6459328": 8, "7965888": 8, "6250816": 8, "5188416": 8, "721408": 8, "5920832": 8, "7068608": 8, "0909248": 8, "7930752": 8, "6524544": 8, "7745216": 8, "6146176": 8, "5116928": 8, "6975872": 8, "5548416": 8, "7075136": 8, "174624": 8, "8384512": 8, "69104": 8, "8335488": 8, "6264192": 8, "5445248": 8, "6719104": 8, "5592064": 8, "19": [8, 9], "solv": 9, "0x7f888f8cd7b8": 9, "4152": 9, "086019515991": 9, "0x7f8865b51f28": 9, "gpuarrai": [9, 11], "tool": [9, 11, 13], "autoinit": [9, 11], "to_gpu": [9, 11], "mod": [9, 11], "t0": [9, 11], "ona": 9, "33": 9, "46109390258789": 9, "0x7f8858b873c8": 9, "1080": [9, 11], "916985595226": 9, "489004802704": 9, "500524806976": 9, "513356792927": 9, "545715200901": 9, "486515200138": 9, "449055999517": 9, "44974719882": 9, "457427197695": 9, "492915201187": 9, "464863997698": 9, "466118401289": 9, "475264000893": 9, "513632011414": 9, "458412796259": 9, "457715201378": 9, "461017608643": 9, "475987195969": 9, "460032004118": 9, "457779198885": 9, "462649595737": 9, "kernel_string_shar": 9, "22673916817": 9, "826361596584": 9, "793516802788": 9, "782112002373": 9, "776639997959": 9, "795135998726": 9, "722777605057": 9, "762777590752": 9, "75422719717": 9, "804876792431": 9, "778656005859": 9, "769734406471": 9, "782495999336": 9, "932281601429": 9, "734028804302": 9, "721625590324": 9, "736511993408": 9, "800019192696": 9, "724966406822": 9, "722969603539": 9, "759430396557": 9, "kernel_string_til": 9, "22200961113": 9, "91601279974": 9, "752838408947": 9, "873651194572": 9, "69833599329": 9, "586931192875": 9, "516473591328": 9, "411392003298": 9, "384262400866": 9, "82159358263": 9, "632607996464": 9, "506457602978": 9, "618758392334": 9, "500288009644": 9, "429862397909": 9, "44995200038": 9, "366150397062": 9, "342201602459": 9, "793542397022": 9, "58026239872": 9, "494163197279": 9, "546316814423": 9, "467059195042": 9, "404249596596": 9, "440895992517": 9, "341376006603": 9, "339692795277": 9, "783923208714": 9, "597920000553": 9, "50277120471": 9, "615475213528": 9, "470937597752": 9, "418393599987": 9, "443519997597": 9, "343961596489": 9, "342540800571": 9, "780352008343": 9, "611705589294": 9, "515667212009": 9, "622534394264": 9, "502195191383": 9, "437388807535": 9, "45568639636": 9, "359289598465": 9, "426995199919": 9, "788947200775": 9, "616556799412": 9, "496121603251": 9, "629164803028": 9, "474841600657": 9, "407667201757": 9, "47406719923": 9, "371507203579": 9, "352531200647": 9, "72023679018": 9, "574816000462": 9, "481817597151": 9, "580928003788": 9, "455724793673": 9, "394975996017": 9, "464659202099": 9, "357107198238": 9, "324083191156": 9, "759910392761": 9, "569177603722": 9, "481279999018": 9, "528115200996": 9, "441734397411": 9, "393126398325": 9, "455404800177": 9, "350457596779": 9, "322547197342": 9, "754201591015": 9, "579827189445": 9, "491852802038": 9, "582751989365": 9, "451283198595": 9, "391807991266": 9, "456275194883": 9, "356716805696": 9, "362937599421": 9, "809894394875": 9, "60433280468": 9, "507142400742": 9, "655827200413": 9, "474092799425": 9, "408166396618": 9, "480531209707": 9, "346707201004": 9, "780134403706": 9, "601049602032": 9, "493900799751": 9, "620384001732": 9, "494553589821": 9, "425414395332": 9, "467033600807": 9, "375468802452": 9, "346079999208": 9, "771052801609": 9, "593977594376": 9, "49723520875": 9, "583270406723": 9, "478079998493": 9, "416320002079": 9, "443942397833": 9, "359744000435": 9, "343545603752": 9, "780960011482": 9, "598758399487": 9, "498617601395": 9, "57678719759": 9, "46561280489": 9, "41324160099": 9, "431225597858": 9, "351263999939": 9, "34440960288": 9, "933260798454": 9, "715257608891": 9, "586604809761": 9, "711615991592": 9, "558771193027": 9, "466284793615": 9, "44043520093": 9, "361823999882": 9, "731839990616": 9, "57044479847": 9, "470220798254": 9, "608800005913": 9, "472665601969": 9, "416352003813": 9, "481376004219": 9, "380812799931": 9, "351923197508": 9, "719257593155": 9, "55171200037": 9, "466758400202": 9, "568435204029": 9, "459654402733": 9, "394380801916": 9, "463052803278": 9, "36409599781": 9, "328998398781": 9, "73579518795": 9, "564575994015": 9, "472236800194": 9, "549024009705": 9, "438406395912": 9, "389945602417": 9, "455193603039": 9, "364051198959": 9, "375519996881": 9, "798195195198": 9, "588998401165": 9, "49552000761": 9, "595462405682": 9, "460972803831": 9, "400672000647": 9, "465132802725": 9, "364627194405": 9, "729363203049": 9, "558815991879": 9, "466655993462": 9, "600819194317": 9, "460281592607": 9, "404908800125": 9, "478739196062": 9, "386668801308": 9, "385510402918": 9, "720915210247": 9, "550668799877": 9, "466937589645": 9, "564921605587": 9, "447974395752": 9, "394271999598": 9, "46233600378": 9, "365190398693": 9, "387827193737": 9, "762003195286": 9, "579007995129": 9, "486649608612": 9, "557331204414": 9, "443033593893": 9, "396070402861": 9, "457075202465": 9, "369555193186": 9, "wish": 9, "modifi": [9, 17], "tile_size_j": 9, "fixed_param": [9, 11], "ceil": [9, 11], "zip": [9, 11], "transfer": [9, 10, 12], "20": [9, 18], "21": 9, "618": 9, "2231903076172": 9, "0x7f887c3d2358": 9, "incorpor": 9, "ifndef": 9, "kerenel": 9, "psedo": 9, "endif": 9, "bypass": 9, "usecas": 10, "test_vector_add": 10, "test_vector_add_parameter": 10, "illustr": 10, "dimension": [10, 11, 22], "clean": [10, 15], "center": [10, 11], "lock": [10, 17], "overlap": [10, 12], "shuffl": 10, "pipelin": 10, "consist": [10, 15, 22], "scipi": 10, "algorithm": [10, 13, 18, 22], "cub": 10, "gaussian": 11, "delv": 11, "hand": [11, 15], "sum_": 11, "exp": 11, "beta": [11, 18], "sqrt": 11, "y_i": 11, "z_i": 11, "vector": [11, 12, 19], "coordin": 11, "linalg": 11, "la": 11, "compute_grid": 11, "xgrid": 11, "ygrid": 11, "zgrid": 11, "x0": 11, "y0": 11, "z0": 11, "themselv": 11, "meshgrid": 11, "send": 11, "interv": 11, "256": [11, 13, 19], "suffici": [11, 16], "100": [11, 18, 22], "randomli": [11, 18], "distribut": [11, 15], "linspac": 11, "cpu_grid": 11, "npt": 11, "rand": 11, "xyz": [11, 22], "52320": 11, "160627": 11, "might": [11, 16], "nz": 11, "bz": 11, "kernel_cod": 11, "math": 11, "__host__": 11, "__device__": [11, 21], "b": [11, 13, 15, 18, 19, 21], "addgrid": 11, "xvect": 11, "yvect": 11, "zvect": 11, "dx": 11, "dy": 11, "dz": 11, "assign": 11, "explor": 11, "middl": 11, "henc": [11, 20], "mention": 11, "56833920479": 11, "80796158314": 11, "940044796467": 11, "855628800392": 11, "855359995365": 11, "16174077988": 11, "11877760887": 11, "01592960358": 11, "849273598194": 11, "849235200882": 11, "19029750824": 11, "16199679375": 11, "40401918888": 11, "39618558884": 11, "39508478642": 11, "31647996902": 11, "31470079422": 11, "50787198544": 11, "53760001659": 11, "56709756851": 11, "34500494003": 11, "25130877495": 11, "50662400723": 11, "55267841816": 11, "17987194061": 11, "12309756279": 11, "01125121117": 11, "849631989002": 11, "853708791733": 11, "17051515579": 11, "15584001541": 11, "40074241161": 11, "39547519684": 11, "39331197739": 11, "30295038223": 11, "28725762367": 11, "39589118958": 11, "38867840767": 11, "37724158764": 11, "34344320297": 11, "26213116646": 11, "38793599606": 11, "3775359869": 11, "74003200531": 11, "13276162148": 11, "37233917713": 11, "18835201263": 11, "15777277946": 11, "40247042179": 11, "39366400242": 11, "39439997673": 11, "23719043732": 11, "28542718887": 11, "39207677841": 11, "38956804276": 11, "3778496027": 11, "29814395905": 11, "26398081779": 11, "38625922203": 11, "3754431963": 11, "72981758118": 11, "12483196259": 11, "37322881222": 11, "61618566513": 11, "2194111824": 11, "17600002289": 11, "27082881927": 11, "38787200451": 11, "3835711956": 11, "37543039322": 11, "30227203369": 11, "23127679825": 11, "38627202511": 11, "37677440643": 11, "64358406067": 11, "12255358696": 11, "37474560738": 11, "61655673981": 11, "19179515839": 11, "99912958145": 11, "213971138": 11, "16430072784": 11, "38772480488": 11, "3735104084": 11, "54432649612": 11, "05524477959": 11, "36935677528": 11, "42449922562": 11, "10455036163": 11, "67516155243": 11, "programmat": 11, "30": 11, "minimum": 11, "84": 11, "suit": [11, 22], "grid_dim": 11, "associ": 11, "substitut": 11, "ourselv": 11, "extract": 11, "manual": [11, 14], "exlicitli": 11, "accur": [11, 17], "xgpu": 11, "ygpu": 11, "zgpu": 11, "grid_gpu": 11, "80": 11, "133200": 11, "lower": [11, 17, 18], "roughli": [11, 15], "40000": 11, "across": [12, 15], "qualiti": 12, "itself": [12, 13, 22], "precis": 12, "plain": 12, "omp_get_wtim": 12, "openmp": 12, "convolution_stream": 12, "complex": [12, 15], "behind": 12, "spread": 12, "back": [12, 22], "split": 12, "chunk": 12, "slightli": [12, 15, 21], "account": [12, 15], "border": [12, 22], "latter": 12, "cudastreamwaitev": 12, "num_stream": 12, "clarifi": 12, "fit": [12, 18], "choic": [12, 14], "grid_size_x": 12, "grid_size_i": 12, "cudamemcpytosymbol": 12, "upload": 12, "yourself": [12, 22], "spent": [12, 22], "relat": [13, 16, 23], "famili": 13, "launcher": 13, "kt": [13, 20], "easiest": 13, "toolkit": [13, 14], "intend": 13, "Or": [13, 14], "vector_add": [13, 18, 19, 21], "10000000": 13, "512": [13, 19], "research": 13, "cite": 13, "articl": [13, 19], "author": 13, "ben": 13, "van": 13, "werkhoven": 13, "titl": 13, "auto": [13, 15, 17, 18, 21, 22, 23], "journal": 13, "year": 13, "2019": 13, "volum": 13, "90": 13, "347": 13, "358": 13, "url": 13, "www": 13, "sciencedirect": 13, "scienc": 13, "pii": 13, "s0167739x18313359": 13, "doi": 13, "1016": 13, "2018": 13, "08": 13, "004": 13, "willemsen2021bayesian": 13, "willemsen": [13, 18], "flori": 13, "jan": 13, "nieuwpoort": 13, "rob": 13, "bayesian": [13, 18, 22], "workshop": 13, "pmb": 13, "supercomput": 13, "sc21": 13, "2021": 13, "arxiv": 13, "ab": 13, "2111": 13, "14991": 13, "schoonhoven2022benchmark": 13, "schoonhoven": 13, "richard": 13, "batenburg": 13, "joost": 13, "ieee": 13, "transact": 13, "evolutionari": 13, "2022": 13, "schoonhoven2022go": 13, "veenboer": 13, "bram": 13, "green": 13, "effici": [13, 15, 17], "steer": 13, "sc22": 13, "2211": 13, "07260": 13, "comprehens": 14, "recommend": [14, 20], "download": 14, "repo": 14, "continuum": 14, "io": 14, "miniconda3": 14, "x86_64": 14, "sh": 14, "newer": [14, 17], "nativ": 14, "prefix": 14, "home": 14, "pythonpath": 14, "bind": [14, 17], "older": 14, "troubl": 14, "retri": 14, "dir": 14, "wiki": 14, "tiker": 14, "net": 14, "amd": [14, 17], "app": 14, "sdk": 14, "intel": 14, "appl": 14, "beignet": 14, "stack": 14, "altern": [14, 22], "navig": 14, "benvanwerkhoven": 14, "differenti": [14, 18, 22], "chanc": [14, 18, 21], "algebra": 15, "frequent": 15, "programm": [15, 17], "row": 15, "column": 15, "squar": 15, "matric": 15, "matmul_na": 15, "width": 15, "matmul_kernel": 15, "height": 15, "Of": 15, "solut": [15, 17], "realiti": 15, "contant": 15, "denot": [15, 19, 22], "sensibl": 15, "pick": 15, "word": 15, "warpsiz": 15, "namelijk": 15, "stand": 15, "briefli": 15, "figur": 15, "fifth": 15, "fourth": 15, "dramat": 15, "profil": 15, "pretti": 15, "opportun": 15, "realiz": 15, "collabor": 15, "bandwidth": 15, "techniqu": 15, "submatric": 15, "proce": 15, "matmul_shar": 15, "sa": 15, "sb": 15, "kb": 15, "outer": 15, "inner": 15, "race": 15, "drastic": 15, "consumpt": [15, 17], "due": [15, 21, 22], "significantli": [15, 17], "fortun": 15, "benefit": 15, "redund": 15, "distinct": 15, "1xn": 15, "usag": [15, 17], "occup": 15, "goe": 15, "down": 15, "matmul": 15, "newli": 15, "coupl": 15, "respect": [15, 17], "independ": 15, "yield": 15, "discontinu": 15, "room": 15, "impos": 15, "report": [16, 17, 22, 23], "possibli": [16, 22], "_flop": 16, "total_flop": 16, "ps_energi": [16, 17, 23], "occur": [16, 22], "exhaust": 16, "brute": [16, 18, 19], "forc": [16, 18, 19, 21], "maxim": [16, 22], "boolean": [16, 17, 22], "facilit": 17, "layer": 17, "act": 17, "hook": 17, "pattern": 17, "subscrib": 17, "benchmarkobserv": 17, "overwritten": [17, 22], "extend": 17, "mandatori": 17, "get_result": 17, "aggreg": 17, "after_finish": 17, "after_start": 17, "before_start": 17, "register_configur": 17, "register_devic": 17, "variou": [17, 19], "registerobserv": 17, "track": 17, "num_reg": 17, "current_modul": 17, "powersensor2": 17, "pcie": 17, "intercept": 17, "sensor": 17, "transmit": 17, "usb": 17, "connect": 17, "advantag": 17, "instantan": 17, "frequenc": 17, "khz": 17, "pybind11": 17, "powersensor": [17, 23], "ps_power": [17, 23], "joul": [17, 23], "watt": [17, 23], "ttyacm0": 17, "core": 17, "voltag": 17, "thin": 17, "wrapper": [17, 21], "intricaci": 17, "friendli": 17, "mode": 17, "repeatedli": 17, "downsid": 17, "approach": 17, "save_al": 17, "nvidia_smi_fallback": 17, "use_locked_clock": 17, "continous_dur": 17, "monitor": 17, "clock": [17, 23], "power_read": [17, 23], "nvml_power": [17, 23], "nvml_energi": [17, 23], "core_freq": [17, 23], "mem_freq": [17, 23], "gr_voltag": 17, "ordin": 17, "identifi": 17, "smi": 17, "root": 17, "opt": 17, "amper": 17, "continuous_dur": 17, "common": [17, 21], "cap": 17, "popular": 17, "nvml_gr_clock": [17, 23], "nvml_mem_clock": [17, 23], "nvml_pwr_limit": [17, 23], "graphic": [17, 23], "jetson": 17, "rapl": 17, "xilinx": 17, "pmt": 17, "astron": 17, "nl": 17, "rd": 17, "meter": 17, "arduino": 17, "_energi": 17, "_power": 17, "acceler": 18, "prohibit": 18, "slow": 18, "wast": 18, "basin": [18, 22], "hop": [18, 22], "dual": [18, 22], "anneal": [18, 22], "evolut": [18, 22], "firefli": [18, 22], "genet": [18, 22], "greedi": [18, 22], "local": [18, 22], "multi": [18, 22], "particl": [18, 22], "swarm": [18, 22], "mechan": 18, "overrid": 18, "time_limit": [18, 22], "uniqu": [18, 22], "count": 18, "searchspac": 18, "runner": 18, "nelder": 18, "mead": 18, "powel": 18, "cg": 18, "bfg": 18, "l": 18, "tnc": 18, "cobyla": 18, "slsqp": 18, "reject": 18, "thesi": 18, "generate_normalized_param_dict": 18, "denorm": 18, "normalize_parameter_spac": 18, "param_spac": 18, "prune_parameter_spac": 18, "normalize_dict": 18, "prune": 18, "hyperparamet": 18, "popul": 18, "best1bin": 18, "best1exp": 18, "rand1exp": 18, "randtobest1exp": 18, "best2exp": 18, "rand2exp": 18, "randtobest1bin": 18, "best2bin": 18, "rand2bin": 18, "rand1bin": 18, "popsiz": 18, "maxit": 18, "constr": 18, "compute_intens": 18, "fun": 18, "intens": 18, "distance_to": 18, "euclidian": 18, "move_toward": 18, "alpha": 18, "toward": 18, "b0": 18, "attract": 18, "gamma": 18, "light": 18, "absorpt": 18, "coeffici": 18, "disruptive_uniform_crossov": 18, "dna1": 18, "dna2": 18, "disrupt": 18, "uniform": 18, "crossov": 18, "uniformli": 18, "gene": 18, "children": 18, "guarante": 18, "parent": 18, "mutat": 18, "dna": 18, "mutation_ch": 18, "single_point_crossov": 18, "index": 18, "single_point": 18, "two_point": 18, "disruptive_uniform": 18, "two_point_crossov": 18, "uniform_crossov": 18, "weighted_choic": 18, "probabl": [18, 22], "il": 18, "neighbor": 18, "ham": 18, "adjac": 18, "greedy": 18, "soon": 18, "no_improv": 18, "exce": 18, "50": 18, "random_walk": 18, "hillclimb": 18, "travers": 18, "inertia": 18, "c1": 18, "cognit": 18, "c2": 18, "social": 18, "fraction": 18, "acceptance_prob": 18, "old_cost": 18, "new_cost": 18, "modif": [18, 20], "po": 18, "t_min": 18, "001": 18, "995": 18, "vector_add_kernel": 19, "wise": 19, "1000000": [19, 21], "recogn": 19, "alright": 19, "portabl": 20, "stick": 20, "pointer": 20, "primit": 20, "lead": 20, "ineffici": 20, "situat": 20, "scientif": 20, "sens": 20, "experiment": 20, "pack": 20, "consult": 20, "create_receive_spec_struct": 20, "0l": 20, "pad": 20, "8byte": 20, "packstr": 20, "iiiiiiiiiiippi": 20, "fffi": 20, "nsampl": 20, "nsamplesiq": 20, "nslowtimesampl": 20, "nchannel": 20, "ntx": 20, "nrepeat": 20, "nfasttimesampl": 20, "rfsize": 20, "mnrow": 20, "mnrowsiq": 20, "nactivechannel": 20, "isiq": 20, "fsiq": 20, "fc": 20, "nbuffer": 20, "frombuff": 20, "len": 20, "receive_spec": 20, "bf": 20, "rf": 20, "recon": 20, "length": 20, "slight": 20, "matlab": 21, "typenam": 21, "my_typ": 21, "regardless": 21, "demot": 21, "rewrit": 21, "real": 21, "risk": 21, "seper": 21, "grid_div_z": 22, "06": 22, "log": 22, "auxilliari": 22, "safer": 22, "notat": 22, "divison": 22, "treat": 22, "warp": 22, "empti": 22, "kepler": 22, "plu": 22, "filter_mod": 22, "address_mod": 22, "clamp": 22, "mirror": 22, "axi": 22, "normalized_coordin": 22, "emtpi": 22, "get_local_s": 22, "satisfi": 22, "000001": 22, "ref": 22, "basinhop": 22, "bayes_opt": 22, "diff_evo": 22, "firefly_algorithm": 22, "genetic_algorithm": 22, "greedy_il": 22, "greedy_ml": 22, "ml": 22, "ordered_greedy_ml": 22, "pso": 22, "simulated_ann": 22, "sort": 22, "resourc": 22, "persist": 22, "consol": 22, "info": 22, "summar": 22, "store_result": 22, "results_filenam": 22, "typicali": 22, "percentag": 22, "create_device_target": 22, "header_filenam": 22, "target": 22, "dtarget_gpu": 22, "name_of_gpu": 22, "chosen": 22, "block_size_": 23, "grid_size_": 23, "compiler_opt_": 23, "loop_unroll_factor_": 23, "nvml_": 23, "nvml": 23, "nvmlobserv": 23}, "objects": {"kernel_tuner.backends.compiler": [[6, 0, 1, "", "CompilerFunctions"]], "kernel_tuner.backends.compiler.CompilerFunctions": [[6, 1, 1, "", "__init__"], [6, 1, 1, "", "cleanup_lib"], [6, 1, 1, "", "compile"], [6, 1, 1, "", "kernel_finished"], [6, 1, 1, "", "memcpy_dtoh"], [6, 1, 1, "", "memcpy_htod"], [6, 1, 1, "", "memset"], [6, 1, 1, "", "ready_argument_list"], [6, 1, 1, "", "run_kernel"], [6, 1, 1, "", "start_event"], [6, 1, 1, "", "stop_event"], [6, 1, 1, "", "synchronize"]], "kernel_tuner.backends.cupy": [[6, 0, 1, "", "CupyFunctions"]], "kernel_tuner.backends.cupy.CupyFunctions": [[6, 1, 1, "", "__init__"], [6, 1, 1, "", "compile"], [6, 1, 1, "", "copy_constant_memory_args"], [6, 1, 1, "", "copy_shared_memory_args"], [6, 1, 1, "", "copy_texture_memory_args"], [6, 1, 1, "", "kernel_finished"], [6, 1, 1, "", "memcpy_dtoh"], [6, 1, 1, "", "memcpy_htod"], [6, 1, 1, "", "memset"], [6, 1, 1, "", "ready_argument_list"], [6, 1, 1, "", "run_kernel"], [6, 1, 1, "", "start_event"], [6, 1, 1, "", "stop_event"], [6, 1, 1, "", "synchronize"]], "kernel_tuner.backends.hip": [[6, 0, 1, "", "HipFunctions"]], "kernel_tuner.backends.hip.HipFunctions": [[6, 1, 1, "", "__init__"], [6, 1, 1, "", "compile"], [6, 1, 1, "", "copy_constant_memory_args"], [6, 1, 1, "", "copy_shared_memory_args"], [6, 1, 1, "", "copy_texture_memory_args"], [6, 1, 1, "", "kernel_finished"], [6, 1, 1, "", "memcpy_dtoh"], [6, 1, 1, "", "memcpy_htod"], [6, 1, 1, "", "memset"], [6, 1, 1, "", "ready_argument_list"], [6, 1, 1, "", "run_kernel"], [6, 1, 1, "", "start_event"], [6, 1, 1, "", "stop_event"], [6, 1, 1, "", "synchronize"]], "kernel_tuner.backends.nvcuda": [[6, 0, 1, "", "CudaFunctions"]], "kernel_tuner.backends.nvcuda.CudaFunctions": [[6, 1, 1, "", "__init__"], [6, 1, 1, "", "compile"], [6, 1, 1, "", "copy_constant_memory_args"], [6, 1, 1, "", "copy_shared_memory_args"], [6, 1, 1, "", "copy_texture_memory_args"], [6, 1, 1, "", "kernel_finished"], [6, 1, 1, "", "memcpy_dtoh"], [6, 1, 1, "", "memcpy_htod"], [6, 1, 1, "", "memset"], [6, 1, 1, "", "ready_argument_list"], [6, 1, 1, "", "run_kernel"], [6, 1, 1, "", "start_event"], [6, 1, 1, "", "stop_event"], [6, 1, 1, "", "synchronize"]], "kernel_tuner.backends.opencl": [[6, 0, 1, "", "OpenCLFunctions"]], "kernel_tuner.backends.opencl.OpenCLFunctions": [[6, 1, 1, "", "__init__"], [6, 1, 1, "", "compile"], [6, 1, 1, "", "copy_constant_memory_args"], [6, 1, 1, "", "copy_shared_memory_args"], [6, 1, 1, "", "copy_texture_memory_args"], [6, 1, 1, "", "kernel_finished"], [6, 1, 1, "", "memcpy_dtoh"], [6, 1, 1, "", "memcpy_htod"], [6, 1, 1, "", "memset"], [6, 1, 1, "", "ready_argument_list"], [6, 1, 1, "", "run_kernel"], [6, 1, 1, "", "start_event"], [6, 1, 1, "", "stop_event"], [6, 1, 1, "", "synchronize"]], "kernel_tuner.backends.pycuda": [[6, 0, 1, "", "PyCudaFunctions"]], "kernel_tuner.backends.pycuda.PyCudaFunctions": [[6, 1, 1, "", "__init__"], [6, 1, 1, "", "compile"], [6, 1, 1, "", "copy_constant_memory_args"], [6, 1, 1, "", "copy_shared_memory_args"], [6, 1, 1, "", "copy_texture_memory_args"], [6, 1, 1, "", "kernel_finished"], [6, 1, 1, "", "memcpy_dtoh"], [6, 1, 1, "", "memcpy_htod"], [6, 1, 1, "", "memset"], [6, 1, 1, "", "ready_argument_list"], [6, 1, 1, "", "run_kernel"], [6, 1, 1, "", "start_event"], [6, 1, 1, "", "stop_event"], [6, 1, 1, "", "synchronize"]], "kernel_tuner.core": [[6, 0, 1, "", "DeviceInterface"]], "kernel_tuner.core.DeviceInterface": [[6, 1, 1, "", "__init__"], [6, 1, 1, "", "benchmark"], [6, 1, 1, "", "benchmark_continuous"], [6, 1, 1, "", "benchmark_default"], [6, 1, 1, "", "check_kernel_output"], [6, 1, 1, "", "compile_kernel"], [6, 1, 1, "", "copy_constant_memory_args"], [6, 1, 1, "", "copy_shared_memory_args"], [6, 1, 1, "", "copy_texture_memory_args"], [6, 1, 1, "", "create_kernel_instance"], [6, 1, 1, "", "get_environment"], [6, 1, 1, "", "memcpy_dtoh"], [6, 1, 1, "", "preprocess_gpu_arguments"], [6, 1, 1, "", "ready_argument_list"], [6, 1, 1, "", "run_kernel"]], "kernel_tuner": [[22, 2, 1, "", "create_device_targets"], [22, 2, 1, "", "run_kernel"], [22, 2, 1, "", "store_results"], [22, 2, 1, "", "tune_kernel"], [6, 3, 0, "-", "util"]], "kernel_tuner.observers": [[17, 0, 1, "", "BenchmarkObserver"]], "kernel_tuner.observers.BenchmarkObserver": [[17, 1, 1, "", "after_finish"], [17, 1, 1, "", "after_start"], [17, 1, 1, "", "before_start"], [17, 1, 1, "", "during"], [17, 1, 1, "", "get_results"], [17, 1, 1, "", "register_configuration"], [17, 1, 1, "", "register_device"]], "kernel_tuner.observers.nvml": [[17, 0, 1, "", "NVMLObserver"]], "kernel_tuner.observers.pmt": [[17, 0, 1, "", "PMTObserver"]], "kernel_tuner.observers.powersensor": [[17, 0, 1, "", "PowerSensorObserver"]], "kernel_tuner.runners.sequential": [[6, 0, 1, "", "SequentialRunner"]], "kernel_tuner.runners.sequential.SequentialRunner": [[6, 1, 1, "", "__init__"], [6, 1, 1, "", "run"]], "kernel_tuner.runners.simulation": [[6, 0, 1, "", "SimulationRunner"]], "kernel_tuner.runners.simulation.SimulationRunner": [[6, 1, 1, "", "__init__"], [6, 1, 1, "", "run"]], "kernel_tuner.strategies": [[18, 3, 0, "-", "basinhopping"], [18, 3, 0, "-", "bayes_opt"], [18, 3, 0, "-", "brute_force"], [6, 3, 0, "-", "common"], [18, 3, 0, "-", "diff_evo"], [18, 3, 0, "-", "dual_annealing"], [18, 3, 0, "-", "firefly_algorithm"], [18, 3, 0, "-", "genetic_algorithm"], [18, 3, 0, "-", "greedy_ils"], [18, 3, 0, "-", "greedy_mls"], [18, 3, 0, "-", "minimize"], [18, 3, 0, "-", "mls"], [18, 3, 0, "-", "ordered_greedy_mls"], [18, 3, 0, "-", "pso"], [18, 3, 0, "-", "random_sample"], [18, 3, 0, "-", "simulated_annealing"]], "kernel_tuner.strategies.basinhopping": [[18, 2, 1, "", "tune"]], "kernel_tuner.strategies.bayes_opt": [[18, 2, 1, "", "generate_normalized_param_dicts"], [18, 2, 1, "", "normalize_parameter_space"], [18, 2, 1, "", "prune_parameter_space"], [18, 2, 1, "", "tune"]], "kernel_tuner.strategies.brute_force": [[18, 2, 1, "", "tune"]], "kernel_tuner.strategies.common": [[6, 2, 1, "", "get_options"], [6, 2, 1, "", "get_strategy_docstring"], [6, 2, 1, "", "make_strategy_options_doc"], [6, 2, 1, "", "scale_from_params"], [6, 2, 1, "", "setup_method_arguments"], [6, 2, 1, "", "setup_method_options"], [6, 2, 1, "", "snap_to_nearest_config"], [6, 2, 1, "", "unscale_and_snap_to_nearest"]], "kernel_tuner.strategies.diff_evo": [[18, 2, 1, "", "tune"]], "kernel_tuner.strategies.dual_annealing": [[18, 2, 1, "", "tune"]], "kernel_tuner.strategies.firefly_algorithm": [[18, 0, 1, "", "Firefly"], [18, 2, 1, "", "tune"]], "kernel_tuner.strategies.firefly_algorithm.Firefly": [[18, 1, 1, "", "compute_intensity"], [18, 1, 1, "", "distance_to"], [18, 1, 1, "", "move_towards"]], "kernel_tuner.strategies.genetic_algorithm": [[18, 2, 1, "", "disruptive_uniform_crossover"], [18, 2, 1, "", "mutate"], [18, 2, 1, "", "single_point_crossover"], [18, 2, 1, "", "tune"], [18, 2, 1, "", "two_point_crossover"], [18, 2, 1, "", "uniform_crossover"], [18, 2, 1, "", "weighted_choice"]], "kernel_tuner.strategies.greedy_ils": [[18, 2, 1, "", "tune"]], "kernel_tuner.strategies.greedy_mls": [[18, 2, 1, "", "tune"]], "kernel_tuner.strategies.minimize": [[18, 2, 1, "", "tune"]], "kernel_tuner.strategies.mls": [[18, 2, 1, "", "tune"]], "kernel_tuner.strategies.ordered_greedy_mls": [[18, 2, 1, "", "tune"]], "kernel_tuner.strategies.pso": [[18, 2, 1, "", "tune"]], "kernel_tuner.strategies.random_sample": [[18, 2, 1, "", "tune"]], "kernel_tuner.strategies.simulated_annealing": [[18, 2, 1, "", "acceptance_prob"], [18, 2, 1, "", "neighbor"], [18, 2, 1, "", "tune"]], "kernel_tuner.util": [[6, 0, 1, "", "CompilationFailedConfig"], [6, 0, 1, "", "ErrorConfig"], [6, 0, 1, "", "InvalidConfig"], [6, 0, 1, "", "NpEncoder"], [6, 0, 1, "", "RuntimeFailedConfig"], [6, 4, 1, "", "SkippableFailure"], [6, 4, 1, "", "StopCriterionReached"], [6, 2, 1, "", "check_argument_list"], [6, 2, 1, "", "check_argument_type"], [6, 2, 1, "", "check_restrictions"], [6, 2, 1, "", "check_stop_criterion"], [6, 2, 1, "", "check_thread_block_dimensions"], [6, 2, 1, "", "check_tune_params_list"], [6, 2, 1, "", "compile_restrictions"], [6, 2, 1, "", "config_valid"], [6, 2, 1, "", "convert_constraint_restriction"], [6, 2, 1, "", "cuda_error_check"], [6, 2, 1, "", "delete_temp_file"], [6, 2, 1, "", "detect_language"], [6, 2, 1, "", "dump_cache"], [6, 2, 1, "", "get_best_config"], [6, 2, 1, "", "get_config_string"], [6, 2, 1, "", "get_grid_dimensions"], [6, 2, 1, "", "get_instance_string"], [6, 2, 1, "", "get_kernel_string"], [6, 2, 1, "", "get_problem_size"], [6, 2, 1, "", "get_smem_args"], [6, 2, 1, "", "get_temp_filename"], [6, 2, 1, "", "get_thread_block_dimensions"], [6, 2, 1, "", "get_total_timings"], [6, 2, 1, "", "looks_like_a_filename"], [6, 2, 1, "", "normalize_verify_function"], [6, 2, 1, "", "parse_restrictions"], [6, 2, 1, "", "prepare_kernel_string"], [6, 2, 1, "", "print_config"], [6, 2, 1, "", "print_config_output"], [6, 2, 1, "", "process_cache"], [6, 2, 1, "", "process_metrics"], [6, 2, 1, "", "read_cache"], [6, 2, 1, "", "read_file"], [6, 2, 1, "", "replace_param_occurrences"], [6, 2, 1, "", "setup_block_and_grid"], [6, 2, 1, "", "store_cache"], [6, 2, 1, "", "write_file"]], "kernel_tuner.util.NpEncoder": [[6, 1, 1, "", "default"]]}, "objtypes": {"0": "py:class", "1": "py:method", "2": "py:function", "3": "py:module", "4": "py:exception"}, "objnames": {"0": ["py", "class", "Python class"], "1": ["py", "method", "Python method"], "2": ["py", "function", "Python function"], "3": ["py", "module", "Python module"], "4": ["py", "exception", "Python exception"]}, "titleterms": {"backend": [0, 6, 14, 21], "cuda": [0, 14, 15], "featur": [0, 2], "support": 0, "usag": [0, 13], "compil": [0, 6], "cach": 1, "file": 1, "The": [2, 13], "kernel": [2, 7, 8, 9, 10, 11, 13, 15, 21], "tuner": [2, 7, 8, 9, 10, 11, 13], "document": [2, 3, 6, 13, 22], "guid": [2, 3, 14], "refer": 2, "contribut": 3, "report": 3, "issu": 3, "code": [3, 7, 8, 9, 10, 12], "develop": 3, "environ": 3, "local": [3, 8], "setup": 3, "cluster": 3, "run": [3, 9], "test": [3, 4], "build": 3, "convolut": [4, 10], "2d": 4, "exampl": [4, 10, 13, 21], "implement": [4, 7, 8, 9], "tune": [4, 7, 8, 9, 11, 12, 15, 16, 17], "more": 4, "tunabl": 4, "paramet": [4, 9, 11, 17, 23], "correct": 5, "verif": 5, "design": 6, "strategi": [6, 18], "kernel_tun": [6, 18], "common": 6, "runner": 6, "sequenti": 6, "sequentialrunn": 6, "simulationrunn": 6, "devic": 6, "interfac": 6, "core": 6, "deviceinterfac": 6, "pycuda": [6, 14], "pycudafunct": 6, "cupi": 6, "cupyfunct": 6, "nvcuda": 6, "cudafunct": 6, "opencl": [6, 14], "openclfunct": 6, "compilerfunct": 6, "hip": [6, 14], "hipfunct": 6, "util": 6, "function": 6, "diffus": [7, 8, 9], "python": [7, 8, 9, 14], "comput": [7, 8, 9], "gpu": [7, 8, 9, 11], "auto": [7, 8, 9], "us": [7, 8, 9, 11, 15, 20], "share": [7, 8, 9, 15], "memori": [7, 8, 9, 15], "tile": [7, 8, 9], "store": [7, 8], "result": [7, 8], "tutori": [8, 9], "from": [8, 9], "physic": [8, 9], "best": 9, "product": 9, "c": 9, "vector": 10, "add": 10, "stencil": 10, "matrix": [10, 15], "multipl": [10, 15], "py": 10, "sepconv": 10, "convolution_correct": 10, "convolution_stream": 10, "reduct": 10, "spars": 10, "point": 10, "polygon": 10, "expdist": 10, "gener": 10, "3d": 11, "grid": 11, "let": 11, "": 11, "start": [11, 19], "cpu": 11, "move": 11, "optim": [11, 18], "host": 12, "number": 12, "stream": 12, "quick": 13, "instal": [13, 14], "citat": 13, "packag": 14, "other": 14, "pyopencl": 14, "pyhip": 14, "git": 14, "version": 14, "depend": 14, "naiv": 15, "increas": 15, "work": 15, "per": 15, "thread": 15, "metric": 16, "object": 16, "observ": 17, "powersensorobserv": 17, "nvmlobserv": 17, "execut": 17, "nvml": 17, "pmtobserv": 17, "basinhop": 18, "bayes_opt": 18, "brute_forc": 18, "diff_evo": 18, "dual_ann": 18, "firefly_algorithm": 18, "genetic_algorithm": 18, "greedy_il": 18, "greedy_ml": 18, "minim": 18, "ml": 18, "ordered_greedy_ml": 18, "pso": 18, "random_sampl": 18, "simulated_ann": 18, "get": 19, "struct": 20, "templat": 21, "select": 21, "api": 22, "vocabulari": 23}, "envversion": {"sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "nbsphinx": 4, "sphinx": 58}, "alltitles": {"Backends": [[0, "backends"]], "CUDA Backends": [[0, "cuda-backends"]], "Backend feature support": [[0, "id1"]], "Backend usage and compiler": [[0, "id2"]], "Cache files": [[1, "cache-files"]], "The Kernel Tuner documentation": [[2, "the-kernel-tuner-documentation"], [13, "the-kernel-tuner-documentation"]], "Kernel Tuner": [[2, null]], "Guides": [[2, null]], "Features": [[2, null]], "Reference": [[2, null]], "Contribution guide": [[3, "contribution-guide"]], "Reporting Issues": [[3, "reporting-issues"]], "Contributing Code": [[3, "contributing-code"]], "Development environment": [[3, "development-environment"]], "Local setup": [[3, "local-setup"]], "Cluster setup": [[3, "cluster-setup"]], "Running tests": [[3, "running-tests"]], "Building documentation": [[3, "building-documentation"]], "Convolution": [[4, "Convolution"], [10, "convolution"]], "2D Convolution example": [[4, "2D-Convolution-example"]], "Implement a test": [[4, "Implement-a-test"]], "Tuning 2D Convolution": [[4, "Tuning-2D-Convolution"]], "More tunable parameters": [[4, "More-tunable-parameters"]], "Correctness Verification": [[5, "correctness-verification"]], "Design documentation": [[6, "design-documentation"]], "Strategies": [[6, "strategies"]], "kernel_tuner.strategies.common": [[6, "module-kernel_tuner.strategies.common"]], "Runners": [[6, "runners"]], "kernel_tuner.runners.sequential.SequentialRunner": [[6, "kernel-tuner-runners-sequential-sequentialrunner"]], "kernel_tuner.runners.sequential.SimulationRunner": [[6, "kernel-tuner-runners-sequential-simulationrunner"]], "Device Interfaces": [[6, "device-interfaces"]], "kernel_tuner.core.DeviceInterface": [[6, "kernel-tuner-core-deviceinterface"]], "kernel_tuner.backends.pycuda.PyCudaFunctions": [[6, "kernel-tuner-backends-pycuda-pycudafunctions"]], "kernel_tuner.backends.cupy.CupyFunctions": [[6, "kernel-tuner-backends-cupy-cupyfunctions"]], "kernel_tuner.backends.nvcuda.CudaFunctions": [[6, "kernel-tuner-backends-nvcuda-cudafunctions"]], "kernel_tuner.backends.opencl.OpenCLFunctions": [[6, "kernel-tuner-backends-opencl-openclfunctions"]], "kernel_tuner.backends.compiler.CompilerFunctions": [[6, "kernel-tuner-backends-compiler-compilerfunctions"]], "kernel_tuner.backends.hip.HipFunctions": [[6, "kernel-tuner-backends-hip-hipfunctions"]], "Util Functions": [[6, "util-functions"]], "kernel_tuner.util": [[6, "module-kernel_tuner.util"]], "Diffusion": [[7, "Diffusion"], [7, "id1"], [8, "Diffusion"], [9, "Diffusion"]], "Python implementation": [[7, "Python-implementation"], [8, "Python-implementation"], [9, "Python-implementation"]], "Computing on the GPU": [[7, "Computing-on-the-GPU"], [8, "Computing-on-the-GPU"], [9, "Computing-on-the-GPU"]], "Auto-Tuning with the Kernel Tuner": [[7, "Auto-Tuning-with-the-Kernel-Tuner"], [8, "Auto-Tuning-with-the-Kernel-Tuner"], [9, "Auto-Tuning-with-the-Kernel-Tuner"]], "Using Shared Memory": [[7, "Using-Shared-Memory"]], "Tiling GPU Code": [[7, "Tiling-GPU-Code"], [8, "Tiling-GPU-Code"], [9, "Tiling-GPU-Code"]], "Storing the results": [[7, "Storing-the-results"], [8, "Storing-the-results"]], "Tutorial: From physics to tuned GPU kernels": [[8, "Tutorial:-From-physics-to-tuned-GPU-kernels"], [9, "Tutorial:-From-physics-to-tuned-GPU-kernels"]], "Using Shared (local) Memory": [[8, "Using-Shared-(local)-Memory"]], "Using shared memory": [[9, "Using-shared-memory"], [15, "Using-shared-memory"]], "Using the best parameters in a production run": [[9, "Using-the-best-parameters-in-a-production-run"]], "Python run": [[9, "Python-run"]], "C run": [[9, "C-run"]], "Kernel Tuner Examples": [[10, "kernel-tuner-examples"]], "Vector Add": [[10, "vector-add"]], "Stencil": [[10, "stencil"]], "Matrix Multiplication": [[10, "matrix-multiplication"]], "convolution.py": [[10, "convolution-py"]], "sepconv.py": [[10, "sepconv-py"]], "convolution_correct.py": [[10, "convolution-correct-py"]], "convolution_streams.py": [[10, "convolution-streams-py"]], "Reduction": [[10, "reduction"]], "Sparse Matrix Vector Multiplication": [[10, "sparse-matrix-vector-multiplication"]], "Point-in-Polygon": [[10, "point-in-polygon"]], "ExpDist": [[10, "expdist"]], "Code Generator": [[10, "code-generator"]], "3D Grid on GPU with Kernel Tuner": [[11, "3D-Grid-on-GPU-with-Kernel-Tuner"]], "Let\u2019s start on the CPU": [[11, "Let's-start-on-the-CPU"]], "Let\u2019s move to the GPU": [[11, "Let's-move-to-the-GPU"]], "Tune the kernel": [[11, "Tune-the-kernel"]], "Using the optimized parameters": [[11, "Using-the-optimized-parameters"]], "Tuning Host Code": [[12, "tuning-host-code"]], "Tuning the number of streams": [[12, "tuning-the-number-of-streams"]], "Quick install": [[13, "quick-install"]], "Example usage": [[13, "example-usage"]], "Citation": [[13, "citation"]], "Installation": [[14, "installation"]], "Python": [[14, "python"]], "Installing Python Packages": [[14, "installing-python-packages"]], "CUDA and PyCUDA": [[14, "cuda-and-pycuda"]], "Other CUDA Backends": [[14, "other-cuda-backends"]], "OpenCL and PyOpenCL": [[14, "opencl-and-pyopencl"]], "HIP and PyHIP": [[14, "hip-and-pyhip"]], "Installing the git version": [[14, "installing-the-git-version"]], "Dependencies for the guides": [[14, "dependencies-for-the-guides"]], "Matrix multiplication": [[15, "Matrix-multiplication"]], "Naive CUDA kernel": [[15, "Naive-CUDA-kernel"]], "Tuning a naive kernel": [[15, "Tuning-a-naive-kernel"]], "Increase work per thread": [[15, "Increase-work-per-thread"]], "Metrics and Objectives": [[16, "metrics-and-objectives"]], "Metrics": [[16, "metrics"]], "Tuning Objectives": [[16, "tuning-objectives"]], "Observers": [[17, "observers"]], "PowerSensorObserver": [[17, "powersensorobserver"]], "NVMLObserver": [[17, "nvmlobserver"]], "Tuning execution parameters with NVML": [[17, "tuning-execution-parameters-with-nvml"]], "PMTObserver": [[17, "pmtobserver"]], "Optimization strategies": [[18, "optimization-strategies"]], "kernel_tuner.strategies.basinhopping": [[18, "module-kernel_tuner.strategies.basinhopping"]], "kernel_tuner.strategies.bayes_opt": [[18, "module-kernel_tuner.strategies.bayes_opt"]], "kernel_tuner.strategies.brute_force": [[18, "module-kernel_tuner.strategies.brute_force"]], "kernel_tuner.strategies.diff_evo": [[18, "module-kernel_tuner.strategies.diff_evo"]], "kernel_tuner.strategies.dual_annealing": [[18, "module-kernel_tuner.strategies.dual_annealing"]], "kernel_tuner.strategies.firefly_algorithm": [[18, "module-kernel_tuner.strategies.firefly_algorithm"]], "kernel_tuner.strategies.genetic_algorithm": [[18, "module-kernel_tuner.strategies.genetic_algorithm"]], "kernel_tuner.strategies.greedy_ils": [[18, "module-kernel_tuner.strategies.greedy_ils"]], "kernel_tuner.strategies.greedy_mls": [[18, "module-kernel_tuner.strategies.greedy_mls"]], "kernel_tuner.strategies.minimize": [[18, "module-kernel_tuner.strategies.minimize"]], "kernel_tuner.strategies.mls": [[18, "module-kernel_tuner.strategies.mls"]], "kernel_tuner.strategies.ordered_greedy_mls": [[18, "module-kernel_tuner.strategies.ordered_greedy_mls"]], "kernel_tuner.strategies.pso": [[18, "module-kernel_tuner.strategies.pso"]], "kernel_tuner.strategies.random_sample": [[18, "module-kernel_tuner.strategies.random_sample"]], "kernel_tuner.strategies.simulated_annealing": [[18, "module-kernel_tuner.strategies.simulated_annealing"]], "Getting Started": [[19, "getting-started"]], "Using structs": [[20, "using-structs"]], "Templated kernels": [[21, "templated-kernels"]], "Example": [[21, "example"]], "Selecting a backend": [[21, "selecting-a-backend"]], "API Documentation": [[22, "api-documentation"]], "Parameter Vocabulary": [[23, "parameter-vocabulary"]]}, "indexentries": {"compilationfailedconfig (class in kernel_tuner.util)": [[6, "kernel_tuner.util.CompilationFailedConfig"]], "compilerfunctions (class in kernel_tuner.backends.compiler)": [[6, "kernel_tuner.backends.compiler.CompilerFunctions"]], "cudafunctions (class in kernel_tuner.backends.nvcuda)": [[6, "kernel_tuner.backends.nvcuda.CudaFunctions"]], "cupyfunctions (class in kernel_tuner.backends.cupy)": [[6, "kernel_tuner.backends.cupy.CupyFunctions"]], "deviceinterface (class in kernel_tuner.core)": [[6, "kernel_tuner.core.DeviceInterface"]], "errorconfig (class in kernel_tuner.util)": [[6, "kernel_tuner.util.ErrorConfig"]], "hipfunctions (class in kernel_tuner.backends.hip)": [[6, "kernel_tuner.backends.hip.HipFunctions"]], "invalidconfig (class in kernel_tuner.util)": [[6, "kernel_tuner.util.InvalidConfig"]], "npencoder (class in kernel_tuner.util)": [[6, "kernel_tuner.util.NpEncoder"]], "openclfunctions (class in kernel_tuner.backends.opencl)": [[6, "kernel_tuner.backends.opencl.OpenCLFunctions"]], "pycudafunctions (class in kernel_tuner.backends.pycuda)": [[6, "kernel_tuner.backends.pycuda.PyCudaFunctions"]], "runtimefailedconfig (class in kernel_tuner.util)": [[6, "kernel_tuner.util.RuntimeFailedConfig"]], "sequentialrunner (class in kernel_tuner.runners.sequential)": [[6, "kernel_tuner.runners.sequential.SequentialRunner"]], "simulationrunner (class in kernel_tuner.runners.simulation)": [[6, "kernel_tuner.runners.simulation.SimulationRunner"]], "skippablefailure": [[6, "kernel_tuner.util.SkippableFailure"]], "stopcriterionreached": [[6, "kernel_tuner.util.StopCriterionReached"]], "__init__() (kernel_tuner.backends.compiler.compilerfunctions method)": [[6, "kernel_tuner.backends.compiler.CompilerFunctions.__init__"]], "__init__() (kernel_tuner.backends.cupy.cupyfunctions method)": [[6, "kernel_tuner.backends.cupy.CupyFunctions.__init__"]], "__init__() (kernel_tuner.backends.hip.hipfunctions method)": [[6, "kernel_tuner.backends.hip.HipFunctions.__init__"]], "__init__() (kernel_tuner.backends.nvcuda.cudafunctions method)": [[6, "kernel_tuner.backends.nvcuda.CudaFunctions.__init__"]], "__init__() (kernel_tuner.backends.opencl.openclfunctions method)": [[6, "kernel_tuner.backends.opencl.OpenCLFunctions.__init__"]], "__init__() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[6, "kernel_tuner.backends.pycuda.PyCudaFunctions.__init__"]], "__init__() (kernel_tuner.core.deviceinterface method)": [[6, "kernel_tuner.core.DeviceInterface.__init__"]], "__init__() (kernel_tuner.runners.sequential.sequentialrunner method)": [[6, "kernel_tuner.runners.sequential.SequentialRunner.__init__"]], "__init__() (kernel_tuner.runners.simulation.simulationrunner method)": [[6, "kernel_tuner.runners.simulation.SimulationRunner.__init__"]], "benchmark() (kernel_tuner.core.deviceinterface method)": [[6, "kernel_tuner.core.DeviceInterface.benchmark"]], "benchmark_continuous() (kernel_tuner.core.deviceinterface method)": [[6, "kernel_tuner.core.DeviceInterface.benchmark_continuous"]], "benchmark_default() (kernel_tuner.core.deviceinterface method)": [[6, "kernel_tuner.core.DeviceInterface.benchmark_default"]], "check_argument_list() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.check_argument_list"]], "check_argument_type() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.check_argument_type"]], "check_kernel_output() (kernel_tuner.core.deviceinterface method)": [[6, "kernel_tuner.core.DeviceInterface.check_kernel_output"]], "check_restrictions() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.check_restrictions"]], "check_stop_criterion() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.check_stop_criterion"]], "check_thread_block_dimensions() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.check_thread_block_dimensions"]], "check_tune_params_list() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.check_tune_params_list"]], "cleanup_lib() (kernel_tuner.backends.compiler.compilerfunctions method)": [[6, "kernel_tuner.backends.compiler.CompilerFunctions.cleanup_lib"]], "compile() (kernel_tuner.backends.compiler.compilerfunctions method)": [[6, "kernel_tuner.backends.compiler.CompilerFunctions.compile"]], "compile() (kernel_tuner.backends.cupy.cupyfunctions method)": [[6, "kernel_tuner.backends.cupy.CupyFunctions.compile"]], "compile() (kernel_tuner.backends.hip.hipfunctions method)": [[6, "kernel_tuner.backends.hip.HipFunctions.compile"]], "compile() (kernel_tuner.backends.nvcuda.cudafunctions method)": [[6, "kernel_tuner.backends.nvcuda.CudaFunctions.compile"]], "compile() (kernel_tuner.backends.opencl.openclfunctions method)": [[6, "kernel_tuner.backends.opencl.OpenCLFunctions.compile"]], "compile() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[6, "kernel_tuner.backends.pycuda.PyCudaFunctions.compile"]], "compile_kernel() (kernel_tuner.core.deviceinterface method)": [[6, "kernel_tuner.core.DeviceInterface.compile_kernel"]], "compile_restrictions() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.compile_restrictions"]], "config_valid() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.config_valid"]], "convert_constraint_restriction() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.convert_constraint_restriction"]], "copy_constant_memory_args() (kernel_tuner.backends.cupy.cupyfunctions method)": [[6, "kernel_tuner.backends.cupy.CupyFunctions.copy_constant_memory_args"]], "copy_constant_memory_args() (kernel_tuner.backends.hip.hipfunctions method)": [[6, "kernel_tuner.backends.hip.HipFunctions.copy_constant_memory_args"]], "copy_constant_memory_args() (kernel_tuner.backends.nvcuda.cudafunctions method)": [[6, "kernel_tuner.backends.nvcuda.CudaFunctions.copy_constant_memory_args"]], "copy_constant_memory_args() (kernel_tuner.backends.opencl.openclfunctions method)": [[6, "kernel_tuner.backends.opencl.OpenCLFunctions.copy_constant_memory_args"]], "copy_constant_memory_args() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[6, "kernel_tuner.backends.pycuda.PyCudaFunctions.copy_constant_memory_args"]], "copy_constant_memory_args() (kernel_tuner.core.deviceinterface method)": [[6, "kernel_tuner.core.DeviceInterface.copy_constant_memory_args"]], "copy_shared_memory_args() (kernel_tuner.backends.cupy.cupyfunctions method)": [[6, "kernel_tuner.backends.cupy.CupyFunctions.copy_shared_memory_args"]], "copy_shared_memory_args() (kernel_tuner.backends.hip.hipfunctions method)": [[6, "kernel_tuner.backends.hip.HipFunctions.copy_shared_memory_args"]], "copy_shared_memory_args() (kernel_tuner.backends.nvcuda.cudafunctions method)": [[6, "kernel_tuner.backends.nvcuda.CudaFunctions.copy_shared_memory_args"]], "copy_shared_memory_args() (kernel_tuner.backends.opencl.openclfunctions method)": [[6, "kernel_tuner.backends.opencl.OpenCLFunctions.copy_shared_memory_args"]], "copy_shared_memory_args() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[6, "kernel_tuner.backends.pycuda.PyCudaFunctions.copy_shared_memory_args"]], "copy_shared_memory_args() (kernel_tuner.core.deviceinterface method)": [[6, "kernel_tuner.core.DeviceInterface.copy_shared_memory_args"]], "copy_texture_memory_args() (kernel_tuner.backends.cupy.cupyfunctions method)": [[6, "kernel_tuner.backends.cupy.CupyFunctions.copy_texture_memory_args"]], "copy_texture_memory_args() (kernel_tuner.backends.hip.hipfunctions method)": [[6, "kernel_tuner.backends.hip.HipFunctions.copy_texture_memory_args"]], "copy_texture_memory_args() (kernel_tuner.backends.nvcuda.cudafunctions method)": [[6, "kernel_tuner.backends.nvcuda.CudaFunctions.copy_texture_memory_args"]], "copy_texture_memory_args() (kernel_tuner.backends.opencl.openclfunctions method)": [[6, "kernel_tuner.backends.opencl.OpenCLFunctions.copy_texture_memory_args"]], "copy_texture_memory_args() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[6, "kernel_tuner.backends.pycuda.PyCudaFunctions.copy_texture_memory_args"]], "copy_texture_memory_args() (kernel_tuner.core.deviceinterface method)": [[6, "kernel_tuner.core.DeviceInterface.copy_texture_memory_args"]], "create_kernel_instance() (kernel_tuner.core.deviceinterface method)": [[6, "kernel_tuner.core.DeviceInterface.create_kernel_instance"]], "cuda_error_check() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.cuda_error_check"]], "default() (kernel_tuner.util.npencoder method)": [[6, "kernel_tuner.util.NpEncoder.default"]], "delete_temp_file() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.delete_temp_file"]], "detect_language() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.detect_language"]], "dump_cache() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.dump_cache"]], "get_best_config() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.get_best_config"]], "get_config_string() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.get_config_string"]], "get_environment() (kernel_tuner.core.deviceinterface method)": [[6, "kernel_tuner.core.DeviceInterface.get_environment"]], "get_grid_dimensions() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.get_grid_dimensions"]], "get_instance_string() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.get_instance_string"]], "get_kernel_string() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.get_kernel_string"]], "get_options() (in module kernel_tuner.strategies.common)": [[6, "kernel_tuner.strategies.common.get_options"]], "get_problem_size() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.get_problem_size"]], "get_smem_args() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.get_smem_args"]], "get_strategy_docstring() (in module kernel_tuner.strategies.common)": [[6, "kernel_tuner.strategies.common.get_strategy_docstring"]], "get_temp_filename() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.get_temp_filename"]], "get_thread_block_dimensions() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.get_thread_block_dimensions"]], "get_total_timings() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.get_total_timings"]], "kernel_finished() (kernel_tuner.backends.compiler.compilerfunctions method)": [[6, "kernel_tuner.backends.compiler.CompilerFunctions.kernel_finished"]], "kernel_finished() (kernel_tuner.backends.cupy.cupyfunctions method)": [[6, "kernel_tuner.backends.cupy.CupyFunctions.kernel_finished"]], "kernel_finished() (kernel_tuner.backends.hip.hipfunctions method)": [[6, "kernel_tuner.backends.hip.HipFunctions.kernel_finished"]], "kernel_finished() (kernel_tuner.backends.nvcuda.cudafunctions method)": [[6, "kernel_tuner.backends.nvcuda.CudaFunctions.kernel_finished"]], "kernel_finished() (kernel_tuner.backends.opencl.openclfunctions method)": [[6, "kernel_tuner.backends.opencl.OpenCLFunctions.kernel_finished"]], "kernel_finished() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[6, "kernel_tuner.backends.pycuda.PyCudaFunctions.kernel_finished"]], "kernel_tuner.strategies.common": [[6, "module-kernel_tuner.strategies.common"]], "kernel_tuner.util": [[6, "module-kernel_tuner.util"]], "looks_like_a_filename() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.looks_like_a_filename"]], "make_strategy_options_doc() (in module kernel_tuner.strategies.common)": [[6, "kernel_tuner.strategies.common.make_strategy_options_doc"]], "memcpy_dtoh() (kernel_tuner.backends.compiler.compilerfunctions method)": [[6, "kernel_tuner.backends.compiler.CompilerFunctions.memcpy_dtoh"]], "memcpy_dtoh() (kernel_tuner.backends.cupy.cupyfunctions method)": [[6, "kernel_tuner.backends.cupy.CupyFunctions.memcpy_dtoh"]], "memcpy_dtoh() (kernel_tuner.backends.hip.hipfunctions method)": [[6, "kernel_tuner.backends.hip.HipFunctions.memcpy_dtoh"]], "memcpy_dtoh() (kernel_tuner.backends.nvcuda.cudafunctions static method)": [[6, "kernel_tuner.backends.nvcuda.CudaFunctions.memcpy_dtoh"]], "memcpy_dtoh() (kernel_tuner.backends.opencl.openclfunctions method)": [[6, "kernel_tuner.backends.opencl.OpenCLFunctions.memcpy_dtoh"]], "memcpy_dtoh() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[6, "kernel_tuner.backends.pycuda.PyCudaFunctions.memcpy_dtoh"]], "memcpy_dtoh() (kernel_tuner.core.deviceinterface method)": [[6, "kernel_tuner.core.DeviceInterface.memcpy_dtoh"]], "memcpy_htod() (kernel_tuner.backends.compiler.compilerfunctions method)": [[6, "kernel_tuner.backends.compiler.CompilerFunctions.memcpy_htod"]], "memcpy_htod() (kernel_tuner.backends.cupy.cupyfunctions method)": [[6, "kernel_tuner.backends.cupy.CupyFunctions.memcpy_htod"]], "memcpy_htod() (kernel_tuner.backends.hip.hipfunctions method)": [[6, "kernel_tuner.backends.hip.HipFunctions.memcpy_htod"]], "memcpy_htod() (kernel_tuner.backends.nvcuda.cudafunctions static method)": [[6, "kernel_tuner.backends.nvcuda.CudaFunctions.memcpy_htod"]], "memcpy_htod() (kernel_tuner.backends.opencl.openclfunctions method)": [[6, "kernel_tuner.backends.opencl.OpenCLFunctions.memcpy_htod"]], "memcpy_htod() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[6, "kernel_tuner.backends.pycuda.PyCudaFunctions.memcpy_htod"]], "memset() (kernel_tuner.backends.compiler.compilerfunctions method)": [[6, "kernel_tuner.backends.compiler.CompilerFunctions.memset"]], "memset() (kernel_tuner.backends.cupy.cupyfunctions method)": [[6, "kernel_tuner.backends.cupy.CupyFunctions.memset"]], "memset() (kernel_tuner.backends.hip.hipfunctions method)": [[6, "kernel_tuner.backends.hip.HipFunctions.memset"]], "memset() (kernel_tuner.backends.nvcuda.cudafunctions static method)": [[6, "kernel_tuner.backends.nvcuda.CudaFunctions.memset"]], "memset() (kernel_tuner.backends.opencl.openclfunctions method)": [[6, "kernel_tuner.backends.opencl.OpenCLFunctions.memset"]], "memset() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[6, "kernel_tuner.backends.pycuda.PyCudaFunctions.memset"]], "module": [[6, "module-kernel_tuner.strategies.common"], [6, "module-kernel_tuner.util"], [18, "module-kernel_tuner.strategies.basinhopping"], [18, "module-kernel_tuner.strategies.bayes_opt"], [18, "module-kernel_tuner.strategies.brute_force"], [18, "module-kernel_tuner.strategies.diff_evo"], [18, "module-kernel_tuner.strategies.dual_annealing"], [18, "module-kernel_tuner.strategies.firefly_algorithm"], [18, "module-kernel_tuner.strategies.genetic_algorithm"], [18, "module-kernel_tuner.strategies.greedy_ils"], [18, "module-kernel_tuner.strategies.greedy_mls"], [18, "module-kernel_tuner.strategies.minimize"], [18, "module-kernel_tuner.strategies.mls"], [18, "module-kernel_tuner.strategies.ordered_greedy_mls"], [18, "module-kernel_tuner.strategies.pso"], [18, "module-kernel_tuner.strategies.random_sample"], [18, "module-kernel_tuner.strategies.simulated_annealing"]], "normalize_verify_function() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.normalize_verify_function"]], "parse_restrictions() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.parse_restrictions"]], "prepare_kernel_string() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.prepare_kernel_string"]], "preprocess_gpu_arguments() (kernel_tuner.core.deviceinterface static method)": [[6, "kernel_tuner.core.DeviceInterface.preprocess_gpu_arguments"]], "print_config() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.print_config"]], "print_config_output() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.print_config_output"]], "process_cache() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.process_cache"]], "process_metrics() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.process_metrics"]], "read_cache() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.read_cache"]], "read_file() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.read_file"]], "ready_argument_list() (kernel_tuner.backends.compiler.compilerfunctions method)": [[6, "kernel_tuner.backends.compiler.CompilerFunctions.ready_argument_list"]], "ready_argument_list() (kernel_tuner.backends.cupy.cupyfunctions method)": [[6, "kernel_tuner.backends.cupy.CupyFunctions.ready_argument_list"]], "ready_argument_list() (kernel_tuner.backends.hip.hipfunctions method)": [[6, "kernel_tuner.backends.hip.HipFunctions.ready_argument_list"]], "ready_argument_list() (kernel_tuner.backends.nvcuda.cudafunctions method)": [[6, "kernel_tuner.backends.nvcuda.CudaFunctions.ready_argument_list"]], "ready_argument_list() (kernel_tuner.backends.opencl.openclfunctions method)": [[6, "kernel_tuner.backends.opencl.OpenCLFunctions.ready_argument_list"]], "ready_argument_list() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[6, "kernel_tuner.backends.pycuda.PyCudaFunctions.ready_argument_list"]], "ready_argument_list() (kernel_tuner.core.deviceinterface method)": [[6, "kernel_tuner.core.DeviceInterface.ready_argument_list"]], "replace_param_occurrences() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.replace_param_occurrences"]], "run() (kernel_tuner.runners.sequential.sequentialrunner method)": [[6, "kernel_tuner.runners.sequential.SequentialRunner.run"]], "run() (kernel_tuner.runners.simulation.simulationrunner method)": [[6, "kernel_tuner.runners.simulation.SimulationRunner.run"]], "run_kernel() (kernel_tuner.backends.compiler.compilerfunctions method)": [[6, "kernel_tuner.backends.compiler.CompilerFunctions.run_kernel"]], "run_kernel() (kernel_tuner.backends.cupy.cupyfunctions method)": [[6, "kernel_tuner.backends.cupy.CupyFunctions.run_kernel"]], "run_kernel() (kernel_tuner.backends.hip.hipfunctions method)": [[6, "kernel_tuner.backends.hip.HipFunctions.run_kernel"]], "run_kernel() (kernel_tuner.backends.nvcuda.cudafunctions method)": [[6, "kernel_tuner.backends.nvcuda.CudaFunctions.run_kernel"]], "run_kernel() (kernel_tuner.backends.opencl.openclfunctions method)": [[6, "kernel_tuner.backends.opencl.OpenCLFunctions.run_kernel"]], "run_kernel() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[6, "kernel_tuner.backends.pycuda.PyCudaFunctions.run_kernel"]], "run_kernel() (kernel_tuner.core.deviceinterface method)": [[6, "kernel_tuner.core.DeviceInterface.run_kernel"]], "scale_from_params() (in module kernel_tuner.strategies.common)": [[6, "kernel_tuner.strategies.common.scale_from_params"]], "setup_block_and_grid() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.setup_block_and_grid"]], "setup_method_arguments() (in module kernel_tuner.strategies.common)": [[6, "kernel_tuner.strategies.common.setup_method_arguments"]], "setup_method_options() (in module kernel_tuner.strategies.common)": [[6, "kernel_tuner.strategies.common.setup_method_options"]], "snap_to_nearest_config() (in module kernel_tuner.strategies.common)": [[6, "kernel_tuner.strategies.common.snap_to_nearest_config"]], "start_event() (kernel_tuner.backends.compiler.compilerfunctions method)": [[6, "kernel_tuner.backends.compiler.CompilerFunctions.start_event"]], "start_event() (kernel_tuner.backends.cupy.cupyfunctions method)": [[6, "kernel_tuner.backends.cupy.CupyFunctions.start_event"]], "start_event() (kernel_tuner.backends.hip.hipfunctions method)": [[6, "kernel_tuner.backends.hip.HipFunctions.start_event"]], "start_event() (kernel_tuner.backends.nvcuda.cudafunctions method)": [[6, "kernel_tuner.backends.nvcuda.CudaFunctions.start_event"]], "start_event() (kernel_tuner.backends.opencl.openclfunctions method)": [[6, "kernel_tuner.backends.opencl.OpenCLFunctions.start_event"]], "start_event() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[6, "kernel_tuner.backends.pycuda.PyCudaFunctions.start_event"]], "stop_event() (kernel_tuner.backends.compiler.compilerfunctions method)": [[6, "kernel_tuner.backends.compiler.CompilerFunctions.stop_event"]], "stop_event() (kernel_tuner.backends.cupy.cupyfunctions method)": [[6, "kernel_tuner.backends.cupy.CupyFunctions.stop_event"]], "stop_event() (kernel_tuner.backends.hip.hipfunctions method)": [[6, "kernel_tuner.backends.hip.HipFunctions.stop_event"]], "stop_event() (kernel_tuner.backends.nvcuda.cudafunctions method)": [[6, "kernel_tuner.backends.nvcuda.CudaFunctions.stop_event"]], "stop_event() (kernel_tuner.backends.opencl.openclfunctions method)": [[6, "kernel_tuner.backends.opencl.OpenCLFunctions.stop_event"]], "stop_event() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[6, "kernel_tuner.backends.pycuda.PyCudaFunctions.stop_event"]], "store_cache() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.store_cache"]], "synchronize() (kernel_tuner.backends.compiler.compilerfunctions method)": [[6, "kernel_tuner.backends.compiler.CompilerFunctions.synchronize"]], "synchronize() (kernel_tuner.backends.cupy.cupyfunctions method)": [[6, "kernel_tuner.backends.cupy.CupyFunctions.synchronize"]], "synchronize() (kernel_tuner.backends.hip.hipfunctions method)": [[6, "kernel_tuner.backends.hip.HipFunctions.synchronize"]], "synchronize() (kernel_tuner.backends.nvcuda.cudafunctions static method)": [[6, "kernel_tuner.backends.nvcuda.CudaFunctions.synchronize"]], "synchronize() (kernel_tuner.backends.opencl.openclfunctions method)": [[6, "kernel_tuner.backends.opencl.OpenCLFunctions.synchronize"]], "synchronize() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[6, "kernel_tuner.backends.pycuda.PyCudaFunctions.synchronize"]], "unscale_and_snap_to_nearest() (in module kernel_tuner.strategies.common)": [[6, "kernel_tuner.strategies.common.unscale_and_snap_to_nearest"]], "write_file() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.write_file"]], "benchmarkobserver (class in kernel_tuner.observers)": [[17, "kernel_tuner.observers.BenchmarkObserver"]], "nvmlobserver (class in kernel_tuner.observers.nvml)": [[17, "kernel_tuner.observers.nvml.NVMLObserver"]], "pmtobserver (class in kernel_tuner.observers.pmt)": [[17, "kernel_tuner.observers.pmt.PMTObserver"]], "powersensorobserver (class in kernel_tuner.observers.powersensor)": [[17, "kernel_tuner.observers.powersensor.PowerSensorObserver"]], "after_finish() (kernel_tuner.observers.benchmarkobserver method)": [[17, "kernel_tuner.observers.BenchmarkObserver.after_finish"]], "after_start() (kernel_tuner.observers.benchmarkobserver method)": [[17, "kernel_tuner.observers.BenchmarkObserver.after_start"]], "before_start() (kernel_tuner.observers.benchmarkobserver method)": [[17, "kernel_tuner.observers.BenchmarkObserver.before_start"]], "during() (kernel_tuner.observers.benchmarkobserver method)": [[17, "kernel_tuner.observers.BenchmarkObserver.during"]], "get_results() (kernel_tuner.observers.benchmarkobserver method)": [[17, "kernel_tuner.observers.BenchmarkObserver.get_results"]], "register_configuration() (kernel_tuner.observers.benchmarkobserver method)": [[17, "kernel_tuner.observers.BenchmarkObserver.register_configuration"]], "register_device() (kernel_tuner.observers.benchmarkobserver method)": [[17, "kernel_tuner.observers.BenchmarkObserver.register_device"]], "firefly (class in kernel_tuner.strategies.firefly_algorithm)": [[18, "kernel_tuner.strategies.firefly_algorithm.Firefly"]], "acceptance_prob() (in module kernel_tuner.strategies.simulated_annealing)": [[18, "kernel_tuner.strategies.simulated_annealing.acceptance_prob"]], "compute_intensity() (kernel_tuner.strategies.firefly_algorithm.firefly method)": [[18, "kernel_tuner.strategies.firefly_algorithm.Firefly.compute_intensity"]], "disruptive_uniform_crossover() (in module kernel_tuner.strategies.genetic_algorithm)": [[18, "kernel_tuner.strategies.genetic_algorithm.disruptive_uniform_crossover"]], "distance_to() (kernel_tuner.strategies.firefly_algorithm.firefly method)": [[18, "kernel_tuner.strategies.firefly_algorithm.Firefly.distance_to"]], "generate_normalized_param_dicts() (in module kernel_tuner.strategies.bayes_opt)": [[18, "kernel_tuner.strategies.bayes_opt.generate_normalized_param_dicts"]], "kernel_tuner.strategies.basinhopping": [[18, "module-kernel_tuner.strategies.basinhopping"]], "kernel_tuner.strategies.bayes_opt": [[18, "module-kernel_tuner.strategies.bayes_opt"]], "kernel_tuner.strategies.brute_force": [[18, "module-kernel_tuner.strategies.brute_force"]], "kernel_tuner.strategies.diff_evo": [[18, "module-kernel_tuner.strategies.diff_evo"]], "kernel_tuner.strategies.dual_annealing": [[18, "module-kernel_tuner.strategies.dual_annealing"]], "kernel_tuner.strategies.firefly_algorithm": [[18, "module-kernel_tuner.strategies.firefly_algorithm"]], "kernel_tuner.strategies.genetic_algorithm": [[18, "module-kernel_tuner.strategies.genetic_algorithm"]], "kernel_tuner.strategies.greedy_ils": [[18, "module-kernel_tuner.strategies.greedy_ils"]], "kernel_tuner.strategies.greedy_mls": [[18, "module-kernel_tuner.strategies.greedy_mls"]], "kernel_tuner.strategies.minimize": [[18, "module-kernel_tuner.strategies.minimize"]], "kernel_tuner.strategies.mls": [[18, "module-kernel_tuner.strategies.mls"]], "kernel_tuner.strategies.ordered_greedy_mls": [[18, "module-kernel_tuner.strategies.ordered_greedy_mls"]], "kernel_tuner.strategies.pso": [[18, "module-kernel_tuner.strategies.pso"]], "kernel_tuner.strategies.random_sample": [[18, "module-kernel_tuner.strategies.random_sample"]], "kernel_tuner.strategies.simulated_annealing": [[18, "module-kernel_tuner.strategies.simulated_annealing"]], "move_towards() (kernel_tuner.strategies.firefly_algorithm.firefly method)": [[18, "kernel_tuner.strategies.firefly_algorithm.Firefly.move_towards"]], "mutate() (in module kernel_tuner.strategies.genetic_algorithm)": [[18, "kernel_tuner.strategies.genetic_algorithm.mutate"]], "neighbor() (in module kernel_tuner.strategies.simulated_annealing)": [[18, "kernel_tuner.strategies.simulated_annealing.neighbor"]], "normalize_parameter_space() (in module kernel_tuner.strategies.bayes_opt)": [[18, "kernel_tuner.strategies.bayes_opt.normalize_parameter_space"]], "prune_parameter_space() (in module kernel_tuner.strategies.bayes_opt)": [[18, "kernel_tuner.strategies.bayes_opt.prune_parameter_space"]], "single_point_crossover() (in module kernel_tuner.strategies.genetic_algorithm)": [[18, "kernel_tuner.strategies.genetic_algorithm.single_point_crossover"]], "tune() (in module kernel_tuner.strategies.basinhopping)": [[18, "kernel_tuner.strategies.basinhopping.tune"]], "tune() (in module kernel_tuner.strategies.bayes_opt)": [[18, "kernel_tuner.strategies.bayes_opt.tune"]], "tune() (in module kernel_tuner.strategies.brute_force)": [[18, "kernel_tuner.strategies.brute_force.tune"]], "tune() (in module kernel_tuner.strategies.diff_evo)": [[18, "kernel_tuner.strategies.diff_evo.tune"]], "tune() (in module kernel_tuner.strategies.dual_annealing)": [[18, "kernel_tuner.strategies.dual_annealing.tune"]], "tune() (in module kernel_tuner.strategies.firefly_algorithm)": [[18, "kernel_tuner.strategies.firefly_algorithm.tune"]], "tune() (in module kernel_tuner.strategies.genetic_algorithm)": [[18, "kernel_tuner.strategies.genetic_algorithm.tune"]], "tune() (in module kernel_tuner.strategies.greedy_ils)": [[18, "kernel_tuner.strategies.greedy_ils.tune"]], "tune() (in module kernel_tuner.strategies.greedy_mls)": [[18, "kernel_tuner.strategies.greedy_mls.tune"]], "tune() (in module kernel_tuner.strategies.minimize)": [[18, "kernel_tuner.strategies.minimize.tune"]], "tune() (in module kernel_tuner.strategies.mls)": [[18, "kernel_tuner.strategies.mls.tune"]], "tune() (in module kernel_tuner.strategies.ordered_greedy_mls)": [[18, "kernel_tuner.strategies.ordered_greedy_mls.tune"]], "tune() (in module kernel_tuner.strategies.pso)": [[18, "kernel_tuner.strategies.pso.tune"]], "tune() (in module kernel_tuner.strategies.random_sample)": [[18, "kernel_tuner.strategies.random_sample.tune"]], "tune() (in module kernel_tuner.strategies.simulated_annealing)": [[18, "kernel_tuner.strategies.simulated_annealing.tune"]], "two_point_crossover() (in module kernel_tuner.strategies.genetic_algorithm)": [[18, "kernel_tuner.strategies.genetic_algorithm.two_point_crossover"]], "uniform_crossover() (in module kernel_tuner.strategies.genetic_algorithm)": [[18, "kernel_tuner.strategies.genetic_algorithm.uniform_crossover"]], "weighted_choice() (in module kernel_tuner.strategies.genetic_algorithm)": [[18, "kernel_tuner.strategies.genetic_algorithm.weighted_choice"]], "create_device_targets() (in module kernel_tuner)": [[22, "kernel_tuner.create_device_targets"]], "run_kernel() (in module kernel_tuner)": [[22, "kernel_tuner.run_kernel"]], "store_results() (in module kernel_tuner)": [[22, "kernel_tuner.store_results"]], "tune_kernel() (in module kernel_tuner)": [[22, "kernel_tuner.tune_kernel"]]}}) \ No newline at end of file +Search.setIndex({"docnames": ["backends", "cache_files", "contents", "contributing", "convolution", "correctness", "design", "diffusion", "diffusion_opencl", "diffusion_use_optparam", "examples", "grid3d", "hostcode", "index", "install", "matrix_multiplication", "metrics", "observers", "optimization", "quickstart", "structs", "templates", "user-api", "vocabulary"], "filenames": ["backends.rst", "cache_files.rst", "contents.rst", "contributing.rst", "convolution.ipynb", "correctness.rst", "design.rst", "diffusion.ipynb", "diffusion_opencl.ipynb", "diffusion_use_optparam.ipynb", "examples.rst", "grid3d.ipynb", "hostcode.rst", "index.rst", "install.rst", "matrix_multiplication.ipynb", "metrics.rst", "observers.rst", "optimization.rst", "quickstart.rst", "structs.rst", "templates.rst", "user-api.rst", "vocabulary.rst"], "titles": ["Backends", "Cache files", "The Kernel Tuner documentation", "Contribution guide", "Convolution", "Correctness Verification", "Design documentation", "Diffusion", "Tutorial: From physics to tuned GPU kernels", "Tutorial: From physics to tuned GPU kernels", "Kernel Tuner Examples", "3D Grid on GPU with Kernel Tuner", "Tuning Host Code", "The Kernel Tuner documentation", "Installation", "Matrix multiplication", "Metrics and Objectives", "Observers", "Optimization strategies", "Getting Started", "Using structs", "Templated kernels", "API Documentation", "Parameter Vocabulary"], "terms": {"kernel": [0, 1, 3, 4, 5, 6, 12, 14, 16, 17, 18, 19, 20, 22, 23], "tuner": [0, 1, 3, 4, 5, 6, 12, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23], "implement": [0, 5, 6, 10, 11, 16, 17, 18, 22], "multipl": [0, 2, 6, 12, 17, 21, 22], "one": [0, 3, 4, 6, 7, 8, 9, 11, 14, 15, 17, 18, 22], "opencl": [0, 3, 4, 7, 8, 9, 10, 12, 13, 15, 22], "hip": [0, 3, 13, 22], "gener": [0, 3, 4, 6, 7, 8, 9, 13, 15, 17, 18, 20, 22, 23], "select": [0, 3, 4, 6, 7, 8, 9, 11, 14, 15, 17, 18, 22], "i": [0, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23], "most": [0, 3, 6, 7, 8, 9, 10, 12, 13, 15, 17, 18, 19, 20, 22], "case": [0, 3, 4, 5, 6, 7, 8, 9, 11, 15, 16, 17, 19, 20, 22], "automat": [0, 3, 4, 7, 8, 9, 11, 12, 15, 21, 22], "done": [0, 4, 14, 16, 17], "base": [0, 3, 6, 16, 17, 21, 22], "": [0, 3, 4, 6, 7, 8, 9, 10, 12, 14, 15, 16, 17, 19, 20, 21, 22], "program": [0, 3, 5, 7, 8, 9, 12, 15, 20, 21], "languag": [0, 6, 9, 12, 15, 20, 22], "sometim": [0, 3, 7, 8, 9, 20], "you": [0, 1, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14, 15, 17, 18, 19, 20, 21, 22, 23], "ll": [0, 4, 7, 8, 9, 14, 15], "want": [0, 5, 9, 11, 12, 14, 15, 17, 19, 22, 23], "specif": [0, 4, 6, 7, 8, 9, 11, 16, 17, 18, 22], "choos": [0, 7, 8, 9, 15, 18, 22], "pycuda": [0, 3, 7, 9, 11, 12, 17, 21], "default": [0, 3, 4, 5, 6, 7, 8, 9, 11, 15, 16, 17, 18, 21, 22], "It": [0, 3, 4, 6, 7, 8, 9, 12, 14, 15, 17, 21, 22], "compar": [0, 4, 5, 7, 8, 9, 11, 15, 16, 17], "complet": [0, 1, 4], "cupi": [0, 3, 12, 14, 17, 21, 22], "becaus": [0, 4, 5, 7, 8, 9, 12, 14, 15, 16, 21, 23], "ident": 0, "includ": [0, 3, 4, 5, 7, 8, 9, 11, 12, 14, 15, 17, 21, 22], "here": [0, 4, 11, 12, 14, 15, 17, 22], "well": [0, 7, 8, 9, 11, 15, 17, 22], "To": [0, 3, 5, 7, 8, 9, 11, 12, 13, 14, 15, 17, 18, 19, 20, 21, 22], "us": [0, 1, 2, 3, 4, 5, 6, 10, 12, 13, 14, 16, 17, 18, 19, 21, 22, 23], "nvidia": [0, 3, 6, 14, 15, 17, 21], "gpu": [0, 3, 4, 5, 6, 10, 12, 13, 15, 17, 19, 20, 22, 23], "see": [0, 1, 3, 4, 6, 7, 8, 9, 11, 12, 14, 15, 17, 19, 21, 22], "http": [0, 3, 13, 14, 17], "github": [0, 3, 4, 7, 8, 9, 11, 14, 15], "com": [0, 3, 13, 14], "jatinx": [0, 14], "nv": 0, "while": [0, 1, 4, 6, 7, 8, 9, 10, 15, 17, 18], "expect": [0, 3, 4, 5, 6, 7, 8, 9, 15, 17, 22], "all": [0, 3, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 17, 18, 19, 22], "input": [0, 4, 5, 7, 8, 9, 10, 12, 15, 16, 19, 20, 22], "output": [0, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 15, 19, 22, 23], "numpi": [0, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14, 15, 19, 20, 21, 22], "arrai": [0, 4, 5, 6, 7, 8, 9, 11, 12, 19, 20, 22], "also": [0, 1, 3, 4, 6, 7, 8, 9, 11, 12, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23], "argument": [0, 1, 4, 5, 6, 7, 8, 9, 10, 11, 12, 15, 17, 18, 19, 20, 21, 22], "thi": [0, 1, 3, 4, 5, 6, 7, 8, 9, 11, 12, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23], "give": [0, 7, 8, 9, 18], "user": [0, 3, 4, 5, 6, 8, 10, 14, 15, 16, 17, 18, 21, 22], "more": [0, 3, 5, 6, 7, 8, 9, 13, 14, 15, 16, 17, 19, 21, 22], "control": [0, 7, 8, 9, 17, 18, 22], "over": [0, 6, 7, 8, 9, 14, 15, 17, 18], "how": [0, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 19, 20, 21, 22], "memori": [0, 4, 6, 10, 12, 17, 20, 22, 23], "handl": [0, 12, 22], "check": [0, 3, 5, 6, 7, 8, 9, 12, 15], "dure": [0, 1, 6, 7, 8, 9, 11, 17, 22], "verif": [0, 2, 10, 22], "can": [0, 1, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23], "happen": [0, 1, 3, 4, 15, 19], "entir": [0, 6, 7, 8, 9, 15, 18, 22], "when": [0, 1, 3, 4, 6, 7, 8, 9, 12, 14, 15, 16, 17, 18, 20, 21, 22, 23], "onli": [0, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15, 17, 18, 20, 22], "textur": [0, 6, 22], "c": [0, 3, 4, 6, 10, 12, 13, 14, 15, 19, 21, 22], "signatur": [0, 4, 6], "With": [0, 11, 12], "other": [0, 1, 3, 4, 6, 7, 8, 9, 12, 15, 16, 17, 18, 22, 23], "requir": [0, 3, 4, 6, 7, 8, 9, 11, 12, 14, 15, 17, 21], "ha": [0, 3, 4, 6, 7, 8, 9, 12, 15, 17, 18, 22], "extern": [0, 17, 21], "linkag": [0, 21], "If": [0, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14, 15, 17, 18, 20, 22], "code": [0, 2, 4, 6, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23], "wrap": [0, 6, 19, 21, 22], "an": [0, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22], "block": [0, 4, 6, 7, 8, 9, 10, 11, 14, 15, 16, 19, 22, 23], "which": [0, 3, 4, 6, 7, 8, 9, 10, 11, 12, 15, 16, 17, 18, 19, 20, 21, 22, 23], "mai": [0, 3, 4, 5, 6, 7, 8, 9, 12, 14, 15, 16, 17, 18, 19, 20, 22], "caus": [0, 7, 8, 9], "issu": [0, 20], "contain": [0, 1, 4, 6, 7, 8, 9, 11, 12, 15, 17, 18, 21, 22], "cannot": [0, 3, 7, 8, 9, 17], "have": [0, 1, 3, 4, 6, 7, 8, 9, 11, 12, 13, 14, 15, 17, 18, 19, 21, 22, 23], "present": [0, 3, 15], "header": [0, 22], "file": [0, 2, 3, 4, 6, 7, 8, 10, 12, 15, 18, 19, 21, 22], "As": [0, 1, 4, 7, 8, 9, 11, 14, 15, 17], "detail": [0, 6, 14, 22], "further": [0, 7, 8, 9, 14, 15], "templat": [0, 2, 11], "ar": [0, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23], "fulli": [0, 3, 14], "limit": [0, 4, 6, 7, 8, 9, 10, 15, 17, 18, 21, 22, 23], "python": [0, 3, 4, 6, 10, 11, 12, 15, 17, 19, 20, 21, 22], "benchmark": [0, 1, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 17, 18, 19, 22, 23], "observ": [0, 2, 6, 16, 22, 23], "constant": [0, 4, 6, 7, 8, 9, 10, 12, 15, 18, 22], "dynam": [0, 6, 22], "share": [0, 4, 6, 22], "anoth": [0, 7, 8, 9, 12, 15, 16, 18, 22], "import": [0, 4, 5, 7, 8, 9, 11, 14, 15, 16, 19, 20, 21], "differ": [0, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 15, 16, 17, 18, 22], "between": [0, 7, 8, 9, 12, 14, 15, 16, 18, 22], "The": [0, 1, 3, 4, 5, 6, 7, 8, 9, 11, 12, 14, 15, 16, 17, 18, 19, 20, 21, 22], "tabl": 0, "below": [0, 3, 9, 10, 11, 12, 14, 15, 16, 17, 18, 20], "list": [0, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 15, 16, 17, 18, 19, 20, 22], "packag": [0, 3], "pyhip": [0, 6], "interfac": [0, 4, 5, 12, 14, 17, 18, 20, 22], "lang": [0, 6, 10, 12, 21, 22], "nvcuda": 0, "nvcc": [0, 6], "nvrtc": [0, 21], "hiprtc": 0, "A": [1, 3, 4, 6, 13, 14, 15, 17, 18, 22], "veri": [1, 5, 7, 8, 9, 12, 14, 15, 17, 20, 21], "featur": [1, 4, 5, 10, 14, 16, 17, 19, 21, 22], "abil": 1, "store": [1, 3, 4, 6, 9, 15, 17, 19, 22], "result": [1, 3, 4, 5, 6, 9, 11, 15, 16, 17, 18, 19, 22, 23], "tune": [1, 2, 5, 6, 10, 13, 14, 18, 19, 21, 22, 23], "enabl": [1, 17, 18, 20, 21], "pass": [1, 3, 5, 6, 7, 8, 9, 10, 11, 12, 15, 16, 17, 18, 19, 21, 22], "ani": [1, 3, 4, 6, 7, 8, 9, 12, 15, 16, 17, 18, 20, 21, 22, 23], "filenam": [1, 4, 6, 10, 15, 19, 22], "option": [1, 3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 15, 16, 17, 18, 21, 22, 23], "tune_kernel": [1, 4, 5, 6, 7, 8, 9, 11, 12, 13, 15, 16, 18, 19, 20, 21, 22], "individu": [1, 17, 18], "configur": [1, 4, 6, 7, 8, 9, 10, 11, 15, 16, 17, 18, 22], "append": [1, 6, 14, 22], "run": [1, 4, 5, 6, 7, 8, 11, 12, 14, 15, 17, 18, 22], "allow": [1, 3, 4, 5, 6, 7, 8, 9, 15, 16, 17, 18, 21, 22], "restart": [1, 3, 7, 8, 9, 18], "session": [1, 3, 6, 18], "from": [1, 3, 4, 5, 6, 7, 10, 11, 12, 14, 15, 17, 18, 20, 21, 22], "exist": [1, 6, 22], "should": [1, 3, 4, 5, 6, 7, 8, 9, 12, 15, 16, 17, 19, 22], "someth": [1, 4, 7, 8, 9, 15], "termin": [1, 14], "previou": [1, 3, 7, 8, 9, 18, 22], "befor": [1, 3, 4, 5, 6, 7, 8, 9, 11, 12, 14, 15, 17, 18, 22], "had": [1, 4], "quit": [1, 7, 8, 9, 11, 15, 21], "often": [1, 7, 8, 9, 17], "hpc": 1, "environ": [1, 4, 6, 14, 18, 22], "job": 1, "reserv": [1, 8, 23], "out": [1, 3, 4, 5, 11, 14, 15], "number": [1, 4, 5, 6, 7, 8, 9, 10, 11, 13, 15, 16, 17, 18, 19, 20, 22, 23], "simul": [1, 6, 9, 13, 18, 20, 22], "visual": [1, 3, 15], "optim": [1, 2, 4, 5, 6, 7, 8, 9, 12, 13, 15, 16, 17, 22], "strategi": [1, 2, 4, 16, 22], "start": [1, 2, 4, 5, 6, 7, 8, 9, 12, 14, 15, 17, 18, 22], "call": [1, 4, 5, 6, 7, 8, 9, 11, 12, 15, 17, 18, 19, 20, 21, 22], "full": [1, 3, 6, 17, 19], "search": [1, 4, 6, 10, 13, 15, 16, 18, 22], "space": [1, 3, 4, 5, 6, 11, 12, 15, 16, 18, 22], "true": [1, 4, 5, 6, 7, 8, 9, 12, 15, 17, 18, 22], "creat": [1, 3, 4, 6, 7, 8, 9, 11, 15, 17, 19, 20, 22], "even": [1, 3, 7, 8, 9, 12, 15, 18], "work": [1, 3, 4, 6, 7, 8, 9, 14, 16, 18, 21, 22], "still": [1, 3, 5, 15], "new": [1, 3, 6, 7, 8, 9, 18, 22], "come": [1, 6, 7, 8, 9, 15, 17, 21], "thei": [1, 3, 6, 7, 8, 9, 10, 15, 16], "stream": [1, 6, 7, 8, 9], "pleas": [1, 3, 4, 13, 14, 17, 19, 20, 22], "dashboard": [1, 13], "introduct": 2, "instal": [2, 3, 4, 7, 8, 9, 11, 12, 15, 17, 19], "get": [2, 4, 6, 7, 8, 9, 11, 14, 15], "convolut": [2, 5, 12, 15], "diffus": 2, "matrix": 2, "exampl": [2, 3, 5, 6, 7, 8, 9, 12, 14, 15, 16, 17, 18, 19, 20, 22], "backend": [2, 3, 12, 17], "cach": [2, 3, 6, 7, 8, 9, 14, 15, 18, 22], "correct": [2, 3, 12, 20, 22], "host": [2, 3, 6, 8, 9, 10, 17, 20, 21, 22], "struct": 2, "metric": [2, 4, 6, 10, 15, 22], "object": [2, 4, 5, 6, 7, 8, 9, 18, 22], "api": [2, 4, 6], "paramet": [2, 5, 6, 7, 8, 10, 12, 15, 16, 18, 19, 20, 21, 22], "vocabulari": [2, 17, 19], "design": [2, 3, 7, 8, 9, 17], "contribut": 2, "thank": 3, "consid": [3, 11, 13, 15, 22], "Not": [3, 6], "help": [3, 21], "u": [3, 4, 7, 8, 9], "improv": [3, 6, 7, 8, 9, 15, 18, 22], "about": [3, 4, 6, 7, 8, 9, 13, 15, 17, 18, 19, 22], "problem": [3, 4, 6, 7, 8, 9, 10, 11, 12, 15, 22], "ensur": [3, 5, 7, 8, 9, 12, 14, 17, 20], "follow": [3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14, 15, 17, 18, 21, 22], "describ": [3, 4, 6, 12, 17, 20], "what": [3, 4, 5, 6, 7, 8, 9, 12, 15, 17, 19, 20, 21, 22, 23], "possibl": [3, 4, 5, 7, 8, 9, 11, 12, 15, 17, 18, 19, 20, 22], "minim": [3, 16, 21, 22], "reproduc": 3, "actual": [3, 4, 5, 6, 7, 8, 9, 11, 15, 21], "error": [3, 4, 5, 6, 12, 15, 21], "print": [3, 4, 6, 7, 8, 9, 11, 15, 22], "version": [3, 4, 15, 17, 22], "cuda": [3, 4, 5, 6, 7, 9, 10, 11, 12, 13, 17, 19, 20, 21, 22], "compil": [3, 4, 5, 7, 8, 9, 10, 11, 12, 13, 14, 15, 17, 18, 19, 21, 22, 23], "applic": [3, 4, 7, 8, 9, 10, 11, 12, 13, 16, 17, 20, 21, 22], "For": [3, 4, 5, 6, 7, 8, 9, 11, 14, 17, 19, 20, 22], "propos": 3, "chang": [3, 11, 17, 22], "addit": [3, 4, 7, 8, 9, 14, 16, 19], "signific": 3, "first": [3, 4, 5, 7, 8, 9, 11, 12, 13, 14, 15, 16, 18, 20, 21, 22], "discuss": [3, 6], "Then": [3, 7, 8, 9, 11, 13, 14, 21], "fork": 3, "repositori": [3, 4, 7, 8, 9, 11, 13, 14, 15], "branch": 3, "per": [3, 4, 7, 8, 9, 11, 16, 17, 22], "pull": 3, "request": [3, 17, 22], "googl": 3, "style": 3, "sphinxdoc": 3, "docstr": [3, 6], "modul": [3, 6, 12, 17], "public": [3, 13], "function": [3, 4, 5, 7, 8, 9, 10, 11, 12, 15, 17, 18, 19, 20, 21, 22], "up": [3, 4, 6, 7, 8, 9, 14, 15, 19, 22], "date": 3, "written": [3, 21], "unit": [3, 6], "your": [3, 4, 7, 8, 9, 11, 12, 13, 14, 17, 20, 22], "nox": 3, "do": [3, 4, 6, 7, 8, 9, 11, 12, 15, 22], "hardwar": [3, 7, 8, 9, 11, 17, 18, 19], "skip": [3, 4, 7, 8, 9, 22], "produc": [3, 5], "same": [3, 4, 5, 6, 7, 8, 9, 11, 12, 17, 19, 22], "better": [3, 7, 8, 9], "entri": [3, 6, 7, 8], "changelog": 3, "md": 3, "match": [3, 4, 5, 6], "roadmap": 3, "updat": [3, 6], "remov": [3, 18], "doubt": 3, "where": [3, 4, 5, 6, 7, 8, 9, 15, 16, 17, 20, 21, 22], "put": [3, 6, 7, 8, 9], "look": [3, 4, 6, 7, 8, 9, 11, 14, 15, 21], "regard": [3, 6, 18], "step": [3, 7, 8, 9, 14, 15, 16, 18, 21], "set": [3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 15, 17, 18, 19, 21, 22, 23], "sudo": [3, 14], "access": [3, 4, 7, 8, 9, 11, 17, 20], "e": [3, 14, 16, 17, 18, 22], "g": [3, 14, 16, 17], "devic": [3, 4, 5, 7, 8, 9, 10, 12, 17, 21, 22], "clone": [3, 4, 7, 8, 9, 11, 14, 15], "git": [3, 17], "desir": 3, "locat": [3, 5, 11, 17], "kerneltun": [3, 13], "kernel_tun": [3, 4, 5, 7, 8, 9, 11, 12, 13, 14, 15, 17, 19, 20, 21, 22, 23], "cd": [3, 14], "prepar": [3, 6, 7, 8, 9], "system": [3, 13, 14, 17], "On": [3, 7, 8, 9, 22], "ubuntu": 3, "apt": 3, "upgrad": 3, "y": [3, 4, 6, 7, 8, 9, 11, 12, 15, 22], "make": [3, 4, 7, 8, 9, 11, 13, 14, 15, 17, 20, 21], "essenti": [3, 4], "libssl": 3, "dev": [3, 14, 17], "zlib1g": 3, "libbz2": 3, "libreadlin": 3, "libsqlite3": 3, "wget": [3, 14], "curl": [3, 14], "llvm": 3, "libncurses5": 3, "libncursesw5": 3, "xz": 3, "util": [3, 15], "tk": 3, "libffi": 3, "liblzma": 3, "openssl": 3, "pyenv": 3, "linux": [3, 14], "bash": [3, 14], "rememb": [3, 4, 7, 8, 9, 15], "add": [3, 4, 6, 7, 8, 9, 12, 15, 17, 18], "bash_profil": 3, "bashrc": 3, "specifi": [3, 4, 5, 6, 7, 8, 9, 11, 12, 15, 16, 17, 18, 19, 20, 21, 22, 23], "maco": 3, "brew": 3, "after": [3, 4, 5, 6, 7, 8, 9, 12, 14, 15, 17, 22], "shell": 3, "some": [3, 4, 6, 7, 8, 9, 14, 15, 16, 17, 18, 19, 20, 21, 22], "need": [3, 4, 5, 6, 7, 8, 9, 11, 12, 14, 15, 16, 17, 19, 20, 21, 22], "libgdbm": 3, "libnss3": 3, "lzma": 3, "3": [3, 5, 7, 8, 9, 11, 12, 14, 15, 18, 22], "8": [3, 4, 6, 7, 8, 9, 11, 14, 15, 17], "9": [3, 4, 5, 7, 8, 9, 12], "10": [3, 7, 8, 9, 13, 18], "11": [3, 7, 8, 9], "virtual": [3, 14], "folder": 3, "virtualenv": 3, "whatev": [3, 6, 12, 18], "name": [3, 4, 5, 6, 7, 8, 9, 11, 15, 16, 17, 18, 19, 22, 23], "prefer": [3, 4, 6, 7, 9, 17, 22], "so": [3, 4, 6, 7, 8, 9, 11, 12, 14, 15, 17, 18, 19, 21, 22], "found": [3, 4, 6, 13, 17, 18], "replac": [3, 4, 5, 6, 7, 8, 9, 11, 15, 22], "global": [3, 6, 7, 8, 9, 18], "poetri": [3, 14], "ssl": [3, 14], "org": [3, 13, 14], "python3": [3, 14], "sure": [3, 4, 7, 8, 9, 13, 14, 15], "path": [3, 4, 17], "instruct": [3, 7, 8, 9, 10, 14, 15], "end": [3, 4, 6, 7, 8, 9, 11, 15, 17, 18, 20], "non": [3, 5], "depend": [3, 4, 5, 9, 10, 11, 13, 16, 22], "re": [3, 4, 7, 8, 9, 11, 15], "open": [3, 5, 7, 8, 12, 15], "take": [3, 4, 6, 7, 8, 9, 11, 15, 17, 18, 19, 21, 22], "effect": [3, 4, 7, 8, 9, 22], "activ": 3, "project": [3, 14], "extra": [3, 14, 21], "doc": [3, 4, 7, 8, 9, 11, 14, 15], "leav": 3, "doe": [3, 5, 6, 7, 8, 9, 11, 12, 15, 17, 21, 22], "appli": [3, 7, 8, 9], "go": [3, 4, 7, 8, 9, 11, 13, 14, 15, 19], "necessari": [3, 5, 6, 7, 8, 9, 22], "conveni": [3, 7, 8, 9, 12, 22], "cuda11x": 3, "cuda12x": 3, "These": [3, 7, 8, 9, 11, 14, 15, 17, 21, 22], "current": [3, 4, 5, 6, 7, 8, 9, 14, 15, 17, 18, 22], "defin": [3, 4, 5, 6, 7, 8, 9, 10, 11, 15, 16, 17, 21, 22], "part": [3, 7, 8, 9, 13, 14, 15, 16, 20, 22], "forget": [3, 11], "correctli": [3, 15], "ld_libary_path": 3, "cpath": 3, "pytest": 3, "except": [3, 6, 10], "been": [3, 4, 6, 7, 8, 9, 12, 15, 18], "left": [3, 6, 7, 8, 9, 11, 16], "gracefulli": 3, "note": [3, 4, 6, 7, 8, 9, 11, 14, 15, 17, 20, 22], "driver": [3, 6, 7, 9, 11], "privileg": [3, 17], "read": [3, 4, 5, 6, 7, 8, 9, 11, 12, 15, 17, 22], "counter": [3, 17], "energi": [3, 13, 17, 18, 23], "measur": [3, 6, 7, 8, 9, 11, 12, 15, 16, 17, 22, 23], "cat": 3, "proc": 3, "param": [3, 4, 5, 6, 17, 18, 22], "grep": 3, "rmprofilingadminonli": 3, "1": [3, 4, 5, 7, 8, 9, 11, 12, 15, 17, 18, 22], "without": [3, 7, 8, 9, 11, 12, 17, 18], "conda": 3, "mamba": 3, "perform": [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 16, 17, 18, 19, 20, 22], "miniconda": [3, 14], "tradit": 3, "under": [3, 4, 13, 22], "quota": 3, "otherwis": [3, 6, 15, 22], "restrict": [3, 6, 10, 15, 21, 22], "disk": 3, "directori": [3, 4, 7, 8, 9, 11, 14, 15], "save": [3, 7, 8], "ad": [3, 7, 8, 9, 12, 22], "condarc": 3, "envs_dir": 3, "both": [3, 7, 8, 9, 10, 15], "via": [3, 18], "usual": [3, 17], "provid": [3, 5, 6, 7, 8, 9, 12, 21, 22], "exit": 3, "enter": [3, 4, 7, 8, 9, 11, 15], "avail": [3, 4, 7, 8, 9, 10, 11, 14, 17], "continu": [3, 4, 6, 7, 8, 9, 14, 17, 18, 22], "n": [3, 5, 7, 8, 9, 11, 12, 13, 15, 18, 19, 21], "forg": 3, "execut": [3, 4, 6, 7, 8, 9, 10, 11, 12, 15, 16, 18, 22], "config": [3, 6], "auto_activate_bas": 3, "fals": [3, 6, 17, 18, 22], "load": [3, 6], "unload": [3, 6], "rocm": [3, 14, 17], "inform": [3, 4, 6, 7, 8, 9, 13, 17, 18, 19, 22, 23], "like": [3, 4, 6, 7, 8, 9, 10, 11, 15, 18, 19, 20, 21, 22], "keyr": 3, "seemingli": 3, "weird": 3, "known": [3, 15], "pip": [3, 4, 7, 8, 13, 14, 15], "m": [3, 7, 8, 9, 11], "disabl": 3, "verifi": [3, 5, 6, 10, 22], "miss": [3, 6, 22], "sync": [3, 20], "dry": 3, "node": [3, 18], "In": [3, 4, 5, 6, 7, 8, 9, 11, 12, 14, 15, 16, 17, 19, 20, 22, 23], "noxset": 3, "toml": 3, "venvbackend": 3, "2": [3, 4, 5, 7, 8, 9, 10, 11, 12, 15, 17, 18, 22], "anaconda": 3, "venv": 3, "alreadi": [3, 4, 6, 7, 8, 9, 14, 15, 22], "Be": [3, 7, 8, 9], "adjust": [3, 4], "envdir": 3, "particularli": [3, 4, 16], "diskquota": 3, "against": [3, 5, 6], "support": [3, 4, 6, 7, 8, 9, 12, 14, 17, 18, 21, 22, 23], "isol": [3, 21], "top": [3, 6, 11, 17, 22], "level": [3, 6, 17], "coverag": 3, "gigabyt": 3, "size": [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 18, 19, 21, 22], "tight": 3, "diskspac": 3, "small": [3, 4, 7, 8, 9, 15], "each": [3, 4, 5, 6, 7, 8, 11, 15, 17, 18, 22], "ran": 3, "longer": [3, 4, 6, 16], "would": [3, 4, 7, 8, 9, 21], "command": [3, 14], "line": [3, 4, 7, 8, 9], "combin": [3, 4, 6, 7, 8, 9, 10, 11, 15, 17, 18, 19, 22], "compat": [3, 6, 14], "involv": 3, "especi": 3, "don": [3, 6, 7, 9, 11, 12, 22], "t": [3, 4, 6, 7, 8, 9, 11, 12, 14, 18, 21, 22], "break": [3, 21], "them": [3, 4, 9, 11, 12, 15], "capabl": [3, 6, 7, 8, 15, 22], "hold": [3, 7, 8, 15, 19, 20, 22], "pyopencl": [3, 6, 8, 17], "invok": 3, "tab": 3, "studio": 3, "id": [3, 6, 17], "seen": [3, 4, 6, 15], "integr": [3, 21], "type": [3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 17, 18, 19, 20, 21, 22], "html": 3, "page": [3, 4, 7, 8, 9, 10, 11, 13, 15, 16], "sourc": [3, 4, 6, 7, 8, 9, 11, 12, 14, 15, 17, 21, 22], "inspect": [3, 6, 17], "commit": 3, "brows": 3, "through": [3, 6, 7, 8, 9, 11, 13, 16, 17, 18, 22], "least": [3, 6], "those": [3, 4, 10, 14, 17], "pandoc": 3, "mac": 3, "onlin": 3, "built": [3, 17, 18, 20, 22], "action": 3, "correspond": [3, 4, 7, 8, 9, 11, 17, 18, 19], "master": 3, "latest": [3, 14], "last": [3, 6, 20], "releas": [3, 6], "stabl": 3, "publish": [3, 13], "point": [3, 4, 6, 7, 8, 9, 11, 12, 15, 16, 17, 19, 22], "process": [3, 4, 6, 7, 8, 9, 15, 16, 17, 18, 21], "again": [3, 4, 7, 8, 9, 11, 15], "autom": 3, "guid": [4, 7, 15, 16, 19], "meant": 4, "write": [4, 10, 11, 15, 21, 22], "script": [4, 6, 15, 20, 21], "we": [4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15, 17, 19, 20, 21], "simpl": [4, 6, 7, 8, 9, 10, 11, 12, 13, 15, 17, 18, 19, 20], "find": [4, 12, 15, 18, 22], "shortli": 4, "much": [4, 7, 8, 9, 11, 17, 21, 22], "reus": [4, 7, 8, 9, 15], "document": [4, 5, 7, 8, 9, 11, 14, 15, 20, 23], "jupyt": [4, 7, 8, 9, 11, 14, 15], "notebook": [4, 7, 8, 9, 11, 14, 15], "just": [4, 5, 6, 7, 8, 9, 11, 12, 14, 15], "tutori": [4, 7, 11, 13, 14, 15], "readi": [4, 6, 7, 8, 9, 11, 15], "oper": [4, 7, 8, 9, 11, 12, 15, 16], "signal": [4, 23], "imag": [4, 7, 8, 9], "main": [4, 6, 11, 17, 19], "neural": 4, "network": 4, "deep": 4, "learn": 4, "comput": [4, 5, 6, 10, 11, 12, 13, 15, 18, 22], "linear": [4, 15, 22], "weight": [4, 18], "filter": [4, 5, 10, 12], "rang": [4, 5, 7, 8, 9, 11, 12, 21], "pixel": 4, "w": [4, 7, 8, 16, 18], "time": [4, 6, 7, 8, 9, 11, 12, 15, 16, 17, 18, 21, 22, 23], "h": [4, 11, 22], "f": [4, 5, 11, 12, 20], "f_w": 4, "f_h": 4, "o": [4, 6], "begin": [4, 7, 8, 9, 11], "equat": [4, 7, 8, 9, 11, 18], "nonumb": [4, 11], "x": [4, 5, 6, 7, 8, 9, 11, 13, 15, 19, 21, 22], "sum": [4, 5, 6, 15], "limits_": 4, "j": [4, 7, 8, 9, 13, 15], "0": [4, 5, 6, 7, 8, 9, 11, 12, 15, 17, 18, 20, 22], "naiv": [4, 5, 7, 8, 9], "parallel": [4, 7, 8, 9], "thread": [4, 6, 7, 8, 9, 10, 11, 16, 17, 19, 22, 23], "avoid": [4, 15, 23], "confus": 4, "around": [4, 10], "term": 4, "refer": [4, 5, 6, 7, 8, 9, 10, 12, 14, 17, 22], "shown": [4, 6, 17], "press": [4, 7, 8, 9, 11, 15], "shift": [4, 7, 8, 9, 11, 15], "writefil": [4, 15], "convolution_na": [4, 5], "cu": [4, 5, 12, 15, 19, 21], "__global__": [4, 7, 9, 11, 13, 15, 19, 21], "void": [4, 7, 8, 9, 11, 13, 15, 19, 20, 21], "convolution_kernel": [4, 5], "float": [4, 6, 7, 8, 9, 11, 12, 13, 15, 16, 17, 18, 19, 20, 21, 22], "int": [4, 6, 7, 8, 9, 11, 13, 15, 19, 21, 22], "blockidx": [4, 7, 8, 9, 11, 13, 15, 19, 21], "blockdim": [4, 19, 22], "threadidx": [4, 7, 8, 9, 11, 13, 15, 19, 21], "image_height": 4, "image_width": 4, "filter_height": 4, "filter_width": 4, "input_width": 4, "run_kernel": [4, 5, 6, 10, 22], "our": [4, 7, 8, 9, 11, 15, 19, 20], "But": [4, 7, 8, 9, 11, 19], "data": [4, 6, 7, 8, 9, 11, 12, 15, 16, 17, 19, 20, 22], "np": [4, 6, 11, 15, 19, 20], "filter_s": 4, "17": [4, 5, 7, 8, 9, 12], "output_s": 4, "4096": [4, 5, 7, 8, 9, 12, 15], "prod": [4, 5, 12], "border_s": 4, "input_s": [4, 5, 12], "output_imag": 4, "zero": [4, 5, 11, 12, 15], "astyp": [4, 5, 7, 8, 9, 11, 12, 13, 15, 19, 21], "float32": [4, 5, 6, 7, 8, 9, 11, 12, 13, 15, 19, 21, 22], "input_imag": 4, "random": [4, 5, 6, 7, 8, 9, 11, 12, 13, 15, 18, 19, 21, 22], "randn": [4, 5, 12, 13, 15, 19, 21], "conv_filt": 4, "now": [4, 6, 7, 8, 9, 11, 12, 15, 19], "structur": [4, 6, 7, 8, 15, 19], "kernel_nam": [4, 6, 12, 20, 21, 22], "kernel_sourc": [4, 6, 20, 22], "problem_s": [4, 5, 6, 7, 8, 9, 11, 12, 15, 19, 20, 22, 23], "ellipsi": 4, "indic": [4, 18, 23], "mani": [4, 6, 7, 8, 9, 15, 16, 17, 18, 22], "won": 4, "right": [4, 7, 8, 9, 11, 14], "interest": [4, 20], "five": [4, 6, 19], "string": [4, 6, 7, 8, 9, 10, 15, 16, 17, 19, 20, 22], "domain": [4, 7, 8, 9, 10, 11, 22], "three": [4, 5, 15], "dimens": [4, 6, 7, 8, 9, 10, 11, 12, 15, 16, 18, 19, 22, 23], "dictionari": [4, 6, 7, 8, 9, 11, 15, 17, 18, 19, 22], "simpli": [4, 5, 6, 7, 8, 9, 11, 18, 19, 22], "cell": [4, 7, 8, 9, 11, 15], "wrote": 4, "determin": [4, 7, 8, 9, 11, 17, 18], "grid": [4, 6, 7, 8, 9, 10, 12, 15, 22, 23], "abov": [4, 6, 7, 8, 9, 11, 14, 15, 19, 20], "divid": [4, 7, 8, 9, 11, 12, 15, 22], "divisor": [4, 6, 7, 8, 9, 15, 22], "scalar": [4, 7, 8, 9, 11, 22], "therefor": [4, 5, 7, 8, 9, 11, 12, 15], "exactli": [4, 6, 7, 8, 9, 15, 17], "order": [4, 5, 6, 7, 8, 9, 11, 12, 15, 16, 18, 19, 22], "32": [4, 6, 7, 8, 9, 11, 13, 15, 19, 22], "bit": [4, 6, 7, 8, 9, 11, 12, 15], "final": [4, 5, 7, 8, 9, 11], "anyth": 4, "insert": [4, 5, 6, 9, 11, 12, 15, 19, 21, 22, 23], "preprocessor": [4, 6, 22], "statement": [4, 9, 11, 15, 21], "valu": [4, 5, 6, 7, 8, 9, 10, 11, 12, 15, 17, 18, 19, 22], "were": [4, 7, 8, 9, 11, 15, 22], "i_like_convolut": 4, "42": 4, "definit": [4, 11, 22], "unless": 4, "cours": [4, 7, 8, 9, 14, 15], "somewher": 4, "token": 4, "freeli": 4, "few": [4, 7, 8, 9, 11, 12, 21], "special": [4, 7, 8, 9, 17, 19, 23], "notic": [4, 7, 8, 9], "haven": [4, 14], "yet": [4, 6, 11, 12, 19], "basic": [4, 6, 7, 8, 9, 19], "block_size_x": [4, 5, 6, 7, 8, 9, 11, 12, 13, 15, 19, 21, 22], "block_size_i": [4, 5, 7, 8, 9, 11, 12, 15, 22], "block_size_z": [4, 7, 8, 9, 11, 22], "interpret": 4, "z": [4, 6, 11, 22], "block_size_nam": [4, 6, 22], "let": [4, 6, 7, 8, 9, 19, 21], "creation": [4, 13, 18], "trusti": 4, "old": 4, "16": [4, 5, 7, 8, 9, 11, 12, 15], "dict": [4, 5, 6, 9, 12, 13, 17, 18, 19, 21, 22], "undefin": [4, 6, 7, 8, 9, 15], "filter_heigth": 4, "could": [4, 5, 6, 7, 8, 9, 12, 14, 15, 17, 18, 21, 22], "runtim": [4, 6, 7, 8, 9, 13, 14, 17, 21], "setup": [4, 7, 8, 9, 12, 14, 17, 20], "everyth": [4, 6, 7, 8, 9], "answer": [4, 5, 6, 7, 8, 9, 10, 22], "alloc": [4, 6, 7, 8, 9, 10, 12, 22], "move": [4, 6, 7, 12, 15, 18, 22], "content": [4, 6, 22], "deriv": [4, 6, 7, 8, 9, 16], "retriev": [4, 6, 22], "free": [4, 7, 8, 9, 12, 14, 15], "return": [4, 5, 6, 7, 8, 9, 11, 12, 15, 17, 18, 19, 20, 22], "contrast": 4, "wa": [4, 6, 7, 8, 9, 17, 22], "finish": [4, 6, 8, 11, 12, 17], "than": [4, 7, 8, 9, 11, 16, 17, 18, 22, 23], "highli": [4, 13, 15], "parametr": 4, "long": [4, 7, 8, 9, 11, 12, 15, 20], "instead": [4, 6, 10, 15, 22], "littl": [4, 7, 8, 9, 15], "ve": [4, 7, 8, 9, 14, 15], "familiar": [4, 15], "kernel_str": [4, 5, 6, 7, 8, 9, 12, 13, 18, 22], "tune_param": [4, 5, 6, 7, 8, 9, 11, 12, 13, 15, 18, 19, 20, 21, 22], "similarli": 4, "singl": [4, 5, 6, 7, 8, 9, 12, 15, 17, 21, 22], "wai": [4, 7, 8, 9, 11, 12, 13, 14, 15, 16, 17, 22], "64": [4, 7, 8, 9, 13, 15, 19, 21], "128": [4, 7, 8, 9, 13, 19, 21], "try": [4, 6, 7, 8, 9, 14, 15, 18, 22], "env": [4, 6, 18, 19, 22], "cartesian": [4, 11], "product": [4, 7, 8, 22], "realli": [4, 7, 8, 9, 14], "howev": [4, 5, 7, 8, 9, 12, 14, 15, 17, 20, 21, 22], "lot": [4, 7, 8, 9, 15, 17, 19, 20, 22], "problemat": 4, "explain": [4, 6, 7, 8, 9, 12, 14, 15, 16, 19, 21, 22], "illeg": 4, "2048": 4, "1024": [4, 7, 8, 9, 19], "fail": [4, 6, 14, 22], "reason": [4, 6, 20, 22], "too": [4, 7, 8, 9, 11, 12, 15, 22], "regist": [4, 7, 8, 9, 15, 17], "silent": 4, "verbos": [4, 5, 6, 7, 8, 9, 12, 22], "bound": [4, 6, 15, 18], "ignor": [4, 6, 7, 8, 9, 22], "two": [4, 6, 7, 8, 9, 10, 15, 16, 18, 22], "thing": [4, 12, 15], "record": [4, 6, 7, 17, 22], "show": [4, 7, 8, 9, 10, 13, 16, 20], "secondli": [4, 15], "experi": 4, "took": [4, 7, 9, 18, 19, 22], "place": [4, 7, 8, 9, 17, 18, 19, 22], "That": [4, 7, 8, 9, 12, 15, 16, 19], "mean": [4, 12, 15, 16, 18, 20, 21, 23], "softwar": [4, 7, 8, 9, 13, 14, 17, 18, 19], "along": [4, 6, 14, 19, 23], "second": [4, 5, 6, 7, 8, 9, 11, 15, 16, 17, 18, 22], "alwai": [4, 6, 7, 8, 9], "circumst": 4, "obtain": [4, 7, 8, 9, 11, 17], "promis": 4, "tile": [4, 10, 15], "factor": [4, 7, 8, 9, 10, 11, 15, 23], "amount": [4, 7, 8, 9, 15, 16, 22], "particular": [4, 6, 7, 8, 10, 12, 15, 17, 20], "increas": [4, 7, 8, 9, 17], "certain": [4, 6, 7, 8, 9, 17, 23], "tile_size_x": [4, 5, 7, 8, 9, 12, 15], "4": [4, 7, 8, 9, 11, 15, 17], "tile_size_i": [4, 5, 7, 8, 9, 12, 15, 22], "understand": 4, "everi": [4, 5, 7, 8, 9, 10, 17, 19], "fewer": [4, 7, 8, 9], "total": [4, 6, 7, 8, 9, 15, 16, 19], "stai": 4, "tell": [4, 7, 8, 9, 10, 12, 15, 19, 20], "influenc": 4, "did": [4, 7, 8, 9, 15], "mimick": 4, "behavior": [4, 15, 17, 22], "assum": [4, 6, 7, 8, 9, 15, 22], "far": [4, 7, 8, 9, 15, 19], "grid_div_x": [4, 5, 7, 8, 9, 12, 15, 22], "grid_div_i": [4, 5, 7, 8, 9, 12, 15, 22], "decreas": [4, 15], "correspondingli": 4, "displai": 4, "commonli": [4, 7, 8, 9, 14, 15], "gflop": [4, 6, 10, 15, 16], "giga": [4, 15], "compos": [4, 6, 15, 16], "lambda": [4, 6, 7, 8, 15, 16, 22], "collect": [4, 6, 7, 8, 9, 11, 15, 17, 20], "ordereddict": [4, 7, 8, 9, 11, 15, 16], "p": [4, 6, 15, 16, 20, 22], "1e9": [4, 15], "1e3": [4, 7, 8, 9, 15, 16], "expand": [4, 15, 17], "sinc": [4, 9, 11, 15, 21], "And": [4, 7, 8, 9, 18, 21, 22], "know": [4, 7, 8, 9, 15, 16], "enough": [4, 5, 15], "abl": [4, 6, 7, 8, 9], "own": [4, 9, 12, 14, 16, 17], "whenev": 5, "good": [5, 7, 8, 9, 23], "fast": [5, 7, 8, 9], "instanc": [5, 6, 7, 8, 9, 12, 17, 22], "none": [5, 6, 17, 18, 22], "onc": [5, 6, 7, 8, 9, 11, 17, 22], "comparison": 5, "allclos": [5, 22], "maximum": [5, 6, 11, 18, 22], "absolut": [5, 22], "1e": [5, 22], "6": [5, 7, 8, 9, 11, 12, 22], "toler": 5, "atol": [5, 6, 22], "convolution_correct": 5, "py": [5, 12, 14], "demonstr": [5, 9, 10, 15], "r": [5, 12], "cmem_arg": [5, 6, 22], "d_filter": 5, "arg": [5, 6, 7, 8, 9, 11, 12, 13, 15, 18, 19, 20, 21], "field": [5, 7, 8, 9], "its": [5, 6, 7, 8, 9, 11, 13, 14, 15, 16, 17, 22], "almost": [5, 7, 8, 9, 17], "whose": [5, 22], "trust": [5, 18], "construct": [5, 15], "There": [5, 6, 7, 8, 9, 10, 11, 12, 14, 15, 17, 19, 22, 23], "precomput": 5, "flexibl": [5, 7, 8, 15], "callabl": [5, 6, 22], "accept": [5, 6, 18, 22], "cpu_result": 5, "gpu_result": [5, 7, 9], "although": 5, "semant": 5, "posit": [5, 6, 11, 18, 21, 22], "reflect": [5, 17], "reduct": [5, 16, 22], "snippet": 5, "sum_x": 5, "custom": [5, 10, 16, 17, 20], "def": [5, 6, 7, 8, 9, 11, 17, 20], "verify_partial_reduc": 5, "isclos": 5, "first_kernel": 5, "_": [5, 7, 8, 9], "sum_float": 5, "map": [5, 10, 11], "third": [5, 15], "partial": [5, 7, 8, 9, 10], "cpu": [5, 8, 9, 12], "achiev": [5, 9], "element": [5, 7, 8, 9, 15, 16, 19, 20, 22], "necessarili": [5, 12], "section": [6, 7, 8, 9], "intern": [6, 13, 18, 21], "mostli": [6, 13, 22], "relev": [6, 13, 17], "develop": [6, 13, 14], "extens": 6, "architectur": [6, 17], "At": [6, 11, 22], "expos": 6, "respons": 6, "iter": [6, 7, 8, 9, 11, 15, 17, 18, 19, 22], "brute_forc": [6, 22], "valid": [6, 10, 15, 22], "random_sampl": [6, 22], "sampl": [6, 18, 22], "advanc": [6, 21, 22], "being": [6, 7, 8, 9, 15, 17, 18, 22], "strategy_opt": [6, 18, 22], "sai": [6, 7, 8, 9, 19, 21], "foreseen": 6, "futur": [6, 13, 22, 23], "high": [6, 7, 8, 9, 13, 15, 17], "low": [6, 7, 8, 9, 15], "abstract": [6, 17], "ready_argument_list": 6, "build": [6, 7, 8, 9], "bottom": 6, "either": [6, 11, 18, 21, 22], "typic": [6, 14, 15, 22], "gcc": 6, "fortran": [6, 10, 21], "turn": 6, "launch": [6, 7, 8, 9, 12, 17, 22], "rest": [6, 7, 8, 9], "helper": [6, 17], "get_opt": 6, "suppli": [6, 12, 15, 18, 21, 22], "get_strategy_docstr": 6, "method": [6, 7, 8, 9, 12, 15, 17, 18], "make_strategy_options_doc": 6, "scale_from_param": 6, "ep": [6, 18], "func": [6, 17, 22], "invers": 6, "unscal": 6, "setup_method_argu": 6, "setup_method_opt": 6, "tuning_opt": [6, 18], "snap_to_nearest_config": 6, "closest": 6, "unscale_and_snap_to_nearest": 6, "snap": 6, "scale": 6, "variabl": [6, 11, 14, 18, 22], "nearest": [6, 22], "class": [6, 17, 18], "kernel_opt": 6, "device_opt": 6, "__init__": 6, "instanti": [6, 21], "kernelsourc": 6, "parameter_spac": [6, 18], "iterfac": 6, "platform": [6, 13, 14, 17, 22], "quiet": [6, 22], "compiler_opt": [6, 22], "7": [6, 7, 8, 9, 11, 22], "offer": 6, "bool": [6, 20, 22], "gpu_arg": 6, "benchmark_continu": 6, "durat": [6, 17], "benchmark_default": 6, "check_kernel_output": 6, "compile_kernel": 6, "copy_constant_memory_arg": 6, "recent": [6, 14, 17], "copy_shared_memory_arg": 6, "smem_arg": [6, 22], "copy_texture_memory_arg": 6, "texmem_arg": [6, 22], "create_kernel_inst": 6, "get_environ": 6, "memcpy_dtoh": [6, 7], "dest": 6, "src": 6, "copi": [6, 7, 8, 9, 12, 19, 22], "static": 6, "preprocess_gpu_argu": 6, "old_argu": 6, "flat": 6, "given": [6, 7, 8, 9, 11, 17, 18, 22], "mem": 6, "group": [6, 7, 8, 9, 22], "maintain": 6, "state": [6, 7, 8, 9, 17, 22], "interact": [6, 17], "properti": [6, 15, 22], "context": [6, 7, 9, 11], "kernel_inst": 6, "lookup": 6, "directli": [6, 7, 8, 9, 12, 15, 17, 21, 22], "ndarrai": [6, 11], "format": [6, 7, 8, 20], "kei": [6, 7, 8, 9, 15, 18, 19, 22], "symbol": [6, 22], "similar": [6, 12, 15, 22], "regular": [6, 9, 17], "int32": [6, 13, 19, 21, 22], "kernel_finish": 6, "devicealloc": 6, "memcpy_htod": [6, 7], "memset": 6, "unsign": [6, 8], "byte": [6, 20, 22], "tupl": [6, 9, 11, 18, 22], "start_ev": 6, "event": [6, 7, 12, 17], "mark": 6, "stop_ev": 6, "synchron": [6, 7, 9, 11, 15, 16], "halt": [6, 12], "until": [6, 12], "task": 6, "rawkernel": 6, "cudeviceptr": 6, "cufunct": 6, "must": [6, 16, 22], "buffer": [6, 8, 20], "fill": [6, 15], "item": [6, 7, 8, 9, 11], "ndrang": 6, "cfunction": 6, "cleanup_lib": 6, "previous": [6, 7, 8, 9, 15], "librari": [6, 10, 17, 20], "kernelinst": 6, "repres": [6, 7, 8, 9], "tunabl": [6, 7, 8, 9, 10, 11, 15, 16, 17, 18, 19, 21, 22, 23], "ctype": 6, "_funcptr": 6, "asynchron": 6, "memcpi": [6, 12], "c_arg": 6, "robust": 6, "averag": [6, 7, 8, 9, 12, 17], "ptr": 6, "pionter": 6, "compilationfailedconfig": 6, "errorconfig": 6, "invalidconfig": 6, "npencod": 6, "skipkei": 6, "ensure_ascii": 6, "check_circular": 6, "allow_nan": 6, "sort_kei": 6, "indent": 6, "separ": [6, 10, 12, 21], "dump": [6, 7, 8], "json": [6, 7, 8, 10, 22], "obj": 6, "subclass": 6, "serializ": 6, "rais": 6, "typeerror": 6, "arbitrari": 6, "self": [6, 17, 18], "els": 6, "jsonencod": 6, "runtimefailedconfig": 6, "skippablefailur": 6, "stopcriterionreach": 6, "thrown": 6, "stop": [6, 18], "criterion": [6, 18], "reach": 6, "check_argument_list": 6, "check_argument_typ": 6, "dtype": [6, 20], "kernel_argu": 6, "check_restrict": 6, "whether": [6, 16, 18, 22], "meet": 6, "check_stop_criterion": 6, "max_fev": [6, 18, 22], "exceed": 6, "check_thread_block_dimens": 6, "max_thread": 6, "check_tune_params_list": 6, "simulation_mod": [6, 22], "forbidden": 6, "compile_restrict": 6, "monolith": 6, "try_to_constraint": 6, "union": 6, "str": [6, 7, 8, 9, 11], "constraint": 6, "pars": [6, 7, 8], "config_valid": 6, "max": 6, "convert_constraint_restrict": 6, "convert": [6, 7, 8], "backward": 6, "correct_open_cach": 6, "open_cach": 6, "properli": 6, "close": [6, 7, 8, 9], "pretend": 6, "cuda_error_check": 6, "statu": 6, "delete_temp_fil": 6, "delet": 6, "temporari": 6, "complain": 6, "detect_languag": 6, "attempt": [6, 21], "detect": [6, 18, 21, 22], "dump_cach": 6, "omit": 6, "sever": [6, 7, 8, 9, 10, 11, 14, 15, 21, 22], "store_cach": 6, "speed": 6, "great": [6, 7, 8, 9, 19], "power": [6, 15, 17, 23], "get_best_config": 6, "objective_higher_is_bett": [6, 16, 22], "best": [6, 7, 8, 11, 15, 18, 21, 22, 23], "accord": [6, 22], "get_config_str": 6, "compact": 6, "represent": [6, 20], "get_grid_dimens": 6, "current_problem_s": 6, "grid_div": 6, "dim": 6, "get_instance_str": 6, "debug": 6, "advis": 6, "get_kernel_str": [6, 7, 8, 9], "One": [6, 7, 8, 9, 17, 20], "get_problem_s": 6, "get_smem_arg": 6, "get_temp_filenam": 6, "suffix": [6, 22], "form": [6, 15, 17, 18], "temp_x": 6, "larg": [6, 7, 8, 9, 11, 22], "integ": [6, 17, 20, 22], "get_thread_block_dimens": 6, "convent": [6, 12, 22], "get_total_tim": 6, "overhead_tim": 6, "looks_like_a_filenam": 6, "normalize_verify_funct": 6, "v": [6, 7, 8, 9, 11], "normal": [6, 18, 22], "result_host": 6, "keyword": 6, "behaviour": 6, "parse_restrict": 6, "prepare_kernel_str": 6, "prepend": [6, 9], "seri": [6, 11], "By": [6, 12, 15, 18, 22], "macro": 6, "made": 6, "print_config": 6, "print_config_output": 6, "process_cach": 6, "device_nam": [6, 22], "tune_params_kei": 6, "x1": 6, "x2": 6, "xn": 6, "234342": 6, "y1": 6, "y2": 6, "yn": 6, "134233": 6, "bracket": 6, "earlier": [6, 7, 8, 9, 11], "abruptli": 6, "process_metr": 6, "calcul": [6, 11], "express": [6, 7, 8, 9, 10, 12, 15, 22], "10000": 6, "read_cach": 6, "cachefil": [6, 22], "read_fil": 6, "replace_param_occurr": 6, "occurr": 6, "setup_block_and_grid": 6, "write_fil": 6, "whole": [7, 8, 9, 15, 18], "model": [7, 8, 9, 13], "physic": 7, "numer": [7, 8, 9], "introduc": [7, 8, 9, 15, 17], "redistribut": [7, 8, 9], "region": [7, 8, 9], "concentr": [7, 8, 9], "bulk": [7, 8, 9], "motion": [7, 8, 9], "concept": [7, 8, 9], "wide": [7, 8, 9, 14, 15], "chemistri": [7, 8, 9], "biologi": [7, 8, 9], "suppos": [7, 8, 9], "metal": [7, 8, 9], "sheet": [7, 8, 9], "temperatur": [7, 8, 9, 17, 18, 23], "equal": [7, 8, 9, 15, 22], "degre": [7, 8, 9], "everywher": [7, 8, 9], "heat": [7, 8, 9], "thousand": [7, 8, 9], "instant": [7, 8, 9, 11], "hotspot": [7, 8, 9], "cooler": [7, 8, 9], "area": [7, 8, 9, 15], "melt": [7, 8, 9], "loss": [7, 8, 9], "radiat": [7, 8, 9], "frac": [7, 8, 9], "d": [7, 8, 9, 11, 18, 19], "spatial": [7, 8, 9], "descret": [7, 8, 9], "2d": [7, 8, 9, 10], "quantiti": [7, 8, 9, 16, 17, 22], "nx": [7, 8, 9, 11], "equi": [7, 8, 9], "distant": [7, 8, 9], "direct": [7, 8, 9, 12, 15, 16, 22], "ny": [7, 8, 9, 11], "distanc": [7, 8, 9, 18], "delta": [7, 8, 9], "central": [7, 8, 9], "approxim": [7, 8, 9], "x_i": [7, 8, 9, 11], "x_": [7, 8, 9], "approx": [7, 8, 9], "u_": [7, 8, 9], "2u_": [7, 8, 9], "y_": [7, 8, 9], "estim": [7, 8, 9], "next": [7, 8, 9, 15, 20], "simplifi": [7, 8, 9], "formula": [7, 8, 9], "4u_": [7, 8, 9], "simplic": [7, 8, 9, 11], "assumpt": [7, 8, 9], "boundari": [7, 8, 9], "condit": [7, 8, 9, 15], "dt": [7, 8, 9], "225": [7, 8, 9], "test": [7, 8, 9, 10, 14, 15, 17, 22], "initi": [7, 8, 9, 20], "hot": [7, 8, 9], "plot": [7, 8, 9], "color": [7, 8, 9], "matplotlib": [7, 8, 9, 14], "pyplot": [7, 8, 9], "inlin": [7, 8, 9], "get_initial_condit": [7, 8, 9], "ones": [7, 8, 9, 23], "randint": [7, 8, 9], "1000": [7, 8, 9, 11], "2000": [7, 8, 9], "fig": [7, 8, 9], "ax1": [7, 8, 9], "ax2": [7, 8, 9], "subplot": [7, 8, 9], "imshow": [7, 8, 9], "lt": [7, 8, 9], "axesimag": [7, 8, 9], "0x2aaab952f240": 7, "gt": [7, 8, 9], "quick": [7, 8, 9], "later": [7, 8, 9, 11, 22], "field_copi": [7, 8], "4164": 7, "018869400024": 7, "0x2aab1c98b3c8": 7, "worri": [7, 9], "terminologi": [7, 9], "text": [7, 9, 15], "5": [7, 8, 9, 11, 18], "225f": [7, 8, 9], "diffuse_kernel": [7, 8, 9], "u_new": [7, 8, 9], "0f": [7, 8, 9], "togeth": [7, 8, 9, 14, 22], "impact": [7, 8, 9, 12], "fix": [7, 8, 9, 18, 22], "unrol": [7, 8, 9, 10, 15, 23], "loop": [7, 8, 9, 10, 15, 23], "drv": 7, "sourcemodul": [7, 9, 11], "init": 7, "make_context": 7, "devprop": 7, "k": [7, 8, 9, 11, 13, 15, 19], "get_devic": 7, "get_attribut": 7, "cc": 7, "compute_capability_major": 7, "compute_capability_minor": 7, "u_old": [7, 9], "mem_alloc": 7, "nbyte": 7, "block_size_str": [7, 9], "arch": 7, "sm_": 7, "get_funct": [7, 9, 11], "boilerpl": [7, 8, 9], "moment": [7, 8, 9, 22], "serv": [7, 8, 9, 16, 18], "guess": [7, 8, 9], "pair": [7, 8, 9], "500": [7, 8, 9], "time_sinc": 7, "zeros_lik": [7, 11, 13, 15, 19, 21], "set_titl": [7, 8, 9], "53": [7, 8, 9], "423038482666016": 7, "0x2aaabbdcb2e8": 7, "faster": [7, 8, 9, 15], "cleanup": 7, "pop": 7, "think": [7, 8, 9], "messi": [7, 8, 9], "got": [7, 8, 9], "cleaner": [7, 8, 9], "plai": [7, 8, 9], "difficult": [7, 8, 9, 20, 21], "rather": [7, 8, 9, 22], "underutil": [7, 8, 9], "purpos": [7, 8, 9, 12, 15, 22, 23], "feel": [7, 8, 9], "48": [7, 8, 9], "care": [7, 8, 9], "appropi": [7, 8, 9], "fly": [7, 8, 9], "12": [7, 8, 9], "13": [7, 8, 9], "geforc": [7, 8, 9, 11], "gtx": [7, 8, 9, 11], "titan": [7, 8, 9], "22305920124": 7, "779033613205": 7, "824838399887": 7, "900499212742": 7, "999763202667": 7, "727967989445": 7, "752479994297": 7, "797900807858": 7, "876627194881": 7, "93347837925": 7, "766662418842": 7, "803033602238": 7, "853574407101": 7, "971545600891": 7, "763775992393": 7, "791257584095": 7, "848044800758": 7, "922745585442": 7, "792595207691": 7, "822137594223": 7, "893279993534": 7, "millisecond": [7, 8, 9], "matter": [7, 8, 9, 12], "analyz": [7, 8, 9], "seem": [7, 8, 9], "vari": [7, 8, 9, 11, 15, 16], "addtion": [7, 8, 9], "among": [7, 8, 9, 13, 18], "128x32": [7, 8, 9], "likewis": [7, 8, 9], "becom": [7, 8, 9, 17, 18], "affect": [7, 8, 9, 15], "within": [7, 8, 9, 11, 15, 18, 22], "exchang": [7, 8, 9], "fact": [7, 8, 9, 12], "commun": [7, 8, 9], "idea": [7, 8, 9, 12, 15, 23], "l2": [7, 8, 9], "closer": [7, 8, 9], "multiprocessor": [7, 8, 9], "l1": [7, 8, 9], "fine": [7, 8, 9], "grain": [7, 8, 9], "manag": [7, 8, 9, 15, 17], "cost": [7, 8, 9, 18], "overhead": [7, 8, 9, 15], "degrad": [7, 8, 9], "intermedi": [7, 8, 9], "mind": [7, 8, 9], "14": [7, 8, 9], "tx": [7, 8, 9, 15], "ty": [7, 8, 9, 15], "bx": [7, 8, 9, 11], "__shared__": [7, 9, 15], "sh_u": [7, 8, 9], "pragma": [7, 8, 9, 15], "__syncthread": [7, 8, 9, 15], "75041918755": 7, "18713598251": 7, "09015038013": 7, "06844799519": 7, "09730558395": 7, "14420480728": 7, "05957758427": 7, "07508480549": 7, "0731967926": 7, "14729599953": 7, "08389122486": 7, "10700161457": 7, "10125439167": 7, "31661438942": 7, "0629119873": 7, "04807043076": 7, "054880023": 7, "12033278942": 7, "06672639847": 7, "05816960335": 7, "12000002861": 7, "merg": [7, 8, 9, 15], "half": [7, 8, 9], "doubl": [7, 8, 9, 20, 21], "cover": [7, 8, 9, 18], "beyond": [7, 8, 9, 22], "reduc": [7, 8, 9, 15], "condens": [7, 8, 9], "keep": [7, 8, 9, 15, 20], "importantli": [7, 8, 9], "worst": [7, 8, 9], "15": [7, 8, 9, 21], "tj": [7, 8, 9], "ti": [7, 8, 9, 11], "somehow": [7, 8, 9], "larger": [7, 8, 9, 12, 18, 21], "insid": [7, 8, 9, 12, 15, 21, 22], "round": [7, 8, 9, 22], "arithmet": [7, 8, 9, 22], "evalu": [7, 8, 9, 15, 18, 22], "759308815": 7, "29789438248": 7, "06983039379": 7, "2634239912": 7, "997139203548": 7, "843692803383": 7, "05549435616": 7, "862348806858": 7, "750636804104": 7, "19084160328": 7, "876377594471": 7, "714169609547": 7, "875001597404": 7, "691116797924": 7, "575859189034": 7, "759679996967": 7, "622867202759": 7, "650336003304": 7, "09794559479": 7, "826515209675": 7, "692665600777": 7, "78363519907": 7, "646092808247": 7, "554745602608": 7, "716115188599": 7, "581280004978": 7, "662566399574": 7, "07386879921": 7, "833420813084": 7, "705055999756": 7, "840755212307": 7, "652575993538": 7, "569388794899": 7, "689356791973": 7, "597267186642": 7, "675232005119": 7, "10033922195": 7, "860332798958": 7, "731891202927": 7, "867276787758": 7, "68781440258": 7, "595276796818": 7, "735436797142": 7, "60216319561": 7, "852166390419": 7, "15089921951": 7, "852575981617": 7, "705932807922": 7, "888671982288": 7, "673248004913": 7, "563417613506": 7, "761139214039": 7, "621254396439": 7, "676595199108": 7, "06709122658": 7, "804953610897": 7, "685670387745": 7, "801798415184": 7, "632006394863": 7, "542387211323": 7, "722668802738": 7, "578745603561": 7, "618598401546": 7, "08220798969": 7, "821881604195": 7, "687955200672": 7, "77759360075": 7, "618003201485": 7, "539891195297": 7, "705900788307": 7, "568556785583": 7, "624492788315": 7, "0799423933": 7, "832300806046": 7, "70140799284": 7, "835481595993": 7, "638348805904": 7, "550105595589": 7, "667251205444": 7, "576044797897": 7, "732409596443": 7, "15916161537": 7, "869497597218": 7, "733248019218": 7, "890803205967": 7, "677363204956": 7, "577215993404": 7, "730982398987": 7, "58035838604": 7, "10066559315": 7, "837804794312": 7, "691385602951": 7, "851040017605": 7, "666656005383": 7, "560505592823": 7, "771103990078": 7, "626163220406": 7, "694451200962": 7, "11514236927": 7, "837299215794": 7, "703302407265": 7, "806828796864": 7, "648620784283": 7, "562521612644": 7, "760915207863": 7, "605760002136": 7, "690009605885": 7, "10740480423": 7, "841631996632": 7, "700883197784": 7, "838195204735": 7, "649779188633": 7, "56585599184": 7, "7168192029": 7, "59088640213": 7, "69627519846": 7, "3269824028": 7, "02665598392": 7, "840908801556": 7, "03752319813": 7, "788345599174": 7, "662041604519": 7, "85437438488": 7, "680422389507": 7, "0759360075": 7, "801996803284": 7, "666003203392": 7, "808000004292": 7, "643359994888": 7, "544691193104": 7, "741964805126": 7, "60942081213": 7, "681350398064": 7, "05262081623": 7, "792108798027": 7, "66344319582": 7, "768064010143": 7, "625260794163": 7, "540352010727": 7, "721862399578": 7, "579411196709": 7, "626976013184": 7, "06332798004": 7, "808211183548": 7, "679372787476": 7, "803718411922": 7, "627136015892": 7, "538227200508": 7, "682188808918": 7, "573836791515": 7, "725548803806": 7, "13023357391": 7, "843411195278": 7, "713843202591": 7, "85886080265": 7, "657920002937": 7, "565254402161": 7, "697094392776": 7, "579904007912": 7, "07484800816": 7, "801119995117": 7, "667347204685": 7, "799059200287": 7, "643820810318": 7, "542937588692": 7, "740518403053": 7, "615148806572": 7, "731334400177": 7, "07002239227": 7, "805299210548": 7, "675923216343": 7, "782060790062": 7, "631142401695": 7, "540383994579": 7, "723999989033": 7, "578681600094": 7, "726335990429": 7, "13297917843": 7, "844428789616": 7, "710278391838": 7, "835494399071": 7, "637958395481": 7, "567417597771": 7, "699366402626": 7, "588492810726": 7, "tri": [7, 8, 9, 18], "grow": [7, 8, 9], "quickli": [7, 8, 9], "went": [7, 8, 9, 11], "72": [7, 8, 9], "26": [7, 8, 9], "32x2": [7, 8, 9], "64x4": [7, 8, 9], "four": [7, 8, 9], "best_tim": [7, 8], "min": [7, 8], "05": [7, 8], "join": [7, 8], "nice": [7, 8], "stdout": [7, 8], "why": [7, 8, 12, 16], "easili": [7, 8, 17], "easi": [7, 8, 16, 17, 22], "csv": [7, 8, 10], "analysi": [7, 8], "panda": [7, 8, 10, 14], "18": [7, 8, 9], "fp": [7, 8], "datafram": [7, 8], "df": [7, 8], "to_csv": [7, 8], "0x2aab1de088d0": 8, "01": 8, "sy": 8, "140": 8, "wall": 8, "98": 8, "__kernel": 8, "get_group_id": 8, "get_local_id": 8, "cl": 8, "ctx": 8, "create_some_context": 8, "mf": 8, "mem_flag": 8, "a_h": 8, "a_d": 8, "read_writ": 8, "copy_host_ptr": 8, "hostbuf": 8, "b_d": 8, "kernel_src": 8, "prg": 8, "queue": 8, "commandqueu": 8, "run_gpu": 8, "444": 8, "154": 8, "598": 8, "985": 8, "enqueue_copi": 8, "1748096": 8, "7284544": 8, "7707904": 8, "8573184": 8, "8380288": 8, "686528": 8, "69648": 8, "7461632": 8, "818304": 8, "771072": 8, "7190464": 8, "7522432": 8, "7982208": 8, "9624512": 8, "7214464": 8, "7453312": 8, "8028416": 8, "8922624": 8, "747328": 8, "7860736": 8, "8637184": 8, "__local": 8, "barrier": 8, "clk_local_mem_f": 8, "8449472": 8, "1912576": 8, "1035136": 8, "0927808": 8, "1140736": 8, "1790336": 8, "0808192": 8, "0809792": 8, "0836928": 8, "1545856": 8, "1249984": 8, "1264": 8, "1230336": 8, "4015104": 8, "0873216": 8, "0626496": 8, "0692224": 8, "140192": 8, "0801344": 8, "0688128": 8, "1428928": 8, "8844544": 8, "3245952": 8, "0911808": 8, "3039616": 8, "0079296": 8, "84848": 8, "0708288": 8, "857728": 8, "7561792": 8, "231072": 8, "8774336": 8, "7087296": 8, "8772672": 8, "6911872": 8, "5715968": 8, "7584896": 8, "6292032": 8, "6498688": 8, "1145664": 8, "8252928": 8, "6757568": 8, "7881152": 8, "6237696": 8, "544224": 8, "6951168": 8, "5648128": 8, "6452736": 8, "1065792": 8, "8313792": 8, "6905984": 8, "8302656": 8, "6367488": 8, "5478592": 8, "6660672": 8, "5719744": 8, "6551744": 8, "1384064": 8, "8531072": 8, "7078976": 8, "8516672": 8, "6677696": 8, "5685632": 8, "7074048": 8, "5753152": 8, "8228864": 8, "2124736": 8, "8633344": 8, "6921216": 8, "8896384": 8, "6659904": 8, "5582144": 8, "7522624": 8, "6081536": 8, "6664448": 8, "1095936": 8, "8063424": 8, "6717888": 8, "7982848": 8, "6263552": 8, "5289728": 8, "7008832": 8, "567456": 8, "5968704": 8, "1018432": 8, "8117248": 8, "6724736": 8, "7728576": 8, "6038336": 8, "5172352": 8, "6796352": 8, "5470016": 8, "5968448": 8, "1107712": 8, "8237248": 8, "6810944": 8, "821952": 8, "620352": 8, "5230208": 8, "6415552": 8, "5476864": 8, "7168192": 8, "1942016": 8, "8626304": 8, "7099712": 8, "9123328": 8, "6608448": 8, "5631168": 8, "7113024": 8, "556576": 8, "1583104": 8, "8384832": 8, "67856": 8, "845856": 8, "6581248": 8, "54944": 8, "7520064": 8, "6076224": 8, "6842112": 8, "1547072": 8, "8422016": 8, "6895552": 8, "8037312": 8, "6387072": 8, "5383296": 8, "7326656": 8, "5863488": 8, "6813376": 8, "1493952": 8, "8444928": 8, "6929216": 8, "832768": 8, "6389312": 8, "5412672": 8, "698336": 8, "5717568": 8, "676096": 8, "4303104": 8, "0341696": 8, "8365184": 8, "0398656": 8, "7786496": 8, "648928": 8, "8479232": 8, "6508544": 8, "1219392": 8, "7994048": 8, "6492288": 8, "8068416": 8, "6343168": 8, "5235328": 8, "7268928": 8, "5898432": 8, "6633536": 8, "0849664": 8, "7869632": 8, "6458624": 8, "7611968": 8, "613088": 8, "50912": 8, "6972928": 8, "5620608": 8, "601856": 8, "095232": 8, "7967488": 8, "6601472": 8, "7952896": 8, "6047296": 8, "5108224": 8, "6607744": 8, "5492416": 8, "7091136": 8, "171552": 8, "8473408": 8, "6962112": 8, "8663936": 8, "6466816": 8, "5475584": 8, "6754048": 8, "5591744": 8, "108896": 8, "7907264": 8, "6459328": 8, "7965888": 8, "6250816": 8, "5188416": 8, "721408": 8, "5920832": 8, "7068608": 8, "0909248": 8, "7930752": 8, "6524544": 8, "7745216": 8, "6146176": 8, "5116928": 8, "6975872": 8, "5548416": 8, "7075136": 8, "174624": 8, "8384512": 8, "69104": 8, "8335488": 8, "6264192": 8, "5445248": 8, "6719104": 8, "5592064": 8, "19": [8, 9], "solv": 9, "0x7f888f8cd7b8": 9, "4152": 9, "086019515991": 9, "0x7f8865b51f28": 9, "gpuarrai": [9, 11], "tool": [9, 11, 13], "autoinit": [9, 11], "to_gpu": [9, 11], "mod": [9, 11], "t0": [9, 11], "ona": 9, "33": 9, "46109390258789": 9, "0x7f8858b873c8": 9, "1080": [9, 11], "916985595226": 9, "489004802704": 9, "500524806976": 9, "513356792927": 9, "545715200901": 9, "486515200138": 9, "449055999517": 9, "44974719882": 9, "457427197695": 9, "492915201187": 9, "464863997698": 9, "466118401289": 9, "475264000893": 9, "513632011414": 9, "458412796259": 9, "457715201378": 9, "461017608643": 9, "475987195969": 9, "460032004118": 9, "457779198885": 9, "462649595737": 9, "kernel_string_shar": 9, "22673916817": 9, "826361596584": 9, "793516802788": 9, "782112002373": 9, "776639997959": 9, "795135998726": 9, "722777605057": 9, "762777590752": 9, "75422719717": 9, "804876792431": 9, "778656005859": 9, "769734406471": 9, "782495999336": 9, "932281601429": 9, "734028804302": 9, "721625590324": 9, "736511993408": 9, "800019192696": 9, "724966406822": 9, "722969603539": 9, "759430396557": 9, "kernel_string_til": 9, "22200961113": 9, "91601279974": 9, "752838408947": 9, "873651194572": 9, "69833599329": 9, "586931192875": 9, "516473591328": 9, "411392003298": 9, "384262400866": 9, "82159358263": 9, "632607996464": 9, "506457602978": 9, "618758392334": 9, "500288009644": 9, "429862397909": 9, "44995200038": 9, "366150397062": 9, "342201602459": 9, "793542397022": 9, "58026239872": 9, "494163197279": 9, "546316814423": 9, "467059195042": 9, "404249596596": 9, "440895992517": 9, "341376006603": 9, "339692795277": 9, "783923208714": 9, "597920000553": 9, "50277120471": 9, "615475213528": 9, "470937597752": 9, "418393599987": 9, "443519997597": 9, "343961596489": 9, "342540800571": 9, "780352008343": 9, "611705589294": 9, "515667212009": 9, "622534394264": 9, "502195191383": 9, "437388807535": 9, "45568639636": 9, "359289598465": 9, "426995199919": 9, "788947200775": 9, "616556799412": 9, "496121603251": 9, "629164803028": 9, "474841600657": 9, "407667201757": 9, "47406719923": 9, "371507203579": 9, "352531200647": 9, "72023679018": 9, "574816000462": 9, "481817597151": 9, "580928003788": 9, "455724793673": 9, "394975996017": 9, "464659202099": 9, "357107198238": 9, "324083191156": 9, "759910392761": 9, "569177603722": 9, "481279999018": 9, "528115200996": 9, "441734397411": 9, "393126398325": 9, "455404800177": 9, "350457596779": 9, "322547197342": 9, "754201591015": 9, "579827189445": 9, "491852802038": 9, "582751989365": 9, "451283198595": 9, "391807991266": 9, "456275194883": 9, "356716805696": 9, "362937599421": 9, "809894394875": 9, "60433280468": 9, "507142400742": 9, "655827200413": 9, "474092799425": 9, "408166396618": 9, "480531209707": 9, "346707201004": 9, "780134403706": 9, "601049602032": 9, "493900799751": 9, "620384001732": 9, "494553589821": 9, "425414395332": 9, "467033600807": 9, "375468802452": 9, "346079999208": 9, "771052801609": 9, "593977594376": 9, "49723520875": 9, "583270406723": 9, "478079998493": 9, "416320002079": 9, "443942397833": 9, "359744000435": 9, "343545603752": 9, "780960011482": 9, "598758399487": 9, "498617601395": 9, "57678719759": 9, "46561280489": 9, "41324160099": 9, "431225597858": 9, "351263999939": 9, "34440960288": 9, "933260798454": 9, "715257608891": 9, "586604809761": 9, "711615991592": 9, "558771193027": 9, "466284793615": 9, "44043520093": 9, "361823999882": 9, "731839990616": 9, "57044479847": 9, "470220798254": 9, "608800005913": 9, "472665601969": 9, "416352003813": 9, "481376004219": 9, "380812799931": 9, "351923197508": 9, "719257593155": 9, "55171200037": 9, "466758400202": 9, "568435204029": 9, "459654402733": 9, "394380801916": 9, "463052803278": 9, "36409599781": 9, "328998398781": 9, "73579518795": 9, "564575994015": 9, "472236800194": 9, "549024009705": 9, "438406395912": 9, "389945602417": 9, "455193603039": 9, "364051198959": 9, "375519996881": 9, "798195195198": 9, "588998401165": 9, "49552000761": 9, "595462405682": 9, "460972803831": 9, "400672000647": 9, "465132802725": 9, "364627194405": 9, "729363203049": 9, "558815991879": 9, "466655993462": 9, "600819194317": 9, "460281592607": 9, "404908800125": 9, "478739196062": 9, "386668801308": 9, "385510402918": 9, "720915210247": 9, "550668799877": 9, "466937589645": 9, "564921605587": 9, "447974395752": 9, "394271999598": 9, "46233600378": 9, "365190398693": 9, "387827193737": 9, "762003195286": 9, "579007995129": 9, "486649608612": 9, "557331204414": 9, "443033593893": 9, "396070402861": 9, "457075202465": 9, "369555193186": 9, "wish": 9, "modifi": [9, 17], "tile_size_j": 9, "fixed_param": [9, 11], "ceil": [9, 11], "zip": [9, 11], "transfer": [9, 10, 12], "20": [9, 18], "21": 9, "618": 9, "2231903076172": 9, "0x7f887c3d2358": 9, "incorpor": 9, "ifndef": 9, "kerenel": 9, "psedo": 9, "endif": 9, "bypass": 9, "usecas": 10, "test_vector_add": 10, "test_vector_add_parameter": 10, "illustr": 10, "dimension": [10, 11, 22], "clean": [10, 15], "center": [10, 11], "lock": [10, 17], "overlap": [10, 12], "shuffl": 10, "pipelin": 10, "consist": [10, 15, 22], "scipi": 10, "algorithm": [10, 13, 18, 22], "cub": 10, "gaussian": 11, "delv": 11, "hand": [11, 15], "sum_": 11, "exp": 11, "beta": [11, 18], "sqrt": 11, "y_i": 11, "z_i": 11, "vector": [11, 12, 19], "coordin": 11, "linalg": 11, "la": 11, "compute_grid": 11, "xgrid": 11, "ygrid": 11, "zgrid": 11, "x0": 11, "y0": 11, "z0": 11, "themselv": 11, "meshgrid": 11, "send": 11, "interv": 11, "256": [11, 13, 19], "suffici": [11, 16], "100": [11, 18, 22], "randomli": [11, 18], "distribut": [11, 15], "linspac": 11, "cpu_grid": 11, "npt": 11, "rand": 11, "xyz": [11, 22], "52320": 11, "160627": 11, "might": [11, 16], "nz": 11, "bz": 11, "kernel_cod": 11, "math": 11, "__host__": 11, "__device__": [11, 21], "b": [11, 13, 15, 18, 19, 21], "addgrid": 11, "xvect": 11, "yvect": 11, "zvect": 11, "dx": 11, "dy": 11, "dz": 11, "assign": 11, "explor": 11, "middl": 11, "henc": [11, 20], "mention": 11, "56833920479": 11, "80796158314": 11, "940044796467": 11, "855628800392": 11, "855359995365": 11, "16174077988": 11, "11877760887": 11, "01592960358": 11, "849273598194": 11, "849235200882": 11, "19029750824": 11, "16199679375": 11, "40401918888": 11, "39618558884": 11, "39508478642": 11, "31647996902": 11, "31470079422": 11, "50787198544": 11, "53760001659": 11, "56709756851": 11, "34500494003": 11, "25130877495": 11, "50662400723": 11, "55267841816": 11, "17987194061": 11, "12309756279": 11, "01125121117": 11, "849631989002": 11, "853708791733": 11, "17051515579": 11, "15584001541": 11, "40074241161": 11, "39547519684": 11, "39331197739": 11, "30295038223": 11, "28725762367": 11, "39589118958": 11, "38867840767": 11, "37724158764": 11, "34344320297": 11, "26213116646": 11, "38793599606": 11, "3775359869": 11, "74003200531": 11, "13276162148": 11, "37233917713": 11, "18835201263": 11, "15777277946": 11, "40247042179": 11, "39366400242": 11, "39439997673": 11, "23719043732": 11, "28542718887": 11, "39207677841": 11, "38956804276": 11, "3778496027": 11, "29814395905": 11, "26398081779": 11, "38625922203": 11, "3754431963": 11, "72981758118": 11, "12483196259": 11, "37322881222": 11, "61618566513": 11, "2194111824": 11, "17600002289": 11, "27082881927": 11, "38787200451": 11, "3835711956": 11, "37543039322": 11, "30227203369": 11, "23127679825": 11, "38627202511": 11, "37677440643": 11, "64358406067": 11, "12255358696": 11, "37474560738": 11, "61655673981": 11, "19179515839": 11, "99912958145": 11, "213971138": 11, "16430072784": 11, "38772480488": 11, "3735104084": 11, "54432649612": 11, "05524477959": 11, "36935677528": 11, "42449922562": 11, "10455036163": 11, "67516155243": 11, "programmat": 11, "30": 11, "minimum": 11, "84": 11, "suit": [11, 22], "grid_dim": 11, "associ": 11, "substitut": 11, "ourselv": 11, "extract": 11, "manual": [11, 14], "exlicitli": 11, "accur": [11, 17], "xgpu": 11, "ygpu": 11, "zgpu": 11, "grid_gpu": 11, "80": 11, "133200": 11, "lower": [11, 17, 18], "roughli": [11, 15], "40000": 11, "across": [12, 15], "qualiti": 12, "itself": [12, 13, 22], "precis": 12, "plain": 12, "omp_get_wtim": 12, "openmp": 12, "convolution_stream": 12, "complex": [12, 15], "behind": 12, "spread": 12, "back": [12, 22], "split": 12, "chunk": 12, "slightli": [12, 15, 21], "account": [12, 15], "border": [12, 22], "latter": 12, "cudastreamwaitev": 12, "num_stream": 12, "clarifi": 12, "fit": [12, 18], "choic": [12, 14], "grid_size_x": 12, "grid_size_i": 12, "cudamemcpytosymbol": 12, "upload": 12, "yourself": [12, 22], "spent": [12, 22], "relat": [13, 16, 23], "famili": 13, "launcher": 13, "kt": [13, 20], "easiest": 13, "toolkit": [13, 14], "intend": 13, "Or": [13, 14], "vector_add": [13, 18, 19, 21], "10000000": 13, "512": [13, 19], "research": 13, "cite": 13, "articl": [13, 19], "author": 13, "ben": 13, "van": 13, "werkhoven": 13, "titl": 13, "auto": [13, 15, 17, 18, 21, 22, 23], "journal": 13, "year": 13, "2019": 13, "volum": 13, "90": 13, "347": 13, "358": 13, "url": 13, "www": 13, "sciencedirect": 13, "scienc": 13, "pii": 13, "s0167739x18313359": 13, "doi": 13, "1016": 13, "2018": 13, "08": 13, "004": 13, "willemsen2021bayesian": 13, "willemsen": [13, 18], "flori": 13, "jan": 13, "nieuwpoort": 13, "rob": 13, "bayesian": [13, 18, 22], "workshop": 13, "pmb": 13, "supercomput": 13, "sc21": 13, "2021": 13, "arxiv": 13, "ab": 13, "2111": 13, "14991": 13, "schoonhoven2022benchmark": 13, "schoonhoven": 13, "richard": 13, "batenburg": 13, "joost": 13, "ieee": 13, "transact": 13, "evolutionari": 13, "2022": 13, "schoonhoven2022go": 13, "veenboer": 13, "bram": 13, "green": 13, "effici": [13, 15, 17], "steer": 13, "sc22": 13, "2211": 13, "07260": 13, "comprehens": 14, "recommend": [14, 20], "download": 14, "repo": 14, "continuum": 14, "io": 14, "miniconda3": 14, "x86_64": 14, "sh": 14, "newer": [14, 17], "nativ": 14, "prefix": 14, "home": 14, "pythonpath": 14, "bind": [14, 17], "older": 14, "troubl": 14, "retri": 14, "dir": 14, "wiki": 14, "tiker": 14, "net": 14, "amd": [14, 17], "app": 14, "sdk": 14, "intel": 14, "appl": 14, "beignet": 14, "stack": 14, "altern": [14, 22], "navig": 14, "benvanwerkhoven": 14, "differenti": [14, 18, 22], "chanc": [14, 18, 21], "algebra": 15, "frequent": 15, "programm": [15, 17], "row": 15, "column": 15, "squar": 15, "matric": 15, "matmul_na": 15, "width": 15, "matmul_kernel": 15, "height": 15, "Of": 15, "solut": [15, 17], "realiti": 15, "contant": 15, "denot": [15, 19, 22], "sensibl": 15, "pick": 15, "word": 15, "warpsiz": 15, "namelijk": 15, "stand": 15, "briefli": 15, "figur": 15, "fifth": 15, "fourth": 15, "dramat": 15, "profil": 15, "pretti": 15, "opportun": 15, "realiz": 15, "collabor": 15, "bandwidth": 15, "techniqu": 15, "submatric": 15, "proce": 15, "matmul_shar": 15, "sa": 15, "sb": 15, "kb": 15, "outer": 15, "inner": 15, "race": 15, "drastic": 15, "consumpt": [15, 17], "due": [15, 21, 22], "significantli": [15, 17], "fortun": 15, "benefit": 15, "redund": 15, "distinct": 15, "1xn": 15, "usag": [15, 17], "occup": 15, "goe": 15, "down": 15, "matmul": 15, "newli": 15, "coupl": 15, "respect": [15, 17], "independ": 15, "yield": 15, "discontinu": 15, "room": 15, "impos": 15, "report": [16, 17, 22, 23], "possibli": [16, 22], "_flop": 16, "total_flop": 16, "ps_energi": [16, 17, 23], "occur": [16, 22], "exhaust": 16, "brute": [16, 18, 19], "forc": [16, 18, 19, 21], "maxim": [16, 22], "boolean": [16, 17, 22], "facilit": 17, "layer": 17, "act": 17, "hook": 17, "pattern": 17, "subscrib": 17, "benchmarkobserv": 17, "overwritten": [17, 22], "extend": 17, "mandatori": 17, "get_result": 17, "aggreg": 17, "after_finish": 17, "after_start": 17, "before_start": 17, "register_configur": 17, "register_devic": 17, "variou": [17, 19], "registerobserv": 17, "track": 17, "num_reg": 17, "current_modul": 17, "powersensor2": 17, "pcie": 17, "intercept": 17, "sensor": 17, "transmit": 17, "usb": 17, "connect": 17, "advantag": 17, "instantan": 17, "frequenc": 17, "khz": 17, "pybind11": 17, "powersensor": [17, 23], "ps_power": [17, 23], "joul": [17, 23], "watt": [17, 23], "ttyacm0": 17, "core": 17, "voltag": 17, "thin": 17, "wrapper": [17, 21], "intricaci": 17, "friendli": 17, "mode": 17, "repeatedli": 17, "downsid": 17, "approach": 17, "save_al": 17, "nvidia_smi_fallback": 17, "use_locked_clock": 17, "continous_dur": 17, "monitor": 17, "clock": [17, 23], "power_read": [17, 23], "nvml_power": [17, 23], "nvml_energi": [17, 23], "core_freq": [17, 23], "mem_freq": [17, 23], "gr_voltag": 17, "ordin": 17, "identifi": 17, "smi": 17, "root": 17, "opt": 17, "amper": 17, "continuous_dur": 17, "common": [17, 21], "cap": 17, "popular": 17, "nvml_gr_clock": [17, 23], "nvml_mem_clock": [17, 23], "nvml_pwr_limit": [17, 23], "graphic": [17, 23], "jetson": 17, "rapl": 17, "xilinx": 17, "pmt": 17, "astron": 17, "nl": 17, "rd": 17, "meter": 17, "arduino": 17, "_energi": 17, "_power": 17, "acceler": 18, "prohibit": 18, "slow": 18, "wast": 18, "basin": [18, 22], "hop": [18, 22], "dual": [18, 22], "anneal": [18, 22], "evolut": [18, 22], "firefli": [18, 22], "genet": [18, 22], "greedi": [18, 22], "local": [18, 22], "multi": [18, 22], "particl": [18, 22], "swarm": [18, 22], "mechan": 18, "overrid": 18, "time_limit": [18, 22], "uniqu": [18, 22], "count": 18, "searchspac": 18, "runner": 18, "nelder": 18, "mead": 18, "powel": 18, "cg": 18, "bfg": 18, "l": 18, "tnc": 18, "cobyla": 18, "slsqp": 18, "reject": 18, "thesi": 18, "generate_normalized_param_dict": 18, "denorm": 18, "normalize_parameter_spac": 18, "param_spac": 18, "prune_parameter_spac": 18, "normalize_dict": 18, "prune": 18, "hyperparamet": 18, "popul": 18, "best1bin": 18, "best1exp": 18, "rand1exp": 18, "randtobest1exp": 18, "best2exp": 18, "rand2exp": 18, "randtobest1bin": 18, "best2bin": 18, "rand2bin": 18, "rand1bin": 18, "popsiz": 18, "maxit": 18, "constr": 18, "compute_intens": 18, "fun": 18, "intens": 18, "distance_to": 18, "euclidian": 18, "move_toward": 18, "alpha": 18, "toward": 18, "b0": 18, "attract": 18, "gamma": 18, "light": 18, "absorpt": 18, "coeffici": 18, "disruptive_uniform_crossov": 18, "dna1": 18, "dna2": 18, "disrupt": 18, "uniform": 18, "crossov": 18, "uniformli": 18, "gene": 18, "children": 18, "guarante": 18, "parent": 18, "mutat": 18, "dna": 18, "mutation_ch": 18, "single_point_crossov": 18, "index": 18, "single_point": 18, "two_point": 18, "disruptive_uniform": 18, "two_point_crossov": 18, "uniform_crossov": 18, "weighted_choic": 18, "probabl": [18, 22], "il": 18, "neighbor": 18, "ham": 18, "adjac": 18, "greedy": 18, "soon": 18, "no_improv": 18, "exce": 18, "50": 18, "random_walk": 18, "hillclimb": 18, "travers": 18, "inertia": 18, "c1": 18, "cognit": 18, "c2": 18, "social": 18, "fraction": 18, "acceptance_prob": 18, "old_cost": 18, "new_cost": 18, "modif": [18, 20], "po": 18, "t_min": 18, "001": 18, "995": 18, "vector_add_kernel": 19, "wise": 19, "1000000": [19, 21], "recogn": 19, "alright": 19, "portabl": 20, "stick": 20, "pointer": 20, "primit": 20, "lead": 20, "ineffici": 20, "situat": 20, "scientif": 20, "sens": 20, "experiment": 20, "pack": 20, "consult": 20, "create_receive_spec_struct": 20, "0l": 20, "pad": 20, "8byte": 20, "packstr": 20, "iiiiiiiiiiippi": 20, "fffi": 20, "nsampl": 20, "nsamplesiq": 20, "nslowtimesampl": 20, "nchannel": 20, "ntx": 20, "nrepeat": 20, "nfasttimesampl": 20, "rfsize": 20, "mnrow": 20, "mnrowsiq": 20, "nactivechannel": 20, "isiq": 20, "fsiq": 20, "fc": 20, "nbuffer": 20, "frombuff": 20, "len": 20, "receive_spec": 20, "bf": 20, "rf": 20, "recon": 20, "length": 20, "slight": 20, "matlab": 21, "typenam": 21, "my_typ": 21, "regardless": 21, "demot": 21, "rewrit": 21, "real": 21, "risk": 21, "seper": 21, "grid_div_z": 22, "06": 22, "log": 22, "auxilliari": 22, "safer": 22, "notat": 22, "divison": 22, "treat": 22, "warp": 22, "empti": 22, "kepler": 22, "plu": 22, "filter_mod": 22, "address_mod": 22, "clamp": 22, "mirror": 22, "axi": 22, "normalized_coordin": 22, "emtpi": 22, "get_local_s": 22, "satisfi": 22, "000001": 22, "ref": 22, "basinhop": 22, "bayes_opt": 22, "diff_evo": 22, "firefly_algorithm": 22, "genetic_algorithm": 22, "greedy_il": 22, "greedy_ml": 22, "ml": 22, "ordered_greedy_ml": 22, "pso": 22, "simulated_ann": 22, "sort": 22, "resourc": 22, "persist": 22, "consol": 22, "info": 22, "summar": 22, "store_result": 22, "results_filenam": 22, "typicali": 22, "percentag": 22, "create_device_target": 22, "header_filenam": 22, "target": 22, "dtarget_gpu": 22, "name_of_gpu": 22, "chosen": 22, "block_size_": 23, "grid_size_": 23, "compiler_opt_": 23, "loop_unroll_factor_": 23, "nvml_": 23, "nvml": 23, "nvmlobserv": 23}, "objects": {"kernel_tuner.backends.compiler": [[6, 0, 1, "", "CompilerFunctions"]], "kernel_tuner.backends.compiler.CompilerFunctions": [[6, 1, 1, "", "__init__"], [6, 1, 1, "", "cleanup_lib"], [6, 1, 1, "", "compile"], [6, 1, 1, "", "kernel_finished"], [6, 1, 1, "", "memcpy_dtoh"], [6, 1, 1, "", "memcpy_htod"], [6, 1, 1, "", "memset"], [6, 1, 1, "", "ready_argument_list"], [6, 1, 1, "", "run_kernel"], [6, 1, 1, "", "start_event"], [6, 1, 1, "", "stop_event"], [6, 1, 1, "", "synchronize"]], "kernel_tuner.backends.cupy": [[6, 0, 1, "", "CupyFunctions"]], "kernel_tuner.backends.cupy.CupyFunctions": [[6, 1, 1, "", "__init__"], [6, 1, 1, "", "compile"], [6, 1, 1, "", "copy_constant_memory_args"], [6, 1, 1, "", "copy_shared_memory_args"], [6, 1, 1, "", "copy_texture_memory_args"], [6, 1, 1, "", "kernel_finished"], [6, 1, 1, "", "memcpy_dtoh"], [6, 1, 1, "", "memcpy_htod"], [6, 1, 1, "", "memset"], [6, 1, 1, "", "ready_argument_list"], [6, 1, 1, "", "run_kernel"], [6, 1, 1, "", "start_event"], [6, 1, 1, "", "stop_event"], [6, 1, 1, "", "synchronize"]], "kernel_tuner.backends.hip": [[6, 0, 1, "", "HipFunctions"]], "kernel_tuner.backends.hip.HipFunctions": [[6, 1, 1, "", "__init__"], [6, 1, 1, "", "compile"], [6, 1, 1, "", "copy_constant_memory_args"], [6, 1, 1, "", "copy_shared_memory_args"], [6, 1, 1, "", "copy_texture_memory_args"], [6, 1, 1, "", "kernel_finished"], [6, 1, 1, "", "memcpy_dtoh"], [6, 1, 1, "", "memcpy_htod"], [6, 1, 1, "", "memset"], [6, 1, 1, "", "ready_argument_list"], [6, 1, 1, "", "run_kernel"], [6, 1, 1, "", "start_event"], [6, 1, 1, "", "stop_event"], [6, 1, 1, "", "synchronize"]], "kernel_tuner.backends.nvcuda": [[6, 0, 1, "", "CudaFunctions"]], "kernel_tuner.backends.nvcuda.CudaFunctions": [[6, 1, 1, "", "__init__"], [6, 1, 1, "", "compile"], [6, 1, 1, "", "copy_constant_memory_args"], [6, 1, 1, "", "copy_shared_memory_args"], [6, 1, 1, "", "copy_texture_memory_args"], [6, 1, 1, "", "kernel_finished"], [6, 1, 1, "", "memcpy_dtoh"], [6, 1, 1, "", "memcpy_htod"], [6, 1, 1, "", "memset"], [6, 1, 1, "", "ready_argument_list"], [6, 1, 1, "", "run_kernel"], [6, 1, 1, "", "start_event"], [6, 1, 1, "", "stop_event"], [6, 1, 1, "", "synchronize"]], "kernel_tuner.backends.opencl": [[6, 0, 1, "", "OpenCLFunctions"]], "kernel_tuner.backends.opencl.OpenCLFunctions": [[6, 1, 1, "", "__init__"], [6, 1, 1, "", "compile"], [6, 1, 1, "", "copy_constant_memory_args"], [6, 1, 1, "", "copy_shared_memory_args"], [6, 1, 1, "", "copy_texture_memory_args"], [6, 1, 1, "", "kernel_finished"], [6, 1, 1, "", "memcpy_dtoh"], [6, 1, 1, "", "memcpy_htod"], [6, 1, 1, "", "memset"], [6, 1, 1, "", "ready_argument_list"], [6, 1, 1, "", "run_kernel"], [6, 1, 1, "", "start_event"], [6, 1, 1, "", "stop_event"], [6, 1, 1, "", "synchronize"]], "kernel_tuner.backends.pycuda": [[6, 0, 1, "", "PyCudaFunctions"]], "kernel_tuner.backends.pycuda.PyCudaFunctions": [[6, 1, 1, "", "__init__"], [6, 1, 1, "", "compile"], [6, 1, 1, "", "copy_constant_memory_args"], [6, 1, 1, "", "copy_shared_memory_args"], [6, 1, 1, "", "copy_texture_memory_args"], [6, 1, 1, "", "kernel_finished"], [6, 1, 1, "", "memcpy_dtoh"], [6, 1, 1, "", "memcpy_htod"], [6, 1, 1, "", "memset"], [6, 1, 1, "", "ready_argument_list"], [6, 1, 1, "", "run_kernel"], [6, 1, 1, "", "start_event"], [6, 1, 1, "", "stop_event"], [6, 1, 1, "", "synchronize"]], "kernel_tuner.core": [[6, 0, 1, "", "DeviceInterface"]], "kernel_tuner.core.DeviceInterface": [[6, 1, 1, "", "__init__"], [6, 1, 1, "", "benchmark"], [6, 1, 1, "", "benchmark_continuous"], [6, 1, 1, "", "benchmark_default"], [6, 1, 1, "", "check_kernel_output"], [6, 1, 1, "", "compile_kernel"], [6, 1, 1, "", "copy_constant_memory_args"], [6, 1, 1, "", "copy_shared_memory_args"], [6, 1, 1, "", "copy_texture_memory_args"], [6, 1, 1, "", "create_kernel_instance"], [6, 1, 1, "", "get_environment"], [6, 1, 1, "", "memcpy_dtoh"], [6, 1, 1, "", "preprocess_gpu_arguments"], [6, 1, 1, "", "ready_argument_list"], [6, 1, 1, "", "run_kernel"]], "kernel_tuner": [[22, 2, 1, "", "create_device_targets"], [22, 2, 1, "", "run_kernel"], [22, 2, 1, "", "store_results"], [22, 2, 1, "", "tune_kernel"], [6, 3, 0, "-", "util"]], "kernel_tuner.observers": [[17, 0, 1, "", "BenchmarkObserver"]], "kernel_tuner.observers.BenchmarkObserver": [[17, 1, 1, "", "after_finish"], [17, 1, 1, "", "after_start"], [17, 1, 1, "", "before_start"], [17, 1, 1, "", "during"], [17, 1, 1, "", "get_results"], [17, 1, 1, "", "register_configuration"], [17, 1, 1, "", "register_device"]], "kernel_tuner.observers.nvml": [[17, 0, 1, "", "NVMLObserver"]], "kernel_tuner.observers.pmt": [[17, 0, 1, "", "PMTObserver"]], "kernel_tuner.observers.powersensor": [[17, 0, 1, "", "PowerSensorObserver"]], "kernel_tuner.runners.sequential": [[6, 0, 1, "", "SequentialRunner"]], "kernel_tuner.runners.sequential.SequentialRunner": [[6, 1, 1, "", "__init__"], [6, 1, 1, "", "run"]], "kernel_tuner.runners.simulation": [[6, 0, 1, "", "SimulationRunner"]], "kernel_tuner.runners.simulation.SimulationRunner": [[6, 1, 1, "", "__init__"], [6, 1, 1, "", "run"]], "kernel_tuner.strategies": [[18, 3, 0, "-", "basinhopping"], [18, 3, 0, "-", "bayes_opt"], [18, 3, 0, "-", "brute_force"], [6, 3, 0, "-", "common"], [18, 3, 0, "-", "diff_evo"], [18, 3, 0, "-", "dual_annealing"], [18, 3, 0, "-", "firefly_algorithm"], [18, 3, 0, "-", "genetic_algorithm"], [18, 3, 0, "-", "greedy_ils"], [18, 3, 0, "-", "greedy_mls"], [18, 3, 0, "-", "minimize"], [18, 3, 0, "-", "mls"], [18, 3, 0, "-", "ordered_greedy_mls"], [18, 3, 0, "-", "pso"], [18, 3, 0, "-", "random_sample"], [18, 3, 0, "-", "simulated_annealing"]], "kernel_tuner.strategies.basinhopping": [[18, 2, 1, "", "tune"]], "kernel_tuner.strategies.bayes_opt": [[18, 2, 1, "", "generate_normalized_param_dicts"], [18, 2, 1, "", "normalize_parameter_space"], [18, 2, 1, "", "prune_parameter_space"], [18, 2, 1, "", "tune"]], "kernel_tuner.strategies.brute_force": [[18, 2, 1, "", "tune"]], "kernel_tuner.strategies.common": [[6, 2, 1, "", "get_options"], [6, 2, 1, "", "get_strategy_docstring"], [6, 2, 1, "", "make_strategy_options_doc"], [6, 2, 1, "", "scale_from_params"], [6, 2, 1, "", "setup_method_arguments"], [6, 2, 1, "", "setup_method_options"], [6, 2, 1, "", "snap_to_nearest_config"], [6, 2, 1, "", "unscale_and_snap_to_nearest"]], "kernel_tuner.strategies.diff_evo": [[18, 2, 1, "", "tune"]], "kernel_tuner.strategies.dual_annealing": [[18, 2, 1, "", "tune"]], "kernel_tuner.strategies.firefly_algorithm": [[18, 0, 1, "", "Firefly"], [18, 2, 1, "", "tune"]], "kernel_tuner.strategies.firefly_algorithm.Firefly": [[18, 1, 1, "", "compute_intensity"], [18, 1, 1, "", "distance_to"], [18, 1, 1, "", "move_towards"]], "kernel_tuner.strategies.genetic_algorithm": [[18, 2, 1, "", "disruptive_uniform_crossover"], [18, 2, 1, "", "mutate"], [18, 2, 1, "", "single_point_crossover"], [18, 2, 1, "", "tune"], [18, 2, 1, "", "two_point_crossover"], [18, 2, 1, "", "uniform_crossover"], [18, 2, 1, "", "weighted_choice"]], "kernel_tuner.strategies.greedy_ils": [[18, 2, 1, "", "tune"]], "kernel_tuner.strategies.greedy_mls": [[18, 2, 1, "", "tune"]], "kernel_tuner.strategies.minimize": [[18, 2, 1, "", "tune"]], "kernel_tuner.strategies.mls": [[18, 2, 1, "", "tune"]], "kernel_tuner.strategies.ordered_greedy_mls": [[18, 2, 1, "", "tune"]], "kernel_tuner.strategies.pso": [[18, 2, 1, "", "tune"]], "kernel_tuner.strategies.random_sample": [[18, 2, 1, "", "tune"]], "kernel_tuner.strategies.simulated_annealing": [[18, 2, 1, "", "acceptance_prob"], [18, 2, 1, "", "neighbor"], [18, 2, 1, "", "tune"]], "kernel_tuner.util": [[6, 0, 1, "", "CompilationFailedConfig"], [6, 0, 1, "", "ErrorConfig"], [6, 0, 1, "", "InvalidConfig"], [6, 0, 1, "", "NpEncoder"], [6, 0, 1, "", "RuntimeFailedConfig"], [6, 4, 1, "", "SkippableFailure"], [6, 4, 1, "", "StopCriterionReached"], [6, 2, 1, "", "check_argument_list"], [6, 2, 1, "", "check_argument_type"], [6, 2, 1, "", "check_restrictions"], [6, 2, 1, "", "check_stop_criterion"], [6, 2, 1, "", "check_thread_block_dimensions"], [6, 2, 1, "", "check_tune_params_list"], [6, 2, 1, "", "compile_restrictions"], [6, 2, 1, "", "config_valid"], [6, 2, 1, "", "convert_constraint_restriction"], [6, 2, 1, "", "correct_open_cache"], [6, 2, 1, "", "cuda_error_check"], [6, 2, 1, "", "delete_temp_file"], [6, 2, 1, "", "detect_language"], [6, 2, 1, "", "dump_cache"], [6, 2, 1, "", "get_best_config"], [6, 2, 1, "", "get_config_string"], [6, 2, 1, "", "get_grid_dimensions"], [6, 2, 1, "", "get_instance_string"], [6, 2, 1, "", "get_kernel_string"], [6, 2, 1, "", "get_problem_size"], [6, 2, 1, "", "get_smem_args"], [6, 2, 1, "", "get_temp_filename"], [6, 2, 1, "", "get_thread_block_dimensions"], [6, 2, 1, "", "get_total_timings"], [6, 2, 1, "", "looks_like_a_filename"], [6, 2, 1, "", "normalize_verify_function"], [6, 2, 1, "", "parse_restrictions"], [6, 2, 1, "", "prepare_kernel_string"], [6, 2, 1, "", "print_config"], [6, 2, 1, "", "print_config_output"], [6, 2, 1, "", "process_cache"], [6, 2, 1, "", "process_metrics"], [6, 2, 1, "", "read_cache"], [6, 2, 1, "", "read_file"], [6, 2, 1, "", "replace_param_occurrences"], [6, 2, 1, "", "setup_block_and_grid"], [6, 2, 1, "", "store_cache"], [6, 2, 1, "", "write_file"]], "kernel_tuner.util.NpEncoder": [[6, 1, 1, "", "default"]]}, "objtypes": {"0": "py:class", "1": "py:method", "2": "py:function", "3": "py:module", "4": "py:exception"}, "objnames": {"0": ["py", "class", "Python class"], "1": ["py", "method", "Python method"], "2": ["py", "function", "Python function"], "3": ["py", "module", "Python module"], "4": ["py", "exception", "Python exception"]}, "titleterms": {"backend": [0, 6, 14, 21], "cuda": [0, 14, 15], "featur": [0, 2], "support": 0, "usag": [0, 13], "compil": [0, 6], "cach": 1, "file": 1, "The": [2, 13], "kernel": [2, 7, 8, 9, 10, 11, 13, 15, 21], "tuner": [2, 7, 8, 9, 10, 11, 13], "document": [2, 3, 6, 13, 22], "guid": [2, 3, 14], "refer": 2, "contribut": 3, "report": 3, "issu": 3, "code": [3, 7, 8, 9, 10, 12], "develop": 3, "environ": 3, "local": [3, 8], "setup": 3, "cluster": 3, "run": [3, 9], "test": [3, 4], "build": 3, "convolut": [4, 10], "2d": 4, "exampl": [4, 10, 13, 21], "implement": [4, 7, 8, 9], "tune": [4, 7, 8, 9, 11, 12, 15, 16, 17], "more": 4, "tunabl": 4, "paramet": [4, 9, 11, 17, 23], "correct": 5, "verif": 5, "design": 6, "strategi": [6, 18], "kernel_tun": [6, 18], "common": 6, "runner": 6, "sequenti": 6, "sequentialrunn": 6, "simulationrunn": 6, "devic": 6, "interfac": 6, "core": 6, "deviceinterfac": 6, "pycuda": [6, 14], "pycudafunct": 6, "cupi": 6, "cupyfunct": 6, "nvcuda": 6, "cudafunct": 6, "opencl": [6, 14], "openclfunct": 6, "compilerfunct": 6, "hip": [6, 14], "hipfunct": 6, "util": 6, "function": 6, "diffus": [7, 8, 9], "python": [7, 8, 9, 14], "comput": [7, 8, 9], "gpu": [7, 8, 9, 11], "auto": [7, 8, 9], "us": [7, 8, 9, 11, 15, 20], "share": [7, 8, 9, 15], "memori": [7, 8, 9, 15], "tile": [7, 8, 9], "store": [7, 8], "result": [7, 8], "tutori": [8, 9], "from": [8, 9], "physic": [8, 9], "best": 9, "product": 9, "c": 9, "vector": 10, "add": 10, "stencil": 10, "matrix": [10, 15], "multipl": [10, 15], "py": 10, "sepconv": 10, "convolution_correct": 10, "convolution_stream": 10, "reduct": 10, "spars": 10, "point": 10, "polygon": 10, "expdist": 10, "gener": 10, "3d": 11, "grid": 11, "let": 11, "": 11, "start": [11, 19], "cpu": 11, "move": 11, "optim": [11, 18], "host": 12, "number": 12, "stream": 12, "quick": 13, "instal": [13, 14], "citat": 13, "packag": 14, "other": 14, "pyopencl": 14, "pyhip": 14, "git": 14, "version": 14, "depend": 14, "naiv": 15, "increas": 15, "work": 15, "per": 15, "thread": 15, "metric": 16, "object": 16, "observ": 17, "powersensorobserv": 17, "nvmlobserv": 17, "execut": 17, "nvml": 17, "pmtobserv": 17, "basinhop": 18, "bayes_opt": 18, "brute_forc": 18, "diff_evo": 18, "dual_ann": 18, "firefly_algorithm": 18, "genetic_algorithm": 18, "greedy_il": 18, "greedy_ml": 18, "minim": 18, "ml": 18, "ordered_greedy_ml": 18, "pso": 18, "random_sampl": 18, "simulated_ann": 18, "get": 19, "struct": 20, "templat": 21, "select": 21, "api": 22, "vocabulari": 23}, "envversion": {"sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "nbsphinx": 4, "sphinx": 58}, "alltitles": {"Backends": [[0, "backends"]], "CUDA Backends": [[0, "cuda-backends"]], "Backend feature support": [[0, "id1"]], "Backend usage and compiler": [[0, "id2"]], "Cache files": [[1, "cache-files"]], "The Kernel Tuner documentation": [[2, "the-kernel-tuner-documentation"], [13, "the-kernel-tuner-documentation"]], "Kernel Tuner": [[2, null]], "Guides": [[2, null]], "Features": [[2, null]], "Reference": [[2, null]], "Contribution guide": [[3, "contribution-guide"]], "Reporting Issues": [[3, "reporting-issues"]], "Contributing Code": [[3, "contributing-code"]], "Development environment": [[3, "development-environment"]], "Local setup": [[3, "local-setup"]], "Cluster setup": [[3, "cluster-setup"]], "Running tests": [[3, "running-tests"]], "Building documentation": [[3, "building-documentation"]], "Convolution": [[4, "Convolution"], [10, "convolution"]], "2D Convolution example": [[4, "2D-Convolution-example"]], "Implement a test": [[4, "Implement-a-test"]], "Tuning 2D Convolution": [[4, "Tuning-2D-Convolution"]], "More tunable parameters": [[4, "More-tunable-parameters"]], "Correctness Verification": [[5, "correctness-verification"]], "Design documentation": [[6, "design-documentation"]], "Strategies": [[6, "strategies"]], "kernel_tuner.strategies.common": [[6, "module-kernel_tuner.strategies.common"]], "Runners": [[6, "runners"]], "kernel_tuner.runners.sequential.SequentialRunner": [[6, "kernel-tuner-runners-sequential-sequentialrunner"]], "kernel_tuner.runners.sequential.SimulationRunner": [[6, "kernel-tuner-runners-sequential-simulationrunner"]], "Device Interfaces": [[6, "device-interfaces"]], "kernel_tuner.core.DeviceInterface": [[6, "kernel-tuner-core-deviceinterface"]], "kernel_tuner.backends.pycuda.PyCudaFunctions": [[6, "kernel-tuner-backends-pycuda-pycudafunctions"]], "kernel_tuner.backends.cupy.CupyFunctions": [[6, "kernel-tuner-backends-cupy-cupyfunctions"]], "kernel_tuner.backends.nvcuda.CudaFunctions": [[6, "kernel-tuner-backends-nvcuda-cudafunctions"]], "kernel_tuner.backends.opencl.OpenCLFunctions": [[6, "kernel-tuner-backends-opencl-openclfunctions"]], "kernel_tuner.backends.compiler.CompilerFunctions": [[6, "kernel-tuner-backends-compiler-compilerfunctions"]], "kernel_tuner.backends.hip.HipFunctions": [[6, "kernel-tuner-backends-hip-hipfunctions"]], "Util Functions": [[6, "util-functions"]], "kernel_tuner.util": [[6, "module-kernel_tuner.util"]], "Diffusion": [[7, "Diffusion"], [7, "id1"], [8, "Diffusion"], [9, "Diffusion"]], "Python implementation": [[7, "Python-implementation"], [8, "Python-implementation"], [9, "Python-implementation"]], "Computing on the GPU": [[7, "Computing-on-the-GPU"], [8, "Computing-on-the-GPU"], [9, "Computing-on-the-GPU"]], "Auto-Tuning with the Kernel Tuner": [[7, "Auto-Tuning-with-the-Kernel-Tuner"], [8, "Auto-Tuning-with-the-Kernel-Tuner"], [9, "Auto-Tuning-with-the-Kernel-Tuner"]], "Using Shared Memory": [[7, "Using-Shared-Memory"]], "Tiling GPU Code": [[7, "Tiling-GPU-Code"], [8, "Tiling-GPU-Code"], [9, "Tiling-GPU-Code"]], "Storing the results": [[7, "Storing-the-results"], [8, "Storing-the-results"]], "Tutorial: From physics to tuned GPU kernels": [[8, "Tutorial:-From-physics-to-tuned-GPU-kernels"], [9, "Tutorial:-From-physics-to-tuned-GPU-kernels"]], "Using Shared (local) Memory": [[8, "Using-Shared-(local)-Memory"]], "Using shared memory": [[9, "Using-shared-memory"], [15, "Using-shared-memory"]], "Using the best parameters in a production run": [[9, "Using-the-best-parameters-in-a-production-run"]], "Python run": [[9, "Python-run"]], "C run": [[9, "C-run"]], "Kernel Tuner Examples": [[10, "kernel-tuner-examples"]], "Vector Add": [[10, "vector-add"]], "Stencil": [[10, "stencil"]], "Matrix Multiplication": [[10, "matrix-multiplication"]], "convolution.py": [[10, "convolution-py"]], "sepconv.py": [[10, "sepconv-py"]], "convolution_correct.py": [[10, "convolution-correct-py"]], "convolution_streams.py": [[10, "convolution-streams-py"]], "Reduction": [[10, "reduction"]], "Sparse Matrix Vector Multiplication": [[10, "sparse-matrix-vector-multiplication"]], "Point-in-Polygon": [[10, "point-in-polygon"]], "ExpDist": [[10, "expdist"]], "Code Generator": [[10, "code-generator"]], "3D Grid on GPU with Kernel Tuner": [[11, "3D-Grid-on-GPU-with-Kernel-Tuner"]], "Let\u2019s start on the CPU": [[11, "Let's-start-on-the-CPU"]], "Let\u2019s move to the GPU": [[11, "Let's-move-to-the-GPU"]], "Tune the kernel": [[11, "Tune-the-kernel"]], "Using the optimized parameters": [[11, "Using-the-optimized-parameters"]], "Tuning Host Code": [[12, "tuning-host-code"]], "Tuning the number of streams": [[12, "tuning-the-number-of-streams"]], "Quick install": [[13, "quick-install"]], "Example usage": [[13, "example-usage"]], "Citation": [[13, "citation"]], "Installation": [[14, "installation"]], "Python": [[14, "python"]], "Installing Python Packages": [[14, "installing-python-packages"]], "CUDA and PyCUDA": [[14, "cuda-and-pycuda"]], "Other CUDA Backends": [[14, "other-cuda-backends"]], "OpenCL and PyOpenCL": [[14, "opencl-and-pyopencl"]], "HIP and PyHIP": [[14, "hip-and-pyhip"]], "Installing the git version": [[14, "installing-the-git-version"]], "Dependencies for the guides": [[14, "dependencies-for-the-guides"]], "Matrix multiplication": [[15, "Matrix-multiplication"]], "Naive CUDA kernel": [[15, "Naive-CUDA-kernel"]], "Tuning a naive kernel": [[15, "Tuning-a-naive-kernel"]], "Increase work per thread": [[15, "Increase-work-per-thread"]], "Metrics and Objectives": [[16, "metrics-and-objectives"]], "Metrics": [[16, "metrics"]], "Tuning Objectives": [[16, "tuning-objectives"]], "Observers": [[17, "observers"]], "PowerSensorObserver": [[17, "powersensorobserver"]], "NVMLObserver": [[17, "nvmlobserver"]], "Tuning execution parameters with NVML": [[17, "tuning-execution-parameters-with-nvml"]], "PMTObserver": [[17, "pmtobserver"]], "Optimization strategies": [[18, "optimization-strategies"]], "kernel_tuner.strategies.basinhopping": [[18, "module-kernel_tuner.strategies.basinhopping"]], "kernel_tuner.strategies.bayes_opt": [[18, "module-kernel_tuner.strategies.bayes_opt"]], "kernel_tuner.strategies.brute_force": [[18, "module-kernel_tuner.strategies.brute_force"]], "kernel_tuner.strategies.diff_evo": [[18, "module-kernel_tuner.strategies.diff_evo"]], "kernel_tuner.strategies.dual_annealing": [[18, "module-kernel_tuner.strategies.dual_annealing"]], "kernel_tuner.strategies.firefly_algorithm": [[18, "module-kernel_tuner.strategies.firefly_algorithm"]], "kernel_tuner.strategies.genetic_algorithm": [[18, "module-kernel_tuner.strategies.genetic_algorithm"]], "kernel_tuner.strategies.greedy_ils": [[18, "module-kernel_tuner.strategies.greedy_ils"]], "kernel_tuner.strategies.greedy_mls": [[18, "module-kernel_tuner.strategies.greedy_mls"]], "kernel_tuner.strategies.minimize": [[18, "module-kernel_tuner.strategies.minimize"]], "kernel_tuner.strategies.mls": [[18, "module-kernel_tuner.strategies.mls"]], "kernel_tuner.strategies.ordered_greedy_mls": [[18, "module-kernel_tuner.strategies.ordered_greedy_mls"]], "kernel_tuner.strategies.pso": [[18, "module-kernel_tuner.strategies.pso"]], "kernel_tuner.strategies.random_sample": [[18, "module-kernel_tuner.strategies.random_sample"]], "kernel_tuner.strategies.simulated_annealing": [[18, "module-kernel_tuner.strategies.simulated_annealing"]], "Getting Started": [[19, "getting-started"]], "Using structs": [[20, "using-structs"]], "Templated kernels": [[21, "templated-kernels"]], "Example": [[21, "example"]], "Selecting a backend": [[21, "selecting-a-backend"]], "API Documentation": [[22, "api-documentation"]], "Parameter Vocabulary": [[23, "parameter-vocabulary"]]}, "indexentries": {"compilationfailedconfig (class in kernel_tuner.util)": [[6, "kernel_tuner.util.CompilationFailedConfig"]], "compilerfunctions (class in kernel_tuner.backends.compiler)": [[6, "kernel_tuner.backends.compiler.CompilerFunctions"]], "cudafunctions (class in kernel_tuner.backends.nvcuda)": [[6, "kernel_tuner.backends.nvcuda.CudaFunctions"]], "cupyfunctions (class in kernel_tuner.backends.cupy)": [[6, "kernel_tuner.backends.cupy.CupyFunctions"]], "deviceinterface (class in kernel_tuner.core)": [[6, "kernel_tuner.core.DeviceInterface"]], "errorconfig (class in kernel_tuner.util)": [[6, "kernel_tuner.util.ErrorConfig"]], "hipfunctions (class in kernel_tuner.backends.hip)": [[6, "kernel_tuner.backends.hip.HipFunctions"]], "invalidconfig (class in kernel_tuner.util)": [[6, "kernel_tuner.util.InvalidConfig"]], "npencoder (class in kernel_tuner.util)": [[6, "kernel_tuner.util.NpEncoder"]], "openclfunctions (class in kernel_tuner.backends.opencl)": [[6, "kernel_tuner.backends.opencl.OpenCLFunctions"]], "pycudafunctions (class in kernel_tuner.backends.pycuda)": [[6, "kernel_tuner.backends.pycuda.PyCudaFunctions"]], "runtimefailedconfig (class in kernel_tuner.util)": [[6, "kernel_tuner.util.RuntimeFailedConfig"]], "sequentialrunner (class in kernel_tuner.runners.sequential)": [[6, "kernel_tuner.runners.sequential.SequentialRunner"]], "simulationrunner (class in kernel_tuner.runners.simulation)": [[6, "kernel_tuner.runners.simulation.SimulationRunner"]], "skippablefailure": [[6, "kernel_tuner.util.SkippableFailure"]], "stopcriterionreached": [[6, "kernel_tuner.util.StopCriterionReached"]], "__init__() (kernel_tuner.backends.compiler.compilerfunctions method)": [[6, "kernel_tuner.backends.compiler.CompilerFunctions.__init__"]], "__init__() (kernel_tuner.backends.cupy.cupyfunctions method)": [[6, "kernel_tuner.backends.cupy.CupyFunctions.__init__"]], "__init__() (kernel_tuner.backends.hip.hipfunctions method)": [[6, "kernel_tuner.backends.hip.HipFunctions.__init__"]], "__init__() (kernel_tuner.backends.nvcuda.cudafunctions method)": [[6, "kernel_tuner.backends.nvcuda.CudaFunctions.__init__"]], "__init__() (kernel_tuner.backends.opencl.openclfunctions method)": [[6, "kernel_tuner.backends.opencl.OpenCLFunctions.__init__"]], "__init__() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[6, "kernel_tuner.backends.pycuda.PyCudaFunctions.__init__"]], "__init__() (kernel_tuner.core.deviceinterface method)": [[6, "kernel_tuner.core.DeviceInterface.__init__"]], "__init__() (kernel_tuner.runners.sequential.sequentialrunner method)": [[6, "kernel_tuner.runners.sequential.SequentialRunner.__init__"]], "__init__() (kernel_tuner.runners.simulation.simulationrunner method)": [[6, "kernel_tuner.runners.simulation.SimulationRunner.__init__"]], "benchmark() (kernel_tuner.core.deviceinterface method)": [[6, "kernel_tuner.core.DeviceInterface.benchmark"]], "benchmark_continuous() (kernel_tuner.core.deviceinterface method)": [[6, "kernel_tuner.core.DeviceInterface.benchmark_continuous"]], "benchmark_default() (kernel_tuner.core.deviceinterface method)": [[6, "kernel_tuner.core.DeviceInterface.benchmark_default"]], "check_argument_list() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.check_argument_list"]], "check_argument_type() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.check_argument_type"]], "check_kernel_output() (kernel_tuner.core.deviceinterface method)": [[6, "kernel_tuner.core.DeviceInterface.check_kernel_output"]], "check_restrictions() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.check_restrictions"]], "check_stop_criterion() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.check_stop_criterion"]], "check_thread_block_dimensions() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.check_thread_block_dimensions"]], "check_tune_params_list() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.check_tune_params_list"]], "cleanup_lib() (kernel_tuner.backends.compiler.compilerfunctions method)": [[6, "kernel_tuner.backends.compiler.CompilerFunctions.cleanup_lib"]], "compile() (kernel_tuner.backends.compiler.compilerfunctions method)": [[6, "kernel_tuner.backends.compiler.CompilerFunctions.compile"]], "compile() (kernel_tuner.backends.cupy.cupyfunctions method)": [[6, "kernel_tuner.backends.cupy.CupyFunctions.compile"]], "compile() (kernel_tuner.backends.hip.hipfunctions method)": [[6, "kernel_tuner.backends.hip.HipFunctions.compile"]], "compile() (kernel_tuner.backends.nvcuda.cudafunctions method)": [[6, "kernel_tuner.backends.nvcuda.CudaFunctions.compile"]], "compile() (kernel_tuner.backends.opencl.openclfunctions method)": [[6, "kernel_tuner.backends.opencl.OpenCLFunctions.compile"]], "compile() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[6, "kernel_tuner.backends.pycuda.PyCudaFunctions.compile"]], "compile_kernel() (kernel_tuner.core.deviceinterface method)": [[6, "kernel_tuner.core.DeviceInterface.compile_kernel"]], "compile_restrictions() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.compile_restrictions"]], "config_valid() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.config_valid"]], "convert_constraint_restriction() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.convert_constraint_restriction"]], "copy_constant_memory_args() (kernel_tuner.backends.cupy.cupyfunctions method)": [[6, "kernel_tuner.backends.cupy.CupyFunctions.copy_constant_memory_args"]], "copy_constant_memory_args() (kernel_tuner.backends.hip.hipfunctions method)": [[6, "kernel_tuner.backends.hip.HipFunctions.copy_constant_memory_args"]], "copy_constant_memory_args() (kernel_tuner.backends.nvcuda.cudafunctions method)": [[6, "kernel_tuner.backends.nvcuda.CudaFunctions.copy_constant_memory_args"]], "copy_constant_memory_args() (kernel_tuner.backends.opencl.openclfunctions method)": [[6, "kernel_tuner.backends.opencl.OpenCLFunctions.copy_constant_memory_args"]], "copy_constant_memory_args() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[6, "kernel_tuner.backends.pycuda.PyCudaFunctions.copy_constant_memory_args"]], "copy_constant_memory_args() (kernel_tuner.core.deviceinterface method)": [[6, "kernel_tuner.core.DeviceInterface.copy_constant_memory_args"]], "copy_shared_memory_args() (kernel_tuner.backends.cupy.cupyfunctions method)": [[6, "kernel_tuner.backends.cupy.CupyFunctions.copy_shared_memory_args"]], "copy_shared_memory_args() (kernel_tuner.backends.hip.hipfunctions method)": [[6, "kernel_tuner.backends.hip.HipFunctions.copy_shared_memory_args"]], "copy_shared_memory_args() (kernel_tuner.backends.nvcuda.cudafunctions method)": [[6, "kernel_tuner.backends.nvcuda.CudaFunctions.copy_shared_memory_args"]], "copy_shared_memory_args() (kernel_tuner.backends.opencl.openclfunctions method)": [[6, "kernel_tuner.backends.opencl.OpenCLFunctions.copy_shared_memory_args"]], "copy_shared_memory_args() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[6, "kernel_tuner.backends.pycuda.PyCudaFunctions.copy_shared_memory_args"]], "copy_shared_memory_args() (kernel_tuner.core.deviceinterface method)": [[6, "kernel_tuner.core.DeviceInterface.copy_shared_memory_args"]], "copy_texture_memory_args() (kernel_tuner.backends.cupy.cupyfunctions method)": [[6, "kernel_tuner.backends.cupy.CupyFunctions.copy_texture_memory_args"]], "copy_texture_memory_args() (kernel_tuner.backends.hip.hipfunctions method)": [[6, "kernel_tuner.backends.hip.HipFunctions.copy_texture_memory_args"]], "copy_texture_memory_args() (kernel_tuner.backends.nvcuda.cudafunctions method)": [[6, "kernel_tuner.backends.nvcuda.CudaFunctions.copy_texture_memory_args"]], "copy_texture_memory_args() (kernel_tuner.backends.opencl.openclfunctions method)": [[6, "kernel_tuner.backends.opencl.OpenCLFunctions.copy_texture_memory_args"]], "copy_texture_memory_args() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[6, "kernel_tuner.backends.pycuda.PyCudaFunctions.copy_texture_memory_args"]], "copy_texture_memory_args() (kernel_tuner.core.deviceinterface method)": [[6, "kernel_tuner.core.DeviceInterface.copy_texture_memory_args"]], "correct_open_cache() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.correct_open_cache"]], "create_kernel_instance() (kernel_tuner.core.deviceinterface method)": [[6, "kernel_tuner.core.DeviceInterface.create_kernel_instance"]], "cuda_error_check() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.cuda_error_check"]], "default() (kernel_tuner.util.npencoder method)": [[6, "kernel_tuner.util.NpEncoder.default"]], "delete_temp_file() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.delete_temp_file"]], "detect_language() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.detect_language"]], "dump_cache() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.dump_cache"]], "get_best_config() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.get_best_config"]], "get_config_string() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.get_config_string"]], "get_environment() (kernel_tuner.core.deviceinterface method)": [[6, "kernel_tuner.core.DeviceInterface.get_environment"]], "get_grid_dimensions() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.get_grid_dimensions"]], "get_instance_string() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.get_instance_string"]], "get_kernel_string() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.get_kernel_string"]], "get_options() (in module kernel_tuner.strategies.common)": [[6, "kernel_tuner.strategies.common.get_options"]], "get_problem_size() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.get_problem_size"]], "get_smem_args() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.get_smem_args"]], "get_strategy_docstring() (in module kernel_tuner.strategies.common)": [[6, "kernel_tuner.strategies.common.get_strategy_docstring"]], "get_temp_filename() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.get_temp_filename"]], "get_thread_block_dimensions() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.get_thread_block_dimensions"]], "get_total_timings() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.get_total_timings"]], "kernel_finished() (kernel_tuner.backends.compiler.compilerfunctions method)": [[6, "kernel_tuner.backends.compiler.CompilerFunctions.kernel_finished"]], "kernel_finished() (kernel_tuner.backends.cupy.cupyfunctions method)": [[6, "kernel_tuner.backends.cupy.CupyFunctions.kernel_finished"]], "kernel_finished() (kernel_tuner.backends.hip.hipfunctions method)": [[6, "kernel_tuner.backends.hip.HipFunctions.kernel_finished"]], "kernel_finished() (kernel_tuner.backends.nvcuda.cudafunctions method)": [[6, "kernel_tuner.backends.nvcuda.CudaFunctions.kernel_finished"]], "kernel_finished() (kernel_tuner.backends.opencl.openclfunctions method)": [[6, "kernel_tuner.backends.opencl.OpenCLFunctions.kernel_finished"]], "kernel_finished() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[6, "kernel_tuner.backends.pycuda.PyCudaFunctions.kernel_finished"]], "kernel_tuner.strategies.common": [[6, "module-kernel_tuner.strategies.common"]], "kernel_tuner.util": [[6, "module-kernel_tuner.util"]], "looks_like_a_filename() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.looks_like_a_filename"]], "make_strategy_options_doc() (in module kernel_tuner.strategies.common)": [[6, "kernel_tuner.strategies.common.make_strategy_options_doc"]], "memcpy_dtoh() (kernel_tuner.backends.compiler.compilerfunctions method)": [[6, "kernel_tuner.backends.compiler.CompilerFunctions.memcpy_dtoh"]], "memcpy_dtoh() (kernel_tuner.backends.cupy.cupyfunctions method)": [[6, "kernel_tuner.backends.cupy.CupyFunctions.memcpy_dtoh"]], "memcpy_dtoh() (kernel_tuner.backends.hip.hipfunctions method)": [[6, "kernel_tuner.backends.hip.HipFunctions.memcpy_dtoh"]], "memcpy_dtoh() (kernel_tuner.backends.nvcuda.cudafunctions static method)": [[6, "kernel_tuner.backends.nvcuda.CudaFunctions.memcpy_dtoh"]], "memcpy_dtoh() (kernel_tuner.backends.opencl.openclfunctions method)": [[6, "kernel_tuner.backends.opencl.OpenCLFunctions.memcpy_dtoh"]], "memcpy_dtoh() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[6, "kernel_tuner.backends.pycuda.PyCudaFunctions.memcpy_dtoh"]], "memcpy_dtoh() (kernel_tuner.core.deviceinterface method)": [[6, "kernel_tuner.core.DeviceInterface.memcpy_dtoh"]], "memcpy_htod() (kernel_tuner.backends.compiler.compilerfunctions method)": [[6, "kernel_tuner.backends.compiler.CompilerFunctions.memcpy_htod"]], "memcpy_htod() (kernel_tuner.backends.cupy.cupyfunctions method)": [[6, "kernel_tuner.backends.cupy.CupyFunctions.memcpy_htod"]], "memcpy_htod() (kernel_tuner.backends.hip.hipfunctions method)": [[6, "kernel_tuner.backends.hip.HipFunctions.memcpy_htod"]], "memcpy_htod() (kernel_tuner.backends.nvcuda.cudafunctions static method)": [[6, "kernel_tuner.backends.nvcuda.CudaFunctions.memcpy_htod"]], "memcpy_htod() (kernel_tuner.backends.opencl.openclfunctions method)": [[6, "kernel_tuner.backends.opencl.OpenCLFunctions.memcpy_htod"]], "memcpy_htod() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[6, "kernel_tuner.backends.pycuda.PyCudaFunctions.memcpy_htod"]], "memset() (kernel_tuner.backends.compiler.compilerfunctions method)": [[6, "kernel_tuner.backends.compiler.CompilerFunctions.memset"]], "memset() (kernel_tuner.backends.cupy.cupyfunctions method)": [[6, "kernel_tuner.backends.cupy.CupyFunctions.memset"]], "memset() (kernel_tuner.backends.hip.hipfunctions method)": [[6, "kernel_tuner.backends.hip.HipFunctions.memset"]], "memset() (kernel_tuner.backends.nvcuda.cudafunctions static method)": [[6, "kernel_tuner.backends.nvcuda.CudaFunctions.memset"]], "memset() (kernel_tuner.backends.opencl.openclfunctions method)": [[6, "kernel_tuner.backends.opencl.OpenCLFunctions.memset"]], "memset() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[6, "kernel_tuner.backends.pycuda.PyCudaFunctions.memset"]], "module": [[6, "module-kernel_tuner.strategies.common"], [6, "module-kernel_tuner.util"], [18, "module-kernel_tuner.strategies.basinhopping"], [18, "module-kernel_tuner.strategies.bayes_opt"], [18, "module-kernel_tuner.strategies.brute_force"], [18, "module-kernel_tuner.strategies.diff_evo"], [18, "module-kernel_tuner.strategies.dual_annealing"], [18, "module-kernel_tuner.strategies.firefly_algorithm"], [18, "module-kernel_tuner.strategies.genetic_algorithm"], [18, "module-kernel_tuner.strategies.greedy_ils"], [18, "module-kernel_tuner.strategies.greedy_mls"], [18, "module-kernel_tuner.strategies.minimize"], [18, "module-kernel_tuner.strategies.mls"], [18, "module-kernel_tuner.strategies.ordered_greedy_mls"], [18, "module-kernel_tuner.strategies.pso"], [18, "module-kernel_tuner.strategies.random_sample"], [18, "module-kernel_tuner.strategies.simulated_annealing"]], "normalize_verify_function() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.normalize_verify_function"]], "parse_restrictions() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.parse_restrictions"]], "prepare_kernel_string() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.prepare_kernel_string"]], "preprocess_gpu_arguments() (kernel_tuner.core.deviceinterface static method)": [[6, "kernel_tuner.core.DeviceInterface.preprocess_gpu_arguments"]], "print_config() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.print_config"]], "print_config_output() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.print_config_output"]], "process_cache() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.process_cache"]], "process_metrics() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.process_metrics"]], "read_cache() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.read_cache"]], "read_file() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.read_file"]], "ready_argument_list() (kernel_tuner.backends.compiler.compilerfunctions method)": [[6, "kernel_tuner.backends.compiler.CompilerFunctions.ready_argument_list"]], "ready_argument_list() (kernel_tuner.backends.cupy.cupyfunctions method)": [[6, "kernel_tuner.backends.cupy.CupyFunctions.ready_argument_list"]], "ready_argument_list() (kernel_tuner.backends.hip.hipfunctions method)": [[6, "kernel_tuner.backends.hip.HipFunctions.ready_argument_list"]], "ready_argument_list() (kernel_tuner.backends.nvcuda.cudafunctions method)": [[6, "kernel_tuner.backends.nvcuda.CudaFunctions.ready_argument_list"]], "ready_argument_list() (kernel_tuner.backends.opencl.openclfunctions method)": [[6, "kernel_tuner.backends.opencl.OpenCLFunctions.ready_argument_list"]], "ready_argument_list() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[6, "kernel_tuner.backends.pycuda.PyCudaFunctions.ready_argument_list"]], "ready_argument_list() (kernel_tuner.core.deviceinterface method)": [[6, "kernel_tuner.core.DeviceInterface.ready_argument_list"]], "replace_param_occurrences() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.replace_param_occurrences"]], "run() (kernel_tuner.runners.sequential.sequentialrunner method)": [[6, "kernel_tuner.runners.sequential.SequentialRunner.run"]], "run() (kernel_tuner.runners.simulation.simulationrunner method)": [[6, "kernel_tuner.runners.simulation.SimulationRunner.run"]], "run_kernel() (kernel_tuner.backends.compiler.compilerfunctions method)": [[6, "kernel_tuner.backends.compiler.CompilerFunctions.run_kernel"]], "run_kernel() (kernel_tuner.backends.cupy.cupyfunctions method)": [[6, "kernel_tuner.backends.cupy.CupyFunctions.run_kernel"]], "run_kernel() (kernel_tuner.backends.hip.hipfunctions method)": [[6, "kernel_tuner.backends.hip.HipFunctions.run_kernel"]], "run_kernel() (kernel_tuner.backends.nvcuda.cudafunctions method)": [[6, "kernel_tuner.backends.nvcuda.CudaFunctions.run_kernel"]], "run_kernel() (kernel_tuner.backends.opencl.openclfunctions method)": [[6, "kernel_tuner.backends.opencl.OpenCLFunctions.run_kernel"]], "run_kernel() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[6, "kernel_tuner.backends.pycuda.PyCudaFunctions.run_kernel"]], "run_kernel() (kernel_tuner.core.deviceinterface method)": [[6, "kernel_tuner.core.DeviceInterface.run_kernel"]], "scale_from_params() (in module kernel_tuner.strategies.common)": [[6, "kernel_tuner.strategies.common.scale_from_params"]], "setup_block_and_grid() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.setup_block_and_grid"]], "setup_method_arguments() (in module kernel_tuner.strategies.common)": [[6, "kernel_tuner.strategies.common.setup_method_arguments"]], "setup_method_options() (in module kernel_tuner.strategies.common)": [[6, "kernel_tuner.strategies.common.setup_method_options"]], "snap_to_nearest_config() (in module kernel_tuner.strategies.common)": [[6, "kernel_tuner.strategies.common.snap_to_nearest_config"]], "start_event() (kernel_tuner.backends.compiler.compilerfunctions method)": [[6, "kernel_tuner.backends.compiler.CompilerFunctions.start_event"]], "start_event() (kernel_tuner.backends.cupy.cupyfunctions method)": [[6, "kernel_tuner.backends.cupy.CupyFunctions.start_event"]], "start_event() (kernel_tuner.backends.hip.hipfunctions method)": [[6, "kernel_tuner.backends.hip.HipFunctions.start_event"]], "start_event() (kernel_tuner.backends.nvcuda.cudafunctions method)": [[6, "kernel_tuner.backends.nvcuda.CudaFunctions.start_event"]], "start_event() (kernel_tuner.backends.opencl.openclfunctions method)": [[6, "kernel_tuner.backends.opencl.OpenCLFunctions.start_event"]], "start_event() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[6, "kernel_tuner.backends.pycuda.PyCudaFunctions.start_event"]], "stop_event() (kernel_tuner.backends.compiler.compilerfunctions method)": [[6, "kernel_tuner.backends.compiler.CompilerFunctions.stop_event"]], "stop_event() (kernel_tuner.backends.cupy.cupyfunctions method)": [[6, "kernel_tuner.backends.cupy.CupyFunctions.stop_event"]], "stop_event() (kernel_tuner.backends.hip.hipfunctions method)": [[6, "kernel_tuner.backends.hip.HipFunctions.stop_event"]], "stop_event() (kernel_tuner.backends.nvcuda.cudafunctions method)": [[6, "kernel_tuner.backends.nvcuda.CudaFunctions.stop_event"]], "stop_event() (kernel_tuner.backends.opencl.openclfunctions method)": [[6, "kernel_tuner.backends.opencl.OpenCLFunctions.stop_event"]], "stop_event() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[6, "kernel_tuner.backends.pycuda.PyCudaFunctions.stop_event"]], "store_cache() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.store_cache"]], "synchronize() (kernel_tuner.backends.compiler.compilerfunctions method)": [[6, "kernel_tuner.backends.compiler.CompilerFunctions.synchronize"]], "synchronize() (kernel_tuner.backends.cupy.cupyfunctions method)": [[6, "kernel_tuner.backends.cupy.CupyFunctions.synchronize"]], "synchronize() (kernel_tuner.backends.hip.hipfunctions method)": [[6, "kernel_tuner.backends.hip.HipFunctions.synchronize"]], "synchronize() (kernel_tuner.backends.nvcuda.cudafunctions static method)": [[6, "kernel_tuner.backends.nvcuda.CudaFunctions.synchronize"]], "synchronize() (kernel_tuner.backends.opencl.openclfunctions method)": [[6, "kernel_tuner.backends.opencl.OpenCLFunctions.synchronize"]], "synchronize() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[6, "kernel_tuner.backends.pycuda.PyCudaFunctions.synchronize"]], "unscale_and_snap_to_nearest() (in module kernel_tuner.strategies.common)": [[6, "kernel_tuner.strategies.common.unscale_and_snap_to_nearest"]], "write_file() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.write_file"]], "benchmarkobserver (class in kernel_tuner.observers)": [[17, "kernel_tuner.observers.BenchmarkObserver"]], "nvmlobserver (class in kernel_tuner.observers.nvml)": [[17, "kernel_tuner.observers.nvml.NVMLObserver"]], "pmtobserver (class in kernel_tuner.observers.pmt)": [[17, "kernel_tuner.observers.pmt.PMTObserver"]], "powersensorobserver (class in kernel_tuner.observers.powersensor)": [[17, "kernel_tuner.observers.powersensor.PowerSensorObserver"]], "after_finish() (kernel_tuner.observers.benchmarkobserver method)": [[17, "kernel_tuner.observers.BenchmarkObserver.after_finish"]], "after_start() (kernel_tuner.observers.benchmarkobserver method)": [[17, "kernel_tuner.observers.BenchmarkObserver.after_start"]], "before_start() (kernel_tuner.observers.benchmarkobserver method)": [[17, "kernel_tuner.observers.BenchmarkObserver.before_start"]], "during() (kernel_tuner.observers.benchmarkobserver method)": [[17, "kernel_tuner.observers.BenchmarkObserver.during"]], "get_results() (kernel_tuner.observers.benchmarkobserver method)": [[17, "kernel_tuner.observers.BenchmarkObserver.get_results"]], "register_configuration() (kernel_tuner.observers.benchmarkobserver method)": [[17, "kernel_tuner.observers.BenchmarkObserver.register_configuration"]], "register_device() (kernel_tuner.observers.benchmarkobserver method)": [[17, "kernel_tuner.observers.BenchmarkObserver.register_device"]], "firefly (class in kernel_tuner.strategies.firefly_algorithm)": [[18, "kernel_tuner.strategies.firefly_algorithm.Firefly"]], "acceptance_prob() (in module kernel_tuner.strategies.simulated_annealing)": [[18, "kernel_tuner.strategies.simulated_annealing.acceptance_prob"]], "compute_intensity() (kernel_tuner.strategies.firefly_algorithm.firefly method)": [[18, "kernel_tuner.strategies.firefly_algorithm.Firefly.compute_intensity"]], "disruptive_uniform_crossover() (in module kernel_tuner.strategies.genetic_algorithm)": [[18, "kernel_tuner.strategies.genetic_algorithm.disruptive_uniform_crossover"]], "distance_to() (kernel_tuner.strategies.firefly_algorithm.firefly method)": [[18, "kernel_tuner.strategies.firefly_algorithm.Firefly.distance_to"]], "generate_normalized_param_dicts() (in module kernel_tuner.strategies.bayes_opt)": [[18, "kernel_tuner.strategies.bayes_opt.generate_normalized_param_dicts"]], "kernel_tuner.strategies.basinhopping": [[18, "module-kernel_tuner.strategies.basinhopping"]], "kernel_tuner.strategies.bayes_opt": [[18, "module-kernel_tuner.strategies.bayes_opt"]], "kernel_tuner.strategies.brute_force": [[18, "module-kernel_tuner.strategies.brute_force"]], "kernel_tuner.strategies.diff_evo": [[18, "module-kernel_tuner.strategies.diff_evo"]], "kernel_tuner.strategies.dual_annealing": [[18, "module-kernel_tuner.strategies.dual_annealing"]], "kernel_tuner.strategies.firefly_algorithm": [[18, "module-kernel_tuner.strategies.firefly_algorithm"]], "kernel_tuner.strategies.genetic_algorithm": [[18, "module-kernel_tuner.strategies.genetic_algorithm"]], "kernel_tuner.strategies.greedy_ils": [[18, "module-kernel_tuner.strategies.greedy_ils"]], "kernel_tuner.strategies.greedy_mls": [[18, "module-kernel_tuner.strategies.greedy_mls"]], "kernel_tuner.strategies.minimize": [[18, "module-kernel_tuner.strategies.minimize"]], "kernel_tuner.strategies.mls": [[18, "module-kernel_tuner.strategies.mls"]], "kernel_tuner.strategies.ordered_greedy_mls": [[18, "module-kernel_tuner.strategies.ordered_greedy_mls"]], "kernel_tuner.strategies.pso": [[18, "module-kernel_tuner.strategies.pso"]], "kernel_tuner.strategies.random_sample": [[18, "module-kernel_tuner.strategies.random_sample"]], "kernel_tuner.strategies.simulated_annealing": [[18, "module-kernel_tuner.strategies.simulated_annealing"]], "move_towards() (kernel_tuner.strategies.firefly_algorithm.firefly method)": [[18, "kernel_tuner.strategies.firefly_algorithm.Firefly.move_towards"]], "mutate() (in module kernel_tuner.strategies.genetic_algorithm)": [[18, "kernel_tuner.strategies.genetic_algorithm.mutate"]], "neighbor() (in module kernel_tuner.strategies.simulated_annealing)": [[18, "kernel_tuner.strategies.simulated_annealing.neighbor"]], "normalize_parameter_space() (in module kernel_tuner.strategies.bayes_opt)": [[18, "kernel_tuner.strategies.bayes_opt.normalize_parameter_space"]], "prune_parameter_space() (in module kernel_tuner.strategies.bayes_opt)": [[18, "kernel_tuner.strategies.bayes_opt.prune_parameter_space"]], "single_point_crossover() (in module kernel_tuner.strategies.genetic_algorithm)": [[18, "kernel_tuner.strategies.genetic_algorithm.single_point_crossover"]], "tune() (in module kernel_tuner.strategies.basinhopping)": [[18, "kernel_tuner.strategies.basinhopping.tune"]], "tune() (in module kernel_tuner.strategies.bayes_opt)": [[18, "kernel_tuner.strategies.bayes_opt.tune"]], "tune() (in module kernel_tuner.strategies.brute_force)": [[18, "kernel_tuner.strategies.brute_force.tune"]], "tune() (in module kernel_tuner.strategies.diff_evo)": [[18, "kernel_tuner.strategies.diff_evo.tune"]], "tune() (in module kernel_tuner.strategies.dual_annealing)": [[18, "kernel_tuner.strategies.dual_annealing.tune"]], "tune() (in module kernel_tuner.strategies.firefly_algorithm)": [[18, "kernel_tuner.strategies.firefly_algorithm.tune"]], "tune() (in module kernel_tuner.strategies.genetic_algorithm)": [[18, "kernel_tuner.strategies.genetic_algorithm.tune"]], "tune() (in module kernel_tuner.strategies.greedy_ils)": [[18, "kernel_tuner.strategies.greedy_ils.tune"]], "tune() (in module kernel_tuner.strategies.greedy_mls)": [[18, "kernel_tuner.strategies.greedy_mls.tune"]], "tune() (in module kernel_tuner.strategies.minimize)": [[18, "kernel_tuner.strategies.minimize.tune"]], "tune() (in module kernel_tuner.strategies.mls)": [[18, "kernel_tuner.strategies.mls.tune"]], "tune() (in module kernel_tuner.strategies.ordered_greedy_mls)": [[18, "kernel_tuner.strategies.ordered_greedy_mls.tune"]], "tune() (in module kernel_tuner.strategies.pso)": [[18, "kernel_tuner.strategies.pso.tune"]], "tune() (in module kernel_tuner.strategies.random_sample)": [[18, "kernel_tuner.strategies.random_sample.tune"]], "tune() (in module kernel_tuner.strategies.simulated_annealing)": [[18, "kernel_tuner.strategies.simulated_annealing.tune"]], "two_point_crossover() (in module kernel_tuner.strategies.genetic_algorithm)": [[18, "kernel_tuner.strategies.genetic_algorithm.two_point_crossover"]], "uniform_crossover() (in module kernel_tuner.strategies.genetic_algorithm)": [[18, "kernel_tuner.strategies.genetic_algorithm.uniform_crossover"]], "weighted_choice() (in module kernel_tuner.strategies.genetic_algorithm)": [[18, "kernel_tuner.strategies.genetic_algorithm.weighted_choice"]], "create_device_targets() (in module kernel_tuner)": [[22, "kernel_tuner.create_device_targets"]], "run_kernel() (in module kernel_tuner)": [[22, "kernel_tuner.run_kernel"]], "store_results() (in module kernel_tuner)": [[22, "kernel_tuner.store_results"]], "tune_kernel() (in module kernel_tuner)": [[22, "kernel_tuner.tune_kernel"]]}}) \ No newline at end of file