⚠️🚧 This site is currently under construction. Documentation is actively being written and expected to be released along with the next GoMLX release v0.28.0. 🚧⚠️

Dotgeneral Performance

This document is generated by running the TestDotGeneral_PerformanceTable test. To run it, from the root of the repository, run (for the simplego backend):

GOMLX_BACKEND=go go test -tags=perf ./backends/simplego/ \
    -test.run=TestDotGeneral_PerformanceTable -test.v -test.count=1 \
    -markdown

Results

Backend: go / CPU AMD 9950X3D / 2026-01-17 (gotip -> just after go1.26rc2 made available)

GOMLX_BACKEND=go go test -tags=perf ./backends/simplego/ \
    -test.run=TestDotGeneral_PerformanceTable -test.v -test.count=1 \
    -markdown
Test NameLHS DimsRHS DimsDTypeBatchSizeTime/RunNum OpsGOps/Sec
NoBatch-Tiny{128, 4}{4, 1}Float3213.52µs1,0240.3
NoBatch-Tiny{128, 4}{4, 1}Float6413.50µs1,0240.3
NoBatch-Tiny{128, 4}{4, 1}BFloat1613.71µs1,0240.3
NoBatch-Tiny{128, 4}{4, 1}Float1615.17µs1,0240.2
NoBatch-Tiny-Norm{128, 4}{1, 4}Float3213.88µs1,0240.3
NoBatch-Tiny-Norm{128, 4}{1, 4}Float6413.92µs1,0240.3
NoBatch-Tiny-Norm{128, 4}{1, 4}BFloat1614.21µs1,0240.2
NoBatch-Tiny-Norm{128, 4}{1, 4}Float1615.81µs1,0240.2
NoBatch-Small{16, 128}{128, 32}Float32126.51µs131,0724.9
NoBatch-Small{16, 128}{128, 32}Float64127.08µs131,0724.8
NoBatch-Small{16, 128}{128, 32}BFloat16144.36µs131,0723.0
NoBatch-Small{16, 128}{128, 32}Float161141.16µs131,0720.9
NoBatch-Medium{128, 128}{128, 256}Float321130.12µs8,388,60864.5
NoBatch-Medium{128, 128}{128, 256}Float641296.33µs8,388,60828.3
NoBatch-Medium{128, 128}{128, 256}BFloat161656.75µs8,388,60812.8
NoBatch-Medium{128, 128}{128, 256}Float1611.58ms8,388,6085.3
NoBatch-Large{1536, 1920}{1920, 1024}Float32117.76ms6,039,797,760340.2
NoBatch-Large{1536, 1920}{1920, 1024}Float64150.29ms6,039,797,760120.1
NoBatch-Large{1536, 1920}{1920, 1024}BFloat161124.82ms6,039,797,76048.4
NoBatch-Large{1536, 1920}{1920, 1024}Float161413.90ms6,039,797,76014.6
R-Unbalanced-Cross{128}{128, 256}Float32122.95µs65,5362.9
R-Unbalanced-Cross{128}{128, 256}Float64129.16µs65,5362.2
R-Unbalanced-Cross{128}{128, 256}BFloat16126.57µs65,5362.5
R-Unbalanced-Cross{128}{128, 256}Float16173.04µs65,5360.9
L-Unbalanced-Cross{4096, 32}{32, 16}Float321212.43µs4,194,30419.7
L-Unbalanced-Cross{4096, 32}{32, 16}Float641367.21µs4,194,30411.4
L-Unbalanced-Cross{4096, 32}{32, 16}BFloat161598.47µs4,194,3047.0
L-Unbalanced-Cross{4096, 32}{32, 16}Float1611.23ms4,194,3043.4
LargeBatch-Tiny{1024, 128, 4}{1024, 4, 1}Float321024128.69µs1,048,5768.1
LargeBatch-Tiny{1024, 128, 4}{1024, 4, 1}Float641024134.52µs1,048,5767.8
LargeBatch-Tiny{1024, 128, 4}{1024, 4, 1}BFloat161024202.00µs1,048,5765.2
LargeBatch-Tiny{1024, 128, 4}{1024, 4, 1}Float161024411.85µs1,048,5762.5
LargeBatch-Small{256, 8, 32}{256, 32, 16}Float32256504.32µs2,097,1524.2
LargeBatch-Small{256, 8, 32}{256, 32, 16}Float64256522.36µs2,097,1524.0
LargeBatch-Small{256, 8, 32}{256, 32, 16}BFloat16256555.87µs2,097,1523.8
LargeBatch-Small{256, 8, 32}{256, 32, 16}Float16256740.62µs2,097,1522.8
LargeBatch-Medium{64, 64, 128}{64, 64, 128}Float3264947.11µs67,108,86470.9
LargeBatch-Medium{64, 64, 128}{64, 64, 128}Float6464963.56µs67,108,86469.6
LargeBatch-Medium{64, 64, 128}{64, 64, 128}BFloat16641.92ms67,108,86434.9
LargeBatch-Medium{64, 64, 128}{64, 64, 128}Float16645.60ms67,108,86412.0
Batched-Large{16, 1536, 1920}{16, 1920, 1024}Float3216240.61ms96,636,764,160401.6
Batched-Large{16, 1536, 1920}{16, 1920, 1024}Float6416791.46ms96,636,764,160122.1
Batched-Large{16, 1536, 1920}{16, 1920, 1024}BFloat16161.87s96,636,764,16051.6
Batched-Large{16, 1536, 1920}{16, 1920, 1024}Float16166.32s96,636,764,16015.3
KA-Batch-16-#1{16, 12, 13, 13}{16, 12, 13, 32}Float32192314.50µs2,076,6726.6
KA-Batch-16-#1{16, 12, 13, 13}{16, 12, 13, 32}Float64192313.41µs2,076,6726.6
KA-Batch-16-#1{16, 12, 13, 13}{16, 12, 13, 32}BFloat16192389.75µs2,076,6725.3
KA-Batch-16-#1{16, 12, 13, 13}{16, 12, 13, 32}Float16192607.23µs2,076,6723.4
KA-Batch-16-#2{16, 12, 13, 32}{16, 12, 32, 13}Float32192323.64µs2,076,6726.4
KA-Batch-16-#2{16, 12, 13, 32}{16, 12, 32, 13}Float64192341.99µs2,076,6726.1
KA-Batch-16-#2{16, 12, 13, 32}{16, 12, 32, 13}BFloat16192386.01µs2,076,6725.4
KA-Batch-16-#2{16, 12, 13, 32}{16, 12, 32, 13}Float16192564.37µs2,076,6723.7
KA-Batch-16-#3{16, 13, 1536}{1536, 384}Float3212.62ms245,366,78493.7
KA-Batch-16-#3{16, 13, 1536}{1536, 384}Float6414.91ms245,366,78449.9
KA-Batch-16-#3{16, 13, 1536}{1536, 384}BFloat1619.18ms245,366,78426.7
KA-Batch-16-#3{16, 13, 1536}{1536, 384}Float16125.54ms245,366,7849.6
KA-Batch-16-#4{16, 13, 384}{384, 1536}Float3212.18ms245,366,784112.3
KA-Batch-16-#4{16, 13, 384}{384, 1536}Float6414.08ms245,366,78460.1
KA-Batch-16-#4{16, 13, 384}{384, 1536}BFloat1617.41ms245,366,78433.1
KA-Batch-16-#4{16, 13, 384}{384, 1536}Float16120.73ms245,366,78411.8
KA-Batch-16-#5{16, 13, 384}{384, 384}Float321698.36µs61,341,69687.8
KA-Batch-16-#5{16, 13, 384}{384, 384}Float6411.35ms61,341,69645.5
KA-Batch-16-#5{16, 13, 384}{384, 384}BFloat1612.40ms61,341,69625.6
KA-Batch-16-#5{16, 13, 384}{384, 384}Float1616.58ms61,341,6969.3
adult-#1{128, 4}{4, 1}Float3213.73µs1,0240.3
adult-#1{128, 4}{4, 1}Float6413.88µs1,0240.3
adult-#1{128, 4}{4, 1}BFloat1613.89µs1,0240.3
adult-#1{128, 4}{4, 1}Float1615.20µs1,0240.2
adult-#2{128, 69}{69, 4}Float32116.52µs70,6564.3
adult-#2{128, 69}{69, 4}Float64116.44µs70,6564.3
adult-#2{128, 69}{69, 4}BFloat16126.20µs70,6562.7
adult-#2{128, 69}{69, 4}Float16183.93µs70,6560.8
adult-#3{25, 4}{4, 1}Float3213.54µs2000.1
adult-#3{25, 4}{4, 1}Float6413.52µs2000.1
adult-#3{25, 4}{4, 1}BFloat1613.50µs2000.1
adult-#3{25, 4}{4, 1}Float1613.69µs2000.1
adult-#4{25, 69}{69, 4}Float3216.41µs13,8002.2
adult-#4{25, 69}{69, 4}Float6416.44µs13,8002.1
adult-#4{25, 69}{69, 4}BFloat1618.41µs13,8001.6
adult-#4{25, 69}{69, 4}Float16119.84µs13,8000.7
adult-#5{49, 4}{4, 1}Float3213.46µs3920.1
adult-#5{49, 4}{4, 1}Float6413.51µs3920.1
adult-#5{49, 4}{4, 1}BFloat1613.50µs3920.1
adult-#5{49, 4}{4, 1}Float1613.97µs3920.1
adult-#6{49, 69}{69, 4}Float3218.69µs27,0483.1
adult-#6{49, 69}{69, 4}Float6418.66µs27,0483.1
adult-#6{49, 69}{69, 4}BFloat16112.57µs27,0482.2
adult-#6{49, 69}{69, 4}Float16134.48µs27,0480.8
adult-#6-Normalized{49, 69}{4, 69}Float3219.06µs27,0483.0
adult-#6-Normalized{49, 69}{4, 69}Float6419.51µs27,0482.8
adult-#6-Normalized{49, 69}{4, 69}BFloat16115.33µs27,0481.8
adult-#6-Normalized{49, 69}{4, 69}Float16135.70µs27,0480.8

With the experimental SIMD enabled for AVX-512 we get the following updates:

Test NameLHS DimsRHS DimsDTypeBatchSizeTime/RunNum OpsGOps/Sec
NoBatch-Medium{128, 128}{128, 256}Float3219.07µs8,388,608924.5
NoBatch-Large{1536, 1920}{1920, 1024}Float3212.95ms6,039,797,7602045.5
L-Unbalanced-Cross{4096, 32}{32, 16}Float32111.31µs4,194,304370.8
LargeBatch-Tiny{1024, 128, 4}{1024, 4, 1}Float321024806.62µs1,048,5761.3
LargeBatch-Small{256, 8, 32}{256, 32, 16}Float32256171.54µs2,097,15212.2
Batched-Large{16, 1536, 1920}{16, 1920, 1024}Float321648.60ms96,636,764,1601988.3
KA-Batch-16-#1{16, 12, 13, 13}{16, 12, 13, 32}Float32192113.87µs2,076,67218.2
KA-Batch-16-#2{16, 12, 13, 32}{16, 12, 32, 13}Float32192138.35µs2,076,67215.0

Backend: xla:cpu/ CPU AMD 9950X3D / 2026-01-07 / PJRT Plugin v0.83.4 (see pjrt-cpu-binaries)

GOMLX_BACKEND=xla:cpu go test -tags=perf ./backends/simplego/ \
    -test.run=TestDotGeneral_PerformanceTable -test.v -test.count=1 \
    -markdown
Test NameLHS DimsRHS DimsDTypeBatchSizeTime/RunNum OpsGOps/Sec
NoBatch-Tiny{128, 4}{4, 1}Float3214.62µs1,0240.2
NoBatch-Tiny{128, 4}{4, 1}Float6414.58µs1,0240.2
NoBatch-Tiny{128, 4}{4, 1}BFloat1614.83µs1,0240.2
NoBatch-Tiny{128, 4}{4, 1}Float1615.12µs1,0240.2
NoBatch-Tiny-Norm{128, 4}{1, 4}Float3214.55µs1,0240.2
NoBatch-Tiny-Norm{128, 4}{1, 4}Float6414.20µs1,0240.2
NoBatch-Tiny-Norm{128, 4}{1, 4}BFloat1614.51µs1,0240.2
NoBatch-Tiny-Norm{128, 4}{1, 4}Float1614.46µs1,0240.2
NoBatch-Small{16, 128}{128, 32}Float3215.48µs131,07223.9
NoBatch-Small{16, 128}{128, 32}Float6419.63µs131,07213.6
NoBatch-Small{16, 128}{128, 32}BFloat1616.25µs131,07221.0
NoBatch-Small{16, 128}{128, 32}Float1616.39µs131,07220.5
NoBatch-Medium{128, 128}{128, 256}Float32131.59µs8,388,608265.5
NoBatch-Medium{128, 128}{128, 256}Float64170.09µs8,388,608119.7
NoBatch-Medium{128, 128}{128, 256}BFloat16135.01µs8,388,608239.6
NoBatch-Medium{128, 128}{128, 256}Float16135.33µs8,388,608237.5
NoBatch-Large{1536, 1920}{1920, 1024}Float3211.75ms6,039,797,7603456.4
NoBatch-Large{1536, 1920}{1920, 1024}Float64116.79ms6,039,797,760359.8
NoBatch-Large{1536, 1920}{1920, 1024}BFloat1612.83ms6,039,797,7602134.5
NoBatch-Large{1536, 1920}{1920, 1024}Float1612.77ms6,039,797,7602184.4
R-Unbalanced-Cross{128}{128, 256}Float3216.14µs65,53610.7
R-Unbalanced-Cross{128}{128, 256}Float6416.17µs65,53610.6
R-Unbalanced-Cross{128}{128, 256}BFloat16111.15µs65,5365.9
R-Unbalanced-Cross{128}{128, 256}Float16111.87µs65,5365.5
L-Unbalanced-Cross{4096, 32}{32, 16}Float32115.05µs4,194,304278.7
L-Unbalanced-Cross{4096, 32}{32, 16}Float64181.84µs4,194,30451.3
L-Unbalanced-Cross{4096, 32}{32, 16}BFloat16149.63µs4,194,30484.5
L-Unbalanced-Cross{4096, 32}{32, 16}Float16139.84µs4,194,304105.3
LargeBatch-Tiny{1024, 128, 4}{1024, 4, 1}Float321024121.14µs1,048,5768.7
LargeBatch-Tiny{1024, 128, 4}{1024, 4, 1}Float64102467.71µs1,048,57615.5
LargeBatch-Tiny{1024, 128, 4}{1024, 4, 1}BFloat161024140.54µs1,048,5767.5
LargeBatch-Tiny{1024, 128, 4}{1024, 4, 1}Float161024135.41µs1,048,5767.7
LargeBatch-Small{256, 8, 32}{256, 32, 16}Float3225627.18µs2,097,15277.2
LargeBatch-Small{256, 8, 32}{256, 32, 16}Float64256123.81µs2,097,15216.9
LargeBatch-Small{256, 8, 32}{256, 32, 16}BFloat1625661.14µs2,097,15234.3
LargeBatch-Small{256, 8, 32}{256, 32, 16}Float1625664.62µs2,097,15232.5
LargeBatch-Medium{64, 64, 128}{64, 64, 128}Float326459.80µs67,108,8641122.2
LargeBatch-Medium{64, 64, 128}{64, 64, 128}Float6464849.47µs67,108,86479.0
LargeBatch-Medium{64, 64, 128}{64, 64, 128}BFloat1664119.78µs67,108,864560.3
LargeBatch-Medium{64, 64, 128}{64, 64, 128}Float1664123.86µs67,108,864541.8
Batched-Large{16, 1536, 1920}{16, 1920, 1024}Float321651.30ms96,636,764,1601883.7
Batched-Large{16, 1536, 1920}{16, 1920, 1024}Float6416220.27ms96,636,764,160438.7
Batched-Large{16, 1536, 1920}{16, 1920, 1024}BFloat161672.52ms96,636,764,1601332.6
Batched-Large{16, 1536, 1920}{16, 1920, 1024}Float161671.93ms96,636,764,1601343.6
KA-Batch-16-#1{16, 12, 13, 13}{16, 12, 13, 32}Float3219213.93µs2,076,672149.0
KA-Batch-16-#1{16, 12, 13, 13}{16, 12, 13, 32}Float64192118.63µs2,076,67217.5
KA-Batch-16-#1{16, 12, 13, 13}{16, 12, 13, 32}BFloat1619263.88µs2,076,67232.5
KA-Batch-16-#1{16, 12, 13, 13}{16, 12, 13, 32}Float1619247.36µs2,076,67243.9
KA-Batch-16-#2{16, 12, 13, 32}{16, 12, 32, 13}Float3219248.02µs2,076,67243.2
KA-Batch-16-#2{16, 12, 13, 32}{16, 12, 32, 13}Float64192121.54µs2,076,67217.1
KA-Batch-16-#2{16, 12, 13, 32}{16, 12, 32, 13}BFloat1619281.54µs2,076,67225.5
KA-Batch-16-#2{16, 12, 13, 32}{16, 12, 32, 13}Float1619282.65µs2,076,67225.1
KA-Batch-16-#3{16, 13, 1536}{1536, 384}Float321228.54µs245,366,7841073.6
KA-Batch-16-#3{16, 13, 1536}{1536, 384}Float641882.13µs245,366,784278.2
KA-Batch-16-#3{16, 13, 1536}{1536, 384}BFloat161374.36µs245,366,784655.4
KA-Batch-16-#3{16, 13, 1536}{1536, 384}Float161371.05µs245,366,784661.3
KA-Batch-16-#4{16, 13, 384}{384, 1536}Float321147.57µs245,366,7841662.7
KA-Batch-16-#4{16, 13, 384}{384, 1536}Float641881.09µs245,366,784278.5
KA-Batch-16-#4{16, 13, 384}{384, 1536}BFloat161282.59µs245,366,784868.3
KA-Batch-16-#4{16, 13, 384}{384, 1536}Float161279.47µs245,366,784878.0
KA-Batch-16-#5{16, 13, 384}{384, 384}Float32167.69µs61,341,696906.3
KA-Batch-16-#5{16, 13, 384}{384, 384}Float641282.96µs61,341,696216.8
KA-Batch-16-#5{16, 13, 384}{384, 384}BFloat161112.21µs61,341,696546.7
KA-Batch-16-#5{16, 13, 384}{384, 384}Float161117.72µs61,341,696521.1
adult-#1{128, 4}{4, 1}Float3214.72µs1,0240.2
adult-#1{128, 4}{4, 1}Float6414.24µs1,0240.2
adult-#1{128, 4}{4, 1}BFloat1614.27µs1,0240.2
adult-#1{128, 4}{4, 1}Float1614.21µs1,0240.2
adult-#2{128, 69}{69, 4}Float3215.98µs70,65611.8
adult-#2{128, 69}{69, 4}Float6417.87µs70,6569.0
adult-#2{128, 69}{69, 4}BFloat1616.34µs70,65611.1
adult-#2{128, 69}{69, 4}Float1616.27µs70,65611.3
adult-#3{25, 4}{4, 1}Float3212.28µs2000.1
adult-#3{25, 4}{4, 1}Float6412.29µs2000.1
adult-#3{25, 4}{4, 1}BFloat1612.32µs2000.1
adult-#3{25, 4}{4, 1}Float1612.35µs2000.1
adult-#4{25, 69}{69, 4}Float3215.63µs13,8002.5
adult-#4{25, 69}{69, 4}Float6415.88µs13,8002.3
adult-#4{25, 69}{69, 4}BFloat1615.90µs13,8002.3
adult-#4{25, 69}{69, 4}Float1615.97µs13,8002.3
adult-#5{49, 4}{4, 1}Float3212.33µs3920.2
adult-#5{49, 4}{4, 1}Float6412.26µs3920.2
adult-#5{49, 4}{4, 1}BFloat1612.34µs3920.2
adult-#5{49, 4}{4, 1}Float1612.33µs3920.2
adult-#6{49, 69}{69, 4}Float3215.87µs27,0484.6
adult-#6{49, 69}{69, 4}Float6415.95µs27,0484.5
adult-#6{49, 69}{69, 4}BFloat1616.26µs27,0484.3
adult-#6{49, 69}{69, 4}Float1616.39µs27,0484.2
adult-#6-Normalized{49, 69}{4, 69}Float3215.96µs27,0484.5
adult-#6-Normalized{49, 69}{4, 69}Float6416.07µs27,0484.5
adult-#6-Normalized{49, 69}{4, 69}BFloat1616.03µs27,0484.5
adult-#6-Normalized{49, 69}{4, 69}Float1616.09µs27,0484.4

Backend: xla:cuda/ GPU RTX 5090 / 2026-01-07 / PJRT Plugin v0.81 (Jax-build)

GOMLX_BACKEND=xla:cuda go test -tags=perf ./backends/simplego/ \
    -test.run=TestDotGeneral_PerformanceTable -test.v -test.count=1 \
    -markdown
Test NameLHS DimsRHS DimsDTypeBatchSizeTime/RunNum OpsGOps/Sec
NoBatch-Tiny{128, 4}{4, 1}Float32117.53µs1,0240.1
NoBatch-Tiny{128, 4}{4, 1}Float64116.42µs1,0240.1
NoBatch-Tiny{128, 4}{4, 1}BFloat16115.72µs1,0240.1
NoBatch-Tiny{128, 4}{4, 1}Float16114.97µs1,0240.1
NoBatch-Tiny-Norm{128, 4}{1, 4}Float32113.44µs1,0240.1
NoBatch-Tiny-Norm{128, 4}{1, 4}Float64117.53µs1,0240.1
NoBatch-Tiny-Norm{128, 4}{1, 4}BFloat16126.20µs1,0240.0
NoBatch-Tiny-Norm{128, 4}{1, 4}Float16115.92µs1,0240.1
NoBatch-Small{16, 128}{128, 32}Float32127.23µs131,0724.8
NoBatch-Small{16, 128}{128, 32}Float64160.17µs131,0722.2
NoBatch-Small{16, 128}{128, 32}BFloat16132.50µs131,0724.0
NoBatch-Small{16, 128}{128, 32}Float16123.79µs131,0725.5
NoBatch-Medium{128, 128}{128, 256}Float32126.76µs8,388,608313.5
NoBatch-Medium{128, 128}{128, 256}Float64160.90µs8,388,608137.7
NoBatch-Medium{128, 128}{128, 256}BFloat16126.77µs8,388,608313.4
NoBatch-Medium{128, 128}{128, 256}Float16126.01µs8,388,608322.5
NoBatch-Large{1536, 1920}{1920, 1024}Float321121.00µs6,039,797,76049915.7
NoBatch-Large{1536, 1920}{1920, 1024}Float6413.72ms6,039,797,7601624.6
NoBatch-Large{1536, 1920}{1920, 1024}BFloat16191.86µs6,039,797,76065751.5
NoBatch-Large{1536, 1920}{1920, 1024}Float16176.70µs6,039,797,76078741.6
R-Unbalanced-Cross{128}{128, 256}Float32115.28µs65,5364.3
R-Unbalanced-Cross{128}{128, 256}Float64184.30µs65,5360.8
R-Unbalanced-Cross{128}{128, 256}BFloat16119.00µs65,5363.4
R-Unbalanced-Cross{128}{128, 256}Float16118.36µs65,5363.6
L-Unbalanced-Cross{4096, 32}{32, 16}Float32124.77µs4,194,304169.3
L-Unbalanced-Cross{4096, 32}{32, 16}Float64141.32µs4,194,304101.5
L-Unbalanced-Cross{4096, 32}{32, 16}BFloat161224.48µs4,194,30418.7
L-Unbalanced-Cross{4096, 32}{32, 16}Float16119.41µs4,194,304216.1
LargeBatch-Tiny{1024, 128, 4}{1024, 4, 1}Float32102414.82µs1,048,57670.8
LargeBatch-Tiny{1024, 128, 4}{1024, 4, 1}Float64102427.46µs1,048,57638.2
LargeBatch-Tiny{1024, 128, 4}{1024, 4, 1}BFloat16102414.84µs1,048,57670.6
LargeBatch-Tiny{1024, 128, 4}{1024, 4, 1}Float16102413.19µs1,048,57679.5
LargeBatch-Small{256, 8, 32}{256, 32, 16}Float3225617.99µs2,097,152116.6
LargeBatch-Small{256, 8, 32}{256, 32, 16}Float6425629.11µs2,097,15272.0
LargeBatch-Small{256, 8, 32}{256, 32, 16}BFloat1625616.83µs2,097,152124.6
LargeBatch-Small{256, 8, 32}{256, 32, 16}Float1625613.51µs2,097,152155.3
LargeBatch-Medium{64, 64, 128}{64, 64, 128}Float326428.59µs67,108,8642347.1
LargeBatch-Medium{64, 64, 128}{64, 64, 128}Float646499.14µs67,108,864676.9
LargeBatch-Medium{64, 64, 128}{64, 64, 128}BFloat166424.59µs67,108,8642728.7
LargeBatch-Medium{64, 64, 128}{64, 64, 128}Float166425.75µs67,108,8642605.9
Batched-Large{16, 1536, 1920}{16, 1920, 1024}Float3216927.42µs96,636,764,160104199.5
Batched-Large{16, 1536, 1920}{16, 1920, 1024}Float641657.21ms96,636,764,1601689.2
Batched-Large{16, 1536, 1920}{16, 1920, 1024}BFloat1616473.19µs96,636,764,160204223.6
Batched-Large{16, 1536, 1920}{16, 1920, 1024}Float1616476.39µs96,636,764,160202853.9
KA-Batch-16-#1{16, 12, 13, 13}{16, 12, 13, 32}Float3219229.05µs2,076,67271.5
KA-Batch-16-#1{16, 12, 13, 13}{16, 12, 13, 32}Float6419236.93µs2,076,67256.2
KA-Batch-16-#1{16, 12, 13, 13}{16, 12, 13, 32}BFloat1619218.15µs2,076,672114.4
KA-Batch-16-#1{16, 12, 13, 13}{16, 12, 13, 32}Float1619217.17µs2,076,672120.9
KA-Batch-16-#2{16, 12, 13, 32}{16, 12, 32, 13}Float3219223.30µs2,076,67289.1
KA-Batch-16-#2{16, 12, 13, 32}{16, 12, 32, 13}Float6419241.81µs2,076,67249.7
KA-Batch-16-#2{16, 12, 13, 32}{16, 12, 32, 13}BFloat1619218.89µs2,076,672109.9
KA-Batch-16-#2{16, 12, 13, 32}{16, 12, 32, 13}Float1619217.59µs2,076,672118.0
KA-Batch-16-#3{16, 13, 1536}{1536, 384}Float32136.02µs245,366,7846812.3
KA-Batch-16-#3{16, 13, 1536}{1536, 384}Float641374.37µs245,366,784655.4
KA-Batch-16-#3{16, 13, 1536}{1536, 384}BFloat16139.73µs245,366,7846176.0
KA-Batch-16-#3{16, 13, 1536}{1536, 384}Float16131.60µs245,366,7847764.8
KA-Batch-16-#4{16, 13, 384}{384, 1536}Float32135.77µs245,366,7846858.6
KA-Batch-16-#4{16, 13, 384}{384, 1536}Float641205.18µs245,366,7841195.8
KA-Batch-16-#4{16, 13, 384}{384, 1536}BFloat16170.33µs245,366,7843488.8
KA-Batch-16-#4{16, 13, 384}{384, 1536}Float16126.47µs245,366,7849268.2
KA-Batch-16-#5{16, 13, 384}{384, 384}Float32186.77µs61,341,696707.0
KA-Batch-16-#5{16, 13, 384}{384, 384}Float641114.17µs61,341,696537.3
KA-Batch-16-#5{16, 13, 384}{384, 384}BFloat16126.20µs61,341,6962340.9
KA-Batch-16-#5{16, 13, 384}{384, 384}Float16126.57µs61,341,6962309.1
adult-#1{128, 4}{4, 1}Float32113.96µs1,0240.1
adult-#1{128, 4}{4, 1}Float64114.06µs1,0240.1
adult-#1{128, 4}{4, 1}BFloat16114.56µs1,0240.1
adult-#1{128, 4}{4, 1}Float16115.22µs1,0240.1
adult-#2{128, 69}{69, 4}Float32122.74µs70,6563.1
adult-#2{128, 69}{69, 4}Float64146.36µs70,6561.5
adult-#2{128, 69}{69, 4}BFloat16117.25µs70,6564.1
adult-#2{128, 69}{69, 4}Float16122.23µs70,6563.2
adult-#3{25, 4}{4, 1}Float32113.62µs2000.0
adult-#3{25, 4}{4, 1}Float64116.46µs2000.0
adult-#3{25, 4}{4, 1}BFloat16113.80µs2000.0
adult-#3{25, 4}{4, 1}Float16113.51µs2000.0
adult-#4{25, 69}{69, 4}Float32115.01µs13,8000.9
adult-#4{25, 69}{69, 4}Float64149.26µs13,8000.3
adult-#4{25, 69}{69, 4}BFloat16121.87µs13,8000.6
adult-#4{25, 69}{69, 4}Float16117.55µs13,8000.8
adult-#5{49, 4}{4, 1}Float32113.25µs3920.0
adult-#5{49, 4}{4, 1}Float64115.64µs3920.0
adult-#5{49, 4}{4, 1}BFloat16113.40µs3920.0
adult-#5{49, 4}{4, 1}Float16113.97µs3920.0
adult-#6{49, 69}{69, 4}Float32127.97µs27,0481.0
adult-#6{49, 69}{69, 4}Float64146.09µs27,0480.6
adult-#6{49, 69}{69, 4}BFloat16121.03µs27,0481.3
adult-#6{49, 69}{69, 4}Float16119.12µs27,0481.4
adult-#6-Normalized{49, 69}{4, 69}Float32115.79µs27,0481.7
adult-#6-Normalized{49, 69}{4, 69}Float64149.39µs27,0480.5
adult-#6-Normalized{49, 69}{4, 69}BFloat16115.07µs27,0481.8
adult-#6-Normalized{49, 69}{4, 69}Float16117.34µs27,0481.6
Last updated April 25, 2026