This document is generated by running the TestDotGeneral_PerformanceTable test.
To run it, from the root of the repository, run (for the simplego backend):
GOMLX_BACKEND=go go test -tags=perf ./backends/simplego/ \
-test.run=TestDotGeneral_PerformanceTable -test.v -test.count=1 \
-markdown
Results
Backend: go / CPU AMD 9950X3D / 2026-01-17 (gotip -> just after go1.26rc2 made available)
GOMLX_BACKEND=go go test -tags=perf ./backends/simplego/ \
-test.run=TestDotGeneral_PerformanceTable -test.v -test.count=1 \
-markdown
| Test Name | LHS Dims | RHS Dims | DType | BatchSize | Time/Run | Num Ops | GOps/Sec |
|---|
NoBatch-Tiny | {128, 4} | {4, 1} | Float32 | 1 | 3.52µs | 1,024 | 0.3 |
NoBatch-Tiny | {128, 4} | {4, 1} | Float64 | 1 | 3.50µs | 1,024 | 0.3 |
NoBatch-Tiny | {128, 4} | {4, 1} | BFloat16 | 1 | 3.71µs | 1,024 | 0.3 |
NoBatch-Tiny | {128, 4} | {4, 1} | Float16 | 1 | 5.17µs | 1,024 | 0.2 |
NoBatch-Tiny-Norm | {128, 4} | {1, 4} | Float32 | 1 | 3.88µs | 1,024 | 0.3 |
NoBatch-Tiny-Norm | {128, 4} | {1, 4} | Float64 | 1 | 3.92µs | 1,024 | 0.3 |
NoBatch-Tiny-Norm | {128, 4} | {1, 4} | BFloat16 | 1 | 4.21µs | 1,024 | 0.2 |
NoBatch-Tiny-Norm | {128, 4} | {1, 4} | Float16 | 1 | 5.81µs | 1,024 | 0.2 |
NoBatch-Small | {16, 128} | {128, 32} | Float32 | 1 | 26.51µs | 131,072 | 4.9 |
NoBatch-Small | {16, 128} | {128, 32} | Float64 | 1 | 27.08µs | 131,072 | 4.8 |
NoBatch-Small | {16, 128} | {128, 32} | BFloat16 | 1 | 44.36µs | 131,072 | 3.0 |
NoBatch-Small | {16, 128} | {128, 32} | Float16 | 1 | 141.16µs | 131,072 | 0.9 |
NoBatch-Medium | {128, 128} | {128, 256} | Float32 | 1 | 130.12µs | 8,388,608 | 64.5 |
NoBatch-Medium | {128, 128} | {128, 256} | Float64 | 1 | 296.33µs | 8,388,608 | 28.3 |
NoBatch-Medium | {128, 128} | {128, 256} | BFloat16 | 1 | 656.75µs | 8,388,608 | 12.8 |
NoBatch-Medium | {128, 128} | {128, 256} | Float16 | 1 | 1.58ms | 8,388,608 | 5.3 |
NoBatch-Large | {1536, 1920} | {1920, 1024} | Float32 | 1 | 17.76ms | 6,039,797,760 | 340.2 |
NoBatch-Large | {1536, 1920} | {1920, 1024} | Float64 | 1 | 50.29ms | 6,039,797,760 | 120.1 |
NoBatch-Large | {1536, 1920} | {1920, 1024} | BFloat16 | 1 | 124.82ms | 6,039,797,760 | 48.4 |
NoBatch-Large | {1536, 1920} | {1920, 1024} | Float16 | 1 | 413.90ms | 6,039,797,760 | 14.6 |
R-Unbalanced-Cross | {128} | {128, 256} | Float32 | 1 | 22.95µs | 65,536 | 2.9 |
R-Unbalanced-Cross | {128} | {128, 256} | Float64 | 1 | 29.16µs | 65,536 | 2.2 |
R-Unbalanced-Cross | {128} | {128, 256} | BFloat16 | 1 | 26.57µs | 65,536 | 2.5 |
R-Unbalanced-Cross | {128} | {128, 256} | Float16 | 1 | 73.04µs | 65,536 | 0.9 |
L-Unbalanced-Cross | {4096, 32} | {32, 16} | Float32 | 1 | 212.43µs | 4,194,304 | 19.7 |
L-Unbalanced-Cross | {4096, 32} | {32, 16} | Float64 | 1 | 367.21µs | 4,194,304 | 11.4 |
L-Unbalanced-Cross | {4096, 32} | {32, 16} | BFloat16 | 1 | 598.47µs | 4,194,304 | 7.0 |
L-Unbalanced-Cross | {4096, 32} | {32, 16} | Float16 | 1 | 1.23ms | 4,194,304 | 3.4 |
LargeBatch-Tiny | {1024, 128, 4} | {1024, 4, 1} | Float32 | 1024 | 128.69µs | 1,048,576 | 8.1 |
LargeBatch-Tiny | {1024, 128, 4} | {1024, 4, 1} | Float64 | 1024 | 134.52µs | 1,048,576 | 7.8 |
LargeBatch-Tiny | {1024, 128, 4} | {1024, 4, 1} | BFloat16 | 1024 | 202.00µs | 1,048,576 | 5.2 |
LargeBatch-Tiny | {1024, 128, 4} | {1024, 4, 1} | Float16 | 1024 | 411.85µs | 1,048,576 | 2.5 |
LargeBatch-Small | {256, 8, 32} | {256, 32, 16} | Float32 | 256 | 504.32µs | 2,097,152 | 4.2 |
LargeBatch-Small | {256, 8, 32} | {256, 32, 16} | Float64 | 256 | 522.36µs | 2,097,152 | 4.0 |
LargeBatch-Small | {256, 8, 32} | {256, 32, 16} | BFloat16 | 256 | 555.87µs | 2,097,152 | 3.8 |
LargeBatch-Small | {256, 8, 32} | {256, 32, 16} | Float16 | 256 | 740.62µs | 2,097,152 | 2.8 |
LargeBatch-Medium | {64, 64, 128} | {64, 64, 128} | Float32 | 64 | 947.11µs | 67,108,864 | 70.9 |
LargeBatch-Medium | {64, 64, 128} | {64, 64, 128} | Float64 | 64 | 963.56µs | 67,108,864 | 69.6 |
LargeBatch-Medium | {64, 64, 128} | {64, 64, 128} | BFloat16 | 64 | 1.92ms | 67,108,864 | 34.9 |
LargeBatch-Medium | {64, 64, 128} | {64, 64, 128} | Float16 | 64 | 5.60ms | 67,108,864 | 12.0 |
Batched-Large | {16, 1536, 1920} | {16, 1920, 1024} | Float32 | 16 | 240.61ms | 96,636,764,160 | 401.6 |
Batched-Large | {16, 1536, 1920} | {16, 1920, 1024} | Float64 | 16 | 791.46ms | 96,636,764,160 | 122.1 |
Batched-Large | {16, 1536, 1920} | {16, 1920, 1024} | BFloat16 | 16 | 1.87s | 96,636,764,160 | 51.6 |
Batched-Large | {16, 1536, 1920} | {16, 1920, 1024} | Float16 | 16 | 6.32s | 96,636,764,160 | 15.3 |
KA-Batch-16-#1 | {16, 12, 13, 13} | {16, 12, 13, 32} | Float32 | 192 | 314.50µs | 2,076,672 | 6.6 |
KA-Batch-16-#1 | {16, 12, 13, 13} | {16, 12, 13, 32} | Float64 | 192 | 313.41µs | 2,076,672 | 6.6 |
KA-Batch-16-#1 | {16, 12, 13, 13} | {16, 12, 13, 32} | BFloat16 | 192 | 389.75µs | 2,076,672 | 5.3 |
KA-Batch-16-#1 | {16, 12, 13, 13} | {16, 12, 13, 32} | Float16 | 192 | 607.23µs | 2,076,672 | 3.4 |
KA-Batch-16-#2 | {16, 12, 13, 32} | {16, 12, 32, 13} | Float32 | 192 | 323.64µs | 2,076,672 | 6.4 |
KA-Batch-16-#2 | {16, 12, 13, 32} | {16, 12, 32, 13} | Float64 | 192 | 341.99µs | 2,076,672 | 6.1 |
KA-Batch-16-#2 | {16, 12, 13, 32} | {16, 12, 32, 13} | BFloat16 | 192 | 386.01µs | 2,076,672 | 5.4 |
KA-Batch-16-#2 | {16, 12, 13, 32} | {16, 12, 32, 13} | Float16 | 192 | 564.37µs | 2,076,672 | 3.7 |
KA-Batch-16-#3 | {16, 13, 1536} | {1536, 384} | Float32 | 1 | 2.62ms | 245,366,784 | 93.7 |
KA-Batch-16-#3 | {16, 13, 1536} | {1536, 384} | Float64 | 1 | 4.91ms | 245,366,784 | 49.9 |
KA-Batch-16-#3 | {16, 13, 1536} | {1536, 384} | BFloat16 | 1 | 9.18ms | 245,366,784 | 26.7 |
KA-Batch-16-#3 | {16, 13, 1536} | {1536, 384} | Float16 | 1 | 25.54ms | 245,366,784 | 9.6 |
KA-Batch-16-#4 | {16, 13, 384} | {384, 1536} | Float32 | 1 | 2.18ms | 245,366,784 | 112.3 |
KA-Batch-16-#4 | {16, 13, 384} | {384, 1536} | Float64 | 1 | 4.08ms | 245,366,784 | 60.1 |
KA-Batch-16-#4 | {16, 13, 384} | {384, 1536} | BFloat16 | 1 | 7.41ms | 245,366,784 | 33.1 |
KA-Batch-16-#4 | {16, 13, 384} | {384, 1536} | Float16 | 1 | 20.73ms | 245,366,784 | 11.8 |
KA-Batch-16-#5 | {16, 13, 384} | {384, 384} | Float32 | 1 | 698.36µs | 61,341,696 | 87.8 |
KA-Batch-16-#5 | {16, 13, 384} | {384, 384} | Float64 | 1 | 1.35ms | 61,341,696 | 45.5 |
KA-Batch-16-#5 | {16, 13, 384} | {384, 384} | BFloat16 | 1 | 2.40ms | 61,341,696 | 25.6 |
KA-Batch-16-#5 | {16, 13, 384} | {384, 384} | Float16 | 1 | 6.58ms | 61,341,696 | 9.3 |
adult-#1 | {128, 4} | {4, 1} | Float32 | 1 | 3.73µs | 1,024 | 0.3 |
adult-#1 | {128, 4} | {4, 1} | Float64 | 1 | 3.88µs | 1,024 | 0.3 |
adult-#1 | {128, 4} | {4, 1} | BFloat16 | 1 | 3.89µs | 1,024 | 0.3 |
adult-#1 | {128, 4} | {4, 1} | Float16 | 1 | 5.20µs | 1,024 | 0.2 |
adult-#2 | {128, 69} | {69, 4} | Float32 | 1 | 16.52µs | 70,656 | 4.3 |
adult-#2 | {128, 69} | {69, 4} | Float64 | 1 | 16.44µs | 70,656 | 4.3 |
adult-#2 | {128, 69} | {69, 4} | BFloat16 | 1 | 26.20µs | 70,656 | 2.7 |
adult-#2 | {128, 69} | {69, 4} | Float16 | 1 | 83.93µs | 70,656 | 0.8 |
adult-#3 | {25, 4} | {4, 1} | Float32 | 1 | 3.54µs | 200 | 0.1 |
adult-#3 | {25, 4} | {4, 1} | Float64 | 1 | 3.52µs | 200 | 0.1 |
adult-#3 | {25, 4} | {4, 1} | BFloat16 | 1 | 3.50µs | 200 | 0.1 |
adult-#3 | {25, 4} | {4, 1} | Float16 | 1 | 3.69µs | 200 | 0.1 |
adult-#4 | {25, 69} | {69, 4} | Float32 | 1 | 6.41µs | 13,800 | 2.2 |
adult-#4 | {25, 69} | {69, 4} | Float64 | 1 | 6.44µs | 13,800 | 2.1 |
adult-#4 | {25, 69} | {69, 4} | BFloat16 | 1 | 8.41µs | 13,800 | 1.6 |
adult-#4 | {25, 69} | {69, 4} | Float16 | 1 | 19.84µs | 13,800 | 0.7 |
adult-#5 | {49, 4} | {4, 1} | Float32 | 1 | 3.46µs | 392 | 0.1 |
adult-#5 | {49, 4} | {4, 1} | Float64 | 1 | 3.51µs | 392 | 0.1 |
adult-#5 | {49, 4} | {4, 1} | BFloat16 | 1 | 3.50µs | 392 | 0.1 |
adult-#5 | {49, 4} | {4, 1} | Float16 | 1 | 3.97µs | 392 | 0.1 |
adult-#6 | {49, 69} | {69, 4} | Float32 | 1 | 8.69µs | 27,048 | 3.1 |
adult-#6 | {49, 69} | {69, 4} | Float64 | 1 | 8.66µs | 27,048 | 3.1 |
adult-#6 | {49, 69} | {69, 4} | BFloat16 | 1 | 12.57µs | 27,048 | 2.2 |
adult-#6 | {49, 69} | {69, 4} | Float16 | 1 | 34.48µs | 27,048 | 0.8 |
adult-#6-Normalized | {49, 69} | {4, 69} | Float32 | 1 | 9.06µs | 27,048 | 3.0 |
adult-#6-Normalized | {49, 69} | {4, 69} | Float64 | 1 | 9.51µs | 27,048 | 2.8 |
adult-#6-Normalized | {49, 69} | {4, 69} | BFloat16 | 1 | 15.33µs | 27,048 | 1.8 |
adult-#6-Normalized | {49, 69} | {4, 69} | Float16 | 1 | 35.70µs | 27,048 | 0.8 |
With the experimental SIMD enabled for AVX-512 we get the following updates:
| Test Name | LHS Dims | RHS Dims | DType | BatchSize | Time/Run | Num Ops | GOps/Sec |
|---|
NoBatch-Medium | {128, 128} | {128, 256} | Float32 | 1 | 9.07µs | 8,388,608 | 924.5 |
NoBatch-Large | {1536, 1920} | {1920, 1024} | Float32 | 1 | 2.95ms | 6,039,797,760 | 2045.5 |
L-Unbalanced-Cross | {4096, 32} | {32, 16} | Float32 | 1 | 11.31µs | 4,194,304 | 370.8 |
LargeBatch-Tiny | {1024, 128, 4} | {1024, 4, 1} | Float32 | 1024 | 806.62µs | 1,048,576 | 1.3 |
LargeBatch-Small | {256, 8, 32} | {256, 32, 16} | Float32 | 256 | 171.54µs | 2,097,152 | 12.2 |
Batched-Large | {16, 1536, 1920} | {16, 1920, 1024} | Float32 | 16 | 48.60ms | 96,636,764,160 | 1988.3 |
KA-Batch-16-#1 | {16, 12, 13, 13} | {16, 12, 13, 32} | Float32 | 192 | 113.87µs | 2,076,672 | 18.2 |
KA-Batch-16-#2 | {16, 12, 13, 32} | {16, 12, 32, 13} | Float32 | 192 | 138.35µs | 2,076,672 | 15.0 |
Backend: xla:cpu/ CPU AMD 9950X3D / 2026-01-07 / PJRT Plugin v0.83.4 (see pjrt-cpu-binaries)
GOMLX_BACKEND=xla:cpu go test -tags=perf ./backends/simplego/ \
-test.run=TestDotGeneral_PerformanceTable -test.v -test.count=1 \
-markdown
| Test Name | LHS Dims | RHS Dims | DType | BatchSize | Time/Run | Num Ops | GOps/Sec |
|---|
NoBatch-Tiny | {128, 4} | {4, 1} | Float32 | 1 | 4.62µs | 1,024 | 0.2 |
NoBatch-Tiny | {128, 4} | {4, 1} | Float64 | 1 | 4.58µs | 1,024 | 0.2 |
NoBatch-Tiny | {128, 4} | {4, 1} | BFloat16 | 1 | 4.83µs | 1,024 | 0.2 |
NoBatch-Tiny | {128, 4} | {4, 1} | Float16 | 1 | 5.12µs | 1,024 | 0.2 |
NoBatch-Tiny-Norm | {128, 4} | {1, 4} | Float32 | 1 | 4.55µs | 1,024 | 0.2 |
NoBatch-Tiny-Norm | {128, 4} | {1, 4} | Float64 | 1 | 4.20µs | 1,024 | 0.2 |
NoBatch-Tiny-Norm | {128, 4} | {1, 4} | BFloat16 | 1 | 4.51µs | 1,024 | 0.2 |
NoBatch-Tiny-Norm | {128, 4} | {1, 4} | Float16 | 1 | 4.46µs | 1,024 | 0.2 |
NoBatch-Small | {16, 128} | {128, 32} | Float32 | 1 | 5.48µs | 131,072 | 23.9 |
NoBatch-Small | {16, 128} | {128, 32} | Float64 | 1 | 9.63µs | 131,072 | 13.6 |
NoBatch-Small | {16, 128} | {128, 32} | BFloat16 | 1 | 6.25µs | 131,072 | 21.0 |
NoBatch-Small | {16, 128} | {128, 32} | Float16 | 1 | 6.39µs | 131,072 | 20.5 |
NoBatch-Medium | {128, 128} | {128, 256} | Float32 | 1 | 31.59µs | 8,388,608 | 265.5 |
NoBatch-Medium | {128, 128} | {128, 256} | Float64 | 1 | 70.09µs | 8,388,608 | 119.7 |
NoBatch-Medium | {128, 128} | {128, 256} | BFloat16 | 1 | 35.01µs | 8,388,608 | 239.6 |
NoBatch-Medium | {128, 128} | {128, 256} | Float16 | 1 | 35.33µs | 8,388,608 | 237.5 |
NoBatch-Large | {1536, 1920} | {1920, 1024} | Float32 | 1 | 1.75ms | 6,039,797,760 | 3456.4 |
NoBatch-Large | {1536, 1920} | {1920, 1024} | Float64 | 1 | 16.79ms | 6,039,797,760 | 359.8 |
NoBatch-Large | {1536, 1920} | {1920, 1024} | BFloat16 | 1 | 2.83ms | 6,039,797,760 | 2134.5 |
NoBatch-Large | {1536, 1920} | {1920, 1024} | Float16 | 1 | 2.77ms | 6,039,797,760 | 2184.4 |
R-Unbalanced-Cross | {128} | {128, 256} | Float32 | 1 | 6.14µs | 65,536 | 10.7 |
R-Unbalanced-Cross | {128} | {128, 256} | Float64 | 1 | 6.17µs | 65,536 | 10.6 |
R-Unbalanced-Cross | {128} | {128, 256} | BFloat16 | 1 | 11.15µs | 65,536 | 5.9 |
R-Unbalanced-Cross | {128} | {128, 256} | Float16 | 1 | 11.87µs | 65,536 | 5.5 |
L-Unbalanced-Cross | {4096, 32} | {32, 16} | Float32 | 1 | 15.05µs | 4,194,304 | 278.7 |
L-Unbalanced-Cross | {4096, 32} | {32, 16} | Float64 | 1 | 81.84µs | 4,194,304 | 51.3 |
L-Unbalanced-Cross | {4096, 32} | {32, 16} | BFloat16 | 1 | 49.63µs | 4,194,304 | 84.5 |
L-Unbalanced-Cross | {4096, 32} | {32, 16} | Float16 | 1 | 39.84µs | 4,194,304 | 105.3 |
LargeBatch-Tiny | {1024, 128, 4} | {1024, 4, 1} | Float32 | 1024 | 121.14µs | 1,048,576 | 8.7 |
LargeBatch-Tiny | {1024, 128, 4} | {1024, 4, 1} | Float64 | 1024 | 67.71µs | 1,048,576 | 15.5 |
LargeBatch-Tiny | {1024, 128, 4} | {1024, 4, 1} | BFloat16 | 1024 | 140.54µs | 1,048,576 | 7.5 |
LargeBatch-Tiny | {1024, 128, 4} | {1024, 4, 1} | Float16 | 1024 | 135.41µs | 1,048,576 | 7.7 |
LargeBatch-Small | {256, 8, 32} | {256, 32, 16} | Float32 | 256 | 27.18µs | 2,097,152 | 77.2 |
LargeBatch-Small | {256, 8, 32} | {256, 32, 16} | Float64 | 256 | 123.81µs | 2,097,152 | 16.9 |
LargeBatch-Small | {256, 8, 32} | {256, 32, 16} | BFloat16 | 256 | 61.14µs | 2,097,152 | 34.3 |
LargeBatch-Small | {256, 8, 32} | {256, 32, 16} | Float16 | 256 | 64.62µs | 2,097,152 | 32.5 |
LargeBatch-Medium | {64, 64, 128} | {64, 64, 128} | Float32 | 64 | 59.80µs | 67,108,864 | 1122.2 |
LargeBatch-Medium | {64, 64, 128} | {64, 64, 128} | Float64 | 64 | 849.47µs | 67,108,864 | 79.0 |
LargeBatch-Medium | {64, 64, 128} | {64, 64, 128} | BFloat16 | 64 | 119.78µs | 67,108,864 | 560.3 |
LargeBatch-Medium | {64, 64, 128} | {64, 64, 128} | Float16 | 64 | 123.86µs | 67,108,864 | 541.8 |
Batched-Large | {16, 1536, 1920} | {16, 1920, 1024} | Float32 | 16 | 51.30ms | 96,636,764,160 | 1883.7 |
Batched-Large | {16, 1536, 1920} | {16, 1920, 1024} | Float64 | 16 | 220.27ms | 96,636,764,160 | 438.7 |
Batched-Large | {16, 1536, 1920} | {16, 1920, 1024} | BFloat16 | 16 | 72.52ms | 96,636,764,160 | 1332.6 |
Batched-Large | {16, 1536, 1920} | {16, 1920, 1024} | Float16 | 16 | 71.93ms | 96,636,764,160 | 1343.6 |
KA-Batch-16-#1 | {16, 12, 13, 13} | {16, 12, 13, 32} | Float32 | 192 | 13.93µs | 2,076,672 | 149.0 |
KA-Batch-16-#1 | {16, 12, 13, 13} | {16, 12, 13, 32} | Float64 | 192 | 118.63µs | 2,076,672 | 17.5 |
KA-Batch-16-#1 | {16, 12, 13, 13} | {16, 12, 13, 32} | BFloat16 | 192 | 63.88µs | 2,076,672 | 32.5 |
KA-Batch-16-#1 | {16, 12, 13, 13} | {16, 12, 13, 32} | Float16 | 192 | 47.36µs | 2,076,672 | 43.9 |
KA-Batch-16-#2 | {16, 12, 13, 32} | {16, 12, 32, 13} | Float32 | 192 | 48.02µs | 2,076,672 | 43.2 |
KA-Batch-16-#2 | {16, 12, 13, 32} | {16, 12, 32, 13} | Float64 | 192 | 121.54µs | 2,076,672 | 17.1 |
KA-Batch-16-#2 | {16, 12, 13, 32} | {16, 12, 32, 13} | BFloat16 | 192 | 81.54µs | 2,076,672 | 25.5 |
KA-Batch-16-#2 | {16, 12, 13, 32} | {16, 12, 32, 13} | Float16 | 192 | 82.65µs | 2,076,672 | 25.1 |
KA-Batch-16-#3 | {16, 13, 1536} | {1536, 384} | Float32 | 1 | 228.54µs | 245,366,784 | 1073.6 |
KA-Batch-16-#3 | {16, 13, 1536} | {1536, 384} | Float64 | 1 | 882.13µs | 245,366,784 | 278.2 |
KA-Batch-16-#3 | {16, 13, 1536} | {1536, 384} | BFloat16 | 1 | 374.36µs | 245,366,784 | 655.4 |
KA-Batch-16-#3 | {16, 13, 1536} | {1536, 384} | Float16 | 1 | 371.05µs | 245,366,784 | 661.3 |
KA-Batch-16-#4 | {16, 13, 384} | {384, 1536} | Float32 | 1 | 147.57µs | 245,366,784 | 1662.7 |
KA-Batch-16-#4 | {16, 13, 384} | {384, 1536} | Float64 | 1 | 881.09µs | 245,366,784 | 278.5 |
KA-Batch-16-#4 | {16, 13, 384} | {384, 1536} | BFloat16 | 1 | 282.59µs | 245,366,784 | 868.3 |
KA-Batch-16-#4 | {16, 13, 384} | {384, 1536} | Float16 | 1 | 279.47µs | 245,366,784 | 878.0 |
KA-Batch-16-#5 | {16, 13, 384} | {384, 384} | Float32 | 1 | 67.69µs | 61,341,696 | 906.3 |
KA-Batch-16-#5 | {16, 13, 384} | {384, 384} | Float64 | 1 | 282.96µs | 61,341,696 | 216.8 |
KA-Batch-16-#5 | {16, 13, 384} | {384, 384} | BFloat16 | 1 | 112.21µs | 61,341,696 | 546.7 |
KA-Batch-16-#5 | {16, 13, 384} | {384, 384} | Float16 | 1 | 117.72µs | 61,341,696 | 521.1 |
adult-#1 | {128, 4} | {4, 1} | Float32 | 1 | 4.72µs | 1,024 | 0.2 |
adult-#1 | {128, 4} | {4, 1} | Float64 | 1 | 4.24µs | 1,024 | 0.2 |
adult-#1 | {128, 4} | {4, 1} | BFloat16 | 1 | 4.27µs | 1,024 | 0.2 |
adult-#1 | {128, 4} | {4, 1} | Float16 | 1 | 4.21µs | 1,024 | 0.2 |
adult-#2 | {128, 69} | {69, 4} | Float32 | 1 | 5.98µs | 70,656 | 11.8 |
adult-#2 | {128, 69} | {69, 4} | Float64 | 1 | 7.87µs | 70,656 | 9.0 |
adult-#2 | {128, 69} | {69, 4} | BFloat16 | 1 | 6.34µs | 70,656 | 11.1 |
adult-#2 | {128, 69} | {69, 4} | Float16 | 1 | 6.27µs | 70,656 | 11.3 |
adult-#3 | {25, 4} | {4, 1} | Float32 | 1 | 2.28µs | 200 | 0.1 |
adult-#3 | {25, 4} | {4, 1} | Float64 | 1 | 2.29µs | 200 | 0.1 |
adult-#3 | {25, 4} | {4, 1} | BFloat16 | 1 | 2.32µs | 200 | 0.1 |
adult-#3 | {25, 4} | {4, 1} | Float16 | 1 | 2.35µs | 200 | 0.1 |
adult-#4 | {25, 69} | {69, 4} | Float32 | 1 | 5.63µs | 13,800 | 2.5 |
adult-#4 | {25, 69} | {69, 4} | Float64 | 1 | 5.88µs | 13,800 | 2.3 |
adult-#4 | {25, 69} | {69, 4} | BFloat16 | 1 | 5.90µs | 13,800 | 2.3 |
adult-#4 | {25, 69} | {69, 4} | Float16 | 1 | 5.97µs | 13,800 | 2.3 |
adult-#5 | {49, 4} | {4, 1} | Float32 | 1 | 2.33µs | 392 | 0.2 |
adult-#5 | {49, 4} | {4, 1} | Float64 | 1 | 2.26µs | 392 | 0.2 |
adult-#5 | {49, 4} | {4, 1} | BFloat16 | 1 | 2.34µs | 392 | 0.2 |
adult-#5 | {49, 4} | {4, 1} | Float16 | 1 | 2.33µs | 392 | 0.2 |
adult-#6 | {49, 69} | {69, 4} | Float32 | 1 | 5.87µs | 27,048 | 4.6 |
adult-#6 | {49, 69} | {69, 4} | Float64 | 1 | 5.95µs | 27,048 | 4.5 |
adult-#6 | {49, 69} | {69, 4} | BFloat16 | 1 | 6.26µs | 27,048 | 4.3 |
adult-#6 | {49, 69} | {69, 4} | Float16 | 1 | 6.39µs | 27,048 | 4.2 |
adult-#6-Normalized | {49, 69} | {4, 69} | Float32 | 1 | 5.96µs | 27,048 | 4.5 |
adult-#6-Normalized | {49, 69} | {4, 69} | Float64 | 1 | 6.07µs | 27,048 | 4.5 |
adult-#6-Normalized | {49, 69} | {4, 69} | BFloat16 | 1 | 6.03µs | 27,048 | 4.5 |
adult-#6-Normalized | {49, 69} | {4, 69} | Float16 | 1 | 6.09µs | 27,048 | 4.4 |
Backend: xla:cuda/ GPU RTX 5090 / 2026-01-07 / PJRT Plugin v0.81 (Jax-build)
GOMLX_BACKEND=xla:cuda go test -tags=perf ./backends/simplego/ \
-test.run=TestDotGeneral_PerformanceTable -test.v -test.count=1 \
-markdown
| Test Name | LHS Dims | RHS Dims | DType | BatchSize | Time/Run | Num Ops | GOps/Sec |
|---|
NoBatch-Tiny | {128, 4} | {4, 1} | Float32 | 1 | 17.53µs | 1,024 | 0.1 |
NoBatch-Tiny | {128, 4} | {4, 1} | Float64 | 1 | 16.42µs | 1,024 | 0.1 |
NoBatch-Tiny | {128, 4} | {4, 1} | BFloat16 | 1 | 15.72µs | 1,024 | 0.1 |
NoBatch-Tiny | {128, 4} | {4, 1} | Float16 | 1 | 14.97µs | 1,024 | 0.1 |
NoBatch-Tiny-Norm | {128, 4} | {1, 4} | Float32 | 1 | 13.44µs | 1,024 | 0.1 |
NoBatch-Tiny-Norm | {128, 4} | {1, 4} | Float64 | 1 | 17.53µs | 1,024 | 0.1 |
NoBatch-Tiny-Norm | {128, 4} | {1, 4} | BFloat16 | 1 | 26.20µs | 1,024 | 0.0 |
NoBatch-Tiny-Norm | {128, 4} | {1, 4} | Float16 | 1 | 15.92µs | 1,024 | 0.1 |
NoBatch-Small | {16, 128} | {128, 32} | Float32 | 1 | 27.23µs | 131,072 | 4.8 |
NoBatch-Small | {16, 128} | {128, 32} | Float64 | 1 | 60.17µs | 131,072 | 2.2 |
NoBatch-Small | {16, 128} | {128, 32} | BFloat16 | 1 | 32.50µs | 131,072 | 4.0 |
NoBatch-Small | {16, 128} | {128, 32} | Float16 | 1 | 23.79µs | 131,072 | 5.5 |
NoBatch-Medium | {128, 128} | {128, 256} | Float32 | 1 | 26.76µs | 8,388,608 | 313.5 |
NoBatch-Medium | {128, 128} | {128, 256} | Float64 | 1 | 60.90µs | 8,388,608 | 137.7 |
NoBatch-Medium | {128, 128} | {128, 256} | BFloat16 | 1 | 26.77µs | 8,388,608 | 313.4 |
NoBatch-Medium | {128, 128} | {128, 256} | Float16 | 1 | 26.01µs | 8,388,608 | 322.5 |
NoBatch-Large | {1536, 1920} | {1920, 1024} | Float32 | 1 | 121.00µs | 6,039,797,760 | 49915.7 |
NoBatch-Large | {1536, 1920} | {1920, 1024} | Float64 | 1 | 3.72ms | 6,039,797,760 | 1624.6 |
NoBatch-Large | {1536, 1920} | {1920, 1024} | BFloat16 | 1 | 91.86µs | 6,039,797,760 | 65751.5 |
NoBatch-Large | {1536, 1920} | {1920, 1024} | Float16 | 1 | 76.70µs | 6,039,797,760 | 78741.6 |
R-Unbalanced-Cross | {128} | {128, 256} | Float32 | 1 | 15.28µs | 65,536 | 4.3 |
R-Unbalanced-Cross | {128} | {128, 256} | Float64 | 1 | 84.30µs | 65,536 | 0.8 |
R-Unbalanced-Cross | {128} | {128, 256} | BFloat16 | 1 | 19.00µs | 65,536 | 3.4 |
R-Unbalanced-Cross | {128} | {128, 256} | Float16 | 1 | 18.36µs | 65,536 | 3.6 |
L-Unbalanced-Cross | {4096, 32} | {32, 16} | Float32 | 1 | 24.77µs | 4,194,304 | 169.3 |
L-Unbalanced-Cross | {4096, 32} | {32, 16} | Float64 | 1 | 41.32µs | 4,194,304 | 101.5 |
L-Unbalanced-Cross | {4096, 32} | {32, 16} | BFloat16 | 1 | 224.48µs | 4,194,304 | 18.7 |
L-Unbalanced-Cross | {4096, 32} | {32, 16} | Float16 | 1 | 19.41µs | 4,194,304 | 216.1 |
LargeBatch-Tiny | {1024, 128, 4} | {1024, 4, 1} | Float32 | 1024 | 14.82µs | 1,048,576 | 70.8 |
LargeBatch-Tiny | {1024, 128, 4} | {1024, 4, 1} | Float64 | 1024 | 27.46µs | 1,048,576 | 38.2 |
LargeBatch-Tiny | {1024, 128, 4} | {1024, 4, 1} | BFloat16 | 1024 | 14.84µs | 1,048,576 | 70.6 |
LargeBatch-Tiny | {1024, 128, 4} | {1024, 4, 1} | Float16 | 1024 | 13.19µs | 1,048,576 | 79.5 |
LargeBatch-Small | {256, 8, 32} | {256, 32, 16} | Float32 | 256 | 17.99µs | 2,097,152 | 116.6 |
LargeBatch-Small | {256, 8, 32} | {256, 32, 16} | Float64 | 256 | 29.11µs | 2,097,152 | 72.0 |
LargeBatch-Small | {256, 8, 32} | {256, 32, 16} | BFloat16 | 256 | 16.83µs | 2,097,152 | 124.6 |
LargeBatch-Small | {256, 8, 32} | {256, 32, 16} | Float16 | 256 | 13.51µs | 2,097,152 | 155.3 |
LargeBatch-Medium | {64, 64, 128} | {64, 64, 128} | Float32 | 64 | 28.59µs | 67,108,864 | 2347.1 |
LargeBatch-Medium | {64, 64, 128} | {64, 64, 128} | Float64 | 64 | 99.14µs | 67,108,864 | 676.9 |
LargeBatch-Medium | {64, 64, 128} | {64, 64, 128} | BFloat16 | 64 | 24.59µs | 67,108,864 | 2728.7 |
LargeBatch-Medium | {64, 64, 128} | {64, 64, 128} | Float16 | 64 | 25.75µs | 67,108,864 | 2605.9 |
Batched-Large | {16, 1536, 1920} | {16, 1920, 1024} | Float32 | 16 | 927.42µs | 96,636,764,160 | 104199.5 |
Batched-Large | {16, 1536, 1920} | {16, 1920, 1024} | Float64 | 16 | 57.21ms | 96,636,764,160 | 1689.2 |
Batched-Large | {16, 1536, 1920} | {16, 1920, 1024} | BFloat16 | 16 | 473.19µs | 96,636,764,160 | 204223.6 |
Batched-Large | {16, 1536, 1920} | {16, 1920, 1024} | Float16 | 16 | 476.39µs | 96,636,764,160 | 202853.9 |
KA-Batch-16-#1 | {16, 12, 13, 13} | {16, 12, 13, 32} | Float32 | 192 | 29.05µs | 2,076,672 | 71.5 |
KA-Batch-16-#1 | {16, 12, 13, 13} | {16, 12, 13, 32} | Float64 | 192 | 36.93µs | 2,076,672 | 56.2 |
KA-Batch-16-#1 | {16, 12, 13, 13} | {16, 12, 13, 32} | BFloat16 | 192 | 18.15µs | 2,076,672 | 114.4 |
KA-Batch-16-#1 | {16, 12, 13, 13} | {16, 12, 13, 32} | Float16 | 192 | 17.17µs | 2,076,672 | 120.9 |
KA-Batch-16-#2 | {16, 12, 13, 32} | {16, 12, 32, 13} | Float32 | 192 | 23.30µs | 2,076,672 | 89.1 |
KA-Batch-16-#2 | {16, 12, 13, 32} | {16, 12, 32, 13} | Float64 | 192 | 41.81µs | 2,076,672 | 49.7 |
KA-Batch-16-#2 | {16, 12, 13, 32} | {16, 12, 32, 13} | BFloat16 | 192 | 18.89µs | 2,076,672 | 109.9 |
KA-Batch-16-#2 | {16, 12, 13, 32} | {16, 12, 32, 13} | Float16 | 192 | 17.59µs | 2,076,672 | 118.0 |
KA-Batch-16-#3 | {16, 13, 1536} | {1536, 384} | Float32 | 1 | 36.02µs | 245,366,784 | 6812.3 |
KA-Batch-16-#3 | {16, 13, 1536} | {1536, 384} | Float64 | 1 | 374.37µs | 245,366,784 | 655.4 |
KA-Batch-16-#3 | {16, 13, 1536} | {1536, 384} | BFloat16 | 1 | 39.73µs | 245,366,784 | 6176.0 |
KA-Batch-16-#3 | {16, 13, 1536} | {1536, 384} | Float16 | 1 | 31.60µs | 245,366,784 | 7764.8 |
KA-Batch-16-#4 | {16, 13, 384} | {384, 1536} | Float32 | 1 | 35.77µs | 245,366,784 | 6858.6 |
KA-Batch-16-#4 | {16, 13, 384} | {384, 1536} | Float64 | 1 | 205.18µs | 245,366,784 | 1195.8 |
KA-Batch-16-#4 | {16, 13, 384} | {384, 1536} | BFloat16 | 1 | 70.33µs | 245,366,784 | 3488.8 |
KA-Batch-16-#4 | {16, 13, 384} | {384, 1536} | Float16 | 1 | 26.47µs | 245,366,784 | 9268.2 |
KA-Batch-16-#5 | {16, 13, 384} | {384, 384} | Float32 | 1 | 86.77µs | 61,341,696 | 707.0 |
KA-Batch-16-#5 | {16, 13, 384} | {384, 384} | Float64 | 1 | 114.17µs | 61,341,696 | 537.3 |
KA-Batch-16-#5 | {16, 13, 384} | {384, 384} | BFloat16 | 1 | 26.20µs | 61,341,696 | 2340.9 |
KA-Batch-16-#5 | {16, 13, 384} | {384, 384} | Float16 | 1 | 26.57µs | 61,341,696 | 2309.1 |
adult-#1 | {128, 4} | {4, 1} | Float32 | 1 | 13.96µs | 1,024 | 0.1 |
adult-#1 | {128, 4} | {4, 1} | Float64 | 1 | 14.06µs | 1,024 | 0.1 |
adult-#1 | {128, 4} | {4, 1} | BFloat16 | 1 | 14.56µs | 1,024 | 0.1 |
adult-#1 | {128, 4} | {4, 1} | Float16 | 1 | 15.22µs | 1,024 | 0.1 |
adult-#2 | {128, 69} | {69, 4} | Float32 | 1 | 22.74µs | 70,656 | 3.1 |
adult-#2 | {128, 69} | {69, 4} | Float64 | 1 | 46.36µs | 70,656 | 1.5 |
adult-#2 | {128, 69} | {69, 4} | BFloat16 | 1 | 17.25µs | 70,656 | 4.1 |
adult-#2 | {128, 69} | {69, 4} | Float16 | 1 | 22.23µs | 70,656 | 3.2 |
adult-#3 | {25, 4} | {4, 1} | Float32 | 1 | 13.62µs | 200 | 0.0 |
adult-#3 | {25, 4} | {4, 1} | Float64 | 1 | 16.46µs | 200 | 0.0 |
adult-#3 | {25, 4} | {4, 1} | BFloat16 | 1 | 13.80µs | 200 | 0.0 |
adult-#3 | {25, 4} | {4, 1} | Float16 | 1 | 13.51µs | 200 | 0.0 |
adult-#4 | {25, 69} | {69, 4} | Float32 | 1 | 15.01µs | 13,800 | 0.9 |
adult-#4 | {25, 69} | {69, 4} | Float64 | 1 | 49.26µs | 13,800 | 0.3 |
adult-#4 | {25, 69} | {69, 4} | BFloat16 | 1 | 21.87µs | 13,800 | 0.6 |
adult-#4 | {25, 69} | {69, 4} | Float16 | 1 | 17.55µs | 13,800 | 0.8 |
adult-#5 | {49, 4} | {4, 1} | Float32 | 1 | 13.25µs | 392 | 0.0 |
adult-#5 | {49, 4} | {4, 1} | Float64 | 1 | 15.64µs | 392 | 0.0 |
adult-#5 | {49, 4} | {4, 1} | BFloat16 | 1 | 13.40µs | 392 | 0.0 |
adult-#5 | {49, 4} | {4, 1} | Float16 | 1 | 13.97µs | 392 | 0.0 |
adult-#6 | {49, 69} | {69, 4} | Float32 | 1 | 27.97µs | 27,048 | 1.0 |
adult-#6 | {49, 69} | {69, 4} | Float64 | 1 | 46.09µs | 27,048 | 0.6 |
adult-#6 | {49, 69} | {69, 4} | BFloat16 | 1 | 21.03µs | 27,048 | 1.3 |
adult-#6 | {49, 69} | {69, 4} | Float16 | 1 | 19.12µs | 27,048 | 1.4 |
adult-#6-Normalized | {49, 69} | {4, 69} | Float32 | 1 | 15.79µs | 27,048 | 1.7 |
adult-#6-Normalized | {49, 69} | {4, 69} | Float64 | 1 | 49.39µs | 27,048 | 0.5 |
adult-#6-Normalized | {49, 69} | {4, 69} | BFloat16 | 1 | 15.07µs | 27,048 | 1.8 |
adult-#6-Normalized | {49, 69} | {4, 69} | Float16 | 1 | 17.34µs | 27,048 | 1.6 |