This document is generated by running the TestDotGeneral_PerformanceTable test.
To run it, from the root of the repository, run (for the simplego backend):
GOMLX_BACKEND=go go test -tags=perf ./internal/perf/ \
-test.run=TestDotGeneral_PerformanceTable -test.v -test.count=1 \
-markdown
Results
«««< HEAD
Backend: go with new matmul for AVX512 implementation / 2026/05/11 AMD 9950X3D, go 1.26.3, Ubuntu 26.04, PJRT Plugin 0.104
| Test Name | LHS Dims | RHS Dims | Layout | DType | BatchSize | Time/Run | Num Ops | GOps/Sec |
|---|
NoBatch-Tiny | {128, 4} | {4, 1} | NonTransposed | Float32 | 1 | 4.4µs | 1_024 | 0.2 |
NoBatch-Tiny | {128, 4} | {4, 1} | NonTransposed | Float64 | 1 | 4.2µs | 1_024 | 0.2 |
NoBatch-Tiny | {128, 4} | {4, 1} | NonTransposed | BFloat16 | 1 | 6.1µs | 1_024 | 0.2 |
NoBatch-Tiny | {128, 4} | {4, 1} | NonTransposed | Float16 | 1 | 7.8µs | 1_024 | 0.1 |
NoBatch-Tiny-Norm | {128, 4} | {1, 4} | Transposed | Float32 | 1 | 4.9µs | 1_024 | 0.2 |
NoBatch-Tiny-Norm | {128, 4} | {1, 4} | Transposed | Float64 | 1 | 4.6µs | 1_024 | 0.2 |
NoBatch-Tiny-Norm | {128, 4} | {1, 4} | Transposed | BFloat16 | 1 | 7.9µs | 1_024 | 0.1 |
NoBatch-Tiny-Norm | {128, 4} | {1, 4} | Transposed | Float16 | 1 | 6.9µs | 1_024 | 0.1 |
NoBatch-Small | {16, 128} | {128, 32} | NonTransposed | Float32 | 1 | 8.2µs | 131_072 | 15.9 |
NoBatch-Small | {16, 128} | {128, 32} | NonTransposed | Float64 | 1 | 10.8µs | 131_072 | 12.1 |
NoBatch-Small | {16, 128} | {128, 32} | NonTransposed | BFloat16 | 1 | 51.1µs | 131_072 | 2.6 |
NoBatch-Small | {16, 128} | {128, 32} | NonTransposed | Float16 | 1 | 67.7µs | 131_072 | 1.9 |
NoBatch-Medium | {128, 128} | {128, 256} | NonTransposed | Float32 | 1 | 194.7µs | 8_388_608 | 43.1 |
NoBatch-Medium | {128, 128} | {128, 256} | NonTransposed | Float64 | 1 | 223.4µs | 8_388_608 | 37.6 |
NoBatch-Medium | {128, 128} | {128, 256} | NonTransposed | BFloat16 | 1 | 156µs | 8_388_608 | 53.8 |
NoBatch-Medium | {128, 128} | {128, 256} | NonTransposed | Float16 | 1 | 566.2µs | 8_388_608 | 14.8 |
NoBatch-Large | {1536, 1920} | {1920, 1024} | NonTransposed | Float32 | 1 | 3.9ms | 6_039_797_760 | 1536.5 |
NoBatch-Large | {1536, 1920} | {1920, 1024} | NonTransposed | Float64 | 1 | 7.7ms | 6_039_797_760 | 785.9 |
NoBatch-Large | {1536, 1920} | {1920, 1024} | NonTransposed | BFloat16 | 1 | 5.5ms | 6_039_797_760 | 1092.2 |
NoBatch-Large | {1536, 1920} | {1920, 1024} | NonTransposed | Float16 | 1 | 28.6ms | 6_039_797_760 | 211.5 |
R-Unbalanced-Cross | {128} | {128, 256} | NonTransposed | Float32 | 1 | 10.5µs | 65_536 | 6.2 |
R-Unbalanced-Cross | {128} | {128, 256} | NonTransposed | Float64 | 1 | 10.2µs | 65_536 | 6.5 |
R-Unbalanced-Cross | {128} | {128, 256} | NonTransposed | BFloat16 | 1 | 11.4µs | 65_536 | 5.8 |
R-Unbalanced-Cross | {128} | {128, 256} | NonTransposed | Float16 | 1 | 17.8µs | 65_536 | 3.7 |
L-Unbalanced-Cross | {4096, 32} | {32, 16} | NonTransposed | Float32 | 1 | 755.9µs | 4_194_304 | 5.5 |
L-Unbalanced-Cross | {4096, 32} | {32, 16} | NonTransposed | Float64 | 1 | 253.2µs | 4_194_304 | 16.6 |
L-Unbalanced-Cross | {4096, 32} | {32, 16} | NonTransposed | BFloat16 | 1 | 1.3ms | 4_194_304 | 3.2 |
L-Unbalanced-Cross | {4096, 32} | {32, 16} | NonTransposed | Float16 | 1 | 1.7ms | 4_194_304 | 2.5 |
LargeBatch-Tiny | {1024, 128, 4} | {1024, 4, 1} | NonTransposed | Float32 | 1024 | 181.4µs | 1_048_576 | 5.8 |
LargeBatch-Tiny | {1024, 128, 4} | {1024, 4, 1} | NonTransposed | Float64 | 1024 | 160µs | 1_048_576 | 6.6 |
LargeBatch-Tiny | {1024, 128, 4} | {1024, 4, 1} | NonTransposed | BFloat16 | 1024 | 242µs | 1_048_576 | 4.3 |
LargeBatch-Tiny | {1024, 128, 4} | {1024, 4, 1} | NonTransposed | Float16 | 1024 | 405.2µs | 1_048_576 | 2.6 |
LargeBatch-Small | {256, 8, 32} | {256, 32, 16} | NonTransposed | Float32 | 256 | 128.8µs | 2_097_152 | 16.3 |
LargeBatch-Small | {256, 8, 32} | {256, 32, 16} | NonTransposed | Float64 | 256 | 115.8µs | 2_097_152 | 18.1 |
LargeBatch-Small | {256, 8, 32} | {256, 32, 16} | NonTransposed | BFloat16 | 256 | 179.8µs | 2_097_152 | 11.7 |
LargeBatch-Small | {256, 8, 32} | {256, 32, 16} | NonTransposed | Float16 | 256 | 256.8µs | 2_097_152 | 8.2 |
LargeBatch-Medium | {64, 64, 128} | {64, 64, 128} | Transposed | Float32 | 64 | 211.1µs | 67_108_864 | 317.9 |
LargeBatch-Medium | {64, 64, 128} | {64, 64, 128} | Transposed | Float64 | 64 | 288.5µs | 67_108_864 | 232.6 |
LargeBatch-Medium | {64, 64, 128} | {64, 64, 128} | Transposed | BFloat16 | 64 | 292.6µs | 67_108_864 | 229.4 |
LargeBatch-Medium | {64, 64, 128} | {64, 64, 128} | Transposed | Float16 | 64 | 903.4µs | 67_108_864 | 74.3 |
Batched-Large-1 | {16, 1536, 1920} | {16, 1920, 1024} | NonTransposed | Float32 | 16 | 52.2ms | 96_636_764_160 | 1852.3 |
Batched-Large-1 | {16, 1536, 1920} | {16, 1920, 1024} | NonTransposed | Float64 | 16 | 108.1ms | 96_636_764_160 | 893.9 |
Batched-Large-1 | {16, 1536, 1920} | {16, 1920, 1024} | NonTransposed | BFloat16 | 16 | 72.1ms | 96_636_764_160 | 1340.0 |
Batched-Large-1 | {16, 1536, 1920} | {16, 1920, 1024} | NonTransposed | Float16 | 16 | 403.8ms | 96_636_764_160 | 239.3 |
Batched-Large-2 | {16, 1024, 1920} | {16, 1920, 1536} | NonTransposed | Float32 | 16 | 53.2ms | 96_636_764_160 | 1817.7 |
Batched-Large-2 | {16, 1024, 1920} | {16, 1920, 1536} | NonTransposed | Float64 | 16 | 112.4ms | 96_636_764_160 | 859.7 |
Batched-Large-2 | {16, 1024, 1920} | {16, 1920, 1536} | NonTransposed | BFloat16 | 16 | 71.6ms | 96_636_764_160 | 1349.0 |
Batched-Large-2 | {16, 1024, 1920} | {16, 1920, 1536} | NonTransposed | Float16 | 16 | 394.2ms | 96_636_764_160 | 245.1 |
KA-Batch-16-#1 | {16, 12, 13, 13} | {16, 12, 13, 32} | NonTransposed | Float32 | 192 | 165.4µs | 2_076_672 | 12.6 |
KA-Batch-16-#1 | {16, 12, 13, 13} | {16, 12, 13, 32} | NonTransposed | Float64 | 192 | 226.5µs | 2_076_672 | 9.2 |
KA-Batch-16-#1 | {16, 12, 13, 13} | {16, 12, 13, 32} | NonTransposed | BFloat16 | 192 | 266.3µs | 2_076_672 | 7.8 |
KA-Batch-16-#1 | {16, 12, 13, 13} | {16, 12, 13, 32} | NonTransposed | Float16 | 192 | 369µs | 2_076_672 | 5.6 |
KA-Batch-16-#2 | {16, 12, 13, 32} | {16, 12, 32, 13} | NonTransposed | Float32 | 192 | 130.3µs | 2_076_672 | 15.9 |
KA-Batch-16-#2 | {16, 12, 13, 32} | {16, 12, 32, 13} | NonTransposed | Float64 | 192 | 202.5µs | 2_076_672 | 10.3 |
KA-Batch-16-#2 | {16, 12, 13, 32} | {16, 12, 32, 13} | NonTransposed | BFloat16 | 192 | 206.3µs | 2_076_672 | 10.1 |
KA-Batch-16-#2 | {16, 12, 13, 32} | {16, 12, 32, 13} | NonTransposed | Float16 | 192 | 249.5µs | 2_076_672 | 8.3 |
KA-Batch-16-#3 | {16, 13, 1536} | {1536, 384} | NonTransposed | Float32 | 1 | 1.7ms | 245_366_784 | 147.1 |
KA-Batch-16-#3 | {16, 13, 1536} | {1536, 384} | NonTransposed | Float64 | 1 | 2.6ms | 245_366_784 | 96.2 |
KA-Batch-16-#3 | {16, 13, 1536} | {1536, 384} | NonTransposed | BFloat16 | 1 | 2.4ms | 245_366_784 | 103.9 |
KA-Batch-16-#3 | {16, 13, 1536} | {1536, 384} | NonTransposed | Float16 | 1 | 13.3ms | 245_366_784 | 18.5 |
KA-Batch-16-#4 | {16, 13, 384} | {384, 1536} | NonTransposed | Float32 | 1 | 768.4µs | 245_366_784 | 319.3 |
KA-Batch-16-#4 | {16, 13, 384} | {384, 1536} | NonTransposed | Float64 | 1 | 1.2ms | 245_366_784 | 211.7 |
KA-Batch-16-#4 | {16, 13, 384} | {384, 1536} | NonTransposed | BFloat16 | 1 | 1.6ms | 245_366_784 | 151.6 |
KA-Batch-16-#4 | {16, 13, 384} | {384, 1536} | NonTransposed | Float16 | 1 | 7.6ms | 245_366_784 | 32.2 |
KA-Batch-16-#5 | {16, 13, 384} | {384, 384} | NonTransposed | Float32 | 1 | 477.6µs | 61_341_696 | 128.4 |
KA-Batch-16-#5 | {16, 13, 384} | {384, 384} | NonTransposed | Float64 | 1 | 719µs | 61_341_696 | 85.3 |
KA-Batch-16-#5 | {16, 13, 384} | {384, 384} | NonTransposed | BFloat16 | 1 | 627µs | 61_341_696 | 97.8 |
KA-Batch-16-#5 | {16, 13, 384} | {384, 384} | NonTransposed | Float16 | 1 | 3.7ms | 61_341_696 | 16.4 |
adult-#1 | {128, 4} | {4, 1} | NonTransposed | Float32 | 1 | 8.3µs | 1_024 | 0.1 |
adult-#1 | {128, 4} | {4, 1} | NonTransposed | Float64 | 1 | 7.5µs | 1_024 | 0.1 |
adult-#1 | {128, 4} | {4, 1} | NonTransposed | BFloat16 | 1 | 7.4µs | 1_024 | 0.1 |
adult-#1 | {128, 4} | {4, 1} | NonTransposed | Float16 | 1 | 7.8µs | 1_024 | 0.1 |
adult-#2 | {128, 69} | {69, 4} | NonTransposed | Float32 | 1 | 15µs | 70_656 | 4.7 |
adult-#2 | {128, 69} | {69, 4} | NonTransposed | Float64 | 1 | 15.7µs | 70_656 | 4.5 |
adult-#2 | {128, 69} | {69, 4} | NonTransposed | BFloat16 | 1 | 30µs | 70_656 | 2.4 |
adult-#2 | {128, 69} | {69, 4} | NonTransposed | Float16 | 1 | 35.8µs | 70_656 | 2.0 |
adult-#3 | {25, 4} | {4, 1} | NonTransposed | Float32 | 1 | 4.6µs | 200 | 0.0 |
adult-#3 | {25, 4} | {4, 1} | NonTransposed | Float64 | 1 | 4.2µs | 200 | 0.0 |
adult-#3 | {25, 4} | {4, 1} | NonTransposed | BFloat16 | 1 | 6.7µs | 200 | 0.0 |
adult-#3 | {25, 4} | {4, 1} | NonTransposed | Float16 | 1 | 6.4µs | 200 | 0.0 |
adult-#4 | {25, 69} | {69, 4} | NonTransposed | Float32 | 1 | 7.1µs | 13_800 | 1.9 |
adult-#4 | {25, 69} | {69, 4} | NonTransposed | Float64 | 1 | 6.3µs | 13_800 | 2.2 |
adult-#4 | {25, 69} | {69, 4} | NonTransposed | BFloat16 | 1 | 10.4µs | 13_800 | 1.3 |
adult-#4 | {25, 69} | {69, 4} | NonTransposed | Float16 | 1 | 14.4µs | 13_800 | 1.0 |
adult-#5 | {49, 4} | {4, 1} | NonTransposed | Float32 | 1 | 5.9µs | 392 | 0.1 |
adult-#5 | {49, 4} | {4, 1} | NonTransposed | Float64 | 1 | 3.8µs | 392 | 0.1 |
adult-#5 | {49, 4} | {4, 1} | NonTransposed | BFloat16 | 1 | 5.1µs | 392 | 0.1 |
adult-#5 | {49, 4} | {4, 1} | NonTransposed | Float16 | 1 | 5.7µs | 392 | 0.1 |
adult-#6 | {49, 69} | {69, 4} | NonTransposed | Float32 | 1 | 9.5µs | 27_048 | 2.8 |
adult-#6 | {49, 69} | {69, 4} | NonTransposed | Float64 | 1 | 8.7µs | 27_048 | 3.1 |
adult-#6 | {49, 69} | {69, 4} | NonTransposed | BFloat16 | 1 | 14.5µs | 27_048 | 1.9 |
adult-#6 | {49, 69} | {69, 4} | NonTransposed | Float16 | 1 | 19.1µs | 27_048 | 1.4 |
adult-#6-Normalized | {49, 69} | {4, 69} | Transposed | Float32 | 1 | 8.9µs | 27_048 | 3.0 |
adult-#6-Normalized | {49, 69} | {4, 69} | Transposed | Float64 | 1 | 7.9µs | 27_048 | 3.4 |
adult-#6-Normalized | {49, 69} | {4, 69} | Transposed | BFloat16 | 1 | 10.4µs | 27_048 | 2.6 |
adult-#6-Normalized | {49, 69} | {4, 69} | Transposed | Float16 | 1 | 14.7µs | 27_048 | 1.8 |
8d31b28e67efcc6bb8f2f94664c90937259af7a4
Backend: go / CPU AMD 9950X3D / 2026-05-06, go 1.26.2, Ubuntu 26.04, PJRT Plugin 0.104
GOMLX_BACKEND=go go test -tags=perf ./internal/perf/ \
-test.run=TestDotGeneral_PerformanceTable -test.v -test.count=1 \
-markdown
| Test Name | LHS Dims | RHS Dims | Layout | DType | BatchSize | Time/Run | Num Ops | GOps/Sec |
|---|
NoBatch-Tiny | {128, 4} | {4, 1} | NonTransposed | Float32 | 1 | 4.6µs | 1_024 | 0.2 |
NoBatch-Tiny | {128, 4} | {4, 1} | NonTransposed | Float64 | 1 | 4µs | 1_024 | 0.3 |
NoBatch-Tiny | {128, 4} | {4, 1} | NonTransposed | BFloat16 | 1 | 14.3µs | 1_024 | 0.1 |
NoBatch-Tiny | {128, 4} | {4, 1} | NonTransposed | Float16 | 1 | 23.4µs | 1_024 | 0.0 |
NoBatch-Tiny-Norm | {128, 4} | {1, 4} | Transposed | Float32 | 1 | 6.4µs | 1_024 | 0.2 |
NoBatch-Tiny-Norm | {128, 4} | {1, 4} | Transposed | Float64 | 1 | 16.3µs | 1_024 | 0.1 |
NoBatch-Tiny-Norm | {128, 4} | {1, 4} | Transposed | BFloat16 | 1 | 107.2µs | 1_024 | 0.0 |
NoBatch-Tiny-Norm | {128, 4} | {1, 4} | Transposed | Float16 | 1 | 15.1µs | 1_024 | 0.1 |
NoBatch-Small | {16, 128} | {128, 32} | NonTransposed | Float32 | 1 | 38.5µs | 131_072 | 3.4 |
NoBatch-Small | {16, 128} | {128, 32} | NonTransposed | Float64 | 1 | 28.6µs | 131_072 | 4.6 |
NoBatch-Small | {16, 128} | {128, 32} | NonTransposed | BFloat16 | 1 | 138.5µs | 131_072 | 0.9 |
NoBatch-Small | {16, 128} | {128, 32} | NonTransposed | Float16 | 1 | 177.5µs | 131_072 | 0.7 |
NoBatch-Medium | {128, 128} | {128, 256} | NonTransposed | Float32 | 1 | 161µs | 8_388_608 | 52.1 |
NoBatch-Medium | {128, 128} | {128, 256} | NonTransposed | Float64 | 1 | 369.1µs | 8_388_608 | 22.7 |
NoBatch-Medium | {128, 128} | {128, 256} | NonTransposed | BFloat16 | 1 | 664.7µs | 8_388_608 | 12.6 |
NoBatch-Medium | {128, 128} | {128, 256} | NonTransposed | Float16 | 1 | 2ms | 8_388_608 | 4.1 |
NoBatch-Large | {1536, 1920} | {1920, 1024} | NonTransposed | Float32 | 1 | 16.6ms | 6_039_797_760 | 363.1 |
NoBatch-Large | {1536, 1920} | {1920, 1024} | NonTransposed | Float64 | 1 | 49.3ms | 6_039_797_760 | 122.5 |
NoBatch-Large | {1536, 1920} | {1920, 1024} | NonTransposed | BFloat16 | 1 | 123.4ms | 6_039_797_760 | 49.0 |
NoBatch-Large | {1536, 1920} | {1920, 1024} | NonTransposed | Float16 | 1 | 442.5ms | 6_039_797_760 | 13.6 |
R-Unbalanced-Cross | {128} | {128, 256} | NonTransposed | Float32 | 1 | 23.2µs | 65_536 | 2.8 |
R-Unbalanced-Cross | {128} | {128, 256} | NonTransposed | Float64 | 1 | 29.8µs | 65_536 | 2.2 |
R-Unbalanced-Cross | {128} | {128, 256} | NonTransposed | BFloat16 | 1 | 127.8µs | 65_536 | 0.5 |
R-Unbalanced-Cross | {128} | {128, 256} | NonTransposed | Float16 | 1 | 158.7µs | 65_536 | 0.4 |
L-Unbalanced-Cross | {4096, 32} | {32, 16} | NonTransposed | Float32 | 1 | 225.9µs | 4_194_304 | 18.6 |
L-Unbalanced-Cross | {4096, 32} | {32, 16} | NonTransposed | Float64 | 1 | 1.5ms | 4_194_304 | 2.9 |
L-Unbalanced-Cross | {4096, 32} | {32, 16} | NonTransposed | BFloat16 | 1 | 555µs | 4_194_304 | 7.6 |
L-Unbalanced-Cross | {4096, 32} | {32, 16} | NonTransposed | Float16 | 1 | 1.1ms | 4_194_304 | 4.0 |
LargeBatch-Tiny | {1024, 128, 4} | {1024, 4, 1} | NonTransposed | Float32 | 1024 | 186.8µs | 1_048_576 | 5.6 |
LargeBatch-Tiny | {1024, 128, 4} | {1024, 4, 1} | NonTransposed | Float64 | 1024 | 216.6µs | 1_048_576 | 4.8 |
LargeBatch-Tiny | {1024, 128, 4} | {1024, 4, 1} | NonTransposed | BFloat16 | 1024 | 279.5µs | 1_048_576 | 3.8 |
LargeBatch-Tiny | {1024, 128, 4} | {1024, 4, 1} | NonTransposed | Float16 | 1024 | 457.5µs | 1_048_576 | 2.3 |
LargeBatch-Small | {256, 8, 32} | {256, 32, 16} | NonTransposed | Float32 | 256 | 454.8µs | 2_097_152 | 4.6 |
LargeBatch-Small | {256, 8, 32} | {256, 32, 16} | NonTransposed | Float64 | 256 | 482.2µs | 2_097_152 | 4.3 |
LargeBatch-Small | {256, 8, 32} | {256, 32, 16} | NonTransposed | BFloat16 | 256 | 548.6µs | 2_097_152 | 3.8 |
LargeBatch-Small | {256, 8, 32} | {256, 32, 16} | NonTransposed | Float16 | 256 | 677.4µs | 2_097_152 | 3.1 |
LargeBatch-Medium | {64, 64, 128} | {64, 64, 128} | Transposed | Float32 | 64 | 1ms | 67_108_864 | 66.7 |
LargeBatch-Medium | {64, 64, 128} | {64, 64, 128} | Transposed | Float64 | 64 | 1.1ms | 67_108_864 | 63.1 |
LargeBatch-Medium | {64, 64, 128} | {64, 64, 128} | Transposed | BFloat16 | 64 | 4.3ms | 67_108_864 | 15.7 |
LargeBatch-Medium | {64, 64, 128} | {64, 64, 128} | Transposed | Float16 | 64 | 6ms | 67_108_864 | 11.1 |
Batched-Large-1 | {16, 1536, 1920} | {16, 1920, 1024} | NonTransposed | Float32 | 16 | 240.9ms | 96_636_764_160 | 401.2 |
Batched-Large-1 | {16, 1536, 1920} | {16, 1920, 1024} | NonTransposed | Float64 | 16 | 771.6ms | 96_636_764_160 | 125.2 |
Batched-Large-1 | {16, 1536, 1920} | {16, 1920, 1024} | NonTransposed | BFloat16 | 16 | 1.8s | 96_636_764_160 | 52.8 |
Batched-Large-1 | {16, 1536, 1920} | {16, 1920, 1024} | NonTransposed | Float16 | 16 | 6.6s | 96_636_764_160 | 14.5 |
Batched-Large-2 | {16, 1024, 1920} | {16, 1920, 1536} | NonTransposed | Float32 | 16 | 295.5ms | 96_636_764_160 | 327.0 |
Batched-Large-2 | {16, 1024, 1920} | {16, 1920, 1536} | NonTransposed | Float64 | 16 | 781.6ms | 96_636_764_160 | 123.6 |
Batched-Large-2 | {16, 1024, 1920} | {16, 1920, 1536} | NonTransposed | BFloat16 | 16 | 1.8s | 96_636_764_160 | 53.3 |
Batched-Large-2 | {16, 1024, 1920} | {16, 1920, 1536} | NonTransposed | Float16 | 16 | 6.6s | 96_636_764_160 | 14.6 |
KA-Batch-16-#1 | {16, 12, 13, 13} | {16, 12, 13, 32} | NonTransposed | Float32 | 192 | 310.2µs | 2_076_672 | 6.7 |
KA-Batch-16-#1 | {16, 12, 13, 13} | {16, 12, 13, 32} | NonTransposed | Float64 | 192 | 403.7µs | 2_076_672 | 5.1 |
KA-Batch-16-#1 | {16, 12, 13, 13} | {16, 12, 13, 32} | NonTransposed | BFloat16 | 192 | 497.4µs | 2_076_672 | 4.2 |
KA-Batch-16-#1 | {16, 12, 13, 13} | {16, 12, 13, 32} | NonTransposed | Float16 | 192 | 665.5µs | 2_076_672 | 3.1 |
KA-Batch-16-#2 | {16, 12, 13, 32} | {16, 12, 32, 13} | NonTransposed | Float32 | 192 | 304.5µs | 2_076_672 | 6.8 |
KA-Batch-16-#2 | {16, 12, 13, 32} | {16, 12, 32, 13} | NonTransposed | Float64 | 192 | 360.8µs | 2_076_672 | 5.8 |
KA-Batch-16-#2 | {16, 12, 13, 32} | {16, 12, 32, 13} | NonTransposed | BFloat16 | 192 | 436µs | 2_076_672 | 4.8 |
KA-Batch-16-#2 | {16, 12, 13, 32} | {16, 12, 32, 13} | NonTransposed | Float16 | 192 | 519.6µs | 2_076_672 | 4.0 |
KA-Batch-16-#3 | {16, 13, 1536} | {1536, 384} | NonTransposed | Float32 | 1 | 2.5ms | 245_366_784 | 100.0 |
KA-Batch-16-#3 | {16, 13, 1536} | {1536, 384} | NonTransposed | Float64 | 1 | 7ms | 245_366_784 | 35.2 |
KA-Batch-16-#3 | {16, 13, 1536} | {1536, 384} | NonTransposed | BFloat16 | 1 | 12.5ms | 245_366_784 | 19.6 |
KA-Batch-16-#3 | {16, 13, 1536} | {1536, 384} | NonTransposed | Float16 | 1 | 27.8ms | 245_366_784 | 8.8 |
KA-Batch-16-#4 | {16, 13, 384} | {384, 1536} | NonTransposed | Float32 | 1 | 35.7ms | 245_366_784 | 6.9 |
KA-Batch-16-#4 | {16, 13, 384} | {384, 1536} | NonTransposed | Float64 | 1 | 5.4ms | 245_366_784 | 45.5 |
KA-Batch-16-#4 | {16, 13, 384} | {384, 1536} | NonTransposed | BFloat16 | 1 | 6.3ms | 245_366_784 | 38.9 |
KA-Batch-16-#4 | {16, 13, 384} | {384, 1536} | NonTransposed | Float16 | 1 | 20.5ms | 245_366_784 | 11.9 |
KA-Batch-16-#5 | {16, 13, 384} | {384, 384} | NonTransposed | Float32 | 1 | 648.1µs | 61_341_696 | 94.6 |
KA-Batch-16-#5 | {16, 13, 384} | {384, 384} | NonTransposed | Float64 | 1 | 1.4ms | 61_341_696 | 42.6 |
KA-Batch-16-#5 | {16, 13, 384} | {384, 384} | NonTransposed | BFloat16 | 1 | 2.3ms | 61_341_696 | 26.5 |
KA-Batch-16-#5 | {16, 13, 384} | {384, 384} | NonTransposed | Float16 | 1 | 7.2ms | 61_341_696 | 8.5 |
adult-#1 | {128, 4} | {4, 1} | NonTransposed | Float32 | 1 | 4.4µs | 1_024 | 0.2 |
adult-#1 | {128, 4} | {4, 1} | NonTransposed | Float64 | 1 | 15.9µs | 1_024 | 0.1 |
adult-#1 | {128, 4} | {4, 1} | NonTransposed | BFloat16 | 1 | 10.6µs | 1_024 | 0.1 |
adult-#1 | {128, 4} | {4, 1} | NonTransposed | Float16 | 1 | 25.9µs | 1_024 | 0.0 |
adult-#2 | {128, 69} | {69, 4} | NonTransposed | Float32 | 1 | 19.3µs | 70_656 | 3.7 |
adult-#2 | {128, 69} | {69, 4} | NonTransposed | Float64 | 1 | 18.8µs | 70_656 | 3.8 |
adult-#2 | {128, 69} | {69, 4} | NonTransposed | BFloat16 | 1 | 72.6µs | 70_656 | 1.0 |
adult-#2 | {128, 69} | {69, 4} | NonTransposed | Float16 | 1 | 132.3µs | 70_656 | 0.5 |
adult-#3 | {25, 4} | {4, 1} | NonTransposed | Float32 | 1 | 4µs | 200 | 0.1 |
adult-#3 | {25, 4} | {4, 1} | NonTransposed | Float64 | 1 | 6.3µs | 200 | 0.0 |
adult-#3 | {25, 4} | {4, 1} | NonTransposed | BFloat16 | 1 | 32.1µs | 200 | 0.0 |
adult-#3 | {25, 4} | {4, 1} | NonTransposed | Float16 | 1 | 10.9µs | 200 | 0.0 |
adult-#4 | {25, 69} | {69, 4} | NonTransposed | Float32 | 1 | 224.4µs | 13_800 | 0.1 |
adult-#4 | {25, 69} | {69, 4} | NonTransposed | Float64 | 1 | 7.1µs | 13_800 | 1.9 |
adult-#4 | {25, 69} | {69, 4} | NonTransposed | BFloat16 | 1 | 21.6µs | 13_800 | 0.6 |
adult-#4 | {25, 69} | {69, 4} | NonTransposed | Float16 | 1 | 25.4µs | 13_800 | 0.5 |
adult-#5 | {49, 4} | {4, 1} | NonTransposed | Float32 | 1 | 3.5µs | 392 | 0.1 |
adult-#5 | {49, 4} | {4, 1} | NonTransposed | Float64 | 1 | 4.7µs | 392 | 0.1 |
adult-#5 | {49, 4} | {4, 1} | NonTransposed | BFloat16 | 1 | 12.7µs | 392 | 0.0 |
adult-#5 | {49, 4} | {4, 1} | NonTransposed | Float16 | 1 | 9.4µs | 392 | 0.0 |
adult-#6 | {49, 69} | {69, 4} | NonTransposed | Float32 | 1 | 10.7µs | 27_048 | 2.5 |
adult-#6 | {49, 69} | {69, 4} | NonTransposed | Float64 | 1 | 10.7µs | 27_048 | 2.5 |
adult-#6 | {49, 69} | {69, 4} | NonTransposed | BFloat16 | 1 | 38µs | 27_048 | 0.7 |
adult-#6 | {49, 69} | {69, 4} | NonTransposed | Float16 | 1 | 43µs | 27_048 | 0.6 |
adult-#6-Normalized | {49, 69} | {4, 69} | Transposed | Float32 | 1 | 10.3µs | 27_048 | 2.6 |
adult-#6-Normalized | {49, 69} | {4, 69} | Transposed | Float64 | 1 | 24.3µs | 27_048 | 1.1 |
adult-#6-Normalized | {49, 69} | {4, 69} | Transposed | BFloat16 | 1 | 31.1µs | 27_048 | 0.9 |
adult-#6-Normalized | {49, 69} | {4, 69} | Transposed | Float16 | 1 | 38.5µs | 27_048 | 0.7 |
(Older results)
Backend: go / CPU AMD 9950X3D / 2026-01-17 (gotip -> just after go1.26rc2 made available)
GOMLX_BACKEND=go go test -tags=perf ./backends/simplego/ \
-test.run=TestDotGeneral_PerformanceTable -test.v -test.count=1 \
-markdown
| Test Name | LHS Dims | RHS Dims | DType | BatchSize | Time/Run | Num Ops | GOps/Sec |
|---|
NoBatch-Tiny | {128, 4} | {4, 1} | Float32 | 1 | 3.52µs | 1,024 | 0.3 |
NoBatch-Tiny | {128, 4} | {4, 1} | Float64 | 1 | 3.50µs | 1,024 | 0.3 |
NoBatch-Tiny | {128, 4} | {4, 1} | BFloat16 | 1 | 3.71µs | 1,024 | 0.3 |
NoBatch-Tiny | {128, 4} | {4, 1} | Float16 | 1 | 5.17µs | 1,024 | 0.2 |
NoBatch-Tiny-Norm | {128, 4} | {1, 4} | Float32 | 1 | 3.88µs | 1,024 | 0.3 |
NoBatch-Tiny-Norm | {128, 4} | {1, 4} | Float64 | 1 | 3.92µs | 1,024 | 0.3 |
NoBatch-Tiny-Norm | {128, 4} | {1, 4} | BFloat16 | 1 | 4.21µs | 1,024 | 0.2 |
NoBatch-Tiny-Norm | {128, 4} | {1, 4} | Float16 | 1 | 5.81µs | 1,024 | 0.2 |
NoBatch-Small | {16, 128} | {128, 32} | Float32 | 1 | 26.51µs | 131,072 | 4.9 |
NoBatch-Small | {16, 128} | {128, 32} | Float64 | 1 | 27.08µs | 131,072 | 4.8 |
NoBatch-Small | {16, 128} | {128, 32} | BFloat16 | 1 | 44.36µs | 131,072 | 3.0 |
NoBatch-Small | {16, 128} | {128, 32} | Float16 | 1 | 141.16µs | 131,072 | 0.9 |
NoBatch-Medium | {128, 128} | {128, 256} | Float32 | 1 | 130.12µs | 8,388,608 | 64.5 |
NoBatch-Medium | {128, 128} | {128, 256} | Float64 | 1 | 296.33µs | 8,388,608 | 28.3 |
NoBatch-Medium | {128, 128} | {128, 256} | BFloat16 | 1 | 656.75µs | 8,388,608 | 12.8 |
NoBatch-Medium | {128, 128} | {128, 256} | Float16 | 1 | 1.58ms | 8,388,608 | 5.3 |
NoBatch-Large | {1536, 1920} | {1920, 1024} | Float32 | 1 | 17.76ms | 6,039,797,760 | 340.2 |
NoBatch-Large | {1536, 1920} | {1920, 1024} | Float64 | 1 | 50.29ms | 6,039,797,760 | 120.1 |
NoBatch-Large | {1536, 1920} | {1920, 1024} | BFloat16 | 1 | 124.82ms | 6,039,797,760 | 48.4 |
NoBatch-Large | {1536, 1920} | {1920, 1024} | Float16 | 1 | 413.90ms | 6,039,797,760 | 14.6 |
R-Unbalanced-Cross | {128} | {128, 256} | Float32 | 1 | 22.95µs | 65,536 | 2.9 |
R-Unbalanced-Cross | {128} | {128, 256} | Float64 | 1 | 29.16µs | 65,536 | 2.2 |
R-Unbalanced-Cross | {128} | {128, 256} | BFloat16 | 1 | 26.57µs | 65,536 | 2.5 |
R-Unbalanced-Cross | {128} | {128, 256} | Float16 | 1 | 73.04µs | 65,536 | 0.9 |
L-Unbalanced-Cross | {4096, 32} | {32, 16} | Float32 | 1 | 212.43µs | 4,194,304 | 19.7 |
L-Unbalanced-Cross | {4096, 32} | {32, 16} | Float64 | 1 | 367.21µs | 4,194,304 | 11.4 |
L-Unbalanced-Cross | {4096, 32} | {32, 16} | BFloat16 | 1 | 598.47µs | 4,194,304 | 7.0 |
L-Unbalanced-Cross | {4096, 32} | {32, 16} | Float16 | 1 | 1.23ms | 4,194,304 | 3.4 |
LargeBatch-Tiny | {1024, 128, 4} | {1024, 4, 1} | Float32 | 1024 | 128.69µs | 1,048,576 | 8.1 |
LargeBatch-Tiny | {1024, 128, 4} | {1024, 4, 1} | Float64 | 1024 | 134.52µs | 1,048,576 | 7.8 |
LargeBatch-Tiny | {1024, 128, 4} | {1024, 4, 1} | BFloat16 | 1024 | 202.00µs | 1,048,576 | 5.2 |
LargeBatch-Tiny | {1024, 128, 4} | {1024, 4, 1} | Float16 | 1024 | 411.85µs | 1,048,576 | 2.5 |
LargeBatch-Small | {256, 8, 32} | {256, 32, 16} | Float32 | 256 | 504.32µs | 2,097,152 | 4.2 |
LargeBatch-Small | {256, 8, 32} | {256, 32, 16} | Float64 | 256 | 522.36µs | 2,097,152 | 4.0 |
LargeBatch-Small | {256, 8, 32} | {256, 32, 16} | BFloat16 | 256 | 555.87µs | 2,097,152 | 3.8 |
LargeBatch-Small | {256, 8, 32} | {256, 32, 16} | Float16 | 256 | 740.62µs | 2,097,152 | 2.8 |
LargeBatch-Medium | {64, 64, 128} | {64, 64, 128} | Float32 | 64 | 947.11µs | 67,108,864 | 70.9 |
LargeBatch-Medium | {64, 64, 128} | {64, 64, 128} | Float64 | 64 | 963.56µs | 67,108,864 | 69.6 |
LargeBatch-Medium | {64, 64, 128} | {64, 64, 128} | BFloat16 | 64 | 1.92ms | 67,108,864 | 34.9 |
LargeBatch-Medium | {64, 64, 128} | {64, 64, 128} | Float16 | 64 | 5.60ms | 67,108,864 | 12.0 |
Batched-Large | {16, 1536, 1920} | {16, 1920, 1024} | Float32 | 16 | 240.61ms | 96,636,764,160 | 401.6 |
Batched-Large | {16, 1536, 1920} | {16, 1920, 1024} | Float64 | 16 | 791.46ms | 96,636,764,160 | 122.1 |
Batched-Large | {16, 1536, 1920} | {16, 1920, 1024} | BFloat16 | 16 | 1.87s | 96,636,764,160 | 51.6 |
Batched-Large | {16, 1536, 1920} | {16, 1920, 1024} | Float16 | 16 | 6.32s | 96,636,764,160 | 15.3 |
KA-Batch-16-#1 | {16, 12, 13, 13} | {16, 12, 13, 32} | Float32 | 192 | 314.50µs | 2,076,672 | 6.6 |
KA-Batch-16-#1 | {16, 12, 13, 13} | {16, 12, 13, 32} | Float64 | 192 | 313.41µs | 2,076,672 | 6.6 |
KA-Batch-16-#1 | {16, 12, 13, 13} | {16, 12, 13, 32} | BFloat16 | 192 | 389.75µs | 2,076,672 | 5.3 |
KA-Batch-16-#1 | {16, 12, 13, 13} | {16, 12, 13, 32} | Float16 | 192 | 607.23µs | 2,076,672 | 3.4 |
KA-Batch-16-#2 | {16, 12, 13, 32} | {16, 12, 32, 13} | Float32 | 192 | 323.64µs | 2,076,672 | 6.4 |
KA-Batch-16-#2 | {16, 12, 13, 32} | {16, 12, 32, 13} | Float64 | 192 | 341.99µs | 2,076,672 | 6.1 |
KA-Batch-16-#2 | {16, 12, 13, 32} | {16, 12, 32, 13} | BFloat16 | 192 | 386.01µs | 2,076,672 | 5.4 |
KA-Batch-16-#2 | {16, 12, 13, 32} | {16, 12, 32, 13} | Float16 | 192 | 564.37µs | 2,076,672 | 3.7 |
KA-Batch-16-#3 | {16, 13, 1536} | {1536, 384} | Float32 | 1 | 2.62ms | 245,366,784 | 93.7 |
KA-Batch-16-#3 | {16, 13, 1536} | {1536, 384} | Float64 | 1 | 4.91ms | 245,366,784 | 49.9 |
KA-Batch-16-#3 | {16, 13, 1536} | {1536, 384} | BFloat16 | 1 | 9.18ms | 245,366,784 | 26.7 |
KA-Batch-16-#3 | {16, 13, 1536} | {1536, 384} | Float16 | 1 | 25.54ms | 245,366,784 | 9.6 |
KA-Batch-16-#4 | {16, 13, 384} | {384, 1536} | Float32 | 1 | 2.18ms | 245,366,784 | 112.3 |
KA-Batch-16-#4 | {16, 13, 384} | {384, 1536} | Float64 | 1 | 4.08ms | 245,366,784 | 60.1 |
KA-Batch-16-#4 | {16, 13, 384} | {384, 1536} | BFloat16 | 1 | 7.41ms | 245,366,784 | 33.1 |
KA-Batch-16-#4 | {16, 13, 384} | {384, 1536} | Float16 | 1 | 20.73ms | 245,366,784 | 11.8 |
KA-Batch-16-#5 | {16, 13, 384} | {384, 384} | Float32 | 1 | 698.36µs | 61,341,696 | 87.8 |
KA-Batch-16-#5 | {16, 13, 384} | {384, 384} | Float64 | 1 | 1.35ms | 61,341,696 | 45.5 |
KA-Batch-16-#5 | {16, 13, 384} | {384, 384} | BFloat16 | 1 | 2.40ms | 61,341,696 | 25.6 |
KA-Batch-16-#5 | {16, 13, 384} | {384, 384} | Float16 | 1 | 6.58ms | 61,341,696 | 9.3 |
adult-#1 | {128, 4} | {4, 1} | Float32 | 1 | 3.73µs | 1,024 | 0.3 |
adult-#1 | {128, 4} | {4, 1} | Float64 | 1 | 3.88µs | 1,024 | 0.3 |
adult-#1 | {128, 4} | {4, 1} | BFloat16 | 1 | 3.89µs | 1,024 | 0.3 |
adult-#1 | {128, 4} | {4, 1} | Float16 | 1 | 5.20µs | 1,024 | 0.2 |
adult-#2 | {128, 69} | {69, 4} | Float32 | 1 | 16.52µs | 70,656 | 4.3 |
adult-#2 | {128, 69} | {69, 4} | Float64 | 1 | 16.44µs | 70,656 | 4.3 |
adult-#2 | {128, 69} | {69, 4} | BFloat16 | 1 | 26.20µs | 70,656 | 2.7 |
adult-#2 | {128, 69} | {69, 4} | Float16 | 1 | 83.93µs | 70,656 | 0.8 |
adult-#3 | {25, 4} | {4, 1} | Float32 | 1 | 3.54µs | 200 | 0.1 |
adult-#3 | {25, 4} | {4, 1} | Float64 | 1 | 3.52µs | 200 | 0.1 |
adult-#3 | {25, 4} | {4, 1} | BFloat16 | 1 | 3.50µs | 200 | 0.1 |
adult-#3 | {25, 4} | {4, 1} | Float16 | 1 | 3.69µs | 200 | 0.1 |
adult-#4 | {25, 69} | {69, 4} | Float32 | 1 | 6.41µs | 13,800 | 2.2 |
adult-#4 | {25, 69} | {69, 4} | Float64 | 1 | 6.44µs | 13,800 | 2.1 |
adult-#4 | {25, 69} | {69, 4} | BFloat16 | 1 | 8.41µs | 13,800 | 1.6 |
adult-#4 | {25, 69} | {69, 4} | Float16 | 1 | 19.84µs | 13,800 | 0.7 |
adult-#5 | {49, 4} | {4, 1} | Float32 | 1 | 3.46µs | 392 | 0.1 |
adult-#5 | {49, 4} | {4, 1} | Float64 | 1 | 3.51µs | 392 | 0.1 |
adult-#5 | {49, 4} | {4, 1} | BFloat16 | 1 | 3.50µs | 392 | 0.1 |
adult-#5 | {49, 4} | {4, 1} | Float16 | 1 | 3.97µs | 392 | 0.1 |
adult-#6 | {49, 69} | {69, 4} | Float32 | 1 | 8.69µs | 27,048 | 3.1 |
adult-#6 | {49, 69} | {69, 4} | Float64 | 1 | 8.66µs | 27,048 | 3.1 |
adult-#6 | {49, 69} | {69, 4} | BFloat16 | 1 | 12.57µs | 27,048 | 2.2 |
adult-#6 | {49, 69} | {69, 4} | Float16 | 1 | 34.48µs | 27,048 | 0.8 |
adult-#6-Normalized | {49, 69} | {4, 69} | Float32 | 1 | 9.06µs | 27,048 | 3.0 |
adult-#6-Normalized | {49, 69} | {4, 69} | Float64 | 1 | 9.51µs | 27,048 | 2.8 |
adult-#6-Normalized | {49, 69} | {4, 69} | BFloat16 | 1 | 15.33µs | 27,048 | 1.8 |
adult-#6-Normalized | {49, 69} | {4, 69} | Float16 | 1 | 35.70µs | 27,048 | 0.8 |
With the experimental SIMD enabled for AVX-512 we get the following updates:
| Test Name | LHS Dims | RHS Dims | DType | BatchSize | Time/Run | Num Ops | GOps/Sec |
|---|
NoBatch-Medium | {128, 128} | {128, 256} | Float32 | 1 | 9.07µs | 8,388,608 | 924.5 |
NoBatch-Large | {1536, 1920} | {1920, 1024} | Float32 | 1 | 2.95ms | 6,039,797,760 | 2045.5 |
L-Unbalanced-Cross | {4096, 32} | {32, 16} | Float32 | 1 | 11.31µs | 4,194,304 | 370.8 |
LargeBatch-Tiny | {1024, 128, 4} | {1024, 4, 1} | Float32 | 1024 | 806.62µs | 1,048,576 | 1.3 |
LargeBatch-Small | {256, 8, 32} | {256, 32, 16} | Float32 | 256 | 171.54µs | 2,097,152 | 12.2 |
Batched-Large | {16, 1536, 1920} | {16, 1920, 1024} | Float32 | 16 | 48.60ms | 96,636,764,160 | 1988.3 |
KA-Batch-16-#1 | {16, 12, 13, 13} | {16, 12, 13, 32} | Float32 | 192 | 113.87µs | 2,076,672 | 18.2 |
KA-Batch-16-#2 | {16, 12, 13, 32} | {16, 12, 32, 13} | Float32 | 192 | 138.35µs | 2,076,672 | 15.0 |
Backend: xla:cpu/ CPU AMD 9950X3D / 2026-01-07 / PJRT Plugin v0.83.4 (see pjrt-cpu-binaries)
GOMLX_BACKEND=xla:cpu go test -tags=perf ./backends/simplego/ \
-test.run=TestDotGeneral_PerformanceTable -test.v -test.count=1 \
-markdown
| Test Name | LHS Dims | RHS Dims | DType | BatchSize | Time/Run | Num Ops | GOps/Sec |
|---|
NoBatch-Tiny | {128, 4} | {4, 1} | Float32 | 1 | 4.62µs | 1,024 | 0.2 |
NoBatch-Tiny | {128, 4} | {4, 1} | Float64 | 1 | 4.58µs | 1,024 | 0.2 |
NoBatch-Tiny | {128, 4} | {4, 1} | BFloat16 | 1 | 4.83µs | 1,024 | 0.2 |
NoBatch-Tiny | {128, 4} | {4, 1} | Float16 | 1 | 5.12µs | 1,024 | 0.2 |
NoBatch-Tiny-Norm | {128, 4} | {1, 4} | Float32 | 1 | 4.55µs | 1,024 | 0.2 |
NoBatch-Tiny-Norm | {128, 4} | {1, 4} | Float64 | 1 | 4.20µs | 1,024 | 0.2 |
NoBatch-Tiny-Norm | {128, 4} | {1, 4} | BFloat16 | 1 | 4.51µs | 1,024 | 0.2 |
NoBatch-Tiny-Norm | {128, 4} | {1, 4} | Float16 | 1 | 4.46µs | 1,024 | 0.2 |
NoBatch-Small | {16, 128} | {128, 32} | Float32 | 1 | 5.48µs | 131,072 | 23.9 |
NoBatch-Small | {16, 128} | {128, 32} | Float64 | 1 | 9.63µs | 131,072 | 13.6 |
NoBatch-Small | {16, 128} | {128, 32} | BFloat16 | 1 | 6.25µs | 131,072 | 21.0 |
NoBatch-Small | {16, 128} | {128, 32} | Float16 | 1 | 6.39µs | 131,072 | 20.5 |
NoBatch-Medium | {128, 128} | {128, 256} | Float32 | 1 | 31.59µs | 8,388,608 | 265.5 |
NoBatch-Medium | {128, 128} | {128, 256} | Float64 | 1 | 70.09µs | 8,388,608 | 119.7 |
NoBatch-Medium | {128, 128} | {128, 256} | BFloat16 | 1 | 35.01µs | 8,388,608 | 239.6 |
NoBatch-Medium | {128, 128} | {128, 256} | Float16 | 1 | 35.33µs | 8,388,608 | 237.5 |
NoBatch-Large | {1536, 1920} | {1920, 1024} | Float32 | 1 | 1.75ms | 6,039,797,760 | 3456.4 |
NoBatch-Large | {1536, 1920} | {1920, 1024} | Float64 | 1 | 16.79ms | 6,039,797,760 | 359.8 |
NoBatch-Large | {1536, 1920} | {1920, 1024} | BFloat16 | 1 | 2.83ms | 6,039,797,760 | 2134.5 |
NoBatch-Large | {1536, 1920} | {1920, 1024} | Float16 | 1 | 2.77ms | 6,039,797,760 | 2184.4 |
R-Unbalanced-Cross | {128} | {128, 256} | Float32 | 1 | 6.14µs | 65,536 | 10.7 |
R-Unbalanced-Cross | {128} | {128, 256} | Float64 | 1 | 6.17µs | 65,536 | 10.6 |
R-Unbalanced-Cross | {128} | {128, 256} | BFloat16 | 1 | 11.15µs | 65,536 | 5.9 |
R-Unbalanced-Cross | {128} | {128, 256} | Float16 | 1 | 11.87µs | 65,536 | 5.5 |
L-Unbalanced-Cross | {4096, 32} | {32, 16} | Float32 | 1 | 15.05µs | 4,194,304 | 278.7 |
L-Unbalanced-Cross | {4096, 32} | {32, 16} | Float64 | 1 | 81.84µs | 4,194,304 | 51.3 |
L-Unbalanced-Cross | {4096, 32} | {32, 16} | BFloat16 | 1 | 49.63µs | 4,194,304 | 84.5 |
L-Unbalanced-Cross | {4096, 32} | {32, 16} | Float16 | 1 | 39.84µs | 4,194,304 | 105.3 |
LargeBatch-Tiny | {1024, 128, 4} | {1024, 4, 1} | Float32 | 1024 | 121.14µs | 1,048,576 | 8.7 |
LargeBatch-Tiny | {1024, 128, 4} | {1024, 4, 1} | Float64 | 1024 | 67.71µs | 1,048,576 | 15.5 |
LargeBatch-Tiny | {1024, 128, 4} | {1024, 4, 1} | BFloat16 | 1024 | 140.54µs | 1,048,576 | 7.5 |
LargeBatch-Tiny | {1024, 128, 4} | {1024, 4, 1} | Float16 | 1024 | 135.41µs | 1,048,576 | 7.7 |
LargeBatch-Small | {256, 8, 32} | {256, 32, 16} | Float32 | 256 | 27.18µs | 2,097,152 | 77.2 |
LargeBatch-Small | {256, 8, 32} | {256, 32, 16} | Float64 | 256 | 123.81µs | 2,097,152 | 16.9 |
LargeBatch-Small | {256, 8, 32} | {256, 32, 16} | BFloat16 | 256 | 61.14µs | 2,097,152 | 34.3 |
LargeBatch-Small | {256, 8, 32} | {256, 32, 16} | Float16 | 256 | 64.62µs | 2,097,152 | 32.5 |
LargeBatch-Medium | {64, 64, 128} | {64, 64, 128} | Float32 | 64 | 59.80µs | 67,108,864 | 1122.2 |
LargeBatch-Medium | {64, 64, 128} | {64, 64, 128} | Float64 | 64 | 849.47µs | 67,108,864 | 79.0 |
LargeBatch-Medium | {64, 64, 128} | {64, 64, 128} | BFloat16 | 64 | 119.78µs | 67,108,864 | 560.3 |
LargeBatch-Medium | {64, 64, 128} | {64, 64, 128} | Float16 | 64 | 123.86µs | 67,108,864 | 541.8 |
Batched-Large | {16, 1536, 1920} | {16, 1920, 1024} | Float32 | 16 | 51.30ms | 96,636,764,160 | 1883.7 |
Batched-Large | {16, 1536, 1920} | {16, 1920, 1024} | Float64 | 16 | 220.27ms | 96,636,764,160 | 438.7 |
Batched-Large | {16, 1536, 1920} | {16, 1920, 1024} | BFloat16 | 16 | 72.52ms | 96,636,764,160 | 1332.6 |
Batched-Large | {16, 1536, 1920} | {16, 1920, 1024} | Float16 | 16 | 71.93ms | 96,636,764,160 | 1343.6 |
KA-Batch-16-#1 | {16, 12, 13, 13} | {16, 12, 13, 32} | Float32 | 192 | 13.93µs | 2,076,672 | 149.0 |
KA-Batch-16-#1 | {16, 12, 13, 13} | {16, 12, 13, 32} | Float64 | 192 | 118.63µs | 2,076,672 | 17.5 |
KA-Batch-16-#1 | {16, 12, 13, 13} | {16, 12, 13, 32} | BFloat16 | 192 | 63.88µs | 2,076,672 | 32.5 |
KA-Batch-16-#1 | {16, 12, 13, 13} | {16, 12, 13, 32} | Float16 | 192 | 47.36µs | 2,076,672 | 43.9 |
KA-Batch-16-#2 | {16, 12, 13, 32} | {16, 12, 32, 13} | Float32 | 192 | 48.02µs | 2,076,672 | 43.2 |
KA-Batch-16-#2 | {16, 12, 13, 32} | {16, 12, 32, 13} | Float64 | 192 | 121.54µs | 2,076,672 | 17.1 |
KA-Batch-16-#2 | {16, 12, 13, 32} | {16, 12, 32, 13} | BFloat16 | 192 | 81.54µs | 2,076,672 | 25.5 |
KA-Batch-16-#2 | {16, 12, 13, 32} | {16, 12, 32, 13} | Float16 | 192 | 82.65µs | 2,076,672 | 25.1 |
KA-Batch-16-#3 | {16, 13, 1536} | {1536, 384} | Float32 | 1 | 228.54µs | 245,366,784 | 1073.6 |
KA-Batch-16-#3 | {16, 13, 1536} | {1536, 384} | Float64 | 1 | 882.13µs | 245,366,784 | 278.2 |
KA-Batch-16-#3 | {16, 13, 1536} | {1536, 384} | BFloat16 | 1 | 374.36µs | 245,366,784 | 655.4 |
KA-Batch-16-#3 | {16, 13, 1536} | {1536, 384} | Float16 | 1 | 371.05µs | 245,366,784 | 661.3 |
KA-Batch-16-#4 | {16, 13, 384} | {384, 1536} | Float32 | 1 | 147.57µs | 245,366,784 | 1662.7 |
KA-Batch-16-#4 | {16, 13, 384} | {384, 1536} | Float64 | 1 | 881.09µs | 245,366,784 | 278.5 |
KA-Batch-16-#4 | {16, 13, 384} | {384, 1536} | BFloat16 | 1 | 282.59µs | 245,366,784 | 868.3 |
KA-Batch-16-#4 | {16, 13, 384} | {384, 1536} | Float16 | 1 | 279.47µs | 245,366,784 | 878.0 |
KA-Batch-16-#5 | {16, 13, 384} | {384, 384} | Float32 | 1 | 67.69µs | 61,341,696 | 906.3 |
KA-Batch-16-#5 | {16, 13, 384} | {384, 384} | Float64 | 1 | 282.96µs | 61,341,696 | 216.8 |
KA-Batch-16-#5 | {16, 13, 384} | {384, 384} | BFloat16 | 1 | 112.21µs | 61,341,696 | 546.7 |
KA-Batch-16-#5 | {16, 13, 384} | {384, 384} | Float16 | 1 | 117.72µs | 61,341,696 | 521.1 |
adult-#1 | {128, 4} | {4, 1} | Float32 | 1 | 4.72µs | 1,024 | 0.2 |
adult-#1 | {128, 4} | {4, 1} | Float64 | 1 | 4.24µs | 1,024 | 0.2 |
adult-#1 | {128, 4} | {4, 1} | BFloat16 | 1 | 4.27µs | 1,024 | 0.2 |
adult-#1 | {128, 4} | {4, 1} | Float16 | 1 | 4.21µs | 1,024 | 0.2 |
adult-#2 | {128, 69} | {69, 4} | Float32 | 1 | 5.98µs | 70,656 | 11.8 |
adult-#2 | {128, 69} | {69, 4} | Float64 | 1 | 7.87µs | 70,656 | 9.0 |
adult-#2 | {128, 69} | {69, 4} | BFloat16 | 1 | 6.34µs | 70,656 | 11.1 |
adult-#2 | {128, 69} | {69, 4} | Float16 | 1 | 6.27µs | 70,656 | 11.3 |
adult-#3 | {25, 4} | {4, 1} | Float32 | 1 | 2.28µs | 200 | 0.1 |
adult-#3 | {25, 4} | {4, 1} | Float64 | 1 | 2.29µs | 200 | 0.1 |
adult-#3 | {25, 4} | {4, 1} | BFloat16 | 1 | 2.32µs | 200 | 0.1 |
adult-#3 | {25, 4} | {4, 1} | Float16 | 1 | 2.35µs | 200 | 0.1 |
adult-#4 | {25, 69} | {69, 4} | Float32 | 1 | 5.63µs | 13,800 | 2.5 |
adult-#4 | {25, 69} | {69, 4} | Float64 | 1 | 5.88µs | 13,800 | 2.3 |
adult-#4 | {25, 69} | {69, 4} | BFloat16 | 1 | 5.90µs | 13,800 | 2.3 |
adult-#4 | {25, 69} | {69, 4} | Float16 | 1 | 5.97µs | 13,800 | 2.3 |
adult-#5 | {49, 4} | {4, 1} | Float32 | 1 | 2.33µs | 392 | 0.2 |
adult-#5 | {49, 4} | {4, 1} | Float64 | 1 | 2.26µs | 392 | 0.2 |
adult-#5 | {49, 4} | {4, 1} | BFloat16 | 1 | 2.34µs | 392 | 0.2 |
adult-#5 | {49, 4} | {4, 1} | Float16 | 1 | 2.33µs | 392 | 0.2 |
adult-#6 | {49, 69} | {69, 4} | Float32 | 1 | 5.87µs | 27,048 | 4.6 |
adult-#6 | {49, 69} | {69, 4} | Float64 | 1 | 5.95µs | 27,048 | 4.5 |
adult-#6 | {49, 69} | {69, 4} | BFloat16 | 1 | 6.26µs | 27,048 | 4.3 |
adult-#6 | {49, 69} | {69, 4} | Float16 | 1 | 6.39µs | 27,048 | 4.2 |
adult-#6-Normalized | {49, 69} | {4, 69} | Float32 | 1 | 5.96µs | 27,048 | 4.5 |
adult-#6-Normalized | {49, 69} | {4, 69} | Float64 | 1 | 6.07µs | 27,048 | 4.5 |
adult-#6-Normalized | {49, 69} | {4, 69} | BFloat16 | 1 | 6.03µs | 27,048 | 4.5 |
adult-#6-Normalized | {49, 69} | {4, 69} | Float16 | 1 | 6.09µs | 27,048 | 4.4 |
Backend: xla:cuda/ GPU RTX 5090 / 2026-01-07 / PJRT Plugin v0.81 (Jax-build)
GOMLX_BACKEND=xla:cuda go test -tags=perf ./backends/simplego/ \
-test.run=TestDotGeneral_PerformanceTable -test.v -test.count=1 \
-markdown
| Test Name | LHS Dims | RHS Dims | DType | BatchSize | Time/Run | Num Ops | GOps/Sec |
|---|
NoBatch-Tiny | {128, 4} | {4, 1} | Float32 | 1 | 17.53µs | 1,024 | 0.1 |
NoBatch-Tiny | {128, 4} | {4, 1} | Float64 | 1 | 16.42µs | 1,024 | 0.1 |
NoBatch-Tiny | {128, 4} | {4, 1} | BFloat16 | 1 | 15.72µs | 1,024 | 0.1 |
NoBatch-Tiny | {128, 4} | {4, 1} | Float16 | 1 | 14.97µs | 1,024 | 0.1 |
NoBatch-Tiny-Norm | {128, 4} | {1, 4} | Float32 | 1 | 13.44µs | 1,024 | 0.1 |
NoBatch-Tiny-Norm | {128, 4} | {1, 4} | Float64 | 1 | 17.53µs | 1,024 | 0.1 |
NoBatch-Tiny-Norm | {128, 4} | {1, 4} | BFloat16 | 1 | 26.20µs | 1,024 | 0.0 |
NoBatch-Tiny-Norm | {128, 4} | {1, 4} | Float16 | 1 | 15.92µs | 1,024 | 0.1 |
NoBatch-Small | {16, 128} | {128, 32} | Float32 | 1 | 27.23µs | 131,072 | 4.8 |
NoBatch-Small | {16, 128} | {128, 32} | Float64 | 1 | 60.17µs | 131,072 | 2.2 |
NoBatch-Small | {16, 128} | {128, 32} | BFloat16 | 1 | 32.50µs | 131,072 | 4.0 |
NoBatch-Small | {16, 128} | {128, 32} | Float16 | 1 | 23.79µs | 131,072 | 5.5 |
NoBatch-Medium | {128, 128} | {128, 256} | Float32 | 1 | 26.76µs | 8,388,608 | 313.5 |
NoBatch-Medium | {128, 128} | {128, 256} | Float64 | 1 | 60.90µs | 8,388,608 | 137.7 |
NoBatch-Medium | {128, 128} | {128, 256} | BFloat16 | 1 | 26.77µs | 8,388,608 | 313.4 |
NoBatch-Medium | {128, 128} | {128, 256} | Float16 | 1 | 26.01µs | 8,388,608 | 322.5 |
NoBatch-Large | {1536, 1920} | {1920, 1024} | Float32 | 1 | 121.00µs | 6,039,797,760 | 49915.7 |
NoBatch-Large | {1536, 1920} | {1920, 1024} | Float64 | 1 | 3.72ms | 6,039,797,760 | 1624.6 |
NoBatch-Large | {1536, 1920} | {1920, 1024} | BFloat16 | 1 | 91.86µs | 6,039,797,760 | 65751.5 |
NoBatch-Large | {1536, 1920} | {1920, 1024} | Float16 | 1 | 76.70µs | 6,039,797,760 | 78741.6 |
R-Unbalanced-Cross | {128} | {128, 256} | Float32 | 1 | 15.28µs | 65,536 | 4.3 |
R-Unbalanced-Cross | {128} | {128, 256} | Float64 | 1 | 84.30µs | 65,536 | 0.8 |
R-Unbalanced-Cross | {128} | {128, 256} | BFloat16 | 1 | 19.00µs | 65,536 | 3.4 |
R-Unbalanced-Cross | {128} | {128, 256} | Float16 | 1 | 18.36µs | 65,536 | 3.6 |
L-Unbalanced-Cross | {4096, 32} | {32, 16} | Float32 | 1 | 24.77µs | 4,194,304 | 169.3 |
L-Unbalanced-Cross | {4096, 32} | {32, 16} | Float64 | 1 | 41.32µs | 4,194,304 | 101.5 |
L-Unbalanced-Cross | {4096, 32} | {32, 16} | BFloat16 | 1 | 224.48µs | 4,194,304 | 18.7 |
L-Unbalanced-Cross | {4096, 32} | {32, 16} | Float16 | 1 | 19.41µs | 4,194,304 | 216.1 |
LargeBatch-Tiny | {1024, 128, 4} | {1024, 4, 1} | Float32 | 1024 | 14.82µs | 1,048,576 | 70.8 |
LargeBatch-Tiny | {1024, 128, 4} | {1024, 4, 1} | Float64 | 1024 | 27.46µs | 1,048,576 | 38.2 |
LargeBatch-Tiny | {1024, 128, 4} | {1024, 4, 1} | BFloat16 | 1024 | 14.84µs | 1,048,576 | 70.6 |
LargeBatch-Tiny | {1024, 128, 4} | {1024, 4, 1} | Float16 | 1024 | 13.19µs | 1,048,576 | 79.5 |
LargeBatch-Small | {256, 8, 32} | {256, 32, 16} | Float32 | 256 | 17.99µs | 2,097,152 | 116.6 |
LargeBatch-Small | {256, 8, 32} | {256, 32, 16} | Float64 | 256 | 29.11µs | 2,097,152 | 72.0 |
LargeBatch-Small | {256, 8, 32} | {256, 32, 16} | BFloat16 | 256 | 16.83µs | 2,097,152 | 124.6 |
LargeBatch-Small | {256, 8, 32} | {256, 32, 16} | Float16 | 256 | 13.51µs | 2,097,152 | 155.3 |
LargeBatch-Medium | {64, 64, 128} | {64, 64, 128} | Float32 | 64 | 28.59µs | 67,108,864 | 2347.1 |
LargeBatch-Medium | {64, 64, 128} | {64, 64, 128} | Float64 | 64 | 99.14µs | 67,108,864 | 676.9 |
LargeBatch-Medium | {64, 64, 128} | {64, 64, 128} | BFloat16 | 64 | 24.59µs | 67,108,864 | 2728.7 |
LargeBatch-Medium | {64, 64, 128} | {64, 64, 128} | Float16 | 64 | 25.75µs | 67,108,864 | 2605.9 |
Batched-Large | {16, 1536, 1920} | {16, 1920, 1024} | Float32 | 16 | 927.42µs | 96,636,764,160 | 104199.5 |
Batched-Large | {16, 1536, 1920} | {16, 1920, 1024} | Float64 | 16 | 57.21ms | 96,636,764,160 | 1689.2 |
Batched-Large | {16, 1536, 1920} | {16, 1920, 1024} | BFloat16 | 16 | 473.19µs | 96,636,764,160 | 204223.6 |
Batched-Large | {16, 1536, 1920} | {16, 1920, 1024} | Float16 | 16 | 476.39µs | 96,636,764,160 | 202853.9 |
KA-Batch-16-#1 | {16, 12, 13, 13} | {16, 12, 13, 32} | Float32 | 192 | 29.05µs | 2,076,672 | 71.5 |
KA-Batch-16-#1 | {16, 12, 13, 13} | {16, 12, 13, 32} | Float64 | 192 | 36.93µs | 2,076,672 | 56.2 |
KA-Batch-16-#1 | {16, 12, 13, 13} | {16, 12, 13, 32} | BFloat16 | 192 | 18.15µs | 2,076,672 | 114.4 |
KA-Batch-16-#1 | {16, 12, 13, 13} | {16, 12, 13, 32} | Float16 | 192 | 17.17µs | 2,076,672 | 120.9 |
KA-Batch-16-#2 | {16, 12, 13, 32} | {16, 12, 32, 13} | Float32 | 192 | 23.30µs | 2,076,672 | 89.1 |
KA-Batch-16-#2 | {16, 12, 13, 32} | {16, 12, 32, 13} | Float64 | 192 | 41.81µs | 2,076,672 | 49.7 |
KA-Batch-16-#2 | {16, 12, 13, 32} | {16, 12, 32, 13} | BFloat16 | 192 | 18.89µs | 2,076,672 | 109.9 |
KA-Batch-16-#2 | {16, 12, 13, 32} | {16, 12, 32, 13} | Float16 | 192 | 17.59µs | 2,076,672 | 118.0 |
KA-Batch-16-#3 | {16, 13, 1536} | {1536, 384} | Float32 | 1 | 36.02µs | 245,366,784 | 6812.3 |
KA-Batch-16-#3 | {16, 13, 1536} | {1536, 384} | Float64 | 1 | 374.37µs | 245,366,784 | 655.4 |
KA-Batch-16-#3 | {16, 13, 1536} | {1536, 384} | BFloat16 | 1 | 39.73µs | 245,366,784 | 6176.0 |
KA-Batch-16-#3 | {16, 13, 1536} | {1536, 384} | Float16 | 1 | 31.60µs | 245,366,784 | 7764.8 |
KA-Batch-16-#4 | {16, 13, 384} | {384, 1536} | Float32 | 1 | 35.77µs | 245,366,784 | 6858.6 |
KA-Batch-16-#4 | {16, 13, 384} | {384, 1536} | Float64 | 1 | 205.18µs | 245,366,784 | 1195.8 |
KA-Batch-16-#4 | {16, 13, 384} | {384, 1536} | BFloat16 | 1 | 70.33µs | 245,366,784 | 3488.8 |
KA-Batch-16-#4 | {16, 13, 384} | {384, 1536} | Float16 | 1 | 26.47µs | 245,366,784 | 9268.2 |
KA-Batch-16-#5 | {16, 13, 384} | {384, 384} | Float32 | 1 | 86.77µs | 61,341,696 | 707.0 |
KA-Batch-16-#5 | {16, 13, 384} | {384, 384} | Float64 | 1 | 114.17µs | 61,341,696 | 537.3 |
KA-Batch-16-#5 | {16, 13, 384} | {384, 384} | BFloat16 | 1 | 26.20µs | 61,341,696 | 2340.9 |
KA-Batch-16-#5 | {16, 13, 384} | {384, 384} | Float16 | 1 | 26.57µs | 61,341,696 | 2309.1 |
adult-#1 | {128, 4} | {4, 1} | Float32 | 1 | 13.96µs | 1,024 | 0.1 |
adult-#1 | {128, 4} | {4, 1} | Float64 | 1 | 14.06µs | 1,024 | 0.1 |
adult-#1 | {128, 4} | {4, 1} | BFloat16 | 1 | 14.56µs | 1,024 | 0.1 |
adult-#1 | {128, 4} | {4, 1} | Float16 | 1 | 15.22µs | 1,024 | 0.1 |
adult-#2 | {128, 69} | {69, 4} | Float32 | 1 | 22.74µs | 70,656 | 3.1 |
adult-#2 | {128, 69} | {69, 4} | Float64 | 1 | 46.36µs | 70,656 | 1.5 |
adult-#2 | {128, 69} | {69, 4} | BFloat16 | 1 | 17.25µs | 70,656 | 4.1 |
adult-#2 | {128, 69} | {69, 4} | Float16 | 1 | 22.23µs | 70,656 | 3.2 |
adult-#3 | {25, 4} | {4, 1} | Float32 | 1 | 13.62µs | 200 | 0.0 |
adult-#3 | {25, 4} | {4, 1} | Float64 | 1 | 16.46µs | 200 | 0.0 |
adult-#3 | {25, 4} | {4, 1} | BFloat16 | 1 | 13.80µs | 200 | 0.0 |
adult-#3 | {25, 4} | {4, 1} | Float16 | 1 | 13.51µs | 200 | 0.0 |
adult-#4 | {25, 69} | {69, 4} | Float32 | 1 | 15.01µs | 13,800 | 0.9 |
adult-#4 | {25, 69} | {69, 4} | Float64 | 1 | 49.26µs | 13,800 | 0.3 |
adult-#4 | {25, 69} | {69, 4} | BFloat16 | 1 | 21.87µs | 13,800 | 0.6 |
adult-#4 | {25, 69} | {69, 4} | Float16 | 1 | 17.55µs | 13,800 | 0.8 |
adult-#5 | {49, 4} | {4, 1} | Float32 | 1 | 13.25µs | 392 | 0.0 |
adult-#5 | {49, 4} | {4, 1} | Float64 | 1 | 15.64µs | 392 | 0.0 |
adult-#5 | {49, 4} | {4, 1} | BFloat16 | 1 | 13.40µs | 392 | 0.0 |
adult-#5 | {49, 4} | {4, 1} | Float16 | 1 | 13.97µs | 392 | 0.0 |
adult-#6 | {49, 69} | {69, 4} | Float32 | 1 | 27.97µs | 27,048 | 1.0 |
adult-#6 | {49, 69} | {69, 4} | Float64 | 1 | 46.09µs | 27,048 | 0.6 |
adult-#6 | {49, 69} | {69, 4} | BFloat16 | 1 | 21.03µs | 27,048 | 1.3 |
adult-#6 | {49, 69} | {69, 4} | Float16 | 1 | 19.12µs | 27,048 | 1.4 |
adult-#6-Normalized | {49, 69} | {4, 69} | Float32 | 1 | 15.79µs | 27,048 | 1.7 |
adult-#6-Normalized | {49, 69} | {4, 69} | Float64 | 1 | 49.39µs | 27,048 | 0.5 |
adult-#6-Normalized | {49, 69} | {4, 69} | BFloat16 | 1 | 15.07µs | 27,048 | 1.8 |
adult-#6-Normalized | {49, 69} | {4, 69} | Float16 | 1 | 17.34µs | 27,048 | 1.6 |