update benchmarks

bjarthur · bjarthur · commit d5ceb0e302a1 · 2024-01-23T07:56:07.000-05:00
diff --git a/README.md b/README.md
@@ -5,11 +5,12 @@ can only be scalars.  BatchedBLAS.jl extends support for batched arrays by
 `ger`, `syr`, and `spr` that work with arrays of AbstractFloats and Integers,
 and scaling coefficients which can be scalars or Vectors.
 
-In addition to the type flexibility, there is a performance benefit for
-symmetric and packed symmetric matrices, where execution times for `syr`
-and `spr` are faster than the equivalent batched `gemm`.  Benchmarks on an
-A100 follow.  The dashed lines are for the transposed version of `gemv` and
-the upper-triangle versions of all other functions.  Lower numbers are better.
+In addition to the type flexibility, there is a performance benefit for rank-1
+updates as execution times for `ger`, `syr`, and `spr` are faster than the
+equivalent batched `gemm` for the range of parameters tested.  `dot` is also
+faster for small matrices.  Benchmarks on an H100 follow.  The dashed lines are
+for the transposed version of `gemv` and the upper-triangle versions of all
+other functions.  Lower numbers are better.
 
 ![benchmarks](/bench/bench.svg)
 
diff --git a/bench/Project.toml b/bench/Project.toml
@@ -8,5 +8,8 @@ JLD2 = "033835bb-8acc-5ee8-8aae-3f567f8a3819"
 KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
-NNlibCUDA = "a00861dc-f156-4864-bf3c-e6376f28a68d"
 SymmetricFormats = "a91e544d-b3d6-4431-ae28-0549b1291c16"
+
+[compat]
+NNlib = "0.9"
+julia = "1.9"
diff --git a/bench/bench.svg b/bench/bench.svg
diff --git a/bench/runbench.jl b/bench/runbench.jl
@@ -1,5 +1,5 @@
 using LinearAlgebra, BatchedBLAS, NNlib, SymmetricFormats, BenchmarkTools, DataFrames, Gadfly, JLD2
-using KernelAbstractions, CUDA, NNlibCUDA
+using KernelAbstractions, CUDA
 
 macro belapsed_median(args...)
     esc(:(time(median(@benchmark $(args...))) / 1e9))