diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index bd11ea2..1875e00 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -34,4 +34,4 @@ steps: env: JULIA_PKG_SERVER: "" # it often struggles with our large artifacts - CODECOV_TOKEN: "ea64fa23-14d4-4123-a7ce-b4f4208cd455" + CODECOV_TOKEN: "17a4c091-2903-476b-8609-c613436a30f8" diff --git a/.gitignore b/.gitignore index 0ecfb73..4381a61 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,6 @@ *.jl.*.cov *.jl.cov *.jl.mem -/Manifest.toml test.jl +Manifest.toml diff --git a/Project.toml b/Project.toml index 46a8a4e..ff4661c 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "BinomialGPU" uuid = "c5bbfde1-2136-42cd-9b65-d5719df69ebf" authors = ["Simone Carlo Surace"] -version = "0.2.6" +version = "0.3.0" [deps] BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" diff --git a/README.md b/README.md index bf4ba98..f073cb7 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ # BinomialGPU [![Build status](https://badge.buildkite.com/70a8c11259658ad6f836a4981791ed144bac80e65302291d0d.svg?branch=master)](https://buildkite.com/julialang/binomialgpu-dot-jl) -[![Coverage](https://codecov.io/gh/simsurace/BinomialGPU.jl/branch/master/graph/badge.svg)](https://codecov.io/gh/simsurace/BinomialGPU.jl) +[![Coverage](https://codecov.io/gh/JuliaGPU/BinomialGPU.jl/branch/master/graph/badge.svg)](https://codecov.io/gh/JuliaGPU/BinomialGPU.jl) This package provides a function `rand_binomial!` to produce `CuArrays` with binomially distributed entries, analogous to `CUDA.rand_poisson!` for Poisson-distributed ones. @@ -42,8 +42,5 @@ rand_binomial!(A, count = counts, prob = probs) ## Issues -* The sampler is fast: it is about one order of magnitude faster than other samplers. But it is still an open question whether it can be made faster, whether there are other samplers with competitive speed, and it shows some non-intuitive behavior: - * The functionality to draw random numbers within CUDA.jl kernels is still under development. A new function `rand()` has recently become available, but it hasn't been tried within this package. See [issue #7](https://github.com/JuliaGPU/BinomialGPU.jl/issues/7). - * The speed is faster in Julia 1.5.4 than in the current Julia 1.6 release candidate. See [issue #8](https://github.com/JuliaGPU/BinomialGPU.jl/issues/8). - * The speed is slower when using optimal thread allocation than when defaulting to 256 threads. See [issue #2](https://github.com/JuliaGPU/BinomialGPU.jl/issues/2) - * Are there any other samplers that are comparably fast or faster? I compared the following: sample an array of size `(1024, 1024)` with `count = 128` and `prob` of size `(1024, 1024)` with uniformly drawn entries. Timings on an RTX2070 card: BinomialGPU.jl 1.4ms, PyTorch 11ms, CuPy 18ms, tensorflow 400ms. Please let me know if you know samplers that are not yet listed. +* The speed is slower when using optimal thread allocation than when defaulting to 256 threads. See [issue #2](https://github.com/JuliaGPU/BinomialGPU.jl/issues/2) +* Are there any other samplers that are comparably fast or faster? I compared the following: sample an array of size `(1024, 1024)` with `count = 128` and `prob` of size `(1024, 1024)` with uniformly drawn entries. Timings on an RTX2070 card: BinomialGPU.jl 0.8ms, PyTorch 11ms, CuPy 18ms, tensorflow 400ms. Timings for other samplers are very welcome; please open an issue if you find one. diff --git a/src/BinomialGPU.jl b/src/BinomialGPU.jl index 9d1bbf5..6aa5237 100644 --- a/src/BinomialGPU.jl +++ b/src/BinomialGPU.jl @@ -5,6 +5,7 @@ using Random using CUDA: cuda_rng, i32 + # user-level API include("rand_binomial.jl") export rand_binomial! diff --git a/src/kernels.jl b/src/kernels.jl index 1e33683..0d3c107 100644 --- a/src/kernels.jl +++ b/src/kernels.jl @@ -18,7 +18,7 @@ function stirling_approx_tail(k)::Float32 elseif k == 5 return 0.0138761288230707f0 elseif k == 6 - return 0.0118967099458917f0 + return 0.0118967099458917f0Newrand elseif k == 7 return 0.0104112652619720f0 elseif k == 8 @@ -31,6 +31,7 @@ function stirling_approx_tail(k)::Float32 end + # BTRS algorithm, adapted from the tensorflow library (https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/kernels/random_binomial_op.cc) ## Kernel for scalar parameters @@ -268,6 +269,4 @@ function kernel_naive_full!(A, count, prob, randstates) return end - - ## COV_EXCL_STOP