Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Safetensors changes #913

Draft
wants to merge 10 commits into
base: main
Choose a base branch
from
Prev Previous commit
Next Next commit
avoid conv1d bound for cudnn
swfsql committed Feb 9, 2024
commit 4e3f7c7a24728668f72cf3617a66f4476280f6fb
50 changes: 39 additions & 11 deletions dfdx-core/src/tensor_ops/utilities/device.rs
Original file line number Diff line number Diff line change
@@ -114,33 +114,61 @@ pub trait Device<E: Dtype>:
+ crate::tensor_ops::axpy::AxpyKernel<E>

// conv1d
+ super::super::conv1d::Conv1DKernel<E>
+ NonCudnnCuda<E>
{
}

#[cfg(feature = "cudnn")]
pub trait NonCudnnCuda<E: Dtype> {}

#[cfg(not(feature = "cudnn"))]
pub trait NonCudnnCuda<E: Dtype>:
// conv1d
super::super::conv1d::Conv1DKernel<E>
{
}

#[cfg(feature = "f16")]
impl Device<f16> for crate::tensor::Cpu {}
#[cfg(feature = "f16")]
impl Device<AMP<f16>> for crate::tensor::Cpu {}
mod f16_ {
use super::*;
impl Device<f16> for crate::tensor::Cpu {}
impl NonCudnnCuda<f16> for crate::tensor::Cpu {}
impl Device<AMP<f16>> for crate::tensor::Cpu {}
impl NonCudnnCuda<AMP<f16>> for crate::tensor::Cpu {}
}
impl Device<f32> for crate::tensor::Cpu {}
impl NonCudnnCuda<f32> for crate::tensor::Cpu {}
impl Device<f64> for crate::tensor::Cpu {}
impl NonCudnnCuda<f64> for crate::tensor::Cpu {}

#[cfg(all(feature = "cuda", feature = "f16"))]
impl Device<f16> for crate::tensor::Cuda {}
#[cfg(all(feature = "cuda", feature = "f16"))]
impl Device<AMP<f16>> for crate::tensor::Cuda {}
#[cfg(feature = "cuda")]
impl Device<f32> for crate::tensor::Cuda {}
mod cuda_f16 {
use super::*;
impl Device<f16> for crate::tensor::Cuda {}
impl NonCudnnCuda<f16> for crate::tensor::Cuda {}
impl Device<AMP<f16>> for crate::tensor::Cuda {}
impl NonCudnnCuda<AMP<f16>> for crate::tensor::Cuda {}
}
#[cfg(feature = "cuda")]
impl Device<f64> for crate::tensor::Cuda {}
mod cuda {
use super::*;
impl Device<f32> for crate::tensor::Cuda {}
impl NonCudnnCuda<f32> for crate::tensor::Cuda {}
impl Device<f64> for crate::tensor::Cuda {}
impl NonCudnnCuda<f64> for crate::tensor::Cuda {}
}

// TODO: How can we implement this for f16 when WGSL doesn't support f16 yet?
// #[cfg(all(feature = "webgpu", feature = "f16"))]
// impl Device<f16> for crate::tensor::Webgpu {}
// #[cfg(all(feature = "webgpu", feature = "f16"))]
// impl Device<AMP<f16>> for crate::tensor::Webgpu {}
#[cfg(feature = "webgpu")]
impl Device<f32> for crate::tensor::Webgpu {}
mod webgpu {
use super::*;
impl Device<f32> for crate::tensor::Webgpu {}
impl NonCudnnCuda<f32> for crate::tensor::Webgpu {}
}

// TODO: How can we implement this for f64 when WGSL doesn't support f64 yet?
// #[cfg(feature = "webgpu")]