Skip to content

Commit

Permalink
ENH: _Blazingly fast_ schedules.
Browse files Browse the repository at this point in the history
Use auto-schedulers to obtain optimized schedules for CPU and GPU.

CPU schedule outperforms everything else _by far_ except for unreasonably large variances, in which case the GPU schedule wins.

CPU schedule is faster than current ITK CPU filter by over an order of magnitude. It is even faster than the ITK GPU filter, but the contest is closer.
  • Loading branch information
allemangD committed Sep 13, 2024
1 parent 72bf196 commit e41d66a
Show file tree
Hide file tree
Showing 7 changed files with 572 additions and 39 deletions.
2 changes: 2 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ project(HalideFilters)
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED YES)

option(Module_HalideFilters_USE_AUTOSCHEDULER "Use auto-schedulers for Halide filters" OFF)

# Update the following variables to update the version of Halide used
set(HALIDE_VERSION "18.0.0")
set(HALIDE_VERSION_COMMIT "8c651b459a4e3744b413c23a29b5c5d968702bb7")
Expand Down
2 changes: 2 additions & 0 deletions examples/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@ project(HalideFiltersExamples)

set(ExampleSpecificComponents
HalideFilters
ITKGPUSmoothing
ITKImageNoise
)

if(NOT ITK_SOURCE_DIR)
Expand Down
99 changes: 85 additions & 14 deletions examples/SampleHalidePipeline.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -17,29 +17,100 @@
*=========================================================================*/

#include "itkHalideDiscreteGaussianImageFilter.h"
#include "itkDiscreteGaussianImageFilter.h"
#include "itkHalideGPUDiscreteGaussianImageFilter.h"
#include "itkGPUDiscreteGaussianImageFilter.h"
#include "itkAdditiveGaussianNoiseImageFilter.h"
#include "itkCastImageFilter.h"
#include "itkImage.h"
#include "itkGPUImage.h"

#include "itkCommand.h"
#include "itkImageFileReader.h"
#include "itkImageFileWriter.h"

#define BENCH(label, samples, block) \
do \
{ \
auto start = std::chrono::high_resolution_clock::now(); \
for (size_t __i = 0; __i < samples; __i++) \
{ \
block; \
} \
auto end = std::chrono::high_resolution_clock::now(); \
auto ms = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count(); \
std::cout << #label << " " << (ms / samples) << "ms" << std::endl; \
} while (0)

int main( int argc, char * argv[] )
using ImageType = itk::Image<float, 3>;
using NoiseFilter = itk::AdditiveGaussianNoiseImageFilter<ImageType, ImageType>;
using GPUImageType = itk::GPUImage<float, 3>;
using CastToGPUImage = itk::CastImageFilter<ImageType, GPUImageType>;

using CPUBlur = itk::DiscreteGaussianImageFilter<ImageType, ImageType>;
using HalideBlur = itk::HalideDiscreteGaussianImageFilter<ImageType, ImageType>;

using GPUBlur = itk::GPUDiscreteGaussianImageFilter<GPUImageType, GPUImageType>;
using HalideGPUBlur = itk::HalideGPUDiscreteGaussianImageFilter<ImageType, ImageType>;

int
main(int argc, char * argv[])
{
if( argc < 4 )
{
std::cerr << "Missing parameters." << std::endl;
std::cerr << "Usage: " << argv[0]
<< " inputImage"
<< " outputImage"
<< " parameters" << std::endl;
return EXIT_FAILURE;
}
ImageType::IndexValueType SIZE = 300;
float VARIANCE = 90;

ImageType::IndexType index;
index.Fill(0);
ImageType::SizeType size;
size.Fill(SIZE);
ImageType::RegionType region;
region.SetIndex(index);
region.SetSize(size);

ImageType::Pointer image = ImageType::New();
image->SetRegions(region);
image->Allocate();
image->FillBuffer(0.0f);

NoiseFilter::Pointer noise = NoiseFilter::New();
noise->SetInput(image);
noise->SetMean(0);
noise->SetStandardDeviation(2.0);
noise->Update();

image = noise->GetOutput();

BENCH(cpu_blur, 1, {
CPUBlur::Pointer filter = CPUBlur::New();
filter->SetInput(image);
filter->SetVariance(VARIANCE);
filter->Update();
});

BENCH(halide_cpu_blur, 1, {
HalideBlur::Pointer filter = HalideBlur::New();
filter->SetInput(image);
filter->SetVariance(VARIANCE);
filter->Update();
});

// Please, write a complete, self-containted and useful example that
// demonstrate a class when being used along with other ITK classes or in
// the context of a wider or specific application.
BENCH(gpu_blur, 1, {
CastToGPUImage::Pointer cast = CastToGPUImage::New();
cast->SetInput(image);
GPUBlur::Pointer filter = GPUBlur::New();
GPUImageType::Pointer gpu_image = cast->GetOutput();
cast->Update();
filter->SetInput(gpu_image);
filter->SetVariance(VARIANCE);
filter->Update();
filter->GetOutput()->UpdateBuffers();
});

BENCH(halide_gpu_blur, 1, {
HalideGPUBlur::Pointer filter = HalideGPUBlur::New();
filter->SetInput(image);
filter->SetVariance(VARIANCE);
filter->Update();
});

return EXIT_SUCCESS;
}
109 changes: 109 additions & 0 deletions include/itkHalideGPUDiscreteGaussianImageFilter.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
/*=========================================================================
*
* Copyright NumFOCUS
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* https://www.apache.org/licenses/LICENSE-2.0.txt
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*=========================================================================*/
#ifndef itkHalideGPUDiscreteGaussianImageFilter_h
#define itkHalideGPUDiscreteGaussianImageFilter_h

#include "itkImageToImageFilter.h"

namespace itk
{

/** \class HalideGPUDiscreteGaussianImageFilter
*
* \brief Filters a image by iterating over its pixels.
*
* Filters a image by iterating over its pixels in a multi-threaded way
* and {to be completed by the developer}.
*
* \ingroup HalideFilters
*
* Limitations compared te itkDiscreteGaussianImageFilter:
* - Only supports isotropic variance and maximum error (to simplify wrapper)
* - Only supports 3d images (to simplify wrapper)
*
*/
template <typename TInputImage, typename TOutputImage>
class HalideGPUDiscreteGaussianImageFilter : public ImageToImageFilter<TInputImage, TOutputImage>
{
public:
ITK_DISALLOW_COPY_AND_MOVE(HalideGPUDiscreteGaussianImageFilter);

static constexpr unsigned int InputImageDimension = TInputImage::ImageDimension;
static constexpr unsigned int OutputImageDimension = TOutputImage::ImageDimension;

using InputImageType = TInputImage;
using OutputImageType = TOutputImage;
using InputPixelType = typename InputImageType::PixelType;
using OutputPixelType = typename OutputImageType::PixelType;

/** Standard class aliases. */
using Self = HalideGPUDiscreteGaussianImageFilter<InputImageType, OutputImageType>;
using Superclass = ImageToImageFilter<InputImageType, OutputImageType>;
using Pointer = SmartPointer<Self>;
using ConstPointer = SmartPointer<const Self>;

/** Run-time type information. */
itkOverrideGetNameOfClassMacro(HalideGPUDiscreteGaussianImageFilter);

/** Standard New macro. */
itkNewMacro(Self);

itkSetMacro(Variance, float);
itkGetMacro(Variance, float);

itkSetMacro(MaximumError, float);
itkGetMacro(MaximumError, float);

itkGetMacro(MaximumKernelWidth, unsigned int);
itkSetMacro(MaximumKernelWidth, unsigned int);

itkGetMacro(UseImageSpacing, bool);
itkSetMacro(UseImageSpacing, bool);
itkBooleanMacro(UseImageSpacing);

protected:
HalideGPUDiscreteGaussianImageFilter();
~
HalideGPUDiscreteGaussianImageFilter() override = default;

void
PrintSelf(std::ostream & os, Indent indent) const override;

using OutputRegionType = typename OutputImageType::RegionType;

void
GenerateData() override;

private:
#ifdef ITK_USE_CONCEPT_CHECKING
// Add concept checking such as
itkConceptMacro(FloatingPointPixel, (itk::Concept::IsFloatingPoint<typename InputImageType::PixelType>));
#endif

float m_Variance = 0;
float m_MaximumError = 0.01;
unsigned int m_MaximumKernelWidth = 32;
bool m_UseImageSpacing = true;
};
} // namespace itk

#ifndef ITK_MANUAL_INSTANTIATION
# include "itkHalideGPUDiscreteGaussianImageFilter.hxx"
#endif

#endif // itkHalideGPUDiscreteGaussianImageFilter
100 changes: 100 additions & 0 deletions include/itkHalideGPUDiscreteGaussianImageFilter.hxx
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
/*=========================================================================
*
* Copyright NumFOCUS
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* https://www.apache.org/licenses/LICENSE-2.0.txt
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*=========================================================================*/
#ifndef itkHalideGPUDiscreteGaussianImageFilter_hxx
#define itkHalideGPUDiscreteGaussianImageFilter_hxx

#include "itkHalideGPUDiscreteGaussianImageFilter.h"

#include "itkHalideGPUSeparableConvolutionImpl.h"

#include "itkGaussianOperator.h"

#include <Halide.h>
#include <HalideBuffer.h>
#include <iomanip>

namespace itk
{

template <typename TInputImage, typename TOutputImage>
HalideGPUDiscreteGaussianImageFilter<TInputImage, TOutputImage>::HalideGPUDiscreteGaussianImageFilter()
{
this->DynamicMultiThreadingOff();
this->ThreaderUpdateProgressOff();
}


template <typename TInputImage, typename TOutputImage>
void
HalideGPUDiscreteGaussianImageFilter<TInputImage, TOutputImage>::PrintSelf(std::ostream & os, Indent indent) const
{
Superclass::PrintSelf(os, indent);
}


template <typename TInputImage, typename TOutputImage>
void
HalideGPUDiscreteGaussianImageFilter<TInputImage, TOutputImage>::GenerateData()
{
const InputImageType * input = this->GetInput();
typename InputImageType::RegionType inputRegion = input->GetBufferedRegion();
typename InputImageType::SizeType inputSize = inputRegion.GetSize();
typename InputImageType::SpacingType inputSpacing = input->GetSpacing();

std::vector<Halide::Runtime::Buffer<float, 1>> kernel_buffers{};

// compute kernel coefficients with itk::GaussianOperator to match behavior with itk::DiscreteGaussianImageFilter
for (int dim = 0; dim < InputImageDimension; ++dim)
{
GaussianOperator<float, 1> oper{};
oper.SetMaximumError(m_MaximumError);
oper.SetMaximumKernelWidth(m_MaximumKernelWidth);

float variance = m_Variance;
if (m_UseImageSpacing)
{
variance /= inputSpacing[dim];
}
oper.SetVariance(variance);

oper.CreateDirectional();

Halide::Runtime::Buffer<float, 1> & buf = kernel_buffers.emplace_back(static_cast<int>(oper.GetSize(0)));
buf.set_min(-static_cast<int>(oper.GetRadius(0)));
std::copy(oper.Begin(), oper.End(), buf.begin());
buf.set_host_dirty();
}

OutputImageType * output = this->GetOutput();
output->SetRegions(inputRegion);
output->Allocate();

std::vector<int> sizes(3, 1);
std::copy(inputSize.begin(), inputSize.end(), sizes.begin());

Halide::Runtime::Buffer<const InputPixelType> inputBuffer(input->GetBufferPointer(), sizes);
Halide::Runtime::Buffer<OutputPixelType> outputBuffer(output->GetBufferPointer(), sizes);

inputBuffer.set_host_dirty();
itkHalideGPUSeparableConvolutionImpl(inputBuffer, kernel_buffers[0], kernel_buffers[1], kernel_buffers[2], outputBuffer);
outputBuffer.copy_to_host();
}

} // end namespace itk

#endif // itkHalideGPUDiscreteGaussianImageFilter_hxx
41 changes: 35 additions & 6 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,45 @@ find_package(Halide REQUIRED shared)
add_executable(itkHalideGenerators generators.cpp)
target_link_libraries(itkHalideGenerators PRIVATE Halide::Generator)

add_halide_library(itkHalideSeparableConvolutionImpl
FROM itkHalideGenerators
HEADER itkHalideSeparableConvolutionImpl_h
FEATURES cuda
)
if(Module_HalideFilters_USE_AUTOSCHEDULER)
add_halide_library(itkHalideSeparableConvolutionImpl
FROM itkHalideGenerators
GENERATOR itkHalideSeparableConvolutionImpl
HEADER itkHalideSeparableConvolutionImpl_h
SCHEDULE itkHalideSeparableConvolutionSchedule
AUTOSCHEDULER Halide::Adams2019
)

add_halide_library(itkHalideGPUSeparableConvolutionImpl
FROM itkHalideGenerators
GENERATOR itkHalideSeparableConvolutionImpl
HEADER itkGPUHalideSeparableConvolutionImpl_h
SCHEDULE itkHalideGPUSeparableConvolutionSchedule
FEATURES cuda
AUTOSCHEDULER Halide::Anderson2021
)
else()
add_halide_library(itkHalideSeparableConvolutionImpl
FROM itkHalideGenerators
GENERATOR itkHalideSeparableConvolutionImpl
HEADER itkHalideSeparableConvolutionImpl_h
PARAMS use_gpu=false
)

add_halide_library(itkHalideGPUSeparableConvolutionImpl
FROM itkHalideGenerators
GENERATOR itkHalideSeparableConvolutionImpl
HEADER itkGPUHalideSeparableConvolutionImpl_h
FEATURES cuda
PARAMS use_gpu=true
)
endif()

set(HalideFilters_SRCS
${itkHalideGPUSeparableConvolutionImpl_h}
${itkHalideSeparableConvolutionImpl_h}
)

itk_module_add_library(HalideFilters ${HalideFilters_SRCS})
target_include_directories(HalideFilters PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
target_link_libraries(HalideFilters PUBLIC itkHalideSeparableConvolutionImpl)
target_link_libraries(HalideFilters PUBLIC itkHalideSeparableConvolutionImpl itkHalideGPUSeparableConvolutionImpl)
Loading

0 comments on commit e41d66a

Please sign in to comment.