Skip to content

Commit e41d66a

Browse files
committed
ENH: _Blazingly fast_ schedules.
Use auto-schedulers to obtain optimized schedules for CPU and GPU. CPU schedule outperforms everything else _by far_ except for unreasonably large variances, in which case the GPU schedule wins. CPU schedule is faster than current ITK CPU filter by over an order of magnitude. It is even faster than the ITK GPU filter, but the contest is closer.
1 parent 72bf196 commit e41d66a

File tree

7 files changed

+572
-39
lines changed

7 files changed

+572
-39
lines changed

CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@ project(HalideFilters)
44
set(CMAKE_CXX_STANDARD 17)
55
set(CMAKE_CXX_STANDARD_REQUIRED YES)
66

7+
option(Module_HalideFilters_USE_AUTOSCHEDULER "Use auto-schedulers for Halide filters" OFF)
8+
79
# Update the following variables to update the version of Halide used
810
set(HALIDE_VERSION "18.0.0")
911
set(HALIDE_VERSION_COMMIT "8c651b459a4e3744b413c23a29b5c5d968702bb7")

examples/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@ project(HalideFiltersExamples)
33

44
set(ExampleSpecificComponents
55
HalideFilters
6+
ITKGPUSmoothing
7+
ITKImageNoise
68
)
79

810
if(NOT ITK_SOURCE_DIR)

examples/SampleHalidePipeline.cxx

Lines changed: 85 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -17,29 +17,100 @@
1717
*=========================================================================*/
1818

1919
#include "itkHalideDiscreteGaussianImageFilter.h"
20+
#include "itkDiscreteGaussianImageFilter.h"
21+
#include "itkHalideGPUDiscreteGaussianImageFilter.h"
22+
#include "itkGPUDiscreteGaussianImageFilter.h"
23+
#include "itkAdditiveGaussianNoiseImageFilter.h"
24+
#include "itkCastImageFilter.h"
25+
#include "itkImage.h"
26+
#include "itkGPUImage.h"
2027

21-
#include "itkCommand.h"
2228
#include "itkImageFileReader.h"
2329
#include "itkImageFileWriter.h"
2430

31+
#define BENCH(label, samples, block) \
32+
do \
33+
{ \
34+
auto start = std::chrono::high_resolution_clock::now(); \
35+
for (size_t __i = 0; __i < samples; __i++) \
36+
{ \
37+
block; \
38+
} \
39+
auto end = std::chrono::high_resolution_clock::now(); \
40+
auto ms = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count(); \
41+
std::cout << #label << " " << (ms / samples) << "ms" << std::endl; \
42+
} while (0)
2543

26-
int main( int argc, char * argv[] )
44+
using ImageType = itk::Image<float, 3>;
45+
using NoiseFilter = itk::AdditiveGaussianNoiseImageFilter<ImageType, ImageType>;
46+
using GPUImageType = itk::GPUImage<float, 3>;
47+
using CastToGPUImage = itk::CastImageFilter<ImageType, GPUImageType>;
48+
49+
using CPUBlur = itk::DiscreteGaussianImageFilter<ImageType, ImageType>;
50+
using HalideBlur = itk::HalideDiscreteGaussianImageFilter<ImageType, ImageType>;
51+
52+
using GPUBlur = itk::GPUDiscreteGaussianImageFilter<GPUImageType, GPUImageType>;
53+
using HalideGPUBlur = itk::HalideGPUDiscreteGaussianImageFilter<ImageType, ImageType>;
54+
55+
int
56+
main(int argc, char * argv[])
2757
{
28-
if( argc < 4 )
29-
{
30-
std::cerr << "Missing parameters." << std::endl;
31-
std::cerr << "Usage: " << argv[0]
32-
<< " inputImage"
33-
<< " outputImage"
34-
<< " parameters" << std::endl;
35-
return EXIT_FAILURE;
36-
}
58+
ImageType::IndexValueType SIZE = 300;
59+
float VARIANCE = 90;
60+
61+
ImageType::IndexType index;
62+
index.Fill(0);
63+
ImageType::SizeType size;
64+
size.Fill(SIZE);
65+
ImageType::RegionType region;
66+
region.SetIndex(index);
67+
region.SetSize(size);
68+
69+
ImageType::Pointer image = ImageType::New();
70+
image->SetRegions(region);
71+
image->Allocate();
72+
image->FillBuffer(0.0f);
73+
74+
NoiseFilter::Pointer noise = NoiseFilter::New();
75+
noise->SetInput(image);
76+
noise->SetMean(0);
77+
noise->SetStandardDeviation(2.0);
78+
noise->Update();
79+
80+
image = noise->GetOutput();
81+
82+
BENCH(cpu_blur, 1, {
83+
CPUBlur::Pointer filter = CPUBlur::New();
84+
filter->SetInput(image);
85+
filter->SetVariance(VARIANCE);
86+
filter->Update();
87+
});
3788

89+
BENCH(halide_cpu_blur, 1, {
90+
HalideBlur::Pointer filter = HalideBlur::New();
91+
filter->SetInput(image);
92+
filter->SetVariance(VARIANCE);
93+
filter->Update();
94+
});
3895

39-
// Please, write a complete, self-containted and useful example that
40-
// demonstrate a class when being used along with other ITK classes or in
41-
// the context of a wider or specific application.
96+
BENCH(gpu_blur, 1, {
97+
CastToGPUImage::Pointer cast = CastToGPUImage::New();
98+
cast->SetInput(image);
99+
GPUBlur::Pointer filter = GPUBlur::New();
100+
GPUImageType::Pointer gpu_image = cast->GetOutput();
101+
cast->Update();
102+
filter->SetInput(gpu_image);
103+
filter->SetVariance(VARIANCE);
104+
filter->Update();
105+
filter->GetOutput()->UpdateBuffers();
106+
});
42107

108+
BENCH(halide_gpu_blur, 1, {
109+
HalideGPUBlur::Pointer filter = HalideGPUBlur::New();
110+
filter->SetInput(image);
111+
filter->SetVariance(VARIANCE);
112+
filter->Update();
113+
});
43114

44115
return EXIT_SUCCESS;
45116
}
Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,109 @@
1+
/*=========================================================================
2+
*
3+
* Copyright NumFOCUS
4+
*
5+
* Licensed under the Apache License, Version 2.0 (the "License");
6+
* you may not use this file except in compliance with the License.
7+
* You may obtain a copy of the License at
8+
*
9+
* https://www.apache.org/licenses/LICENSE-2.0.txt
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*
17+
*=========================================================================*/
18+
#ifndef itkHalideGPUDiscreteGaussianImageFilter_h
19+
#define itkHalideGPUDiscreteGaussianImageFilter_h
20+
21+
#include "itkImageToImageFilter.h"
22+
23+
namespace itk
24+
{
25+
26+
/** \class HalideGPUDiscreteGaussianImageFilter
27+
*
28+
* \brief Filters a image by iterating over its pixels.
29+
*
30+
* Filters a image by iterating over its pixels in a multi-threaded way
31+
* and {to be completed by the developer}.
32+
*
33+
* \ingroup HalideFilters
34+
*
35+
* Limitations compared te itkDiscreteGaussianImageFilter:
36+
* - Only supports isotropic variance and maximum error (to simplify wrapper)
37+
* - Only supports 3d images (to simplify wrapper)
38+
*
39+
*/
40+
template <typename TInputImage, typename TOutputImage>
41+
class HalideGPUDiscreteGaussianImageFilter : public ImageToImageFilter<TInputImage, TOutputImage>
42+
{
43+
public:
44+
ITK_DISALLOW_COPY_AND_MOVE(HalideGPUDiscreteGaussianImageFilter);
45+
46+
static constexpr unsigned int InputImageDimension = TInputImage::ImageDimension;
47+
static constexpr unsigned int OutputImageDimension = TOutputImage::ImageDimension;
48+
49+
using InputImageType = TInputImage;
50+
using OutputImageType = TOutputImage;
51+
using InputPixelType = typename InputImageType::PixelType;
52+
using OutputPixelType = typename OutputImageType::PixelType;
53+
54+
/** Standard class aliases. */
55+
using Self = HalideGPUDiscreteGaussianImageFilter<InputImageType, OutputImageType>;
56+
using Superclass = ImageToImageFilter<InputImageType, OutputImageType>;
57+
using Pointer = SmartPointer<Self>;
58+
using ConstPointer = SmartPointer<const Self>;
59+
60+
/** Run-time type information. */
61+
itkOverrideGetNameOfClassMacro(HalideGPUDiscreteGaussianImageFilter);
62+
63+
/** Standard New macro. */
64+
itkNewMacro(Self);
65+
66+
itkSetMacro(Variance, float);
67+
itkGetMacro(Variance, float);
68+
69+
itkSetMacro(MaximumError, float);
70+
itkGetMacro(MaximumError, float);
71+
72+
itkGetMacro(MaximumKernelWidth, unsigned int);
73+
itkSetMacro(MaximumKernelWidth, unsigned int);
74+
75+
itkGetMacro(UseImageSpacing, bool);
76+
itkSetMacro(UseImageSpacing, bool);
77+
itkBooleanMacro(UseImageSpacing);
78+
79+
protected:
80+
HalideGPUDiscreteGaussianImageFilter();
81+
~
82+
HalideGPUDiscreteGaussianImageFilter() override = default;
83+
84+
void
85+
PrintSelf(std::ostream & os, Indent indent) const override;
86+
87+
using OutputRegionType = typename OutputImageType::RegionType;
88+
89+
void
90+
GenerateData() override;
91+
92+
private:
93+
#ifdef ITK_USE_CONCEPT_CHECKING
94+
// Add concept checking such as
95+
itkConceptMacro(FloatingPointPixel, (itk::Concept::IsFloatingPoint<typename InputImageType::PixelType>));
96+
#endif
97+
98+
float m_Variance = 0;
99+
float m_MaximumError = 0.01;
100+
unsigned int m_MaximumKernelWidth = 32;
101+
bool m_UseImageSpacing = true;
102+
};
103+
} // namespace itk
104+
105+
#ifndef ITK_MANUAL_INSTANTIATION
106+
# include "itkHalideGPUDiscreteGaussianImageFilter.hxx"
107+
#endif
108+
109+
#endif // itkHalideGPUDiscreteGaussianImageFilter
Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
/*=========================================================================
2+
*
3+
* Copyright NumFOCUS
4+
*
5+
* Licensed under the Apache License, Version 2.0 (the "License");
6+
* you may not use this file except in compliance with the License.
7+
* You may obtain a copy of the License at
8+
*
9+
* https://www.apache.org/licenses/LICENSE-2.0.txt
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*
17+
*=========================================================================*/
18+
#ifndef itkHalideGPUDiscreteGaussianImageFilter_hxx
19+
#define itkHalideGPUDiscreteGaussianImageFilter_hxx
20+
21+
#include "itkHalideGPUDiscreteGaussianImageFilter.h"
22+
23+
#include "itkHalideGPUSeparableConvolutionImpl.h"
24+
25+
#include "itkGaussianOperator.h"
26+
27+
#include <Halide.h>
28+
#include <HalideBuffer.h>
29+
#include <iomanip>
30+
31+
namespace itk
32+
{
33+
34+
template <typename TInputImage, typename TOutputImage>
35+
HalideGPUDiscreteGaussianImageFilter<TInputImage, TOutputImage>::HalideGPUDiscreteGaussianImageFilter()
36+
{
37+
this->DynamicMultiThreadingOff();
38+
this->ThreaderUpdateProgressOff();
39+
}
40+
41+
42+
template <typename TInputImage, typename TOutputImage>
43+
void
44+
HalideGPUDiscreteGaussianImageFilter<TInputImage, TOutputImage>::PrintSelf(std::ostream & os, Indent indent) const
45+
{
46+
Superclass::PrintSelf(os, indent);
47+
}
48+
49+
50+
template <typename TInputImage, typename TOutputImage>
51+
void
52+
HalideGPUDiscreteGaussianImageFilter<TInputImage, TOutputImage>::GenerateData()
53+
{
54+
const InputImageType * input = this->GetInput();
55+
typename InputImageType::RegionType inputRegion = input->GetBufferedRegion();
56+
typename InputImageType::SizeType inputSize = inputRegion.GetSize();
57+
typename InputImageType::SpacingType inputSpacing = input->GetSpacing();
58+
59+
std::vector<Halide::Runtime::Buffer<float, 1>> kernel_buffers{};
60+
61+
// compute kernel coefficients with itk::GaussianOperator to match behavior with itk::DiscreteGaussianImageFilter
62+
for (int dim = 0; dim < InputImageDimension; ++dim)
63+
{
64+
GaussianOperator<float, 1> oper{};
65+
oper.SetMaximumError(m_MaximumError);
66+
oper.SetMaximumKernelWidth(m_MaximumKernelWidth);
67+
68+
float variance = m_Variance;
69+
if (m_UseImageSpacing)
70+
{
71+
variance /= inputSpacing[dim];
72+
}
73+
oper.SetVariance(variance);
74+
75+
oper.CreateDirectional();
76+
77+
Halide::Runtime::Buffer<float, 1> & buf = kernel_buffers.emplace_back(static_cast<int>(oper.GetSize(0)));
78+
buf.set_min(-static_cast<int>(oper.GetRadius(0)));
79+
std::copy(oper.Begin(), oper.End(), buf.begin());
80+
buf.set_host_dirty();
81+
}
82+
83+
OutputImageType * output = this->GetOutput();
84+
output->SetRegions(inputRegion);
85+
output->Allocate();
86+
87+
std::vector<int> sizes(3, 1);
88+
std::copy(inputSize.begin(), inputSize.end(), sizes.begin());
89+
90+
Halide::Runtime::Buffer<const InputPixelType> inputBuffer(input->GetBufferPointer(), sizes);
91+
Halide::Runtime::Buffer<OutputPixelType> outputBuffer(output->GetBufferPointer(), sizes);
92+
93+
inputBuffer.set_host_dirty();
94+
itkHalideGPUSeparableConvolutionImpl(inputBuffer, kernel_buffers[0], kernel_buffers[1], kernel_buffers[2], outputBuffer);
95+
outputBuffer.copy_to_host();
96+
}
97+
98+
} // end namespace itk
99+
100+
#endif // itkHalideGPUDiscreteGaussianImageFilter_hxx

src/CMakeLists.txt

Lines changed: 35 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3,16 +3,45 @@ find_package(Halide REQUIRED shared)
33
add_executable(itkHalideGenerators generators.cpp)
44
target_link_libraries(itkHalideGenerators PRIVATE Halide::Generator)
55

6-
add_halide_library(itkHalideSeparableConvolutionImpl
7-
FROM itkHalideGenerators
8-
HEADER itkHalideSeparableConvolutionImpl_h
9-
FEATURES cuda
10-
)
6+
if(Module_HalideFilters_USE_AUTOSCHEDULER)
7+
add_halide_library(itkHalideSeparableConvolutionImpl
8+
FROM itkHalideGenerators
9+
GENERATOR itkHalideSeparableConvolutionImpl
10+
HEADER itkHalideSeparableConvolutionImpl_h
11+
SCHEDULE itkHalideSeparableConvolutionSchedule
12+
AUTOSCHEDULER Halide::Adams2019
13+
)
14+
15+
add_halide_library(itkHalideGPUSeparableConvolutionImpl
16+
FROM itkHalideGenerators
17+
GENERATOR itkHalideSeparableConvolutionImpl
18+
HEADER itkGPUHalideSeparableConvolutionImpl_h
19+
SCHEDULE itkHalideGPUSeparableConvolutionSchedule
20+
FEATURES cuda
21+
AUTOSCHEDULER Halide::Anderson2021
22+
)
23+
else()
24+
add_halide_library(itkHalideSeparableConvolutionImpl
25+
FROM itkHalideGenerators
26+
GENERATOR itkHalideSeparableConvolutionImpl
27+
HEADER itkHalideSeparableConvolutionImpl_h
28+
PARAMS use_gpu=false
29+
)
30+
31+
add_halide_library(itkHalideGPUSeparableConvolutionImpl
32+
FROM itkHalideGenerators
33+
GENERATOR itkHalideSeparableConvolutionImpl
34+
HEADER itkGPUHalideSeparableConvolutionImpl_h
35+
FEATURES cuda
36+
PARAMS use_gpu=true
37+
)
38+
endif()
1139

1240
set(HalideFilters_SRCS
41+
${itkHalideGPUSeparableConvolutionImpl_h}
1342
${itkHalideSeparableConvolutionImpl_h}
1443
)
1544

1645
itk_module_add_library(HalideFilters ${HalideFilters_SRCS})
1746
target_include_directories(HalideFilters PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
18-
target_link_libraries(HalideFilters PUBLIC itkHalideSeparableConvolutionImpl)
47+
target_link_libraries(HalideFilters PUBLIC itkHalideSeparableConvolutionImpl itkHalideGPUSeparableConvolutionImpl)

0 commit comments

Comments
 (0)