Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ReverseAD] Add support for user-defined hessians #1819

Merged
merged 5 commits into from
May 30, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/src/submodules/Nonlinear/reference.md
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ Nonlinear.eval_univariate_gradient
Nonlinear.eval_univariate_hessian
Nonlinear.eval_multivariate_function
Nonlinear.eval_multivariate_gradient
Nonlinear.eval_multivariate_hessian
Nonlinear.eval_logic_function
Nonlinear.eval_comparison_function
```
Expand Down
38 changes: 35 additions & 3 deletions src/Nonlinear/ReverseAD/forward_over_reverse.jl
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,7 @@ function _hessian_slice_inner(d, ex, input_ϵ, output_ϵ, ::Type{T}) where {T}
for i in ex.dependent_subexpressions
subexpr = d.subexpressions[i]
subexpr_forward_values_ϵ[i] = _forward_eval_ϵ(
d,
subexpr,
_reinterpret_unsafe(T, subexpr.forward_storage_ϵ),
_reinterpret_unsafe(T, subexpr.partials_storage_ϵ),
Expand All @@ -135,6 +136,7 @@ function _hessian_slice_inner(d, ex, input_ϵ, output_ϵ, ::Type{T}) where {T}
)
end
_forward_eval_ϵ(
d,
ex,
_reinterpret_unsafe(T, d.forward_storage_ϵ),
_reinterpret_unsafe(T, d.partials_storage_ϵ),
Expand Down Expand Up @@ -178,6 +180,7 @@ end

"""
_forward_eval_ϵ(
d::NLPEvaluator,
ex::Union{_FunctionStorage,_SubexpressionStorage},
storage_ϵ::AbstractVector{ForwardDiff.Partials{N,T}},
partials_storage_ϵ::AbstractVector{ForwardDiff.Partials{N,T}},
Expand All @@ -195,6 +198,7 @@ components separate so that we don't need to recompute the real components.
This assumes that `_reverse_model(d, x)` has already been called.
"""
function _forward_eval_ϵ(
d::NLPEvaluator,
ex::Union{_FunctionStorage,_SubexpressionStorage},
storage_ϵ::AbstractVector{ForwardDiff.Partials{N,T}},
partials_storage_ϵ::AbstractVector{ForwardDiff.Partials{N,T}},
Expand Down Expand Up @@ -328,10 +332,38 @@ function _forward_eval_ϵ(
recip_denominator,
)
elseif op > 6
error(
"User-defined operators not supported for hessian " *
"computations",
f_input = _UnsafeVectorView(d.jac_storage, n_children)
for (i, c) in enumerate(children_idx)
f_input[i] = ex.forward_storage[children_arr[c]]
end
H = _UnsafeLowerTriangularMatrixView(
d.user_output_buffer,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Where is it enforced that user_output_buffer is large enough?

Copy link
Member Author

@odow odow May 29, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So the constructor could break other unsafe views on the same array. That sounds extra unsafe... At least needs to be documented.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes. For now though, the only usage of this is evaluating the user-defined hessians, and we use the .user_output_buffer as storage, so it's a safe usage of the unsafe behavior.

n_children,
)
has_hessian = Nonlinear.eval_multivariate_hessian(
user_operators,
user_operators.multivariate_operators[node.index],
H,
f_input,
)
# This might be `false` if we extend this code to all
# multivariate functions.
@assert has_hessian
for col in 1:n_children
dual = zero(ForwardDiff.Partials{N,T})
for row in 1:n_children
# Make sure we get the lower-triangular component.
h = row >= col ? H[row, col] : H[col, row]
# Performance optimization: hessians can be quite
# sparse
if !iszero(h)
i = children_arr[children_idx[row]]
dual += h * storage_ϵ[i]
end
end
i = children_arr[children_idx[col]]
partials_storage_ϵ[i] = dual
end
end
elseif node.type == Nonlinear.NODE_CALL_UNIVARIATE
@inbounds child_idx = children_arr[ex.adj.colptr[k]]
Expand Down
13 changes: 9 additions & 4 deletions src/Nonlinear/ReverseAD/mathoptinterface_api.jl
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,12 @@
# in the LICENSE.md file or at https://opensource.org/licenses/MIT.

function MOI.features_available(d::NLPEvaluator)
# Check if we have any user-defined multivariate operators, in which case we
# need to disable hessians. The result of features_available depends on this.
d.disable_2ndorder =
length(d.data.operators.registered_multivariate_operators) > 0
# Check if we are missing any hessians for user-defined multivariate
# operators, in which case we need to disable :Hess and :HessVec.
d.disable_2ndorder = any(
op -> op.∇²f === nothing,
d.data.operators.registered_multivariate_operators,
)
if d.disable_2ndorder
return [:Grad, :Jac, :JacVec]
end
Expand Down Expand Up @@ -286,6 +288,7 @@ function MOI.eval_hessian_lagrangian_product(d::NLPEvaluator, h, x, v, σ, μ)
for i in d.subexpression_order
subexpr = d.subexpressions[i]
subexpr_forward_values_ϵ[i] = _forward_eval_ϵ(
d,
subexpr,
reinterpret(T, subexpr.forward_storage_ϵ),
reinterpret(T, subexpr.partials_storage_ϵ),
Expand All @@ -302,6 +305,7 @@ function MOI.eval_hessian_lagrangian_product(d::NLPEvaluator, h, x, v, σ, μ)
fill!(output_ϵ, zero(T))
if d.objective !== nothing
_forward_eval_ϵ(
d,
d.objective,
reinterpret(T, d.forward_storage_ϵ),
reinterpret(T, d.partials_storage_ϵ),
Expand All @@ -322,6 +326,7 @@ function MOI.eval_hessian_lagrangian_product(d::NLPEvaluator, h, x, v, σ, μ)
end
for (i, con) in enumerate(d.constraints)
_forward_eval_ϵ(
d,
con,
reinterpret(T, d.forward_storage_ϵ),
reinterpret(T, d.partials_storage_ϵ),
Expand Down
92 changes: 92 additions & 0 deletions src/Nonlinear/ReverseAD/utils.jl
Original file line number Diff line number Diff line change
Expand Up @@ -53,13 +53,105 @@ Base.length(v::_UnsafeVectorView) = v.len

Base.size(v::_UnsafeVectorView) = (v.len,)

"""
_UnsafeVectorView(x::Vector, N::Int)

Create a new [`_UnsafeVectorView`](@ref) from `x`, and resize `x` if needed to
ensure it has a length of at least `N`.

## Unsafe behavior

In addition to the usafe behavior of `_UnsafeVectorView`, this constructor is
additionally unsafe because it may resize `x`. Only call it if you are sure that
the usage of `_UnsafeVectorView(x, N)` is short-lived, and that there are no
other views to `x` while the returned value is within scope.
"""
function _UnsafeVectorView(x::Vector, N::Int)
if length(x) < N
resize!(x, N)
end
return _UnsafeVectorView(0, N, pointer(x))
end

"""
_UnsafeLowerTriangularMatrixView(x, N)

Lightweight unsafe view that converts a vector `x` into the lower-triangular
component of a symmetric `N`-by-`N` matrix.

## Motivation

`_UnsafeLowerTriangularMatrixView` is needed as an allocation-free equivalent of
`view`. Other alternatives, like `reshape(view(x, 1:N^2), N, N)` or a struct
like
```julia
struct _SafeView{T}
x::Vector{T}
len::Int
end
```
will allocate so that `x` can be tracked by Julia's GC.
`_UnsafeLowerTriangularMatrixView` relies on the fact that the use-cases of
`_UnsafeLowerTriangularMatrixView` only temporarily wrap a long-lived vector
like `d.jac_storage` so that we don't have to worry about the GC removing
`d.jac_storage` while `_UnsafeLowerTriangularMatrixView` exists. This lets us
use a `Ptr{T}` and create a struct that is `isbitstype` and therefore does not
allocate.

## Unsafe behavior

`_UnsafeLowerTriangularMatrixView` is unsafe because it assumes that the vector
`x` remains valid during the usage of `_UnsafeLowerTriangularMatrixView`.
"""
struct _UnsafeLowerTriangularMatrixView <: AbstractMatrix{Float64}
N::Int
ptr::Ptr{Float64}
end

Base.size(x::_UnsafeLowerTriangularMatrixView) = (x.N, x.N)

function _linear_index(row, col)
if row < col
error("Unable to access upper-triangular component: ($row, $col)")
end
return div((row - 1) * row, 2) + col
end

function Base.getindex(x::_UnsafeLowerTriangularMatrixView, i, j)
return unsafe_load(x.ptr, _linear_index(i, j))
end

function Base.setindex!(x::_UnsafeLowerTriangularMatrixView, value, i, j)
unsafe_store!(x.ptr, value, _linear_index(i, j))
return value
end

"""
_UnsafeLowerTriangularMatrixView(x::Vector{Float64}, N::Int)

Create a new [`_UnsafeLowerTriangularMatrixView`](@ref) from `x`, zero the
elements in `x`, and resize `x` if needed to ensure it has a length of at least
`N * (N + 1) / 2`.

## Unsafe behavior

In addition to the usafe behavior of `_UnsafeLowerTriangularMatrixView`, this
constructor is additionally unsafe because it may resize `x`. Only call it if
you are sure that the usage of `_UnsafeLowerTriangularMatrixView(x, N)` is
short-lived, and that there are no other views to `x` while the returned value
is within scope.
"""
function _UnsafeLowerTriangularMatrixView(x::Vector{Float64}, N::Int)
z = div(N * (N + 1), 2)
if length(x) < z
resize!(x, z)
end
for i in 1:z
x[i] = 0.0
end
return _UnsafeLowerTriangularMatrixView(N, pointer(x))
end

function _reinterpret_unsafe(::Type{T}, x::Vector{R}) where {T,R}
# how many T's fit into x?
@assert isbitstype(T) && isbitstype(R)
Expand Down
9 changes: 8 additions & 1 deletion src/Nonlinear/model.jl
Original file line number Diff line number Diff line change
Expand Up @@ -203,8 +203,15 @@ Register the user-defined operator `op` with `nargs` input arguments in `model`.
* `∇f(g::AbstractVector{T}, x::T...)::T` is a function that takes a cache vector
`g` of length `length(x)`, and fills each element `g[i]` with the partial
derivative of `f` with respect to `x[i]`.
* `∇²f(H::AbstractMatrix, x::T...)::T` is a function that takes a matrix `H` and
fills the lower-triangular components `H[i, j]` with the Hessian of `f` with
respect to `x[i]` and `x[j]` for `i >= j`.

Hessian are not supported for multivariate functions.
### Notes for multivariate Hessians

* `H` has `size(H) == (length(x), length(x))`, but you must not access
elements `H[i, j]` for `i > j`.
* `H` is dense, but you do not need to fill structural zeros.
"""
function register_operator(model::Model, op::Symbol, nargs::Int, f::Function...)
return register_operator(model.operators, op, nargs, f...)
Expand Down
19 changes: 18 additions & 1 deletion src/Nonlinear/operators.jl
Original file line number Diff line number Diff line change
Expand Up @@ -631,7 +631,24 @@ end

_nan_to_zero(x) = isnan(x) ? 0.0 : x

# No docstring because this function is still a WIP.
"""
eval_multivariate_hessian(
registry::OperatorRegistry,
op::Symbol,
H::AbstractMatrix,
x::AbstractVector{T},
) where {T}

Evaluate the Hessian of operator `∇²op(x)`, where `op` is a multivariate
function in `registry`.

The Hessian is stored in the lower-triangular part of the matrix `H`.

!!! note
Implementations of the Hessian operators will not fill structural zeros.
Therefore, before calling this function you should pre-populate the matrix
`H` with `0`.
"""
function eval_multivariate_hessian(
registry::OperatorRegistry,
op::Symbol,
Expand Down
88 changes: 73 additions & 15 deletions test/Nonlinear/ReverseAD.jl
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,34 @@ function test_objective_quadratic_univariate()
return
end

function test_objective_and_constraints_quadratic_univariate()
x = MOI.VariableIndex(1)
model = Nonlinear.Model()
Nonlinear.set_objective(model, :($x^2 + 1))
Nonlinear.add_constraint(model, :($x^2), MOI.LessThan(2.0))
evaluator = Nonlinear.Evaluator(model, Nonlinear.SparseReverseMode(), [x])
MOI.initialize(evaluator, [:Grad, :Jac, :Hess])
@test MOI.eval_objective(evaluator, [1.2]) == 1.2^2 + 1
g = [NaN]
MOI.eval_objective_gradient(evaluator, g, [1.2])
@test g == [2.4]
@test MOI.hessian_lagrangian_structure(evaluator) == [(1, 1), (1, 1)]
H = [NaN, NaN]
MOI.eval_hessian_lagrangian(evaluator, H, [1.2], 1.5, Float64[1.3])
@test H == [1.5, 1.3] .* [2.0, 2.0]
Hp = [NaN]
MOI.eval_hessian_lagrangian_product(
evaluator,
Hp,
[1.2],
[1.2],
1.5,
Float64[1.3],
)
@test Hp == [1.5 * 2.0 * 1.2 + 1.3 * 2.0 * 1.2]
return
end

function test_objective_quadratic_multivariate()
x = MOI.VariableIndex(1)
y = MOI.VariableIndex(2)
Expand Down Expand Up @@ -287,14 +315,13 @@ function test_hessian_sparsity_registered_function()
Nonlinear.set_objective(model, :(f($x, $z) + $y^2))
evaluator =
Nonlinear.Evaluator(model, Nonlinear.SparseReverseMode(), [x, y, z])
@test_broken :Hess in MOI.features_available(evaluator)
# TODO(odow): re-enable these tests when user-defined hessians are supported
# MOI.initialize(evaluator, [:Grad, :Jac, :Hess])
# @test MOI.hessian_lagrangian_structure(evaluator) ==
# [(1, 1), (2, 2), (3, 3), (3, 1)]
# H = fill(NaN, 4)
# MOI.eval_hessian_lagrangian(evaluator, H, rand(3), 1.5, Float64[])
# @test H == 1.5 .* [2.0, 2.0, 2.0, 0.0]
@test :Hess in MOI.features_available(evaluator)
MOI.initialize(evaluator, [:Grad, :Jac, :Hess])
@test MOI.hessian_lagrangian_structure(evaluator) ==
[(1, 1), (2, 2), (3, 3), (3, 1)]
H = fill(NaN, 4)
MOI.eval_hessian_lagrangian(evaluator, H, rand(3), 1.5, Float64[])
@test H == 1.5 .* [2.0, 2.0, 2.0, 0.0]
return
end

Expand All @@ -308,6 +335,7 @@ function test_hessian_sparsity_registered_rosenbrock()
return
end
function ∇²f(H, x...)
@assert size(H) == (2, 2)
H[1, 1] = 1200 * x[1]^2 - 400 * x[2] + 2
H[2, 1] = -400 * x[1]
H[2, 2] = 200.0
Expand All @@ -318,13 +346,43 @@ function test_hessian_sparsity_registered_rosenbrock()
Nonlinear.set_objective(model, :(rosenbrock($x, $y)))
evaluator =
Nonlinear.Evaluator(model, Nonlinear.SparseReverseMode(), [x, y])
@test_broken :Hess in MOI.features_available(evaluator)
# TODO(odow): re-enable these tests when user-defined hessians are supported
# MOI.initialize(evaluator, [:Grad, :Jac, :Hess])
# @test MOI.hessian_lagrangian_structure(evaluator) == [(1, 1), (2, 2), (2, 1)]
# H = fill(NaN, 3)
# MOI.eval_hessian_lagrangian(evaluator, H, [1.0, 1.0], 1.5, Float64[])
# @test H == 1.5 .* [802, 200, -400]
@test :Hess in MOI.features_available(evaluator)
MOI.initialize(evaluator, [:Grad, :Jac, :Hess])
@test MOI.hessian_lagrangian_structure(evaluator) ==
[(1, 1), (2, 2), (2, 1)]
H = fill(NaN, 3)
MOI.eval_hessian_lagrangian(evaluator, H, [1.0, 1.0], 1.5, Float64[])
@test H == 1.5 .* [802.0, 200.0, -400.0]
return
end

function test_hessian_registered_error()
x = MOI.VariableIndex(1)
y = MOI.VariableIndex(2)
f(x...) = (1 - x[1])^2 + 100 * (x[2] - x[1]^2)^2
function ∇f(g, x...)
g[1] = 400 * x[1]^3 - 400 * x[1] * x[2] + 2 * x[1] - 2
g[2] = 200 * (x[2] - x[1]^2)
return
end
function ∇²f(H, x...)
H[1, 1] = 1200 * x[1]^2 - 400 * x[2] + 2
# Wrong index! Should be [2, 1]
H[1, 2] = -400 * x[1]
H[2, 2] = 200.0
return
end
model = Nonlinear.Model()
Nonlinear.register_operator(model, :rosenbrock, 2, f, ∇f, ∇²f)
Nonlinear.set_objective(model, :(rosenbrock($x, $y)))
evaluator =
Nonlinear.Evaluator(model, Nonlinear.SparseReverseMode(), [x, y])
MOI.initialize(evaluator, [:Grad, :Jac, :Hess])
H = fill(NaN, 3)
@test_throws(
ErrorException("Unable to access upper-triangular component: (1, 2)"),
MOI.eval_hessian_lagrangian(evaluator, H, [1.0, 1.0], 1.5, Float64[]),
)
return
end

Expand Down