Add support for user-defined multivariate hessians (#2961)

odow · web-flow · commit 8d7e89f6324a · 2022-08-02T14:08:19.000+12:00
diff --git a/docs/make.jl b/docs/make.jl
@@ -138,6 +138,7 @@ const _PAGES = [
             "tutorials/nonlinear/rosenbrock.md",
             "tutorials/nonlinear/mle.md",
             "tutorials/nonlinear/clnlbeam.md",
+            "tutorials/nonlinear/user_defined_hessians.md",
             "tutorials/nonlinear/querying_hessians.md",
         ],
         "Conic programs" => [
diff --git a/docs/src/manual/nlp.md b/docs/src/manual/nlp.md
@@ -356,12 +356,6 @@ The above code creates a JuMP model with the objective function
     However, it's more readable if it does. Make sure you use `my_f`
     and not `f` in the macros.
 
-!!! warning
-    If you use multi-variate user-defined functions, JuMP will disable
-    second-derivative information. This can lead to significant slow-downs in
-    some cases. Only use a user-defined function if you cannot write out the
-    expression algebraically in the macro.
-
 !!! warning
     User-defined functions cannot be re-registered and will not update if you
     modify the underlying Julia function. If you want to change a user-defined
@@ -417,9 +411,11 @@ register(model, :my_square, 2, f, ∇f)
 
 ### Register a function, gradient, and hessian
 
-!!! warning
-    The ability to explicitly register a hessian is only available for
-    univariate functions.
+You can also register a function with the second-order derivative information,
+which is a scalar for univariate functions, and a symmetric matrix for
+multivariate functions.
+
+#### Univariate functions
 
 Instead of automatically differentiating the hessian, you can instead pass a
 function which returns a number representing the second-order derivative.
@@ -435,6 +431,43 @@ register(model, :my_square, 1, f, ∇f, ∇²f)
 @NLobjective(model, Min, my_square(x))
 ```
 
+#### Multivariate functions
+
+For multivariate functions, the hessian function `∇²f` must take an
+`AbstractMatrix` as the first argument, the lower-triangular of which is filled
+in-place:
+```@example
+using JuMP #hide
+f(x...) = (1 - x[1])^2 + 100 * (x[2] - x[1]^2)^2
+function ∇f(g, x...)
+    g[1] = 400 * x[1]^3 - 400 * x[1] * x[2] + 2 * x[1] - 2
+    g[2] = 200 * (x[2] - x[1]^2)
+    return
+end
+function ∇²f(H, x...)
+    H[1, 1] = 1200 * x[1]^2 - 400 * x[2] + 2
+    # H[1, 2] = -400 * x[1]  <-- Not needed. Fill the lower-triangular only.
+    H[2, 1] = -400 * x[1]
+    H[2, 2] = 200.0
+    return
+end
+
+model = Model()
+register(model, :rosenbrock, 2, f, ∇f, ∇²f)
+@variable(model, x[1:2])
+@NLobjective(model, Min, rosenbrock(x[1], x[2]))
+```
+
+!!! warning
+    You may assume the Hessian matrix `H` is initialized with zeros, and because
+    `H` is symmetric, you need only to fill in the non-zero of the
+    lower-triangular terms. The matrix type passed in as `H` depends on the
+    automatic differentiation system, so make sure the first argument to the
+    Hessian function supports an `AbstractMatrix` (it may be something other
+    than `Matrix{Float64}`). However, you may assume only that `H` supports
+    `size(H)` and `setindex!`. Finally, the matrix is treated as dense, so the
+    performance will be poor on functions with high-dimensional input.
+
 ### User-defined functions with vector inputs
 
 User-defined functions which take vectors as input arguments (for example,
diff --git a/docs/src/tutorials/nonlinear/user_defined_hessians.jl b/docs/src/tutorials/nonlinear/user_defined_hessians.jl
@@ -0,0 +1,269 @@
+# Copyright (c) 2022 Oscar Dowson and contributors                               #src
+#                                                                                #src
+# Permission is hereby granted, free of charge, to any person obtaining a copy   #src
+# of this software and associated documentation files (the "Software"), to deal  #src
+# in the Software without restriction, including without limitation the rights   #src
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell      #src
+# copies of the Software, and to permit persons to whom the Software is          #src
+# furnished to do so, subject to the following conditions:                       #src
+#                                                                                #src
+# The above copyright notice and this permission notice shall be included in all #src
+# copies or substantial portions of the Software.                                #src
+#                                                                                #src
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR     #src
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,       #src
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE    #src
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER         #src
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,  #src
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE  #src
+# SOFTWARE.                                                                      #src
+
+# # User-defined Hessians
+
+# In this tutorial, we explain how to write a user-defined function (see
+# [User-defined Functions](@ref)) with a Hessian matrix explicitly provided by
+# the user.
+
+# This tutorial uses the following packages:
+
+using JuMP
+import Ipopt
+
+# ## Rosenbrock example
+
+# As a simple example, we first consider the Rosenbrock function:
+
+rosenbrock(x...) = (1 - x[1])^2 + 100 * (x[2] - x[1]^2)^2
+
+# which has the gradient vector:
+
+function ∇rosenbrock(g::AbstractVector, x...)
+    g[1] = 400 * x[1]^3 - 400 * x[1] * x[2] + 2 * x[1] - 2
+    g[2] = 200 * (x[2] - x[1]^2)
+    return
+end
+
+# and the Hessian matrix:
+
+function ∇²rosenbrock(H::AbstractMatrix, x...)
+    H[1, 1] = 1200 * x[1]^2 - 400 * x[2] + 2
+    ## H[1, 2] = -400 * x[1] <-- not needed because Hessian is symmetric
+    H[2, 1] = -400 * x[1]
+    H[2, 2] = 200.0
+    return
+end
+
+# You may assume the Hessian matrix `H` is initialized with zeros, and
+# because it is symmetric you need only to fill in the non-zero of the
+# lower-triangular terms.
+
+# The matrix type passed in as `H` depends on the automatic differentiation
+# system, so make sure the first argument to the Hessian function supports an
+# `AbstractMatrix` (it may be something other than `Matrix{Float64}`). However,
+# you may assume only that `H` supports `size(H)` and `setindex!`.
+
+# Now that we have the function, its gradient, and its Hessian, we can construct
+# a JuMP model, register the function, and use it in a `@NL` macro:
+
+model = Model(Ipopt.Optimizer)
+@variable(model, x[1:2])
+register(model, :rosenbrock, 2, rosenbrock, ∇rosenbrock, ∇²rosenbrock)
+@NLobjective(model, Min, rosenbrock(x[1], x[2]))
+optimize!(model)
+solution_summary(model; verbose = true)
+
+# ## Bilevel optimization
+
+# User-defined Hessian functions can be useful when solving more complicated
+# problems. In the rest of this tutorial, our goal is to solve the bilevel
+# optimization problem:
+
+# ```math
+# \begin{array}{r l}
+# \min\limits_{x,z} & x_1^2 + x_2^2 + z \\
+# s.t.            & \begin{array}{r l}
+#                       z \ge \max\limits_{y} & x_1^2 y_1 + x_2^2 y_2  - x_1 y_1^4 - 2 x_2 y_2^4 \\
+#                       s.t.                  & (y_1 - 10)^2 + (y_2 - 10)^2 \le 25
+#                   \end{array} \\
+#                 & x \ge 0.
+# \end{array}
+# ```
+
+# This bilevel optimization problem is composed of two nested optimization
+# problems. An _upper_ level, involving variables ``x``, and a _lower_ level,
+# involving variables ``y``. From the perspective of the lower-level problem,
+# the values of ``x`` are fixed parameters, and so the model optimizes ``y``
+# given those fixed parameters. Simultaneously, the upper-level problem
+# optimizes ``x`` and ``z`` given the response of ``y``.
+
+# ## Decomposition
+
+# There are a few ways to solve this problem, but we are going to use a
+# nonlinear decomposition method. The first step is to write a function to
+# compute the lower-level problem:
+
+# ```math
+# \begin{array}{r l}
+#   V(x_1, x_2) = \max\limits_{y} & x_1^2 y_1 + x_2^2 y_2  - x_1 y_1^4 - 2 x_2 y_2^4 \\
+#                            s.t. & (y_1 - 10)^2 + (y_2 - 10)^2 \le 25
+# \end{array}
+# ```
+
+function solve_lower_level(x...)
+    model = Model(Ipopt.Optimizer)
+    set_silent(model)
+    @variable(model, y[1:2])
+    @NLobjective(
+        model,
+        Max,
+        x[1]^2 * y[1] + x[2]^2 * y[2] - x[1] * y[1]^4 - 2 * x[2] * y[2]^4,
+    )
+    @constraint(model, (y[1] - 10)^2 + (y[2] - 10)^2 <= 25)
+    optimize!(model)
+    @assert termination_status(model) == LOCALLY_SOLVED
+    return objective_value(model), value.(y)
+end
+
+# The next function takes a value of ``x`` and returns the optimal lower-level
+# objective-value and the optimal response ``y``. The reason why we need both
+# the objective and the optimal ``y`` will be made clear shortly, but for now
+# let us define:
+
+function V(x...)
+    f, _ = solve_lower_level(x...)
+    return f
+end
+
+# Then, we can substitute ``V`` into our full problem to create:
+
+# ```math
+# \begin{array}{r l}
+# \min\limits_{x} & x_1^2 + x_2^2 + V(x_1, x_2) \\
+# s.t.            & x \ge 0.
+# \end{array}
+# ```
+
+# This looks like a nonlinear optimization problem with a user-defined function
+# ``V``! However, because ``V`` solves an optimization problem internally, we
+# can't use automatic differentiation to compute the first and second
+# derivatives. Instead, we can use JuMP's ability to pass callback functions
+# for the gradient and Hessian instead.
+
+# First up, we need to define the gradient of ``V`` with respect to ``x``. In
+# general, this may be difficult to compute, but because ``x`` appears only in
+# the objective, we can just differentiate the objective function with respect
+# to ``x``, giving:
+
+function ∇V(g::AbstractVector, x...)
+    _, y = solve_lower_level(x...)
+    g[1] = 2 * x[1] * y[1] - y[1]^4
+    g[2] = 2 * x[2] * y[2] - 2 * y[2]^4
+    return
+end
+
+# Second, we need to define the Hessian of ``V`` with respect to ``x``. This is
+# a symmetric matrix, but in our example only the diagonal elements are
+# non-zero:
+
+function ∇²V(H::AbstractMatrix, x...)
+    _, y = solve_lower_level(x...)
+    H[1, 1] = 2 * y[1]
+    H[2, 2] = 2 * y[2]
+    return
+end
+
+# We now have enough to define our bilevel optimization problem:
+
+model = Model(Ipopt.Optimizer)
+@variable(model, x[1:2] >= 0)
+register(model, :V, 2, V, ∇V, ∇²V)
+@NLobjective(model, Min, x[1]^2 + x[2]^2 + V(x[1], x[2]))
+optimize!(model)
+solution_summary(model)
+
+# The optimal objective value is:
+
+objective_value(model)
+
+# and the optimal upper-level decision variables ``x`` are:
+
+value.(x)
+
+# To compute the optimal lower-level decision variables, we need to call
+# `solve_lower_level` with the optimal upper-level decision variables:
+
+_, y = solve_lower_level(value.(x)...)
+y
+
+# ## Improving performance
+
+# Our solution approach works, but it has a performance problem: every time
+# we need to compute the value, gradient, or Hessian of ``V``, we have to
+# re-solve the lower-level optimization problem! This is wasteful, because we
+# will often call the gradient and Hessian at the same point, and so solving the
+# problem twice with the same input repeats work unnecessarily.
+
+# We can work around this by using a cache:
+
+mutable struct Cache
+    x::Any
+    f::Float64
+    y::Vector{Float64}
+end
+
+# with a function to update the cache if needed:
+
+function _update_if_needed(cache::Cache, x...)
+    if cache.x !== x
+        cache.f, cache.y = solve_lower_level(x...)
+        cache.x = x
+    end
+    return
+end
+
+# Then, we define cached versions of out three functions which call
+# `_updated_if_needed` and return values from the cache.
+
+function cached_f(cache::Cache, x...)
+    _update_if_needed(cache, x...)
+    return cache.f
+end
+
+function cached_∇f(cache::Cache, g::AbstractVector, x...)
+    _update_if_needed(cache, x...)
+    g[1] = 2 * x[1] * cache.y[1] - cache.y[1]^4
+    g[2] = 2 * x[2] * cache.y[2] - 2 * cache.y[2]^4
+    return
+end
+
+function cached_∇²f(cache::Cache, H::AbstractMatrix, x...)
+    _update_if_needed(cache, x...)
+    H[1, 1] = 2 * cache.y[1]
+    H[2, 2] = 2 * cache.y[2]
+    return
+end
+
+# Now we're ready to setup and solve the upper level optimization problem:
+
+model = Model(Ipopt.Optimizer)
+@variable(model, x[1:2] >= 0)
+cache = Cache(Float64[], NaN, Float64[])
+register(
+    model,
+    :V,
+    2,
+    (x...) -> cached_f(cache, x...),
+    (g, x...) -> cached_∇f(cache, g, x...),
+    (H, x...) -> cached_∇²f(cache, H, x...),
+)
+@NLobjective(model, Min, x[1]^2 + x[2]^2 + V(x[1], x[2]))
+optimize!(model)
+solution_summary(model)
+
+# an we can check we get the same objective value:
+
+objective_value(model)
+
+# and upper-level decision variable ``x``:
+
+value.(x)
diff --git a/src/nlp.jl b/src/nlp.jl
@@ -801,11 +801,6 @@ function register(
     ∇f::Function,
     ∇²f::Function,
 )
-    if dimension > 1
-        error(
-            "Providing hessians for multivariate functions is not yet supported",
-        )
-    end
     _init_NLP(model)
     MOI.Nonlinear.register_operator(model.nlp_model, op, dimension, f, ∇f, ∇²f)
     return
diff --git a/test/nlp.jl b/test/nlp.jl