Source code for pymanopt.optimizers.trust_regions

# References, taken from trustregions.m in manopt:

# Please cite the Manopt paper as well as the research paper:
#     @Article{genrtr,
#       Title    = {Trust-region methods on {Riemannian} manifolds},
#       Author   = {Absil, P.-A. and Baker, C. G. and Gallivan, K. A.},
#       Journal  = {Foundations of Computational Mathematics},
#       Year     = {2007},
#       Number   = {3},
#       Pages    = {303--330},
#       Volume   = {7},
#       Doi      = {10.1007/s10208-005-0179-9}
#     }
#
# See also: steepestdescent conjugategradient manopt/examples

# An explicit, general listing of this algorithm, with preconditioning,
# can be found in the following paper:
#     @Article{boumal2015lowrank,
#       Title   = {Low-rank matrix completion via preconditioned optimization
#                   on the {G}rassmann manifold},
#       Author  = {Boumal, N. and Absil, P.-A.},
#       Journal = {Linear Algebra and its Applications},
#       Year    = {2015},
#       Pages   = {200--239},
#       Volume  = {475},
#       Doi     = {10.1016/j.laa.2015.02.027},
#     }

# When the Hessian is not specified, it is approximated with
# finite-differences of the gradient. The resulting method is called
# RTR-FD. Some convergence theory for it is available in this paper:
# @incollection{boumal2015rtrfd
# 	author={Boumal, N.},
# 	title={Riemannian trust regions with finite-difference Hessian
#                      approximations are globally convergent},
# 	year={2015},
# 	booktitle={Geometric Science of Information}
# }


# This file is part of Manopt: www.manopt.org.
# This code is an adaptation to Manopt of the original GenRTR code:
# RTR - Riemannian Trust-Region
# (c) 2004-2007, P.-A. Absil, C. G. Baker, K. A. Gallivan
# Florida State University
# School of Computational Science
# (http://www.math.fsu.edu/~cbaker/GenRTR/?page=download)
# See accompanying license file.
# The adaptation was executed by Nicolas Boumal.

# Ported to pymanopt by Jamie Townsend. January 2016.

import time

import numpy as np

from pymanopt.optimizers.optimizer import Optimizer, OptimizerResult


[docs]class TrustRegions(Optimizer): ( NEGATIVE_CURVATURE, EXCEEDED_TR, REACHED_TARGET_LINEAR, REACHED_TARGET_SUPERLINEAR, MAX_INNER_ITER, MODEL_INCREASED, ) = range(6) TCG_STOP_REASONS = { NEGATIVE_CURVATURE: "negative curvature", EXCEEDED_TR: "exceeded trust region", REACHED_TARGET_LINEAR: "reached target residual-kappa (linear)", REACHED_TARGET_SUPERLINEAR: "reached target residual-theta " "(superlinear)", MAX_INNER_ITER: "maximum inner iterations", MODEL_INCREASED: "model increased", } def __init__( self, miniter=3, kappa=0.1, theta=1.0, rho_prime=0.1, use_rand=False, rho_regularization=1e3, *args, **kwargs, ): """Riemannian Trust-Regions algorithm. Second-order method that approximates the objective function by a quadratic surface and then updates the current estimate based on that. Also included is the truncated (Steihaug-Toint) conjugate gradient algorithm. """ super().__init__(*args, **kwargs) self.miniter = miniter self.kappa = kappa self.theta = theta self.rho_prime = rho_prime self.use_rand = use_rand self.rho_regularization = rho_regularization
[docs] def run( self, problem, *, initial_point=None, mininner=1, maxinner=None, Delta_bar=None, Delta0=None, ) -> OptimizerResult: manifold = problem.manifold if maxinner is None: maxinner = manifold.dim # Set default Delta_bar and Delta0 separately to deal with additional # logic: if Delta_bar is provided but not Delta0, let Delta0 # automatically be some fraction of the provided Delta_bar. if Delta_bar is None: try: Delta_bar = manifold.typical_dist except NotImplementedError: Delta_bar = np.sqrt(manifold.dim) if Delta0 is None: Delta0 = Delta_bar / 8 cost = problem.cost gradient = problem.riemannian_gradient hess = problem.riemannian_hessian # If no starting point is specified, generate one at random. if initial_point is None: x = manifold.random_point() else: x = initial_point # Initializations start_time = time.time() # Number of outer (TR) iterations. The semantic is that `iteration` # counts the number of iterations fully executed so far. iteration = 0 # Initialize solution and companion measures: f(x), fgrad(x) fx = cost(x) fgradx = gradient(x) norm_grad = manifold.norm(x, fgradx) # Initialize the trust region radius Delta = Delta0 # To keep track of consecutive radius changes, so that we can warn the # user if it appears necessary. consecutive_TRplus = 0 consecutive_TRminus = 0 # ** Display: if self._verbosity >= 1: print("Optimizing...") if self._verbosity >= 2: print(f"{' ':44s}f: {fx:+.6e} |grad|: {norm_grad:.6e}") self._initialize_log() while True: iteration += 1 # ************************* # ** Begin TR Subproblem ** # ************************* # Determine eta0 if not self.use_rand: # Pick the zero vector eta = manifold.zero_vector(x) else: # Random vector in T_x M (this has to be very small) eta = 1e-6 * manifold.random_tangent_vector(x) # Must be inside trust region while manifold.norm(x, eta) > Delta: eta = np.sqrt(np.sqrt(np.spacing(1))) * eta # Solve TR subproblem approximately eta, Heta, numit, stop_inner = self._truncated_conjugate_gradient( problem, x, fgradx, eta, Delta, self.theta, self.kappa, mininner, maxinner, ) srstr = self.TCG_STOP_REASONS[stop_inner] # If using randomized approach, compare result with the Cauchy # point. Convergence proofs assume that we achieve at least (a # fraction of) the reduction of the Cauchy point. After this # if-block, either all eta-related quantities have been changed # consistently, or none of them have. if self.use_rand: used_cauchy = False # Check the curvature Hg = hess(x, fgradx) g_Hg = manifold.inner_product(x, fgradx, Hg) if g_Hg <= 0: tau_c = 1 else: tau_c = min(norm_grad**3 / (Delta * g_Hg), 1) # and generate the Cauchy point. eta_c = -tau_c * Delta / norm_grad * fgradx Heta_c = -tau_c * Delta / norm_grad * Hg # Now that we have computed the Cauchy point in addition to the # returned eta, we might as well keep the best of them. mdle = ( fx + manifold.inner_product(x, fgradx, eta) + 0.5 * manifold.inner_product(x, Heta, eta) ) mdlec = ( fx + manifold.inner_product(x, fgradx, eta_c) + 0.5 * manifold.inner_product(x, Heta_c, eta_c) ) if mdlec < mdle: eta = eta_c Heta = Heta_c used_cauchy = True # This is only computed for logging purposes, because it may be # useful for some user-defined stopping criteria. If this is not # cheap for specific applications (compared to evaluating the # cost), we should reconsider this. # norm_eta = manifold.norm(x, eta) # Compute the tentative next iterate (the proposal) x_prop = manifold.retraction(x, eta) # Compute the function value of the proposal fx_prop = cost(x_prop) # Will we accept the proposal or not? Check the performance of the # quadratic model against the actual cost. rhonum = fx - fx_prop rhoden = -manifold.inner_product( x, fgradx, eta ) - 0.5 * manifold.inner_product(x, eta, Heta) # rhonum could be anything. # rhoden should be nonnegative, as guaranteed by tCG, baring # numerical errors. # Heuristic -- added Dec. 2, 2013 (NB) to replace the former # heuristic. This heuristic is documented in the book by Conn Gould # and Toint on trust-region methods, section 17.4.2. rhonum # measures the difference between two numbers. Close to # convergence, these two numbers are very close to each other, so # that computing their difference is numerically challenging: there # may be a significant loss in accuracy. Since the acceptance or # rejection of the step is conditioned on the ratio between rhonum # and rhoden, large errors in rhonum result in a very large error # in rho, hence in erratic acceptance / rejection. Meanwhile, close # to convergence, steps are usually trustworthy and we should # transition to a Newton- like method, with rho=1 consistently. The # heuristic thus shifts both rhonum and rhoden by a small amount # such that far from convergence, the shift is irrelevant and close # to convergence, the ratio rho goes to 1, effectively promoting # acceptance of the step. The rationale is that close to # convergence, both rhonum and rhoden are quadratic in the distance # between x and x_prop. Thus, when this distance is on the order of # sqrt(eps), the value of rhonum and rhoden is on the order of eps, # which is indistinguishable from the numerical error, resulting in # badly estimated rho's. # For abs(fx) < 1, this heuristic is invariant under offsets of f # but not under scaling of f. For abs(fx) > 1, the opposite holds. # This should not alarm us, as this heuristic only triggers at the # very last iterations if very fine convergence is demanded. rho_reg = max(1, abs(fx)) * np.spacing(1) * self.rho_regularization rhonum = rhonum + rho_reg rhoden = rhoden + rho_reg # This is always true if a linear, symmetric operator is used for # the Hessian (approximation) and if we had infinite numerical # precision. In practice, nonlinear approximations of the Hessian # such as the built-in finite difference approximation and finite # numerical accuracy can cause the model to increase. In such # scenarios, we decide to force a rejection of the step and a # reduction of the trust-region radius. We test the sign of the # regularized rhoden since the regularization is supposed to # capture the accuracy to which rhoden is computed: if rhoden were # negative before regularization but not after, that should not be # (and is not) detected as a failure. # # Note (Feb. 17, 2015, NB): the most recent version of tCG already # includes a mechanism to ensure model decrease if the Cauchy step # attained a decrease (which is theoretically the case under very # lax assumptions). This being said, it is always possible that # numerical errors will prevent this, so that it is good to keep a # safeguard. # # The current strategy is that, if this should happen, then we # reject the step and reduce the trust region radius. This also # ensures that the actual cost values are monotonically decreasing. model_decreased = rhoden >= 0 if not model_decreased: srstr = srstr + ", model did not decrease" try: rho = rhonum / rhoden except ZeroDivisionError: # Added June 30, 2015 following observation by BM. With this # modification, it is guaranteed that a step rejection is # always accompanied by a TR reduction. This prevents # stagnation in this "corner case" (NaN's really aren't # supposed to occur, but it's nice if we can handle them # nonetheless). print( "rho is NaN! Forcing a radius decrease. This should " "not happen." ) rho = np.nan # Choose the new TR radius based on the model performance trstr = " " # If the actual decrease is smaller than 1/4 of the predicted # decrease, then reduce the TR radius. if rho < 1.0 / 4 or not model_decreased or np.isnan(rho): trstr = "TR-" Delta = Delta / 4 consecutive_TRplus = 0 consecutive_TRminus = consecutive_TRminus + 1 if consecutive_TRminus >= 5 and self._verbosity >= 1: consecutive_TRminus = -np.inf print( " +++ Detected many consecutive TR- (radius " "decreases)." ) print( " +++ Consider decreasing options.Delta_bar " "by an order of magnitude." ) print( f" +++ Current values: Delta_bar = {Delta_bar:g} " f"and Delta0 = {Delta0:g}" ) # If the actual decrease is at least 3/4 of the precicted decrease # and the tCG (inner solve) hit the TR boundary, increase the TR # radius. We also keep track of the number of consecutive # trust-region radius increases. If there are many, this may # indicate the need to adapt the initial and maximum radii. elif rho > 3.0 / 4 and ( stop_inner in (self.NEGATIVE_CURVATURE, self.EXCEEDED_TR) ): trstr = "TR+" Delta = min(2 * Delta, Delta_bar) consecutive_TRminus = 0 consecutive_TRplus = consecutive_TRplus + 1 if consecutive_TRplus >= 5 and self._verbosity >= 1: consecutive_TRplus = -np.inf print( " +++ Detected many consecutive TR+ (radius " "increases)." ) print( " +++ Consider increasing options.Delta_bar " "by an order of magnitude." ) print( f" +++ Current values: Delta_bar = {Delta_bar:g} " f"and Delta0 = {Delta0:g}." ) else: # Otherwise, keep the TR radius constant. consecutive_TRplus = 0 consecutive_TRminus = 0 # Choose to accept or reject the proposed step based on the model # performance. Note the strict inequality. if model_decreased and rho > self.rho_prime: # accept = True accstr = "acc" x = x_prop fx = fx_prop fgradx = gradient(x) norm_grad = manifold.norm(x, fgradx) else: # accept = False accstr = "REJ" # ** Display: if self._verbosity == 2: print( f"{accstr:.3s} {trstr:.3s} k: {iteration:5d} " f"num_inner: {numit:5d} f: {fx:+e} |grad|: " f"{norm_grad:e} {srstr:s}" ) elif self._verbosity > 2: if self.use_rand and used_cauchy: print("USED CAUCHY POINT") print( f"{accstr:.3s} {trstr:.3s} k: {iteration:5d} " f"num_inner: {numit:5d} {srstr:s}" ) print(f" f(x) : {fx:+e} |grad| : {norm_grad:e}") print(f" rho : {rho:e}") # ** CHECK STOPPING criteria stopping_criterion = self._check_stopping_criterion( start_time=start_time, gradient_norm=norm_grad, iteration=iteration, ) if stopping_criterion: if self._verbosity >= 1: print(stopping_criterion) print("") break return self._return_result( start_time=start_time, point=x, cost=fx, iterations=iteration, stopping_criterion=stopping_criterion, gradient_norm=norm_grad, )
def _truncated_conjugate_gradient( self, problem, x, fgradx, eta, Delta, theta, kappa, mininner, maxinner ): manifold = problem.manifold inner = manifold.inner_product hess = problem.riemannian_hessian preconditioner = problem.preconditioner if not self.use_rand: # and therefore, eta == 0 Heta = manifold.zero_vector(x) r = fgradx e_Pe = 0 else: # and therefore, no preconditioner # eta (presumably) ~= 0 was provided by the caller. Heta = hess(x, eta) r = fgradx + Heta e_Pe = inner(x, eta, eta) r_r = inner(x, r, r) norm_r = np.sqrt(r_r) norm_r0 = norm_r # Precondition the residual if not self.use_rand: z = preconditioner(x, r) else: z = r # Compute z'*r z_r = inner(x, z, r) d_Pd = z_r # Initial search direction delta = -z if not self.use_rand: e_Pd = 0 else: e_Pd = inner(x, eta, delta) # If the Hessian or a linear Hessian approximation is in use, it is # theoretically guaranteed that the model value decreases strictly with # each iteration of tCG. Hence, there is no need to monitor the model # value. But, when a nonlinear Hessian approximation is used (such as # the built-in finite-difference approximation for example), the model # may increase. It is then important to terminate the tCG iterations # and return the previous (the best-so-far) iterate. The variable below # will hold the model value. def model_fun(eta, Heta): return inner(x, eta, fgradx) + 0.5 * inner(x, eta, Heta) if not self.use_rand: model_value = 0 else: model_value = model_fun(eta, Heta) # Pre-assume termination because j == end. stop_tCG = self.MAX_INNER_ITER # Begin inner/tCG loop. for j in range(int(maxinner)): # This call is the computationally intensive step Hdelta = hess(x, delta) # Compute curvature (often called kappa) d_Hd = inner(x, delta, Hdelta) # Note that if d_Hd == 0, we will exit at the next "if" anyway. if d_Hd != 0: alpha = z_r / d_Hd # <neweta,neweta>_P = # <eta,eta>_P # + 2*alpha*<eta,delta>_P # + alpha*alpha*<delta,delta>_P e_Pe_new = e_Pe + 2 * alpha * e_Pd + alpha**2 * d_Pd else: e_Pe_new = e_Pe # Check against negative curvature and trust-region radius # violation. If either condition triggers, we bail out. if d_Hd <= 0 or e_Pe_new >= Delta**2: # want # ee = <eta,eta>_prec,x # ed = <eta,delta>_prec,x # dd = <delta,delta>_prec,x tau = ( -e_Pd + np.sqrt(e_Pd * e_Pd + d_Pd * (Delta**2 - e_Pe)) ) / d_Pd eta = eta + tau * delta # If only a nonlinear Hessian approximation is available, this # is only approximately correct, but saves an additional # Hessian call. Heta = Heta + tau * Hdelta # Technically, we may want to verify that this new eta is # indeed better than the previous eta before returning it (this # is always the case if the Hessian approximation is linear, # but I am unsure whether it is the case or not for nonlinear # approximations.) At any rate, the impact should be limited, # so in the interest of code conciseness (if we can still hope # for that), we omit this. if d_Hd <= 0: stop_tCG = self.NEGATIVE_CURVATURE else: stop_tCG = self.EXCEEDED_TR break # No negative curvature and eta_prop inside TR: accept it. e_Pe = e_Pe_new new_eta = eta + alpha * delta # If only a nonlinear Hessian approximation is available, this is # only approximately correct, but saves an additional Hessian call. new_Heta = Heta + alpha * Hdelta # Verify that the model cost decreased in going from eta to # new_eta. If it did not (which can only occur if the Hessian # approximation is nonlinear or because of numerical errors), then # we return the previous eta (which necessarily is the best reached # so far, according to the model cost). Otherwise, we accept the # new eta and go on. new_model_value = model_fun(new_eta, new_Heta) if new_model_value >= model_value: stop_tCG = self.MODEL_INCREASED break eta = new_eta Heta = new_Heta model_value = new_model_value # Update the residual. r = r + alpha * Hdelta # Compute new norm of r. r_r = inner(x, r, r) norm_r = np.sqrt(r_r) # Check kappa/theta stopping criterion. # Note that it is somewhat arbitrary whether to check this stopping # criterion on the r's (the gradients) or on the z's (the # preconditioned gradients). [CGT2000], page 206, mentions both as # acceptable criteria. if j >= mininner and norm_r <= norm_r0 * min( norm_r0**theta, kappa ): # Residual is small enough to quit if kappa < norm_r0**theta: stop_tCG = self.REACHED_TARGET_LINEAR else: stop_tCG = self.REACHED_TARGET_SUPERLINEAR break # Precondition the residual. if not self.use_rand: z = preconditioner(x, r) else: z = r # Save the old z'*r. zold_rold = z_r # Compute new z'*r. z_r = inner(x, z, r) # Compute new search direction beta = z_r / zold_rold delta = -z + beta * delta # Re-tangentialize delta to make sure it remains within the tangent # space. delta = manifold.to_tangent_space(x, delta) # Update new P-norms and P-dots [CGT2000, eq. 7.5.6 & 7.5.7]. e_Pd = beta * (e_Pd + alpha * d_Pd) d_Pd = z_r + beta * beta * d_Pd return eta, Heta, j, stop_tCG