Skip to content

Learning rate #106

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 16 commits into from
Closed
Show file tree
Hide file tree
Changes from 10 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -68,8 +68,6 @@ public class AdaDelta extends Optimizer {
public static final float RHO_DEFAULT = 0.95f;
public static final float EPSILON_DEFAULT = 1e-7f;

private final float learningRate;

private final float rho;

private final float epsilon;
Expand Down Expand Up @@ -97,8 +95,7 @@ public AdaDelta(Graph graph, float learningRate) {
* @param epsilon A constant epsilon used to better conditioning the grad update
*/
public AdaDelta(Graph graph, float learningRate, float rho, float epsilon) {
super(graph);
this.learningRate = learningRate;
super(graph, learningRate);
this.rho = rho;
this.epsilon = epsilon;
}
Expand All @@ -124,8 +121,7 @@ public AdaDelta(Graph graph, String name, float learningRate) {
* @param epsilon A constant epsilon used to better conditioning the grad update
*/
public AdaDelta(Graph graph, String name, float learningRate, float rho, float epsilon) {
super(graph, name);
this.learningRate = learningRate;
super(graph, name, learningRate);
this.rho = rho;
this.epsilon = epsilon;
}
Expand Down Expand Up @@ -162,7 +158,7 @@ protected <T extends TType> Op applyDense(Output<T> gradient, Output<T> variable
variable,
accumSlot,
accumUpdateSlot,
tf.dtypes.cast(tf.constant(learningRate), gradient.dataType()),
tf.dtypes.cast(getLearningRateOperand(), gradient.dataType()),
tf.dtypes.cast(tf.constant(rho), gradient.dataType()),
tf.dtypes.cast(tf.constant(epsilon), gradient.dataType()),
gradient);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,6 @@ public class AdaGrad extends Optimizer {
public static final float LEARNING_RATE_DEFAULT = 0.001f;
public static final float INITIAL_ACCUMULATOR_DEFAULT = 0.01f;

private final float learningRate;

private final float initialAccumulatorValue;

/**
Expand Down Expand Up @@ -76,7 +74,7 @@ public AdaGrad(Graph graph, float learningRate) {
* @throws java.lang.IllegalArgumentException if initialAccumulatorValue is negative
*/
public AdaGrad(Graph graph, float learningRate, float initialAccumulatorValue) {
super(graph);
super(graph, learningRate);
if (initialAccumulatorValue < 0F) {
throw new IllegalArgumentException(
String.format(
Expand Down Expand Up @@ -107,13 +105,12 @@ public AdaGrad(Graph graph, String name, float learningRate) {
* @throws java.lang.IllegalArgumentException if initialAccumulatorValue is negative
*/
public AdaGrad(Graph graph, String name, float learningRate, float initialAccumulatorValue) {
super(graph, name);
super(graph, name, learningRate);
if (initialAccumulatorValue < 0F) {
throw new IllegalArgumentException(
String.format(
"initialAccumulatorValue must be non-negative: %f", initialAccumulatorValue));
}
this.learningRate = learningRate;
this.initialAccumulatorValue = initialAccumulatorValue;
}

Expand Down Expand Up @@ -142,7 +139,7 @@ private <T extends TType> void createAdaGradSlot(Output<T> v) {
protected <T extends TType> Op applyDense(Output<T> gradient, Output<T> variable) {
Variable<T> slot = getSlot(variable, ACCUMULATOR).get();
return tf.train.applyAdagrad(
variable, slot, tf.dtypes.cast(tf.constant(learningRate), gradient.dataType()), gradient);
variable, slot, tf.dtypes.cast(getLearningRateOperand(), gradient.dataType()), gradient);
}

/** {@inheritDoc} */
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,6 @@ public class AdaGradDA extends Optimizer {
public static final float L1_STRENGTH_DEFAULT = 0.0F;
public static final float L2_STRENGTH_DEFAULT = 0.0F;

private final float learningRate;
private final float initialAccumulatorValue;
private final float l1Strength;
private final float l2Strength;
Expand Down Expand Up @@ -101,7 +100,7 @@ public AdaGradDA(
float initialAccumulatorValue,
float l1Strength,
float l2Strength) {
super(graph);
super(graph, learningRate);
if (initialAccumulatorValue <= 0F) {
throw new IllegalArgumentException(
String.format(
Expand All @@ -115,7 +114,6 @@ public AdaGradDA(
throw new IllegalArgumentException(
String.format("l2Strength must not be negative: %f", l2Strength));
}
this.learningRate = learningRate;
this.initialAccumulatorValue = initialAccumulatorValue;
this.l1Strength = l1Strength;
this.l2Strength = l2Strength;
Expand Down Expand Up @@ -157,7 +155,7 @@ public AdaGradDA(
float initialAccumulatorValue,
float l1Strength,
float l2Strength) {
super(graph, name);
super(graph, name, learningRate);
if (initialAccumulatorValue <= 0F) {
throw new IllegalArgumentException(
String.format(
Expand All @@ -171,7 +169,6 @@ public AdaGradDA(
throw new IllegalArgumentException(
String.format("l2Strength must not be negative: %f", l2Strength));
}
this.learningRate = learningRate;
this.initialAccumulatorValue = initialAccumulatorValue;
this.l1Strength = l1Strength;
this.l2Strength = l2Strength;
Expand Down Expand Up @@ -218,7 +215,7 @@ protected <T extends TType> Op applyDense(Output<T> gradient, Output<T> variable
gradSlot,
gradSquaredSlot,
gradient,
tf.dtypes.cast(tf.constant(learningRate), gradient.dataType()),
tf.dtypes.cast(getLearningRateOperand(), gradient.dataType()),
tf.dtypes.cast(tf.constant(l1Strength), gradient.dataType()),
tf.dtypes.cast(tf.constant(l2Strength), gradient.dataType()),
globalStep);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,15 +56,12 @@ public class Adam extends Optimizer {
public static final float BETA_ONE_DEFAULT = 0.9f;
public static final float BETA_TWO_DEFAULT = 0.999f;

private final float learningRate;

private final float betaOne;

private final float betaTwo;

private final float epsilon;

private Constant<TFloat32> learningRateConst;
private Constant<TFloat32> epsilonConst;
private Constant<TFloat32> betaOneConst;
private Constant<TFloat32> betaTwoConst;
Expand Down Expand Up @@ -102,8 +99,7 @@ public Adam(Graph graph, float learningRate) {
* 1 of the paper. Defaults to 1e-8.
*/
public Adam(Graph graph, float learningRate, float betaOne, float betaTwo, float epsilon) {
super(graph);
this.learningRate = learningRate;
super(graph, learningRate);
this.betaOne = betaOne;
this.betaTwo = betaTwo;
this.epsilon = epsilon;
Expand Down Expand Up @@ -134,8 +130,7 @@ public Adam(Graph graph, String name, float learningRate) {
*/
public Adam(
Graph graph, String name, float learningRate, float betaOne, float betaTwo, float epsilon) {
super(graph, name);
this.learningRate = learningRate;
super(graph, name, learningRate);
this.betaOne = betaOne;
this.betaTwo = betaTwo;
this.epsilon = epsilon;
Expand Down Expand Up @@ -202,7 +197,6 @@ protected void createSlots(List<Output<? extends TType>> variables) {
protected Optional<Op> prepare(String scopeName) {
betaOneConst = tf.constant(betaOne);
betaTwoConst = tf.constant(betaTwo);
learningRateConst = tf.constant(learningRate);
epsilonConst = tf.constant(epsilon);
return Optional.empty();
}
Expand Down Expand Up @@ -233,7 +227,7 @@ protected <T extends TType> Op applyDense(Output<T> gradient, Output<T> variable
secondMomentSlot,
tf.dtypes.cast(betaOnePower, gradient.dataType()),
tf.dtypes.cast(betaTwoPower, gradient.dataType()),
tf.dtypes.cast(learningRateConst, gradient.dataType()),
tf.dtypes.cast(getLearningRateOperand(), gradient.dataType()),
tf.dtypes.cast(betaOneConst, gradient.dataType()),
tf.dtypes.cast(betaTwoConst, gradient.dataType()),
tf.dtypes.cast(epsilonConst, gradient.dataType()),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,12 +33,10 @@ public class Adamax extends Optimizer {
public static final float BETA_ONE_DEFAULT = 0.9f;
public static final float BETA_TWO_DEFAULT = 0.999f;

private float learningRate;
private final float betaOne;
private final float betaTwo;
private final float epsilon;

private Constant<TFloat32> learningRateConst;
private Constant<TFloat32> epsilonConst;
private Constant<TFloat32> betaOneConst;
private Constant<TFloat32> betaTwoConst;
Expand Down Expand Up @@ -94,8 +92,7 @@ public Adamax(Graph graph, String name, float learningRate) {
* @param epsilon A small constant for numerical stability.
*/
public Adamax(Graph graph, float learningRate, float betaOne, float betaTwo, float epsilon) {
super(graph);
this.learningRate = learningRate;
super(graph, learningRate);
this.betaOne = betaOne;
this.betaTwo = betaTwo;
this.epsilon = epsilon;
Expand All @@ -113,8 +110,7 @@ public Adamax(Graph graph, float learningRate, float betaOne, float betaTwo, flo
*/
public Adamax(
Graph graph, String name, float learningRate, float betaOne, float betaTwo, float epsilon) {
super(graph, name);
this.learningRate = learningRate;
super(graph, name, learningRate);
this.betaOne = betaOne;
this.betaTwo = betaTwo;
this.epsilon = epsilon;
Expand All @@ -125,7 +121,6 @@ public Adamax(
protected Optional<Op> prepare(String scopeName) {
betaOneConst = tf.constant(betaOne);
betaTwoConst = tf.constant(betaTwo);
learningRateConst = tf.constant(learningRate);
epsilonConst = tf.constant(epsilon);

return Optional.empty();
Expand Down Expand Up @@ -168,7 +163,7 @@ protected <T extends TType> Op applyDense(Output<T> gradient, Output<T> variable
firstMomentSlot,
secondMomentSlot,
tf.dtypes.cast(betaOnePower, gradient.dataType()),
tf.dtypes.cast(learningRateConst, gradient.dataType()),
tf.dtypes.cast(getLearningRateOperand(), gradient.dataType()),
tf.dtypes.cast(betaOneConst, gradient.dataType()),
tf.dtypes.cast(betaTwoConst, gradient.dataType()),
tf.dtypes.cast(epsilonConst, gradient.dataType()),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@ public class Ftrl extends Optimizer {
public static final float L2STRENGTH_DEFAULT = 0.0f;
public static final float L2_SHRINKAGE_REGULARIZATION_STRENGTH_DEFAULT = 0.0f;

private float learningRate;
private final float learningRatePower;
private final float initialAccumulatorValue;
private final float l1RegularizationStrength;
Expand Down Expand Up @@ -133,8 +132,7 @@ public Ftrl(
float l1Strength,
float l2Strength,
float l2ShrinkageRegularizationStrength) {
super(graph);
this.learningRate = learningRate;
super(graph, learningRate);
this.learningRatePower = learningRatePower;
this.initialAccumulatorValue = initialAccumulatorValue;
this.l1RegularizationStrength = l1Strength;
Expand Down Expand Up @@ -171,8 +169,7 @@ public Ftrl(
float l1Strength,
float l2Strength,
float l2ShrinkageRegularizationStrength) {
super(graph, name);
this.learningRate = learningRate;
super(graph, name, learningRate);
this.learningRatePower = learningRatePower;
this.initialAccumulatorValue = initialAccumulatorValue;
this.l1RegularizationStrength = l1Strength;
Expand Down Expand Up @@ -242,19 +239,17 @@ private <T extends TType> void createFtrlSlot(Output<T> v) {
protected <T extends TType> Op applyDense(Output<T> gradient, Output<T> variable) {
Variable<T> accumSlot = getSlot(variable, ACCUMULATOR).get();
Variable<T> linearSlot = getSlot(variable, LINEAR_ACCUMULATOR).get();
ApplyFtrl.Options options = ApplyFtrl.useLocking(true);
return this.tf.train.applyFtrl(
variable,
accumSlot, // accum
linearSlot, // linear
gradient, // gradient
tf.dtypes.cast(tf.constant(learningRate), gradient.dataType()), // lr
tf.dtypes.cast(tf.constant(l1RegularizationStrength), gradient.dataType()), // l1
tf.dtypes.cast(tf.constant(l2RegularizationStrength), gradient.dataType()), // l2
tf.dtypes.cast(
tf.constant(l2ShrinkageRegularizationStrength), gradient.dataType()), // l2Shrinkage
tf.dtypes.cast(tf.constant(learningRatePower), gradient.dataType()), // lrPower
options);
accumSlot,
linearSlot,
gradient,
tf.dtypes.cast(this.getLearningRateOperand(), gradient.dataType()),
tf.dtypes.cast(tf.constant(l1RegularizationStrength), gradient.dataType()),
tf.dtypes.cast(tf.constant(l2RegularizationStrength), gradient.dataType()),
tf.dtypes.cast(tf.constant(l2ShrinkageRegularizationStrength), gradient.dataType()),
tf.dtypes.cast(tf.constant(learningRatePower), gradient.dataType()),
ApplyFtrl.useLocking(true));
}

/** {@inheritDoc} */
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,6 @@ public class GradientDescent extends Optimizer {

public static final float LEARNING_RATE_DEFAULT = 0.01f;

private final float learningRate;

/**
* Creates a GradientDescent Optimizer
*
Expand All @@ -46,8 +44,7 @@ public GradientDescent(Graph graph) {
* @param learningRate the learning rate, defaults to 0.01
*/
public GradientDescent(Graph graph, float learningRate) {
super(graph);
this.learningRate = learningRate;
super(graph, learningRate);
}

/**
Expand All @@ -58,15 +55,14 @@ public GradientDescent(Graph graph, float learningRate) {
* @param learningRate the learning rate, defaults to 0.01
*/
public GradientDescent(Graph graph, String name, float learningRate) {
super(graph, name);
this.learningRate = learningRate;
super(graph, name, learningRate);
}

/** {@inheritDoc} */
@Override
protected <T extends TType> Op applyDense(Output<T> gradient, Output<T> variable) {
return tf.train.applyGradientDescent(
variable, tf.dtypes.cast(tf.constant(learningRate), gradient.dataType()), gradient);
variable, tf.dtypes.cast(getLearningRateOperand(), gradient.dataType()), gradient);
}

/** {@inheritDoc} */
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,6 @@ public class Momentum extends Optimizer {

public static final String MOMENTUM = "momentum";

private final float learningRate;

private final float momentum;

private final boolean useNesterov;
Expand Down Expand Up @@ -86,8 +84,7 @@ public Momentum(Graph graph, float learningRate, float momentum) {
* @param useNesterov Whether to apply Nesterov momentum. Defaults to false.
*/
public Momentum(Graph graph, float learningRate, float momentum, boolean useNesterov) {
super(graph);
this.learningRate = learningRate;
super(graph, learningRate);
this.momentum = momentum;
this.useNesterov = useNesterov;
}
Expand All @@ -104,8 +101,7 @@ public Momentum(Graph graph, float learningRate, float momentum, boolean useNest
*/
public Momentum(
Graph graph, String name, float learningRate, float momentum, boolean useNesterov) {
super(graph, name);
this.learningRate = learningRate;
super(graph, name, learningRate);
this.momentum = momentum;
this.useNesterov = useNesterov;
}
Expand Down Expand Up @@ -136,7 +132,7 @@ protected <T extends TType> Op applyDense(Output<T> gradient, Output<T> variable
return tf.train.applyMomentum(
variable,
slot,
tf.dtypes.cast(tf.constant(learningRate), gradient.dataType()),
tf.dtypes.cast(getLearningRateOperand(), gradient.dataType()),
gradient,
tf.dtypes.cast(tf.constant(momentum), gradient.dataType()),
ApplyMomentum.useNesterov(useNesterov));
Expand Down
Loading