Skip to content

Commit 2734f18

Browse files
authored
Monty-related performance improvements (#777)
Trying to figure out massive slowdowns `crypto-primes` experiences for boxed uints (up to 4x). Could be the reason of the slowdowns in `RSA` as well. Public changes: - Added `Monty::div_by_2_assign()` (with a blanket impl). - Added `BoxedUint::inv_mod2k_vartime()`. - Made `BoxedUint::inv_mod2k()` public. - Added `Monty::Multiplier` associated type and `Monty::copy_montgomery_from()` to assist with tight loops (specifically, Lucas test in `crypto-primes`). - Cleaned up AMM, added comments and references, and reduced the size of the internal buffer to N from 2N. Also made it `const fn`. Closes #782 **Note:** the multiplier for `Uint` is called `DynMontyMultiplier`. Not happy with the name, but we already have `MontyMultiplier` as a trait, and it clashes. **Note:** the exact way MontyMultiplier is exposed and the naming I'm not sure about, also not sure how hazmat do we want to make them. Potentially AMM can be exposed too, but it would be good to wrap the results in some struct that will propagate the "reduction level". Not for this PR, I need to finalize the minimum viable solution. Fixes: - Fixed a bug in `BoxedUnsatInt::to_uint()` which created a 64-bit number instead of a 32-bit one on 32-bit targets Internal: - Added tests for `BoxedUint::inv_mod2k()` and `inv_mod2k_vartime()`. - Removed allocations inside the loop in `BoxedUint::inv_mod2k()`. - Used and `inv_mod2k_vartime()` in `BoxedMontyParams::new_vartime()` and `new()` - since it's only vartime in the `k`, which is fixed. - `new_vartime()` can be made even faster (~15% for Uint, 25% for Boxed) if we make a variant of `inv_mod2k` that is vartime in both arguments. Currently added in the commit as `inv_mod2k_full_vartime()` (crate-private). **Can be removed if that's too much detail.** - Removed an unnecessary allocation in Add/SubAssign of `BoxedMontgomeryForm`. Performance notes: - `BoxedUint::div_by_2()` uses `div_by_2_assign()` because it is faster and does not allocate. - `Uint::div_by_2()` uses the same approach, gets rid of one addition and one `shr1()`, so it is marginally faster (~10%). - As expected, because of `inv_mod2k_vartime()` usage `MontyParams::new/_vartime()` became massively faster (~10x for Uint, ~15x for Boxed, 4096 bits). - I tried using AMM in `Uint`, but it leads to performance degradation for smaller uints (U256). So for now we'll keep the status quo with `Uint` using multiply + reduce. Worth investigating later.
1 parent 07005a4 commit 2734f18

20 files changed

+584
-223
lines changed

benches/monty.rs

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -180,6 +180,17 @@ fn bench_montgomery_ops<M: Measurement>(group: &mut BenchmarkGroup<'_, M>) {
180180
)
181181
});
182182

183+
group.bench_function("div_by_2, U256", |b| {
184+
b.iter_batched(
185+
|| {
186+
let x = U256::random_mod(&mut rng, params.modulus().as_nz_ref());
187+
MontyForm::new(&x, params)
188+
},
189+
|x| black_box(x.div_by_2()),
190+
BatchSize::SmallInput,
191+
)
192+
});
193+
183194
#[cfg(feature = "alloc")]
184195
for i in [1, 2, 3, 4, 10, 100] {
185196
group.bench_function(

src/modular/boxed_monty_form.rs

Lines changed: 57 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,9 @@ mod neg;
88
mod pow;
99
mod sub;
1010

11-
use super::{
12-
ConstMontyParams, Retrieve, div_by_2,
13-
reduction::{montgomery_reduction_boxed, montgomery_reduction_boxed_mut},
14-
};
11+
use super::{ConstMontyParams, Retrieve, div_by_2};
12+
use mul::BoxedMontyMultiplier;
13+
1514
use crate::{BoxedUint, Limb, Monty, Odd, Word};
1615
use alloc::sync::Arc;
1716
use subtle::Choice;
@@ -59,7 +58,27 @@ impl BoxedMontyParams {
5958
.rem(&modulus.as_nz_ref().widen(bits_precision * 2))
6059
.shorten(bits_precision);
6160

62-
Self::new_inner(modulus, one, r2)
61+
// The modular inverse should always exist, because it was ensured odd above, which also ensures it's non-zero
62+
let (inv_mod_limb, inv_mod_limb_exists) = modulus.inv_mod2k_vartime(Word::BITS);
63+
debug_assert!(bool::from(inv_mod_limb_exists));
64+
65+
let mod_neg_inv = Limb(Word::MIN.wrapping_sub(inv_mod_limb.limbs[0].0));
66+
67+
let mod_leading_zeros = modulus.as_ref().leading_zeros().min(Word::BITS - 1);
68+
69+
let r3 = {
70+
let mut mm = BoxedMontyMultiplier::new(&modulus, mod_neg_inv);
71+
mm.square(&r2)
72+
};
73+
74+
Self {
75+
modulus,
76+
one,
77+
r2,
78+
r3,
79+
mod_neg_inv,
80+
mod_leading_zeros,
81+
}
6382
}
6483

6584
/// Instantiates a new set of [`BoxedMontyParams`] representing the given `modulus`, which
@@ -82,23 +101,18 @@ impl BoxedMontyParams {
82101
.rem_vartime(&modulus.as_nz_ref().widen(bits_precision * 2))
83102
.shorten(bits_precision);
84103

85-
Self::new_inner(modulus, one, r2)
86-
}
87-
88-
/// Common functionality of `new` and `new_vartime`.
89-
fn new_inner(modulus: Odd<BoxedUint>, one: BoxedUint, r2: BoxedUint) -> Self {
90-
debug_assert_eq!(one.bits_precision(), modulus.bits_precision());
91-
debug_assert_eq!(r2.bits_precision(), modulus.bits_precision());
92-
93-
// If the inverse exists, it means the modulus is odd.
94-
let (inv_mod_limb, modulus_is_odd) = modulus.inv_mod2k(Word::BITS);
95-
debug_assert!(bool::from(modulus_is_odd));
104+
// The modular inverse should always exist, because it was ensured odd above, which also ensures it's non-zero
105+
let (inv_mod_limb, inv_mod_limb_exists) = modulus.inv_mod2k_full_vartime(Word::BITS);
106+
debug_assert!(bool::from(inv_mod_limb_exists));
96107

97108
let mod_neg_inv = Limb(Word::MIN.wrapping_sub(inv_mod_limb.limbs[0].0));
98109

99110
let mod_leading_zeros = modulus.as_ref().leading_zeros().min(Word::BITS - 1);
100111

101-
let r3 = montgomery_reduction_boxed(&mut r2.square(), &modulus, mod_neg_inv);
112+
let r3 = {
113+
let mut mm = BoxedMontyMultiplier::new(&modulus, mod_neg_inv);
114+
mm.square(&r2)
115+
};
102116

103117
Self {
104118
modulus,
@@ -173,19 +187,8 @@ impl BoxedMontyForm {
173187

174188
/// Retrieves the integer currently encoded in this [`BoxedMontyForm`], guaranteed to be reduced.
175189
pub fn retrieve(&self) -> BoxedUint {
176-
let mut montgomery_form = self.montgomery_form.widen(self.bits_precision() * 2);
177-
178-
let ret = montgomery_reduction_boxed(
179-
&mut montgomery_form,
180-
&self.params.modulus,
181-
self.params.mod_neg_inv,
182-
);
183-
184-
#[cfg(feature = "zeroize")]
185-
montgomery_form.zeroize();
186-
187-
debug_assert!(ret < self.params.modulus);
188-
ret
190+
let mut mm = BoxedMontyMultiplier::from(self.params.as_ref());
191+
mm.mul_by_one(&self.montgomery_form)
189192
}
190193

191194
/// Instantiates a new `ConstMontyForm` that represents zero.
@@ -256,6 +259,12 @@ impl BoxedMontyForm {
256259
params: self.params.clone(),
257260
}
258261
}
262+
263+
/// Performs division by 2 inplace, that is finds `x` such that `x + x = self`
264+
/// and writes it into `self`.
265+
pub fn div_by_2_assign(&mut self) {
266+
div_by_2::div_by_2_boxed_assign(&mut self.montgomery_form, &self.params.modulus)
267+
}
259268
}
260269

261270
impl Retrieve for BoxedMontyForm {
@@ -268,6 +277,7 @@ impl Retrieve for BoxedMontyForm {
268277
impl Monty for BoxedMontyForm {
269278
type Integer = BoxedUint;
270279
type Params = BoxedMontyParams;
280+
type Multiplier<'a> = BoxedMontyMultiplier<'a>;
271281

272282
fn new_params_vartime(modulus: Odd<Self::Integer>) -> Self::Params {
273283
BoxedMontyParams::new_vartime(modulus)
@@ -293,6 +303,17 @@ impl Monty for BoxedMontyForm {
293303
&self.montgomery_form
294304
}
295305

306+
fn copy_montgomery_from(&mut self, other: &Self) {
307+
debug_assert_eq!(
308+
self.montgomery_form.bits_precision(),
309+
other.montgomery_form.bits_precision()
310+
);
311+
debug_assert_eq!(self.params, other.params);
312+
self.montgomery_form
313+
.limbs
314+
.copy_from_slice(&other.montgomery_form.limbs);
315+
}
316+
296317
fn double(&self) -> Self {
297318
BoxedMontyForm::double(self)
298319
}
@@ -301,6 +322,10 @@ impl Monty for BoxedMontyForm {
301322
BoxedMontyForm::div_by_2(self)
302323
}
303324

325+
fn div_by_2_assign(&mut self) {
326+
BoxedMontyForm::div_by_2_assign(self)
327+
}
328+
304329
fn lincomb_vartime(products: &[(&Self, &Self)]) -> Self {
305330
BoxedMontyForm::lincomb_vartime(products)
306331
}
@@ -317,11 +342,8 @@ impl Zeroize for BoxedMontyForm {
317342
/// Convert the given integer into the Montgomery domain.
318343
#[inline]
319344
fn convert_to_montgomery(integer: &mut BoxedUint, params: &BoxedMontyParams) {
320-
let mut product = integer.mul(&params.r2);
321-
montgomery_reduction_boxed_mut(&mut product, &params.modulus, params.mod_neg_inv, integer);
322-
323-
#[cfg(feature = "zeroize")]
324-
product.zeroize();
345+
let mut mm = BoxedMontyMultiplier::from(params);
346+
mm.mul_assign(integer, &params.r2);
325347
}
326348

327349
#[cfg(test)]

src/modular/boxed_monty_form/add.rs

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -58,9 +58,8 @@ impl Add<BoxedMontyForm> for BoxedMontyForm {
5858
impl AddAssign<&BoxedMontyForm> for BoxedMontyForm {
5959
fn add_assign(&mut self, rhs: &BoxedMontyForm) {
6060
debug_assert_eq!(self.params, rhs.params);
61-
self.montgomery_form = self
62-
.montgomery_form
63-
.add_mod(&rhs.montgomery_form, &self.params.modulus)
61+
self.montgomery_form
62+
.add_mod_assign(&rhs.montgomery_form, &self.params.modulus);
6463
}
6564
}
6665

0 commit comments

Comments
 (0)