@@ -396,7 +396,7 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,CoeffBasedProductMode>
396
396
// but easier on the compiler side
397
397
call_assignment_no_alias (dst, lhs.lazyProduct (rhs), internal::assign_op<typename Dst::Scalar,Scalar>());
398
398
}
399
-
399
+
400
400
template <typename Dst>
401
401
static EIGEN_STRONG_INLINE void addTo (Dst& dst, const Lhs& lhs, const Rhs& rhs)
402
402
{
@@ -410,6 +410,32 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,CoeffBasedProductMode>
410
410
// dst.noalias() -= lhs.lazyProduct(rhs);
411
411
call_assignment_no_alias (dst, lhs.lazyProduct (rhs), internal::sub_assign_op<typename Dst::Scalar,Scalar>());
412
412
}
413
+
414
+ // Catch "dst {,+,-}= (s*A)*B" and evaluate it lazily by moving out the scalar factor:
415
+ // dst {,+,-}= s * (A.lazyProduct(B))
416
+ // This is a huge benefit for heap-allocated matrix types as it save one costly allocation.
417
+ // For them, this strategy is also faster than simply by-passing the heap allocation through
418
+ // stack allocation.
419
+ // For fixed sizes matrices, this is less obvious, it is sometimes x2 faster, but sometimes x3 slower,
420
+ // and the behavior depends also a lot on the compiler... so let's be conservative and enable them for dynamic-size only,
421
+ // that is when coming from generic_product_impl<...,GemmProduct> in file GeneralMatrixMatrix.h
422
+ template <typename Dst, typename Scalar1, typename Scalar2, typename Plain1, typename Xpr2, typename Func>
423
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
424
+ void eval_dynamic (Dst& dst, const CwiseBinaryOp<internal::scalar_product_op<Scalar1,Scalar2>,
425
+ const CwiseNullaryOp<internal::scalar_constant_op<Scalar1>, Plain1>, Xpr2>& lhs, const Rhs& rhs, const Func &func)
426
+ {
427
+ call_assignment_no_alias (dst, lhs.lhs ().functor ().m_other * lhs.rhs ().lazyProduct (rhs), func);
428
+ }
429
+
430
+ // Here, we we always have LhsT==Lhs, but we need to make it a template type to make the above
431
+ // overload more specialized.
432
+ template <typename Dst, typename LhsT, typename Func>
433
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
434
+ void eval_dynamic (Dst& dst, const LhsT& lhs, const Rhs& rhs, const Func &func)
435
+ {
436
+ call_assignment_no_alias (dst, lhs.lazyProduct (rhs), func);
437
+ }
438
+
413
439
414
440
// template<typename Dst>
415
441
// static inline void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
0 commit comments