diff --git a/doc/specs/stdlib_intrinsics.md b/doc/specs/stdlib_intrinsics.md
new file mode 100644
index 000000000..be23adeb5
--- /dev/null
+++ b/doc/specs/stdlib_intrinsics.md
@@ -0,0 +1,158 @@
+---
+title: intrinsics
+---
+
+# The `stdlib_intrinsics` module
+
+[TOC]
+
+## Introduction
+
+The `stdlib_intrinsics` module provides replacements for some of the well known intrinsic functions found in Fortran compilers for which either a faster and/or more accurate implementation is found which has also proven of interest to the Fortran community.
+
+<!-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -->
+### `stdlib_sum` function
+
+#### Description
+
+The `stdlib_sum` function can replace the intrinsic `sum` for `real`, `complex` or `integer` arrays. It follows a chunked implementation which maximizes vectorization potential as well as reducing the round-off error. This procedure is recommended when summing large (e..g, >2**10 elements) arrays, for repetitive summation of smaller arrays consider the classical `sum`.
+
+#### Syntax
+
+`res = ` [[stdlib_intrinsics(module):stdlib_sum(interface)]] ` (x [,mask] )`
+
+`res = ` [[stdlib_intrinsics(module):stdlib_sum(interface)]] ` (x, dim [,mask] )`
+
+#### Status
+
+Experimental
+
+#### Class
+
+Pure function.
+
+#### Argument(s)
+
+`x`: N-D array of either `real`, `complex` or `integer` type. This argument is `intent(in)`.
+
+`dim` (optional): scalar of type `integer` with a value in the range from 1 to n, where n equals the rank of `x`.
+
+`mask` (optional): N-D array of `logical` values, with the same shape as `x`. This argument is `intent(in)`.
+
+#### Output value or Result value
+
+If `dim` is absent, the output is a scalar of the same `type` and `kind` as to that of `x`. Otherwise, an array of rank n-1, where n equals the rank of `x`, and a shape similar to that of `x` with dimension `dim` dropped is returned.
+
+<!-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -->
+### `stdlib_sum_kahan` function
+
+#### Description
+
+The `stdlib_sum_kahan` function can replace the intrinsic `sum` for `real` or `complex` arrays. It follows a chunked implementation which maximizes vectorization potential complemented by an `elemental` kernel based on the [kahan summation](https://doi.org/10.1145%2F363707.363723) strategy to reduce the round-off error:
+
+```fortran
+elemental subroutine kahan_kernel_<kind>(a,s,c)
+    type(<kind>), intent(in) :: a
+    type(<kind>), intent(inout) :: s
+    type(<kind>), intent(inout) :: c
+    type(<kind>) :: t, y
+    y = a - c
+    t = s + y
+    c = (t - s) - y
+    s = t
+end subroutine
+```
+
+#### Syntax
+
+`res = ` [[stdlib_intrinsics(module):stdlib_sum_kahan(interface)]] ` (x [,mask] )`
+
+`res = ` [[stdlib_intrinsics(module):stdlib_sum_kahan(interface)]] ` (x, dim [,mask] )`
+
+#### Status
+
+Experimental
+
+#### Class
+
+Pure function.
+
+#### Argument(s)
+
+`x`: 1D array of either `real` or `complex` type. This argument is `intent(in)`.
+
+`dim` (optional): scalar of type `integer` with a value in the range from 1 to n, where n equals the rank of `x`.
+
+`mask` (optional): N-D array of `logical` values, with the same shape as `x`. This argument is `intent(in)`.
+
+#### Output value or Result value
+
+If `dim` is absent, the output is a scalar of the same type and kind as to that of `x`. Otherwise, an array of rank n-1, where n equals the rank of `x`, and a shape similar to that of `x` with dimension `dim` dropped is returned.
+
+#### Example
+
+```fortran
+{!example/intrinsics/example_sum.f90!}
+```
+
+<!-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -->
+### `stdlib_dot_product` function
+
+#### Description
+
+The `stdlib_dot_product` function can replace the intrinsic `dot_product` for 1D `real`, `complex` or `integer` arrays. It follows a chunked implementation which maximizes vectorization potential as well as reducing the round-off error. This procedure is recommended when crunching large arrays, for repetitive products of smaller arrays consider the classical `dot_product`.
+
+#### Syntax
+
+`res = ` [[stdlib_intrinsics(module):stdlib_dot_product(interface)]] ` (x, y)`
+
+#### Status
+
+Experimental
+
+#### Class
+
+Pure function.
+
+#### Argument(s)
+
+`x`: 1D array of either `real`, `complex` or `integer` type. This argument is `intent(in)`.
+
+`y`: 1D array of the same type and kind as `x`. This argument is `intent(in)`.
+
+#### Output value or Result value
+
+The output is a scalar of `type` and `kind` same as to that of `x` and `y`.
+
+<!-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -->
+### `stdlib_dot_product_kahan` function
+
+#### Description
+
+The `stdlib_dot_product_kahan` function can replace the intrinsic `dot_product` for 1D `real` or `complex` arrays. It follows a chunked implementation which maximizes vectorization potential, complemented by the same `elemental` kernel based on the [kahan summation](https://doi.org/10.1145%2F363707.363723) used for `stdlib_sum` to reduce the round-off error.
+
+#### Syntax
+
+`res = ` [[stdlib_intrinsics(module):stdlib_dot_product_kahan(interface)]] ` (x, y)`
+
+#### Status
+
+Experimental
+
+#### Class
+
+Pure function.
+
+#### Argument(s)
+
+`x`: 1D array of either `real` or `complex` type. This argument is `intent(in)`.
+
+`y`: 1D array of the same type and kind as `x`. This argument is `intent(in)`.
+
+#### Output value or Result value
+
+The output is a scalar of the same type and kind as to that of `x` and `y`.
+
+```fortran
+{!example/intrinsics/example_dot_product.f90!}
+```
\ No newline at end of file
diff --git a/example/CMakeLists.txt b/example/CMakeLists.txt
index cbef7f075..968a048b0 100644
--- a/example/CMakeLists.txt
+++ b/example/CMakeLists.txt
@@ -13,6 +13,7 @@ add_subdirectory(constants)
 add_subdirectory(error)
 add_subdirectory(hashmaps)
 add_subdirectory(hash_procedures)
+add_subdirectory(intrinsics)
 add_subdirectory(io)
 add_subdirectory(linalg)
 add_subdirectory(logger)
diff --git a/example/intrinsics/CMakeLists.txt b/example/intrinsics/CMakeLists.txt
new file mode 100644
index 000000000..1645ba8a1
--- /dev/null
+++ b/example/intrinsics/CMakeLists.txt
@@ -0,0 +1,2 @@
+ADD_EXAMPLE(sum)
+ADD_EXAMPLE(dot_product)
\ No newline at end of file
diff --git a/example/intrinsics/example_dot_product.f90 b/example/intrinsics/example_dot_product.f90
new file mode 100644
index 000000000..fb6e40610
--- /dev/null
+++ b/example/intrinsics/example_dot_product.f90
@@ -0,0 +1,18 @@
+program example_dot_product
+    use stdlib_kinds, only: sp
+    use stdlib_intrinsics, only: stdlib_dot_product, stdlib_dot_product_kahan
+    implicit none
+
+    real(sp), allocatable :: x(:), y(:)
+    real(sp) :: total_prod(3)
+
+    allocate( x(1000), y(1000) )
+    call random_number(x)
+    call random_number(y)
+
+    total_prod(1) = dot_product(x,y) !> compiler intrinsic
+    total_prod(2) = stdlib_dot_product(x,y)       !> chunked summation over inner product
+    total_prod(3) = stdlib_dot_product_kahan(x,y) !> chunked kahan summation over inner product
+    print *, total_prod(1:3)
+    
+end program example_dot_product
\ No newline at end of file
diff --git a/example/intrinsics/example_sum.f90 b/example/intrinsics/example_sum.f90
new file mode 100644
index 000000000..0aec4835c
--- /dev/null
+++ b/example/intrinsics/example_sum.f90
@@ -0,0 +1,17 @@
+program example_sum
+    use stdlib_kinds, only: sp
+    use stdlib_intrinsics, only: stdlib_sum, stdlib_sum_kahan
+    implicit none
+
+    real(sp), allocatable :: x(:)
+    real(sp) :: total_sum(3)
+
+    allocate( x(1000) )
+    call random_number(x)
+
+    total_sum(1) = sum(x)       !> compiler intrinsic
+    total_sum(2) = stdlib_sum(x)      !> chunked summation
+    total_sum(3) = stdlib_sum_kahan(x)!> chunked kahan summation
+    print *, total_sum(1:3)
+    
+end program example_sum
\ No newline at end of file
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index d8a4ac1a9..7778097f4 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -16,6 +16,9 @@ set(fppFiles
     stdlib_hash_64bit_fnv.fypp
     stdlib_hash_64bit_pengy.fypp
     stdlib_hash_64bit_spookyv2.fypp
+    stdlib_intrinsics_dot_product.fypp
+    stdlib_intrinsics_sum.fypp
+    stdlib_intrinsics.fypp
     stdlib_io.fypp
     stdlib_io_npy.fypp
     stdlib_io_npy_load.fypp
diff --git a/src/stdlib_constants.fypp b/src/stdlib_constants.fypp
index e169dbb3b..df61e11b2 100644
--- a/src/stdlib_constants.fypp
+++ b/src/stdlib_constants.fypp
@@ -1,9 +1,13 @@
 #:include "common.fypp"
 #:set KINDS = REAL_KINDS
+#:set I_KINDS_TYPES = list(zip(INT_KINDS, INT_TYPES, INT_KINDS))
+#:set R_KINDS_TYPES = list(zip(REAL_KINDS, REAL_TYPES, REAL_SUFFIX))
+#:set C_KINDS_TYPES = list(zip(CMPLX_KINDS, CMPLX_TYPES, CMPLX_SUFFIX))
+
 module stdlib_constants
     !! Constants
     !! ([Specification](../page/specs/stdlib_constants.html))
-    use stdlib_kinds, only: #{for k in KINDS[:-1]}#${k}$, #{endfor}#${KINDS[-1]}$
+    use stdlib_kinds
     use stdlib_codata, only: SPEED_OF_LIGHT_IN_VACUUM, &
                              VACUUM_ELECTRIC_PERMITTIVITY, &
                              VACUUM_MAG_PERMEABILITY, &
@@ -60,5 +64,17 @@ module stdlib_constants
     real(dp), parameter, public :: u = ATOMIC_MASS_CONSTANT%value !! Atomic mass constant
 
     ! Additional constants if needed
+    #:for k, t, s in I_KINDS_TYPES
+    ${t}$, parameter, public :: zero_${s}$ = 0_${k}$
+    ${t}$, parameter, public :: one_${s}$  = 1_${k}$
+    #:endfor
+    #:for k, t, s in R_KINDS_TYPES
+    ${t}$, parameter, public :: zero_${s}$ = 0._${k}$
+    ${t}$, parameter, public :: one_${s}$  = 1._${k}$
+    #:endfor
+    #:for k, t, s in C_KINDS_TYPES
+    ${t}$, parameter, public :: zero_${s}$ = (0._${k}$,0._${k}$)
+    ${t}$, parameter, public :: one_${s}$  = (1._${k}$,0._${k}$)
+    #:endfor
 
 end module stdlib_constants
diff --git a/src/stdlib_intrinsics.fypp b/src/stdlib_intrinsics.fypp
new file mode 100644
index 000000000..b2c16a5a6
--- /dev/null
+++ b/src/stdlib_intrinsics.fypp
@@ -0,0 +1,176 @@
+#:include "common.fypp"
+#:set I_KINDS_TYPES = list(zip(INT_KINDS, INT_TYPES, INT_KINDS))
+#:set R_KINDS_TYPES = list(zip(REAL_KINDS, REAL_TYPES, REAL_SUFFIX))
+#:set C_KINDS_TYPES = list(zip(CMPLX_KINDS, CMPLX_TYPES, CMPLX_SUFFIX))
+#:set RANKS = range(2, MAXRANK + 1)
+
+module stdlib_intrinsics
+    !!Alternative implementations of some Fortran intrinsic functions offering either faster and/or more accurate evaluation.
+    !! ([Specification](../page/specs/stdlib_intrinsics.html))
+    use stdlib_kinds
+    implicit none
+    private
+
+    interface stdlib_sum
+        !! version: experimental 
+        !!
+        !!### Summary 
+        !! Sum elements of rank N arrays. 
+        !! ([Specification](../page/specs/stdlib_intrinsics.html#stdlib_sum))
+        !!
+        !!### Description
+        !! 
+        !! This interface provides standard conforming call for sum of elements of any rank.
+        !! The 1-D base implementation follows a chunked approach for optimizing performance and increasing accuracy.
+        !! The `N-D` interfaces calls upon the `(N-1)-D` implementation. 
+        !! Supported data types include `real`, `complex` and `integer`.
+        !!
+        #:for k, t, s in I_KINDS_TYPES + R_KINDS_TYPES + C_KINDS_TYPES
+        pure module function stdlib_sum_1d_${s}$(a) result(s)
+            ${t}$, intent(in) :: a(:)
+            ${t}$ :: s
+        end function
+        pure module function stdlib_sum_1d_${s}$_mask(a,mask) result(s)
+            ${t}$, intent(in) :: a(:)
+            logical, intent(in) :: mask(:)
+            ${t}$ :: s
+        end function
+        #:for rank in RANKS
+        pure module function stdlib_sum_${rank}$d_${s}$( x, mask ) result( s )
+            ${t}$, intent(in) :: x${ranksuffix(rank)}$
+            logical, intent(in), optional :: mask${ranksuffix(rank)}$
+            ${t}$ :: s
+        end function
+        pure module function stdlib_sum_${rank}$d_dim_${s}$( x , dim, mask ) result( s )
+            ${t}$, intent(in) :: x${ranksuffix(rank)}$
+            integer, intent(in):: dim
+            logical, intent(in), optional :: mask${ranksuffix(rank)}$
+            ${t}$ :: s${reduced_shape('x', rank, 'dim')}$
+        end function
+        #:endfor
+        #:endfor
+    end interface
+    public :: stdlib_sum
+
+    interface stdlib_sum_kahan
+        !! version: experimental 
+        !!
+        !!### Summary 
+        !! Sum elements of rank N arrays. 
+        !! ([Specification](../page/specs/stdlib_intrinsics.html#stdlib_sum_kahan))
+        !!
+        !!### Description
+        !! 
+        !! This interface provides standard conforming call for sum of elements of any rank.
+        !! The 1-D base implementation follows a chunked approach combined with a kahan kernel for optimizing performance and increasing accuracy.
+        !! The `N-D` interfaces calls upon the `(N-1)-D` implementation. 
+        !! Supported data types include `real` and `complex`.
+        !!
+        #:for k, t, s in R_KINDS_TYPES + C_KINDS_TYPES
+        pure module function stdlib_sum_kahan_1d_${s}$(a) result(s)
+            ${t}$, intent(in) :: a(:)
+            ${t}$ :: s
+        end function
+        pure module function stdlib_sum_kahan_1d_${s}$_mask(a,mask) result(s)
+            ${t}$, intent(in) :: a(:)
+            logical, intent(in) :: mask(:)
+            ${t}$ :: s
+        end function
+        #:for rank in RANKS
+        pure module function stdlib_sum_kahan_${rank}$d_${s}$( x, mask ) result( s )
+            ${t}$, intent(in) :: x${ranksuffix(rank)}$
+            logical, intent(in), optional :: mask${ranksuffix(rank)}$
+            ${t}$ :: s
+        end function
+        pure module function stdlib_sum_kahan_${rank}$d_dim_${s}$( x , dim, mask ) result( s )
+            ${t}$, intent(in) :: x${ranksuffix(rank)}$
+            integer, intent(in):: dim
+            logical, intent(in), optional :: mask${ranksuffix(rank)}$
+            ${t}$ :: s${reduced_shape('x', rank, 'dim')}$
+        end function
+        #:endfor
+        #:endfor
+    end interface
+    public :: stdlib_sum_kahan
+
+    interface stdlib_dot_product
+        !! version: experimental 
+        !!
+        !!### Summary 
+        !! dot_product of rank 1 arrays. 
+        !! ([Specification](../page/specs/stdlib_intrinsics.html#stdlib_dot_product))
+        !!
+        !!### Description
+        !! 
+        !! compute the dot_product of rank 1 arrays.
+        !! The 1-D base implementation follows a chunked approach for optimizing performance and increasing accuracy.
+        !! Supported data types include `real`, `complex` and `integer`.
+        !!
+        #:for k, t, s in I_KINDS_TYPES + R_KINDS_TYPES + C_KINDS_TYPES
+        pure module function stdlib_dot_product_${s}$(a,b) result(p)
+            ${t}$, intent(in) :: a(:)
+            ${t}$, intent(in) :: b(:)
+            ${t}$ :: p
+        end function
+        #:endfor
+    end interface
+    public :: stdlib_dot_product
+
+    interface stdlib_dot_product_kahan
+        !! version: experimental 
+        !!
+        !!### Summary 
+        !! dot_product of rank 1 arrays. 
+        !! ([Specification](../page/specs/stdlib_intrinsics.html#stdlib_dot_product_kahan))
+        !!
+        !!### Description
+        !! 
+        !! compute the dot_product of rank 1 arrays.
+        !! The implementation follows a chunked approach combined with a kahan kernel for optimizing performance and increasing accuracy.
+        !! Supported data types include `real` and `complex`.
+        !!
+        #:for k, t, s in R_KINDS_TYPES + C_KINDS_TYPES
+        pure module function stdlib_dot_product_kahan_${s}$(a,b) result(p)
+            ${t}$, intent(in) :: a(:)
+            ${t}$, intent(in) :: b(:)
+            ${t}$ :: p
+        end function
+        #:endfor
+    end interface
+    public :: stdlib_dot_product_kahan
+
+    interface kahan_kernel 
+        #:for k, t, s in R_KINDS_TYPES + C_KINDS_TYPES
+        module procedure :: kahan_kernel_${s}$
+        module procedure :: kahan_kernel_m_${s}$
+        #:endfor
+    end interface
+    public :: kahan_kernel
+    
+contains
+
+#:for k, t, s in R_KINDS_TYPES + C_KINDS_TYPES
+elemental subroutine kahan_kernel_${s}$(a,s,c)
+    ${t}$, intent(in) :: a
+    ${t}$, intent(inout) :: s
+    ${t}$, intent(inout) :: c
+    ${t}$ :: t, y
+    y = a - c
+    t = s + y
+    c = (t - s) - y
+    s = t
+end subroutine  
+elemental subroutine kahan_kernel_m_${s}$(a,s,c,m)
+    ${t}$, intent(in) :: a
+    ${t}$, intent(inout) :: s
+    ${t}$, intent(inout) :: c
+    logical, intent(in) :: m
+    ${t}$ :: t, y
+    y = a - c
+    t = s + y
+    c = (t - s) - y
+    s = merge( s , t , m )
+end subroutine 
+#:endfor
+
+end module stdlib_intrinsics
diff --git a/src/stdlib_intrinsics_dot_product.fypp b/src/stdlib_intrinsics_dot_product.fypp
new file mode 100644
index 000000000..ce6188c8a
--- /dev/null
+++ b/src/stdlib_intrinsics_dot_product.fypp
@@ -0,0 +1,77 @@
+#:include "common.fypp"
+#:set I_KINDS_TYPES = list(zip(INT_KINDS, INT_TYPES, INT_KINDS))
+#:set R_KINDS_TYPES = list(zip(REAL_KINDS, REAL_TYPES, REAL_SUFFIX))
+#:set C_KINDS_TYPES = list(zip(CMPLX_KINDS, CMPLX_TYPES, CMPLX_SUFFIX))
+
+#:def cnjg(type,expression)
+#:if 'complex' in type
+conjg(${expression}$)
+#:else
+${expression}$
+#:endif
+#:enddef
+
+submodule(stdlib_intrinsics) stdlib_intrinsics_dot_product
+    !!Replacement for certain Fortran intrinsic functions offering either faster and/or more accurate implementations.
+    !! ([Specification](../page/specs/stdlib_intrinsics.html))
+    use stdlib_kinds
+    use stdlib_constants
+    implicit none
+
+    integer, parameter :: ilp = int64
+    
+contains
+! This implementation is based on https://github.com/jalvesz/fast_math
+#:for k, t, s in I_KINDS_TYPES + R_KINDS_TYPES + C_KINDS_TYPES
+pure module function stdlib_dot_product_${s}$(a,b) result(p)
+    integer(ilp), parameter :: chunk = 64
+    ${t}$, intent(in) :: a(:)
+    ${t}$, intent(in) :: b(:)
+    ${t}$ :: p
+    ${t}$ :: abatch(chunk)
+    integer(ilp) :: i, n, r
+    ! -----------------------------
+    n = size(a,kind=ilp)
+    r = mod(n,chunk)
+
+    abatch(1:r)       = a(1:r)*${cnjg(t,'b(1:r)')}$
+    abatch(r+1:chunk) = zero_${s}$
+    do i = r+1, n-r, chunk
+        abatch(1:chunk) = abatch(1:chunk) + a(i:i+chunk-1)*${cnjg(t,'b(i:i+chunk-1)')}$
+    end do
+
+    p = zero_${s}$
+    do i = 1, chunk/2
+        p = p + abatch(i)+abatch(chunk/2+i)
+    end do
+end function
+#:endfor
+
+#:for k, t, s in R_KINDS_TYPES + C_KINDS_TYPES
+pure module function stdlib_dot_product_kahan_${s}$(a,b) result(p)
+    integer(ilp), parameter :: chunk = 64
+    ${t}$, intent(in) :: a(:)
+    ${t}$, intent(in) :: b(:)
+    ${t}$ :: p
+    ${t}$ :: abatch(chunk)
+    ${t}$ :: cbatch(chunk)
+    integer(ilp) :: i, n, r
+    ! -----------------------------
+    n = size(a,kind=ilp)
+    r = mod(n,chunk)
+
+    abatch(1:r)       = a(1:r)*${cnjg(t,'b(1:r)')}$
+    abatch(r+1:chunk) = zero_${s}$
+    cbatch = zero_${s}$
+    do i = r+1, n-r, chunk
+        call kahan_kernel( a(i:i+chunk-1)*${cnjg(t,'b(i:i+chunk-1)')}$ , abatch(1:chunk) , cbatch(1:chunk) )
+    end do     
+
+    p = zero_${s}$
+    do i = 1, chunk
+        call kahan_kernel( abatch(i) , p , cbatch(i) )
+    end do      
+end function
+#:endfor
+
+end submodule stdlib_intrinsics_dot_product
diff --git a/src/stdlib_intrinsics_sum.fypp b/src/stdlib_intrinsics_sum.fypp
new file mode 100644
index 000000000..373efcd39
--- /dev/null
+++ b/src/stdlib_intrinsics_sum.fypp
@@ -0,0 +1,266 @@
+#:include "common.fypp"
+#:set I_KINDS_TYPES = list(zip(INT_KINDS, INT_TYPES, INT_KINDS))
+#:set R_KINDS_TYPES = list(zip(REAL_KINDS, REAL_TYPES, REAL_SUFFIX))
+#:set C_KINDS_TYPES = list(zip(CMPLX_KINDS, CMPLX_TYPES, CMPLX_SUFFIX))
+#:set RANKS = range(2, MAXRANK + 1)
+
+submodule(stdlib_intrinsics) stdlib_intrinsics_sum
+    !! ([Specification](../page/specs/stdlib_intrinsics.html))
+    use stdlib_kinds
+    use stdlib_constants
+    implicit none
+
+    integer, parameter :: ilp = int64
+    
+contains
+
+!================= 1D Base implementations ============
+! This implementation is based on https://github.com/jalvesz/fast_math
+#:for k, t, s in I_KINDS_TYPES + R_KINDS_TYPES + C_KINDS_TYPES
+pure module function stdlib_sum_1d_${s}$(a) result(s)
+    integer(ilp), parameter :: chunk = 64
+    ${t}$, intent(in) :: a(:)
+    ${t}$ :: s
+    ${t}$ :: abatch(chunk)
+    integer(ilp) :: i, n, r
+    ! -----------------------------
+    n = size(a,kind=ilp)
+    r = mod(n,chunk)
+    
+    abatch(1:r)       = a(1:r)
+    abatch(r+1:chunk) = zero_${s}$
+    do i = r+1, n-r, chunk
+     abatch(1:chunk) = abatch(1:chunk) + a(i:i+chunk-1)
+    end do
+
+    s = zero_${s}$
+    do i = 1, chunk/2
+      s = s + abatch(i)+abatch(chunk/2+i)
+    end do
+end function
+
+pure module function stdlib_sum_1d_${s}$_mask(a,mask) result(s)
+    integer(ilp), parameter :: chunk = 64
+    ${t}$, intent(in) :: a(:)
+    logical, intent(in) :: mask(:)
+    ${t}$ :: s
+    ${t}$ :: abatch(chunk)
+    integer(ilp) :: i, n, r
+    ! -----------------------------
+    n = size(a,kind=ilp)
+    r = mod(n,chunk)
+
+    abatch(1:r)       = merge( zero_${s}$ , a(1:r) , mask(1:r) )
+    abatch(r+1:chunk) = zero_${s}$
+    do i = r+1, n-r, chunk
+        abatch(1:chunk) = abatch(1:chunk) + merge( zero_${s}$ , a(i:i+chunk-1), mask(i:i+chunk-1) )
+    end do
+    
+    s = zero_${s}$
+    do i = 1, chunk/2
+        s = s + abatch(i)+abatch(chunk/2+i)
+    end do
+end function
+#:endfor
+
+#:for k, t, s in R_KINDS_TYPES + C_KINDS_TYPES
+pure module function stdlib_sum_kahan_1d_${s}$(a) result(s)
+    integer(ilp), parameter :: chunk = 64
+    ${t}$, intent(in) :: a(:)
+    ${t}$ :: s
+    ${t}$ :: sbatch(chunk)
+    ${t}$ :: cbatch(chunk)
+    integer(ilp) :: i, n, r
+    ! -----------------------------
+    n = size(a,kind=ilp)
+    r = mod(n,chunk)
+
+    sbatch(1:r) = a(1:r)
+    sbatch(r+1:chunk)  = zero_${s}$
+    cbatch = zero_${s}$
+    do i = r+1, n-r, chunk
+      call kahan_kernel( a(i:i+chunk-1) , sbatch(1:chunk) , cbatch(1:chunk) )
+    end do 
+
+    s = zero_${s}$
+    do i = 1,chunk
+        call kahan_kernel( sbatch(i) , s , cbatch(i) )
+    end do
+end function
+
+pure module function stdlib_sum_kahan_1d_${s}$_mask(a,mask) result(s)
+    integer(ilp), parameter :: chunk = 64
+    ${t}$, intent(in) :: a(:)
+    logical, intent(in) :: mask(:)
+    ${t}$ :: s
+    ${t}$ :: sbatch(chunk)
+    ${t}$ :: cbatch(chunk)
+    integer(ilp) :: i, n, r
+    ! -----------------------------
+    n = size(a,kind=ilp)
+    r = mod(n,chunk)
+    
+    sbatch(1:r) = merge( zero_${s}$ , a(1:r) , mask(1:r) )
+    sbatch(r+1:chunk)  = zero_${s}$
+    cbatch = zero_${s}$
+    do i = r+1, n-r, chunk
+      call kahan_kernel( a(i:i+chunk-1) , sbatch(1:chunk) , cbatch(1:chunk), mask(i:i+chunk-1) )
+    end do 
+
+    s = zero_${s}$
+    do i = 1,chunk
+        call kahan_kernel( sbatch(i) , s , cbatch(i) )
+    end do
+end function
+#:endfor
+
+!================= N-D implementations ============
+#:for k, t, s in I_KINDS_TYPES + R_KINDS_TYPES + C_KINDS_TYPES
+#:for rank in RANKS
+pure module function stdlib_sum_${rank}$d_${s}$( x , mask ) result( s )
+    ${t}$, intent(in) :: x${ranksuffix(rank)}$
+    logical, intent(in), optional :: mask${ranksuffix(rank)}$
+    ${t}$ :: s
+    if(.not.present(mask)) then
+        s = sum_recast(x,size(x,kind=ilp))
+    else
+        s = sum_recast_mask(x,mask,size(x,kind=ilp))
+    end if
+contains
+    pure ${t}$ function sum_recast(b,n)
+      integer(ilp), intent(in) :: n
+      ${t}$, intent(in) :: b(n)
+      sum_recast = stdlib_sum(b)
+    end function
+    pure ${t}$ function sum_recast_mask(b,m,n)
+      integer(ilp), intent(in) :: n
+      ${t}$, intent(in) :: b(n)
+      logical, intent(in) :: m(n)
+      sum_recast_mask = stdlib_sum(b,m)
+    end function
+end function
+
+pure module function stdlib_sum_${rank}$d_dim_${s}$( x , dim, mask ) result( s )
+    ${t}$, intent(in) :: x${ranksuffix(rank)}$
+    integer, intent(in):: dim
+    logical, intent(in), optional :: mask${ranksuffix(rank)}$
+    ${t}$ :: s${reduced_shape('x', rank, 'dim')}$
+    integer :: j 
+
+    if(.not.present(mask)) then
+        if(dim<${rank}$)then
+            do j = 1, size(x,dim=${rank}$)
+                #:if rank == 2
+                s${select_subarray(rank-1, [(rank-1, 'j')])}$ = stdlib_sum( x${select_subarray(rank, [(rank, 'j')])}$ )
+                #:else
+                s${select_subarray(rank-1, [(rank-1, 'j')])}$ = stdlib_sum( x${select_subarray(rank, [(rank, 'j')])}$, dim=dim )
+                #:endif
+            end do
+        else
+            do j = 1, size(x,dim=1)
+                #:if rank == 2
+                s${select_subarray(rank-1, [(1, 'j')])}$ = stdlib_sum( x${select_subarray(rank, [(1, 'j')])}$ )
+                #:else
+                s${select_subarray(rank-1, [(1, 'j')])}$ = stdlib_sum( x${select_subarray(rank, [(1, 'j')])}$, dim=${rank-1}$ )
+                #:endif
+            end do
+        end if
+    else 
+        if(dim<${rank}$)then
+            do j = 1, size(x,dim=${rank}$)
+                #:if rank == 2
+                s${select_subarray(rank-1, [(rank-1, 'j')])}$ = stdlib_sum( x${select_subarray(rank, [(rank, 'j')])}$, mask=mask${select_subarray(rank, [(rank, 'j')])}$ )
+                #:else
+                s${select_subarray(rank-1, [(rank-1, 'j')])}$ = stdlib_sum( x${select_subarray(rank, [(rank, 'j')])}$, dim=dim, mask=mask${select_subarray(rank, [(rank, 'j')])}$ )
+                #:endif
+            end do
+        else
+            do j = 1, size(x,dim=1)
+                #:if rank == 2
+                s${select_subarray(rank-1, [(1, 'j')])}$ = stdlib_sum( x${select_subarray(rank, [(1, 'j')])}$, mask=mask${select_subarray(rank, [(1, 'j')])}$ )
+                #:else
+                s${select_subarray(rank-1, [(1, 'j')])}$ = stdlib_sum( x${select_subarray(rank, [(1, 'j')])}$, dim=${rank-1}$, mask=mask${select_subarray(rank, [(1, 'j')])}$ )
+                #:endif
+            end do
+        end if
+    end if
+
+end function
+#:endfor
+#:endfor
+
+#:for k, t, s in R_KINDS_TYPES + C_KINDS_TYPES
+#:for rank in RANKS
+pure module function stdlib_sum_kahan_${rank}$d_${s}$( x , mask ) result( s )
+    ${t}$, intent(in) :: x${ranksuffix(rank)}$
+    logical, intent(in), optional :: mask${ranksuffix(rank)}$
+    ${t}$ :: s
+    if(.not.present(mask)) then
+        s = sum_recast(x,size(x,kind=ilp))
+    else
+        s = sum_recast_mask(x,mask,size(x,kind=ilp))
+    end if
+contains
+    pure ${t}$ function sum_recast(b,n)
+      integer(ilp), intent(in) :: n
+      ${t}$, intent(in) :: b(n)
+      sum_recast = stdlib_sum_kahan(b)
+    end function
+    pure ${t}$ function sum_recast_mask(b,m,n)
+      integer(ilp), intent(in) :: n
+      ${t}$, intent(in) :: b(n)
+      logical, intent(in) :: m(n)
+      sum_recast_mask = stdlib_sum_kahan(b,m)
+    end function
+end function
+
+pure module function stdlib_sum_kahan_${rank}$d_dim_${s}$( x , dim, mask ) result( s )
+    ${t}$, intent(in) :: x${ranksuffix(rank)}$
+    integer, intent(in):: dim
+    logical, intent(in), optional :: mask${ranksuffix(rank)}$
+    ${t}$ :: s${reduced_shape('x', rank, 'dim')}$
+    integer :: j 
+
+    if(.not.present(mask)) then
+        if(dim<${rank}$)then
+            do j = 1, size(x,dim=${rank}$)
+                #:if rank == 2
+                s${select_subarray(rank-1, [(rank-1, 'j')])}$ = stdlib_sum_kahan( x${select_subarray(rank, [(rank, 'j')])}$ )
+                #:else
+                s${select_subarray(rank-1, [(rank-1, 'j')])}$ = stdlib_sum_kahan( x${select_subarray(rank, [(rank, 'j')])}$, dim=dim )
+                #:endif
+            end do
+        else
+            do j = 1, size(x,dim=1)
+                #:if rank == 2
+                s${select_subarray(rank-1, [(1, 'j')])}$ = stdlib_sum_kahan( x${select_subarray(rank, [(1, 'j')])}$ )
+                #:else
+                s${select_subarray(rank-1, [(1, 'j')])}$ = stdlib_sum_kahan( x${select_subarray(rank, [(1, 'j')])}$, dim=${rank-1}$ )
+                #:endif
+            end do
+        end if
+    else 
+        if(dim<${rank}$)then
+            do j = 1, size(x,dim=${rank}$)
+                #:if rank == 2
+                s${select_subarray(rank-1, [(rank-1, 'j')])}$ = stdlib_sum_kahan( x${select_subarray(rank, [(rank, 'j')])}$, mask=mask${select_subarray(rank, [(rank, 'j')])}$ )
+                #:else
+                s${select_subarray(rank-1, [(rank-1, 'j')])}$ = stdlib_sum_kahan( x${select_subarray(rank, [(rank, 'j')])}$, dim=dim, mask=mask${select_subarray(rank, [(rank, 'j')])}$ )
+                #:endif
+            end do
+        else
+            do j = 1, size(x,dim=1)
+                #:if rank == 2
+                s${select_subarray(rank-1, [(1, 'j')])}$ = stdlib_sum_kahan( x${select_subarray(rank, [(1, 'j')])}$, mask=mask${select_subarray(rank, [(1, 'j')])}$ )
+                #:else
+                s${select_subarray(rank-1, [(1, 'j')])}$ = stdlib_sum_kahan( x${select_subarray(rank, [(1, 'j')])}$, dim=${rank-1}$, mask=mask${select_subarray(rank, [(1, 'j')])}$ )
+                #:endif
+            end do
+        end if
+    end if
+
+end function
+#:endfor
+#:endfor
+
+end submodule stdlib_intrinsics_sum
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 4d83548db..566c0e6d8 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -17,6 +17,7 @@ add_subdirectory(constants)
 add_subdirectory(hash_functions)
 add_subdirectory(hash_functions_perf)
 add_subdirectory(hashmaps)
+add_subdirectory(intrinsics)
 add_subdirectory(io)
 add_subdirectory(linalg)
 add_subdirectory(logger)
diff --git a/test/intrinsics/CMakeLists.txt b/test/intrinsics/CMakeLists.txt
new file mode 100644
index 000000000..cc5e9fb87
--- /dev/null
+++ b/test/intrinsics/CMakeLists.txt
@@ -0,0 +1,7 @@
+set(
+  fppFiles
+  "test_intrinsics.fypp"
+)
+fypp_f90("${fyppFlags}" "${fppFiles}" outFiles)
+
+ADDTEST(intrinsics)
diff --git a/test/intrinsics/test_intrinsics.fypp b/test/intrinsics/test_intrinsics.fypp
new file mode 100644
index 000000000..8aefe09d3
--- /dev/null
+++ b/test/intrinsics/test_intrinsics.fypp
@@ -0,0 +1,279 @@
+#:include "common.fypp"
+#:set I_KINDS_TYPES = list(zip(INT_KINDS, INT_TYPES, INT_KINDS))
+#:set R_KINDS_TYPES = list(zip(REAL_KINDS, REAL_TYPES, REAL_SUFFIX))
+#:set C_KINDS_TYPES = list(zip(CMPLX_KINDS, CMPLX_TYPES, CMPLX_SUFFIX))
+
+module test_intrinsics
+    use testdrive, only : new_unittest, unittest_type, error_type, check, skip_test
+    use stdlib_kinds, only: sp, dp, xdp, qp, int8, int16, int32, int64
+    use stdlib_intrinsics
+    use stdlib_math, only: swap
+    implicit none
+    
+contains
+
+!> Collect all exported unit tests
+subroutine collect_suite(testsuite)
+    !> Collection of tests
+    type(unittest_type), allocatable, intent(out) :: testsuite(:)
+
+    testsuite = [ &
+        new_unittest('sum', test_sum), &
+        new_unittest('dot_product', test_dot_product) &
+    ]
+end subroutine
+
+subroutine test_sum(error)
+    !> Error handling
+    type(error_type), allocatable, intent(out) :: error
+
+    !> Internal parameters and variables
+    integer, parameter :: n = 1e3, ncalc = 3
+    real(sp) :: u
+    integer :: iter, i, j
+    !====================================================================================
+    #:for k, t, s in I_KINDS_TYPES
+    #:if not k in ["int8","int16"] # skip int8 and int16
+    block
+        ${t}$, allocatable :: x(:)
+        ${t}$, parameter :: total_sum = 0_${k}$
+        ${t}$ :: xsum(ncalc), err(ncalc)
+        logical, allocatable :: mask(:), nmask(:)
+
+        allocate(x(n+1))
+        do i = 1, n+1
+            x(i) = i - n/2 - 1
+        end do
+        allocate(mask(n+1),source=.false.); mask(1:n+1:2) = .true.
+        allocate(nmask(n+1)); nmask = .not.mask
+        ! scramble array
+        do i = 1, n+1
+            call random_number(u) 
+            j = 1 + floor(n*u)
+            call swap( x(i), x(j) )
+            call swap( mask(i), mask(j) )
+            call swap( nmask(i), nmask(j) )
+        end do
+        
+        xsum(1) = sum(x)        ! compiler intrinsic
+        xsum(2) = stdlib_sum(x) ! chunked summation
+        err(1:2) = abs(total_sum-xsum(1:2))
+        
+        call check(error, all(err(1:2)==0_${k}$) , "real sum is not accurate" )
+        if (allocated(error)) return
+
+        xsum(1) = sum(x,mask)+sum(x,nmask) ! compiler intrinsic
+        xsum(2) = stdlib_sum(x,mask)+stdlib_sum(x,nmask) ! chunked summation
+        err(1:2) = abs(total_sum-xsum(1:2))
+        
+        call check(error, all(err(1:2)==0_${k}$) , "masked real sum is not accurate" )
+        if (allocated(error)) return
+    end block
+    #:endif
+    #:endfor
+
+    #:for k, t, s in R_KINDS_TYPES
+    block
+        ${t}$, allocatable :: x(:)
+        ${t}$, parameter :: total_sum = 4*atan(1._${k}$), tolerance = epsilon(1._${k}$)*100
+        ${t}$ :: xsum(ncalc), err(ncalc)
+        logical, allocatable :: mask(:), nmask(:)
+
+        allocate(x(n))
+        do i = 1, n 
+            x(i) = 8*atan(1._${k}$)*(real(i,kind=${k}$)-0.5_${k}$)/real(n,kind=${k}$)**2
+        end do
+        allocate(mask(n),source=.false.); mask(1:n:2) = .true.
+        allocate(nmask(n)); nmask = .not.mask
+        ! scramble array
+        do i = 1, n
+            call random_number(u) 
+            j = 1 + floor(n*u)
+            call swap( x(i), x(j) )
+            call swap( mask(i), mask(j) )
+            call swap( nmask(i), nmask(j) )
+        end do
+        
+        xsum(1) = sum(x)        ! compiler intrinsic
+        xsum(2) = stdlib_sum_kahan(x) ! chunked Kahan summation
+        xsum(3) = stdlib_sum(x)       ! chunked summation
+        err(1:ncalc) = abs(1._${k}$-xsum(1:ncalc)/total_sum)
+        
+        call check(error, all(err(:)<tolerance) , "real sum is not accurate" )
+        if (allocated(error)) return
+
+        xsum(1) = sum(x,mask)+sum(x,nmask) ! compiler intrinsic
+        xsum(2) = stdlib_sum_kahan(x,mask)+stdlib_sum_kahan(x,nmask) ! chunked Kahan summation
+        xsum(3) = stdlib_sum(x,mask)+stdlib_sum(x,nmask) ! chunked summation
+        err(1:ncalc) = abs(1._${k}$-xsum(1:ncalc)/total_sum)
+        
+        call check(error, all(err(:)<tolerance) , "masked real sum is not accurate" )
+        if (allocated(error)) return
+    end block
+    #:endfor
+
+    #:for k, t, s in C_KINDS_TYPES
+    block
+        ${t}$, allocatable :: x(:)
+        real(${k}$), parameter :: total_sum = 4*atan(1._${k}$), tolerance = epsilon(1._${k}$)*100
+        real(${k}$) :: err(ncalc)
+        ${t}$ :: xsum(ncalc)
+        logical, allocatable :: mask(:), nmask(:)
+
+        allocate(x(n))
+        do i = 1, n
+            x(i) = (8*atan(1._${k}$)*(real(i,kind=${k}$)-0.5_${k}$)/n**2)*cmplx(1._${k}$,1._${k}$)
+        end do
+        
+        allocate(mask(n),source=.false.); mask(1:n:2) = .true.
+        allocate(nmask(n)); nmask = .not.mask
+        ! scramble array
+        do i = 1, n
+            call random_number(u) 
+            j = 1 + floor(n*u)
+            call swap( x(i), x(j) )
+            call swap( mask(i), mask(j) )
+            call swap( nmask(i), nmask(j) )
+        end do
+        
+        xsum(1) = sum(x)        ! compiler intrinsic
+        xsum(2) = stdlib_sum_kahan(x) ! chunked Kahan summation
+        xsum(3) = stdlib_sum(x)       ! chunked summation
+        err(1:ncalc) = abs(1._${k}$-(xsum(1:ncalc)%re)/total_sum)
+        
+        
+        call check(error, all(err(:)<tolerance) , "complex sum is not accurate" )
+        if (allocated(error)) return
+
+        xsum(1) = sum(x,mask)+sum(x,nmask) ! compiler intrinsic
+        xsum(2) = stdlib_sum_kahan(x,mask)+stdlib_sum_kahan(x,nmask) ! chunked Kahan summation
+        xsum(3) = stdlib_sum(x,mask)+stdlib_sum(x,nmask) ! chunked summation
+        err(1:ncalc) = abs(1._${k}$-(xsum(1:ncalc)%re)/total_sum)
+        
+        call check(error, all(err(:)<tolerance) , "complex masked sum is not accurate" )
+        if (allocated(error)) return
+    end block
+    #:endfor
+
+    ndarray : block
+        use stdlib_strings, only: to_string
+        real(sp), allocatable :: x(:,:,:)
+        real(sp), parameter :: tolerance = epsilon(1._sp)*100
+        integer :: i 
+
+        allocate(x(100,100,10))
+        call random_number(x)
+        !> sum all elements
+        call check(error, abs( sum(x) - stdlib_sum(x) )<tolerance*size(x) , "KO: full ndarray stdlib_sum" )
+        if (allocated(error)) return
+
+        call check(error, abs( sum(x) - stdlib_sum_kahan(x) )<tolerance*size(x) , "KO: full ndarray stdlib_sum_kahan" )
+        if (allocated(error)) return
+
+        !> sum over specific rank dim
+        do i = 1, rank(x)
+            call check(error, norm2( sum(x,dim=i) - stdlib_sum(x,dim=i) )<tolerance*size(x) ,&
+                        "KO: ndarray stdlib_sum over dim "//to_string(i) )
+            if (allocated(error)) return
+
+            call check(error, norm2( sum(x,dim=i) - stdlib_sum_kahan(x,dim=i) )<tolerance*size(x) ,&
+                        "KO: ndarray stdlib_sum_kahan over dim "//to_string(i) )
+            if (allocated(error)) return
+        end do
+    end block ndarray
+
+end subroutine
+
+subroutine test_dot_product(error)
+    !> Error handling
+    type(error_type), allocatable, intent(out) :: error
+
+    !> Internal parameters and variables
+    integer, parameter :: n = 1e3, ncalc = 3
+    real(sp) :: u
+    integer :: iter, i, j
+    !====================================================================================
+    #:for k, t, s in R_KINDS_TYPES
+    block
+        ${t}$, allocatable :: x(:)
+        ${t}$, parameter :: total_sum = 4*atan(1._${k}$), tolerance = epsilon(1._${k}$)*100
+        ${t}$ :: xsum(ncalc), err(ncalc)
+
+        allocate(x(n))
+        do i = 1, n 
+            x(i) = 2*sqrt( 2*atan(1._${k}$)*(real(i,kind=${k}$)-0.5_${k}$) )/n
+        end do
+        ! scramble array
+        do i = 1, n
+            call random_number(u) 
+            j = 1 + floor(n*u)
+            call swap( x(i), x(j) )
+        end do
+        
+        xsum(1) = dot_product(x,x) ! compiler intrinsic
+        xsum(2) = stdlib_dot_product_kahan(x,x) ! chunked Kahan summation
+        xsum(3) = stdlib_dot_product(x,x)       ! chunked summation
+        err(1:ncalc) = abs(1._${k}$-xsum(1:ncalc)/total_sum)
+        
+        call check(error, all(err(:)<tolerance) , "real dot_product is not accurate" )
+        if (allocated(error)) return
+    end block
+    #:endfor
+
+    #:for k, t, s in C_KINDS_TYPES
+    block
+        ${t}$, allocatable :: x(:)
+        real(${k}$), parameter :: total_sum = 4*atan(1._${k}$), tolerance = epsilon(1._${k}$)*100
+        real(${k}$) :: err(ncalc)
+        ${t}$ :: xsum(ncalc)
+
+        allocate(x(n))
+        do i = 1, n
+            x(i) = ( 2*sqrt( 2*atan(1._${k}$)*(real(i,kind=${k}$)-0.5_${k}$) ) / n )*cmplx(1._${k}$,1._${k}$)
+        end do
+        ! scramble array
+        do i = 1, n
+            call random_number(u) 
+            j = 1 + floor(n*u)
+            call swap( x(i), x(j) )
+        end do
+        
+        xsum(1) = dot_product(x,x) ! compiler intrinsic
+        xsum(2) = stdlib_dot_product_kahan(x,x) ! chunked Kahan dot_product
+        xsum(3) = stdlib_dot_product(x,x)       ! chunked dot_product
+        err(1:ncalc) = abs(1._${k}$-xsum(1:ncalc)%re/(2*total_sum))
+         
+        call check(error, all(err(:)<tolerance) , "complex dot_product is not accurate" )
+        if (allocated(error)) return
+    end block
+    #:endfor
+
+end subroutine
+    
+end module test_intrinsics
+
+program tester
+    use, intrinsic :: iso_fortran_env, only : error_unit
+    use testdrive, only : run_testsuite, new_testsuite, testsuite_type
+    use test_intrinsics, only : collect_suite
+    implicit none
+    integer :: stat, is
+    type(testsuite_type), allocatable :: testsuites(:)
+    character(len=*), parameter :: fmt = '("#", *(1x, a))'
+
+    stat = 0
+
+    testsuites = [ &
+        new_testsuite("sparse", collect_suite) &
+        ]
+
+    do is = 1, size(testsuites)
+        write(error_unit, fmt) "Testing:", testsuites(is)%name
+        call run_testsuite(testsuites(is)%collect, error_unit, stat)
+    end do
+
+    if (stat > 0) then
+        write(error_unit, '(i0, 1x, a)') stat, "test(s) failed!"
+        error stop
+    end if
+end program
\ No newline at end of file