diff --git a/.travis.yml b/.travis.yml
index 5198c5a8..150d62dd 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -10,7 +10,8 @@ notifications:
 
 env:
   global:
-    - NUMPY_VERSION=1.9
+    - NUMPY_VERSION=1.12
+    - DOCTEST=TRUE
 
 addons: 
   apt:
@@ -32,12 +33,6 @@ matrix:
     #     - brew update
     #     - brew tap homebrew/versions && brew reinstall gcc49 --without-multilib
     #     - wget http://repo.continuum.io/miniconda/Miniconda-latest-MacOSX-x86_64.sh -O miniconda.sh
-    #     - bash miniconda.sh -b -p $HOME/miniconda
-    #     - export PATH="$HOME/miniconda/bin:$PATH"
-    #     - conda create -q --yes -n test python=$PYTHON_VERSION numpy=$NUMPY_VERSION sphinx
-    #     - source activate test
-    #     - conda install -q --yes -c asmeurer gsl
-
 
     # - os: osx
     #   compiler: clang
@@ -47,62 +42,35 @@ matrix:
     #     - brew outdated xctool || brew upgrade xctool
     #     - brew tap homebrew/versions && brew install clang-omp
     #     - wget http://repo.continuum.io/miniconda/Miniconda-latest-MacOSX-x86_64.sh -O miniconda.sh
-    #     - bash miniconda.sh -b -p $HOME/miniconda
-    #     - export PATH="$HOME/miniconda/bin:$PATH"
-    #     - conda create -q --yes -n test python=$PYTHON_VERSION numpy=$NUMPY_VERSION sphinx
-    #     - source activate test
-    #     - conda install -q --yes -c asmeurer gsl        
 
     - os: osx
-      osx_image: xcode8
+      osx_image: xcode9
       compiler: clang
-      env: COMPILER=clang FAMILY=clang V='Apple LLVM 8.0.0' PYTHON_VERSION=3.5
+      env: COMPILER=clang FAMILY=clang V='Apple LLVM 7.0.0' PYTHON_VERSION=3.6 DOCTEST=FALSE
       before_install:
         - wget http://repo.continuum.io/miniconda/Miniconda3-latest-MacOSX-x86_64.sh -O miniconda.sh
-        - bash miniconda.sh -b -p $HOME/miniconda
-        - export PATH="$HOME/miniconda/bin:$PATH"
-        - conda create -q --yes -n test python=$PYTHON_VERSION numpy=$NUMPY_VERSION sphinx
-        - source activate test
-        - conda install -q --yes -c asmeurer gsl
+
 
     - os: osx
-      osx_image: xcode7.3
+      osx_image: xcode8
       compiler: clang
-      env: COMPILER=clang FAMILY=clang V='Apple LLVM 7.0.0' PYTHON_VERSION=3.5
+      env: COMPILER=clang FAMILY=clang V='Apple LLVM 7.0.0' PYTHON_VERSION=3.5 DOCTEST=FALSE
       before_install:
         - wget http://repo.continuum.io/miniconda/Miniconda3-latest-MacOSX-x86_64.sh -O miniconda.sh
-        - bash miniconda.sh -b -p $HOME/miniconda
-        - export PATH="$HOME/miniconda/bin:$PATH"
-        - conda create -q --yes -n test python=$PYTHON_VERSION numpy=$NUMPY_VERSION sphinx
-        - source activate test
-        - conda install -q --yes -c asmeurer gsl
-
 
     - os: osx
-      osx_image: xcode9
+      osx_image: xcode7.3
       compiler: clang
-      env: COMPILER=clang FAMILY=clang V='Apple LLVM 9.0.0' PYTHON_VERSION=3.5
+      env: COMPILER=clang FAMILY=clang V='Apple LLVM 7.0.0' PYTHON_VERSION=2.7 DOCTEST=FALSE
       before_install:
-        - wget http://repo.continuum.io/miniconda/Miniconda3-latest-MacOSX-x86_64.sh -O miniconda.sh
-        - bash miniconda.sh -b -p $HOME/miniconda
-        - export PATH="$HOME/miniconda/bin:$PATH"
-        - conda create -q --yes -n test python=$PYTHON_VERSION numpy=$NUMPY_VERSION sphinx
-        - source activate test
-        - conda install -q --yes -c asmeurer gsl
-
+        - wget http://repo.continuum.io/miniconda/Miniconda2-latest-MacOSX-x86_64.sh -O miniconda.sh
 
     - os: osx
       osx_image: xcode6.4
       compiler: clang
-      env: COMPILER=clang FAMILY=clang V='Apple LLVM 7.0.0' PYTHON_VERSION=3.5
+      env: COMPILER=clang FAMILY=clang V='Apple LLVM 7.0.0' PYTHON_VERSION=2.6 NUMPY_VERSION=1.7 DOCTEST=FALSE
       before_install:
-        - wget http://repo.continuum.io/miniconda/Miniconda3-latest-MacOSX-x86_64.sh -O miniconda.sh
-        - bash miniconda.sh -b -p $HOME/miniconda
-        - export PATH="$HOME/miniconda/bin:$PATH"
-        - conda create -q --yes -n test python=$PYTHON_VERSION numpy=$NUMPY_VERSION sphinx
-        - source activate test
-        - conda install -q --yes -c asmeurer gsl
-
+        - wget http://repo.continuum.io/miniconda/Miniconda2-latest-MacOSX-x86_64.sh -O miniconda.sh
 
     # - os: osx
     #   compiler: gcc
@@ -110,11 +78,6 @@ matrix:
     #   before_install:
     #     - brew update && brew tap homebrew/versions && brew install gcc48 --without-multilib
     #     - wget http://repo.continuum.io/miniconda/Miniconda3-latest-MacOSX-x86_64.sh -O miniconda.sh
-    #     - bash miniconda.sh -b -p $HOME/miniconda
-    #     - export PATH="$HOME/miniconda/bin:$PATH"
-    #     - conda create -q --yes -n test python=$PYTHON_VERSION numpy=$NUMPY_VERSION sphinx
-    #     - source activate test
-    #     - conda install -q --yes -c asmeurer gsl
         
     # - os: linux
     #   dist: trusty
@@ -127,10 +90,6 @@ matrix:
     #   env: COMPILER=clang-3.6 V=3.6 PYTHON_VERSION=2.7 
     #   before_install:
     #     - wget http://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh -O miniconda.sh
-    #     - bash miniconda.sh -b -p $HOME/miniconda
-    #     - export PATH="$HOME/miniconda/bin:$PATH"
-    #     - conda create -q --yes -n test python=$PYTHON_VERSION numpy=$NUMPY_VERSION sphinx
-    #     - source activate test
 
     # - os: linux
     #   dist: trusty
@@ -143,22 +102,14 @@ matrix:
     #   env: COMPILER=clang-3.6 V=3.6 PYTHON_VERSION=3.5
     #   before_install:
     #     - wget http://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh
-    #     - bash miniconda.sh -b -p $HOME/miniconda
-    #     - export PATH="$HOME/miniconda/bin:$PATH"
-    #     - conda create -q --yes -n test python=$PYTHON_VERSION numpy=$NUMPY_VERSION sphinx
-    #     - source activate test
         
     - os: linux
       dist: trusty
       sudo: required
       compiler: gcc
-      env: COMPILER=gcc PYTHON_VERSION=2.6 NUMPY_VERSION=1.7
+      env: COMPILER=gcc PYTHON_VERSION=2.6 NUMPY_VERSION=1.7 DOCTEST=FALSE
       before_install:
         - wget http://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh -O miniconda.sh
-        - bash miniconda.sh -b -p $HOME/miniconda
-        - export PATH="$HOME/miniconda/bin:$PATH"
-        - conda create -q --yes -n test python=$PYTHON_VERSION numpy=$NUMPY_VERSION sphinx
-        - source activate test
 
     - os: linux
       dist: trusty
@@ -167,22 +118,14 @@ matrix:
       env: COMPILER=gcc PYTHON_VERSION=2.7
       before_install:
         - wget http://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh -O miniconda.sh
-        - bash miniconda.sh -b -p $HOME/miniconda
-        - export PATH="$HOME/miniconda/bin:$PATH"
-        - conda create -q --yes -n test python=$PYTHON_VERSION numpy=$NUMPY_VERSION sphinx
-        - source activate test
         
     - os: linux
       dist: trusty
       sudo: required
       compiler: gcc
-      env: COMPILER=gcc PYTHON_VERSION=3.3 
+      env: COMPILER=gcc PYTHON_VERSION=3.4 
       before_install:
         - wget http://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh
-        - bash miniconda.sh -b -p $HOME/miniconda
-        - export PATH="$HOME/miniconda/bin:$PATH"
-        - conda create -q --yes -n test python=$PYTHON_VERSION numpy=$NUMPY_VERSION sphinx
-        - source activate test
 
     - os: linux
       dist: trusty
@@ -191,22 +134,28 @@ matrix:
       env: COMPILER=gcc PYTHON_VERSION=3.5
       before_install:
         - wget http://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh
-        - bash miniconda.sh -b -p $HOME/miniconda
-        - export PATH="$HOME/miniconda/bin:$PATH"
-        - conda create -q --yes -n test python=$PYTHON_VERSION numpy=$NUMPY_VERSION sphinx
-        - source activate test
-        
+
+    - os: linux
+      dist: trusty
+      sudo: required
+      compiler: gcc
+      env: COMPILER=gcc PYTHON_VERSION=3.6
+      before_install:
+        - wget http://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh
 
 install:
+  - bash miniconda.sh -b -p $HOME/miniconda
+  - export PATH="$HOME/miniconda/bin:$PATH"
+  - conda create -q --yes -n test python=$PYTHON_VERSION numpy=$NUMPY_VERSION sphinx
+  - source activate test
+  - conda install -q --yes -c asmeurer gsl
   - make -r CC=$COMPILER
   - make install CC=$COMPILER
   - python setup.py install
         
 script:
   - make tests CC=$COMPILER
-
-after_success:
-  - cd docs && make html && cd ../ 
-  - if [[ "$TRAVIS_OS_NAME" != "osx" ]]; then cd docs && make doctest && cd ../; fi
+  - make -C docs html
+  - if [[ "${DOCTEST}" == "TRUE" ]]; then make -C docs doctest ; fi
 
   
diff --git a/Corrfunc/__init__.py b/Corrfunc/__init__.py
index 237e0521..91baba19 100644
--- a/Corrfunc/__init__.py
+++ b/Corrfunc/__init__.py
@@ -10,7 +10,7 @@
                         unicode_literals)
 import os
 
-__version__ = "2.0.1"
+__version__ = "2.1.0"
 __author__ = "Manodeep Sinha <manodeep@gmail.com>"
 
 
diff --git a/Corrfunc/call_correlation_functions.py b/Corrfunc/call_correlation_functions.py
index 5e92838d..c041f613 100644
--- a/Corrfunc/call_correlation_functions.py
+++ b/Corrfunc/call_correlation_functions.py
@@ -22,7 +22,8 @@
     countpairs_rp_pi as DDrppi_extn,\
     countpairs_wp as wp_extn,\
     countpairs_xi as xi_extn,\
-    countspheres_vpf as vpf_extn
+    countspheres_vpf as vpf_extn,\
+    countpairs_s_mu as DDsmu_extn
 
 
 def main():
@@ -74,6 +75,26 @@ def main():
               .format(items[0], items[1], items[2], items[3], items[4], items[5]))
     print("------------------------------------------------------------------------")
 
+    mu_max = 0.5
+    nmu_bins = 10
+
+    print("\nRunning 2-D correlation function DD(s,mu)")
+    results_DDsmu, _ = DDsmu_extn(autocorr, nthreads, binfile,
+                                    mu_max, nmu_bins,
+                                    x, y, z,
+                                    weights1=np.ones_like(x), weight_type='pair_product',
+                                    verbose=True, periodic=periodic,
+                                    boxsize=boxsize, output_savg=True)
+    print("\n#            ****** DD(s,mu): first {0} bins  *******      "
+          .format(numbins_to_print))
+    print("#      smin        smax       savg     mu_max     npairs    weightavg")
+    print("########################################################################")
+    for ibin in range(numbins_to_print):
+        items = results_DDsmu[ibin]
+        print("{0:12.4f} {1:12.4f} {2:10.4f} {3:10.1f} {4:10d} {5:10.4f}"
+              .format(items[0], items[1], items[2], items[3], items[4], items[5]))
+    print("------------------------------------------------------------------------")
+
     print("\nRunning 2-D projected correlation function wp(rp)")
     results_wp, _, _ = wp_extn(boxsize, pimax, nthreads,
                             binfile, x, y, z,
diff --git a/Corrfunc/call_correlation_functions_mocks.py b/Corrfunc/call_correlation_functions_mocks.py
index 5dae9112..e73bc565 100644
--- a/Corrfunc/call_correlation_functions_mocks.py
+++ b/Corrfunc/call_correlation_functions_mocks.py
@@ -19,6 +19,7 @@ def main():
     from Corrfunc.io import read_catalog
     from Corrfunc._countpairs_mocks import\
         countpairs_rp_pi_mocks as rp_pi_mocks_extn,\
+        countpairs_s_mu_mocks as s_mu_mocks_extn,\
         countpairs_theta_mocks as theta_mocks_extn,\
         countspheres_vpf_mocks as vpf_mocks_extn
 
@@ -52,7 +53,7 @@ def main():
                                          output_rpavg=True, verbose=True)
     print("\n#            ****** DD(rp,pi): first {0} bins  *******      "
           .format(numbins_to_print))
-    print("#      rmin        rmax       rpavg     pi_upper     npairs    weightavg")
+    print("#      rmin        rmax       rpavg     pi_upper     npairs   weightavg")
     print("########################################################################")
     for ibin in range(numbins_to_print):
         items = results_DDrppi[ibin]
@@ -61,9 +62,29 @@ def main():
 
     print("------------------------------------------------------------------------")
 
+    nmu_bins = 10
+    mu_max = 1.0
+
+    print("\nRunning 2-D correlation function xi(s,mu)")
+    results_DDsmu, _ = s_mu_mocks_extn(autocorr, cosmology, nthreads,
+                                       mu_max, nmu_bins, binfile,
+                                       ra, dec, cz, weights1=np.ones_like(ra),
+                                       output_savg=True, verbose=True,
+                                       weight_type='pair_product')
+    print("\n#            ****** DD(s,mu): first {0} bins  *******      "
+          .format(numbins_to_print))
+    print("#      smin        smax       savg     mu_upper       npairs    weight_avg")
+    print("###########################################################################")
+    for ibin in range(numbins_to_print):
+        items = results_DDsmu[ibin]
+        print("{0:12.4f} {1:12.4f} {2:10.4f} {3:10.1f} {4:10d} {5:12.4f}"
+              .format(items[0], items[1], items[2], items[3], items[4], items[5]))
+
+    print("--------------------------------------------------------------------------")
+
     binfile = pjoin(dirname(abspath(__file__)),
                     "../mocks/tests/", "angular_bins")
-    print("\nRunning angular correlation function w(theta)")
+    print("\nRunning angular correlation function DD(theta)")
     results_wtheta, _ = theta_mocks_extn(autocorr, nthreads, binfile,
                                          ra, dec, RA2=ra, DEC2=dec,
                                          weights1=np.ones_like(ra),
@@ -71,10 +92,10 @@ def main():
                                          weight_type='pair_product',
                                          output_thetaavg=True, fast_acos=True,
                                          verbose=1)
-    print("\n#         ******  wtheta: first {0} bins  *******        "
+    print("\n#         ******  DD(theta): first {0} bins  *******        "
           .format(numbins_to_print))
-    print("#      thetamin        thetamax       thetaavg      npairs    weightavg")
-    print("#######################################################################")
+    print("#      thetamin        thetamax       thetaavg        npairs      weightavg")
+    print("############################################################################")
     for ibin in range(numbins_to_print):
         items = results_wtheta[ibin]
         print("{0:14.4f} {1:14.4f} {2:14.4f} {3:14d} {4:14.4f}"
diff --git a/Corrfunc/io.py b/Corrfunc/io.py
index 7cd4bce8..69eab92a 100644
--- a/Corrfunc/io.py
+++ b/Corrfunc/io.py
@@ -10,6 +10,10 @@
 from os.path import dirname, abspath, splitext, exists as file_exists,\
     join as pjoin
 import numpy as np
+try:
+    import pandas as pd
+except ImportError:
+    pd = None
 
 
 __all__ = ('read_fastfood_catalog', 'read_ascii_catalog', 'read_catalog')
@@ -220,11 +224,6 @@ def read_ascii_catalog(filename, return_dtype=None):
 
     # check if pandas is available - much faster to read in the data
     # using pandas
-    try:
-        import pandas as pd
-    except ImportError:
-        pd = None
-
     if pd is not None:
         df = pd.read_csv(filename, header=None,
                          engine="c",
diff --git a/Corrfunc/mocks/DDrppi_mocks.py b/Corrfunc/mocks/DDrppi_mocks.py
index ec548f4c..e98a8be6 100644
--- a/Corrfunc/mocks/DDrppi_mocks.py
+++ b/Corrfunc/mocks/DDrppi_mocks.py
@@ -50,11 +50,11 @@ def DDrppi_mocks(autocorr, cosmology, nthreads, pimax, binfile,
     Parameters
     -----------
 
-    autocorr: boolean, required
+    autocorr : boolean, required
         Boolean flag for auto/cross-correlation. If autocorr is set to 1,
         then the second set of particle positions are not required.
 
-    cosmology: integer, required
+    cosmology : integer, required
         Integer choice for setting cosmology. Valid values are 1->LasDamas
         cosmology and 2->Planck cosmology. If you need arbitrary cosmology,
         easiest way is to convert the ``CZ`` values into co-moving distance,
@@ -69,65 +69,62 @@ def DDrppi_mocks(autocorr, cosmology, nthreads, pimax, binfile,
         ``init_cosmology`` in ``ROOT/utils/cosmology_params.c`` and re-install
         the entire package.
 
-    nthreads: integer
+    nthreads : integer
         The number of OpenMP threads to use. Has no effect if OpenMP was not
         enabled during library compilation.
 
-    pimax: double
+    pimax : double
         A double-precision value for the maximum separation along
         the Z-dimension. 
 
         Distances along the :math:`\\pi` direction are binned with unit
         depth. For instance, if ``pimax=40``, then 40 bins will be created
-        along the ``pi`` direction.
+        along the ``pi`` direction. Only pairs with ``0 <= dz < pimax``
+        are counted (no equality).
 
+    binfile: string or an list/array of floats
+        For string input: filename specifying the ``rp`` bins for
+        ``DDrppi_mocks``. The file should contain white-space separated values
+        of (rpmin, rpmax)  for each ``rp`` wanted. The bins need to be
+        contiguous and sorted in increasing order (smallest bins come first).
+
+        For array-like input: A sequence of ``rp`` values that provides the
+        bin-edges. For example,
+        ``np.logspace(np.log10(0.1), np.log10(10.0), 15)`` is a valid
+        input specifying **14** (logarithmic) bins between 0.1 and 10.0. This
+        array does not need to be sorted.         
+
+    RA1 : array-like, real (float/double)
+        The array of Right Ascensions for the first set of points. RA's
+        are expected to be in [0.0, 360.0], but the code will try to fix cases
+        where the RA's are in [-180, 180.0]. For peace of mind, always supply
+        RA's in [0.0, 360.0].
 
-    .. note:: Only pairs with ``0 <= dz < pimax`` are counted (no equality).
+        Calculations are done in the precision of the supplied arrays.
 
+    DEC1 : array-like, real (float/double)
+        Array of Declinations for the first set of points. DEC's are expected
+        to be in the [-90.0, 90.0], but the code will try to fix cases where
+        the DEC's are in [0.0, 180.0]. Again, for peace of mind, always supply
+        DEC's in [-90.0, 90.0].
 
-    binfile: string or an list/array of floats
-       For string input: filename specifying the ``rp`` bins for
-       ``DDrppi_mocks``. The file should contain white-space separated values
-       of (rpmin, rpmax)  for each ``rp`` wanted. The bins need to be
-       contiguous and sorted in increasing order (smallest bins come first).
-
-       For array-like input: A sequence of ``rp`` values that provides the
-       bin-edges. For example,
-       ``np.logspace(np.log10(0.1), np.log10(10.0), 15)`` is a valid
-       input specifying **14** (logarithmic) bins between 0.1 and 10.0. This
-       array does not need to be sorted.
-
-    RA1: array-like, real (float/double)
-       The array of Right Ascensions for the first set of points. RA's
-       are expected to be in [0.0, 360.0], but the code will try to fix cases
-       where the RA's are in [-180, 180.0]. For peace of mind, always supply
-       RA's in [0.0, 360.0].
-
-       Calculations are done in the precision of the supplied arrays.
-
-    DEC1: array-like, real (float/double)
-       Array of Declinations for the first set of points. DEC's are expected
-       to be in the [-90.0, 90.0], but the code will try to fix cases where
-       the DEC's are in [0.0, 180.0]. Again, for peace of mind, always supply
-       DEC's in [-90.0, 90.0].
-
-       Must be of same precision type as RA1.
-
-    CZ1: array-like, real (float/double)
-       Array of (Speed Of Light * Redshift) values for the first set of
-       points. Code will try to detect cases where ``redshifts`` have been
-       passed and multiply the entire array with the ``speed of light``.
-
-       If is_comoving_dist is set, then ``CZ1`` is interpreted as the
-       co-moving distance, rather than `cz`.
+        Must be of same precision type as RA1.
+
+    CZ1 : array-like, real (float/double)
+        Array of (Speed Of Light * Redshift) values for the first set of
+        points. Code will try to detect cases where ``redshifts`` have been
+        passed and multiply the entire array with the ``speed of light``.
+ 
+        If is_comoving_dist is set, then ``CZ1`` is interpreted as the
+        co-moving distance, rather than `cz`.
        
-    weights1: array_like, real (float/double), optional
-       A scalar, or an array of weights of shape (n_weights, n_positions) or (n_positions,).
-       `weight_type` specifies how these weights are used; results are returned
-       in the `weightavg` field.  If only one of weights1 and weights2 is
-       specified, the other will be set to uniform weights.
+    weights1 : array_like, real (float/double), optional
+        A scalar, or an array of weights of shape (n_weights, n_positions) or (n_positions,).
+        `weight_type` specifies how these weights are used; results are returned
+        in the `weightavg` field.  If only one of weights1 and weights2 is
+        specified, the other will be set to uniform weights.
 
-    RA2: array-like, real (float/double)
+    RA2 : array-like, real (float/double)
         The array of Right Ascensions for the second set of points. RA's
         are expected to be in [0.0, 360.0], but the code will try to fix cases
         where the RA's are in [-180, 180.0]. For peace of mind, always supply
@@ -135,7 +132,7 @@ def DDrppi_mocks(autocorr, cosmology, nthreads, pimax, binfile,
 
         Must be of same precision type as RA1/DEC1/CZ1.
 
-    DEC2: array-like, real (float/double)
+    DEC2 : array-like, real (float/double)
         Array of Declinations for the second set of points. DEC's are expected
         to be in the [-90.0, 90.0], but the code will try to fix cases where
         the DEC's are in [0.0, 180.0]. Again, for peace of mind, always supply
@@ -143,7 +140,7 @@ def DDrppi_mocks(autocorr, cosmology, nthreads, pimax, binfile,
 
         Must be of same precision type as RA1/DEC1/CZ1.
 
-    CZ2: array-like, real (float/double)
+    CZ2 : array-like, real (float/double)
         Array of (Speed Of Light * Redshift) values for the second set of
         points. Code will try to detect cases where ``redshifts`` have been
         passed and multiply the entire array with the ``speed of light``.
@@ -153,47 +150,45 @@ def DDrppi_mocks(autocorr, cosmology, nthreads, pimax, binfile,
 
         Must be of same precision type as RA1/DEC1/CZ1.
         
-    weights2: array-like, real (float/double), optional
+    weights2 : array-like, real (float/double), optional
         Same as weights1, but for the second set of positions
 
-    is_comoving_dist: boolean (default false)
+    is_comoving_dist : boolean (default false)
         Boolean flag to indicate that ``cz`` values have already been
         converted into co-moving distances. This flag allows arbitrary
         cosmologies to be used in ``Corrfunc``.
 
-    verbose: boolean (default false)
+    verbose : boolean (default false)
         Boolean flag to control output of informational messages
 
-    output_rpavg: boolean (default false)
+    output_rpavg : boolean (default false)
         Boolean flag to output the average ``rp`` for each bin. Code will
-        run slower if you set this flag. Also, 
-
-
-    .. note:: If you are calculating in single-precision, ``rpavg`` will 
+        run slower if you set this flag.
+    
+        If you are calculating in single-precision, ``rpavg`` will suffer
         suffer from numerical loss of precision and can not be trusted. If 
         you need accurate ``rpavg`` values, then pass in double precision 
         arrays for the particle positions.
-
-
-    fast_divide: boolean (default false)
+    
+    fast_divide : boolean (default false)
         Boolean flag to replace the division in ``AVX`` implementation with an
-        approximate reciprocal, followed by a Newton-Raphson step. Improves
+        approximate reciprocal, followed by two Newton-Raphson steps. Improves
         runtime by ~15-20%. Loss of precision is at the 5-6th decimal place.
 
-    (xyz)bin_refine_factor: integer, default is (2,2,1); typically within [1-3]
-       Controls the refinement on the cell sizes. Can have up to a 20% impact
-       on runtime.
+    (xyz)bin_refine_factor : integer, default is (2,2,1); typically within [1-3]
+        Controls the refinement on the cell sizes. Can have up to a 20% impact
+        on runtime.
 
     max_cells_per_dim: integer, default is 100, typical values in [50-300]
-       Controls the maximum number of cells per dimension. Total number of
-       cells can be up to (max_cells_per_dim)^3. Only increase if ``rpmax`` is
-       too small relative to the boxsize (and increasing helps the runtime).
+        Controls the maximum number of cells per dimension. Total number of
+        cells can be up to (max_cells_per_dim)^3. Only increase if ``rpmax`` is
+        too small relative to the boxsize (and increasing helps the runtime).
 
-    c_api_timer: boolean (default false)
+    c_api_timer : boolean (default false)
         Boolean flag to measure actual time spent in the C libraries. Here
         to allow for benchmarking and scaling studies.
 
-    isa: string (default ``fastest``)
+    isa : string (default ``fastest``)
         Controls the runtime dispatch for the instruction set to use. Possible
         options are: [``fastest``, ``avx``, ``sse42``, ``fallback``]
 
@@ -208,25 +203,25 @@ def DDrppi_mocks(autocorr, cosmology, nthreads, pimax, binfile,
         benchmarking, then the string supplied here gets translated into an
         ``enum`` for the instruction set defined in ``utils/defs.h``.
         
-    weight_type: string, optional
+    weight_type : string, optional
         The type of weighting to apply.  One of ["pair_product", None].  Default: None.
 
     Returns
     --------
 
-    results: Numpy structured array
+    results : Numpy structured array
 
-       A numpy structured array containing [rpmin, rpmax, rpavg, pimax, npairs, weightavg]
-       for each radial bin specified in the ``binfile``. If ``output_ravg`` is
-       not set, then ``rpavg`` will be set to 0.0 for all bins; similarly for
-       ``weightavg``. ``npairs``
-       contains the number of pairs in that bin and can be used to compute the
-       actual :math:`\\xi(r_p, \pi)` or :math:`wp(rp)` by combining with
-       (DR, RR) counts.
+        A numpy structured array containing [rpmin, rpmax, rpavg, pimax, npairs, weightavg]
+        for each radial bin specified in the ``binfile``. If ``output_ravg`` is
+        not set, then ``rpavg`` will be set to 0.0 for all bins; similarly for
+        ``weightavg``. ``npairs``
+        contains the number of pairs in that bin and can be used to compute the
+        actual :math:`\\xi(r_p, \pi)` or :math:`wp(rp)` by combining with
+        (DR, RR) counts.
 
-    api_time: float, optional
-       Only returned if ``c_api_timer`` is set.  ``api_time`` measures only the time
-       spent within the C library and ignores all python overhead.
+    api_time : float, optional
+        Only returned if ``c_api_timer`` is set.  ``api_time`` measures only the time
+        spent within the C library and ignores all python overhead.
 
     Example
     --------
diff --git a/Corrfunc/mocks/DDsmu_mocks.py b/Corrfunc/mocks/DDsmu_mocks.py
new file mode 100755
index 00000000..3d4a3ace
--- /dev/null
+++ b/Corrfunc/mocks/DDsmu_mocks.py
@@ -0,0 +1,332 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+"""
+Python wrapper around the C extension for the pair counter in
+``mocks/DDsmu``. This python wrapper is :py:mod:`Corrfunc.mocks.DDsmu_mocks`
+"""
+
+from __future__ import (division, print_function, absolute_import,
+                        unicode_literals)
+
+__author__ = ('Manodeep Sinha', 'Nick Hand')
+__all__ = ('DDsmu_mocks', )
+
+
+def DDsmu_mocks(autocorr, cosmology, nthreads, mu_max, nmu_bins, binfile,
+                RA1, DEC1, CZ1, weights1=None,
+                RA2=None, DEC2=None, CZ2=None, weights2=None,
+                is_comoving_dist=False,
+                verbose=False, output_savg=False,
+                fast_divide=False, xbin_refine_factor=2,
+                ybin_refine_factor=2, zbin_refine_factor=1,
+                max_cells_per_dim=100,
+                c_api_timer=False, isa='fastest', weight_type=None):
+    """
+    Calculate the 2-D pair-counts corresponding to the projected correlation
+    function, :math:`\\xi(s, \mu)`. The pairs are counted in bins of
+    radial separation and cosine of angle to the line-of-sight (LOS). The
+    input positions are expected to be on-sky co-ordinates. This module is
+    suitable for calculating correlation functions for mock catalogs.
+
+    If ``weights`` are provided, the resulting pair counts are weighted.  The
+    weighting scheme depends on ``weight_type``.
+
+    Returns a numpy structured array containing the pair counts for the
+    specified bins.
+
+
+    .. note:: This module only returns pair counts and not the actual
+       correlation function :math:`\\xi(s, \mu)`. See the
+       utilities :py:mod:`Corrfunc.utils.convert_3d_counts_to_cf` 
+       for computing :math:`\\xi(s, \mu)` from the pair counts.
+    
+    .. versionadded:: 2.1.0
+
+    Parameters
+    ----------
+    
+    autocorr: boolean, required
+        Boolean flag for auto/cross-correlation. If autocorr is set to 1,
+        then the second set of particle positions are not required.
+
+    cosmology: integer, required
+        Integer choice for setting cosmology. Valid values are 1->LasDamas
+        cosmology and 2->Planck cosmology. If you need arbitrary cosmology,
+        easiest way is to convert the ``CZ`` values into co-moving distance,
+        based on your preferred cosmology. Set ``is_comoving_dist=True``, to
+        indicate that the co-moving distance conversion has already been done.
+
+        Choices:
+                 1. LasDamas cosmology. :math:`\\Omega_m=0.25`, :math:`\\Omega_\Lambda=0.75`
+                 2. Planck   cosmology. :math:`\\Omega_m=0.302`, :math:`\\Omega_\Lambda=0.698`
+
+        To setup a new cosmology, add an entry to the function,
+        ``init_cosmology`` in ``ROOT/utils/cosmology_params.c`` and re-install
+        the entire package.
+
+    nthreads: integer
+        The number of OpenMP threads to use. Has no effect if OpenMP was not
+        enabled during library compilation.
+
+    mu_max: double. Must be in range [0.0, 1.0]
+        A double-precision value for the maximum cosine of the angular 
+        separation from the line of sight (LOS). Here, ``mu`` is defined as
+        the angle between ``s`` and ``l``. If :math:`v_1` and :math:`v_2`
+        represent the vectors to each point constituting the pair, then
+        :math:`s := v_1 - v_2` and :math:`l := 1/2 (v_1 + v_2)`.
+
+        Note: Only pairs with :math:`0 <= cos(\theta_{LOS}) < \mu_{max}`
+        are counted (no equality).
+
+    nmu_bins: int
+        The number of linear ``mu`` bins, with the bins ranging from
+        from (0,``mu_max``)
+    
+    binfile: string or an list/array of floats
+        For string input: filename specifying the ``s`` bins for
+        ``DDsmu_mocks``. The file should contain white-space separated values
+        of (smin, smax) specifying each ``s`` bin wanted. The bins
+        need to be contiguous and sorted in increasing order (smallest bins
+        come first).
+
+        For array-like input: A sequence of ``s`` values that provides the
+        bin-edges. For example,
+        ``np.logspace(np.log10(0.1), np.log10(10.0), 15)`` is a valid
+        input specifying **14** (logarithmic) bins between 0.1 and 10.0. This
+        array does not need to be sorted.         
+
+    RA1: array-like, real (float/double)
+        The array of Right Ascensions for the first set of points. RA's
+        are expected to be in [0.0, 360.0], but the code will try to fix cases
+        where the RA's are in [-180, 180.0]. For peace of mind, always supply
+        RA's in [0.0, 360.0].
+
+        Calculations are done in the precision of the supplied arrays.
+
+    DEC1: array-like, real (float/double)
+        Array of Declinations for the first set of points. DEC's are expected
+        to be in the [-90.0, 90.0], but the code will try to fix cases where
+        the DEC's are in [0.0, 180.0]. Again, for peace of mind, always supply
+        DEC's in [-90.0, 90.0].
+
+        Must be of same precision type as RA1.
+
+    CZ1: array-like, real (float/double)
+        Array of (Speed Of Light * Redshift) values for the first set of
+        points. Code will try to detect cases where ``redshifts`` have been
+        passed and multiply the entire array with the ``speed of light``.
+
+        If is_comoving_dist is set, then ``CZ1`` is interpreted as the
+        co-moving distance, rather than `cz`.
+
+    weights1: array_like, real (float/double), optional
+        A scalar, or an array of weights of shape (n_weights, n_positions) or (n_positions,).
+        `weight_type` specifies how these weights are used; results are returned
+        in the `weightavg` field.  If only one of weights1 and weights2 is
+        specified, the other will be set to uniform weights.
+
+    RA2: array-like, real (float/double)
+        The array of Right Ascensions for the second set of points. RA's
+        are expected to be in [0.0, 360.0], but the code will try to fix cases
+        where the RA's are in [-180, 180.0]. For peace of mind, always supply
+        RA's in [0.0, 360.0].
+
+        Must be of same precision type as RA1/DEC1/CZ1.
+
+    DEC2: array-like, real (float/double)
+        Array of Declinations for the second set of points. DEC's are expected
+        to be in the [-90.0, 90.0], but the code will try to fix cases where
+        the DEC's are in [0.0, 180.0]. Again, for peace of mind, always supply
+        DEC's in [-90.0, 90.0].
+
+        Must be of same precision type as RA1/DEC1/CZ1.
+
+    CZ2: array-like, real (float/double)
+        Array of (Speed Of Light * Redshift) values for the second set of
+        points. Code will try to detect cases where ``redshifts`` have been
+        passed and multiply the entire array with the ``speed of light``.
+
+        If is_comoving_dist is set, then ``CZ2`` is interpreted as the
+        co-moving distance, rather than `cz`.
+
+        Must be of same precision type as RA1/DEC1/CZ1.
+
+    weights2: array-like, real (float/double), optional
+        Same as weights1, but for the second set of positions
+
+    is_comoving_dist: boolean (default false)
+        Boolean flag to indicate that ``cz`` values have already been
+        converted into co-moving distances. This flag allows arbitrary
+        cosmologies to be used in ``Corrfunc``.
+
+    verbose: boolean (default false)
+        Boolean flag to control output of informational messages
+
+    output_savg: boolean (default false)
+        Boolean flag to output the average ``s`` for each bin. Code will
+        run slower if you set this flag. Also, note, if you are calculating
+        in single-precision, ``savg`` will suffer from numerical loss of
+        precision and can not be trusted. If you need accurate ``savg``
+        values, then pass in double precision arrays for the particle
+        positions.
+
+    fast_divide: boolean (default false)
+        Boolean flag to replace the division in ``AVX`` implementation with an
+        approximate reciprocal, followed by a Newton-Raphson step. Improves
+        runtime by ~15-20%. Loss of precision is at the 5-6th decimal place.
+
+    (xyz)bin_refine_factor: integer, default is (2,2,1); typically within [1-3]
+        Controls the refinement on the cell sizes. Can have up to a 20% impact
+        on runtime.
+
+    max_cells_per_dim: integer, default is 100, typical values in [50-300]
+        Controls the maximum number of cells per dimension. Total number of
+        cells can be up to (max_cells_per_dim)^3. Only increase if ``rpmax`` is
+        too small relative to the boxsize (and increasing helps the runtime).
+
+    c_api_timer: boolean (default false)
+        Boolean flag to measure actual time spent in the C libraries. Here
+        to allow for benchmarking and scaling studies.
+
+    isa: string (default ``fastest``)
+        Controls the runtime dispatch for the instruction set to use. Possible
+        options are: [``fastest``, ``avx``, ``sse42``, ``fallback``]
+
+        Setting isa to ``fastest`` will pick the fastest available instruction
+        set on the current computer. However, if you set ``isa`` to, say,
+        ``avx`` and ``avx`` is not available on the computer, then the code
+        will revert to using ``fallback`` (even though ``sse42`` might be
+        available).
+
+        Unless you are benchmarking the different instruction sets, you should
+        always leave ``isa`` to the default value. And if you *are*
+        benchmarking, then the string supplied here gets translated into an
+        ``enum`` for the instruction set defined in ``utils/defs.h``.
+
+    weight_type: string, optional
+        The type of weighting to apply.  One of ["pair_product", None].  Default: None.
+
+    Returns
+    --------
+
+    results: Numpy structured array
+        A numpy structured array containing [smin, smax, savg, mumax, npairs, weightavg]
+        for each separation bin specified in the ``binfile``. If ``output_savg`` is
+        not set, then ``savg`` will be set to 0.0 for all bins; similarly for
+        ``weightavg``. ``npairs`` contains the number of pairs in that bin and
+        can be used to compute the actual :math:`\\xi(s, \mu)` by combining
+        with (DR, RR) counts.
+
+    api_time: float, optional
+        Only returned if ``c_api_timer`` is set.  ``api_time`` measures only
+        the time spent within the C library and ignores all python overhead.
+    """
+    try:
+        from Corrfunc._countpairs_mocks import countpairs_s_mu_mocks as\
+            DDsmu_extn
+    except ImportError:
+        msg = "Could not import the C extension for the on-sky"\
+              "pair counter."
+        raise ImportError(msg)
+
+    import numpy as np
+    from Corrfunc.utils import translate_isa_string_to_enum, fix_ra_dec,\
+        return_file_with_rbins
+    from future.utils import bytes_to_native_str
+
+    # Broadcast scalar weights to arrays
+    if weights1 is not None:
+        weights1 = np.atleast_1d(weights1)
+    if weights2 is not None:
+        weights2 = np.atleast_1d(weights2)
+
+    # Check if mu_max is scalar
+    if not np.isscalar(mu_max):
+        msg = "The parameter `mu_max` = {0}, has size = {1}. "\
+              "The code is expecting a scalar quantity (and not "\
+              "not a list, array)".format(mu_max, np.size(mu_max))
+        raise TypeError(msg)
+    
+    # Check that mu_max is within (0.0, 1.0]
+    if mu_max <= 0.0 or mu_max > 1.0:
+        msg = "The parameter `mu_max` = {0}, is the max. of cosine of an "
+        "angle and should be within (0.0, 1.0]".format(mu_max)
+        raise ValueError(msg)
+    
+    if not autocorr:
+        if RA2 is None or DEC2 is None or CZ2 is None:
+            msg = "Must pass valid arrays for RA2/DEC2/CZ2 for "\
+                  "computing cross-correlation"
+            raise ValueError(msg)
+
+        # If only one set of points has weights, set the other to uniform weights
+        if weights1 is None and weights2 is not None:
+            weights1 = np.ones_like(weights2)
+        if weights2 is None and weights1 is not None:
+            weights2 = np.ones_like(weights1)
+
+    else:
+        RA2 = np.empty(1)
+        DEC2 = np.empty(1)
+        CZ2 = np.empty(1)
+
+    fix_ra_dec(RA1, DEC1)
+    if autocorr == 0:
+        fix_ra_dec(RA2, DEC2)
+
+    # Passing None parameters breaks the parsing code, so avoid this
+    kwargs = {}
+    for k in ['weights1', 'weights2', 'weight_type', 'RA2', 'DEC2', 'CZ2']:
+        v = locals()[k]
+        if v is not None:
+            kwargs[k] = v
+
+    integer_isa = translate_isa_string_to_enum(isa)
+    sbinfile, delete_after_use = return_file_with_rbins(binfile)
+    extn_results, api_time = DDsmu_extn(autocorr, cosmology, nthreads,
+                                        mu_max, nmu_bins, sbinfile,
+                                        RA1, DEC1, CZ1,
+                                        is_comoving_dist=is_comoving_dist,
+                                        verbose=verbose,
+                                        output_savg=output_savg,
+                                        fast_divide=fast_divide,
+                                        xbin_refine_factor=xbin_refine_factor,
+                                        ybin_refine_factor=ybin_refine_factor,
+                                        zbin_refine_factor=zbin_refine_factor,
+                                        max_cells_per_dim=max_cells_per_dim,
+                                        c_api_timer=c_api_timer,
+                                        isa=integer_isa, **kwargs)
+    if extn_results is None:
+        msg = "RuntimeError occurred"
+        raise RuntimeError(msg)
+
+    if delete_after_use:
+        import os
+        os.remove(sbinfile)
+
+    results_dtype = np.dtype([(bytes_to_native_str(b'smin'), np.float),
+                              (bytes_to_native_str(b'smax'), np.float),
+                              (bytes_to_native_str(b'savg'), np.float),
+                              (bytes_to_native_str(b'mumax'), np.float),
+                              (bytes_to_native_str(b'npairs'), np.uint64),
+                              (bytes_to_native_str(b'weightavg'), np.float)])
+
+    nbin = len(extn_results)
+    results = np.zeros(nbin, dtype=results_dtype)
+    for ii, r in enumerate(extn_results):
+        results['smin'][ii] = r[0]
+        results['smax'][ii] = r[1]
+        results['savg'][ii] = r[2]
+        results['mumax'][ii] = r[3]
+        results['npairs'][ii] = r[4]
+        results['weightavg'][ii] = r[5]
+
+    if not c_api_timer:
+        return results
+    else:
+        return results, api_time
+
+if __name__ == '__main__':
+    import doctest
+    doctest.testmod()
diff --git a/Corrfunc/mocks/DDtheta_mocks.py b/Corrfunc/mocks/DDtheta_mocks.py
index 084022ee..dcb21279 100644
--- a/Corrfunc/mocks/DDtheta_mocks.py
+++ b/Corrfunc/mocks/DDtheta_mocks.py
@@ -43,26 +43,26 @@ def DDtheta_mocks(autocorr, nthreads, binfile,
     Parameters
     -----------
 
-    autocorr: boolean, required
+    autocorr : boolean, required
         Boolean flag for auto/cross-correlation. If autocorr is set to 1,
         then the second set of particle positions are not required.
 
-    nthreads: integer
+    nthreads : integer
        Number of threads to use.
 
-    binfile: string or an list/array of floats, units: degrees
-       For string input: filename specifying the ``rp`` bins for
-       ``DDtheta_mocks``. The file should contain white-space separated values
-       of (thetapmin, thetamax)  for each ``theta`` wanted. The bins need to be
-       contiguous and sorted in increasing order (smallest bins come first).
+    binfile: string or an list/array of floats. Units: degrees.
+        For string input: filename specifying the ``theta`` bins for
+        ``DDtheta_mocks``. The file should contain white-space separated values
+        of (thetamin, thetamax)  for each ``theta`` wanted. The bins need to be
+        contiguous and sorted in increasing order (smallest bins come first).
 
-       For array-like input: A sequence of ``theta`` values that provides the
-       bin-edges. For example,
-       ``np.logspace(np.log10(0.1), np.log10(10.0), 15)`` is a valid
-       input specifying **14** (logarithmic) bins between 0.1 and 10.0 degrees. 
-       This array does not need to be sorted.
+        For array-like input: A sequence of ``theta`` values that provides the
+        bin-edges. For example,
+        ``np.logspace(np.log10(0.1), np.log10(10.0), 15)`` is a valid
+        input specifying **14** (logarithmic) bins between 0.1 and 10.0
+        degrees. This array does not need to be sorted.         
 
-    RA1: array-like, real (float/double)
+    RA1 : array-like, real (float/double)
         The array of Right Ascensions for the first set of points. RA's
         are expected to be in [0.0, 360.0], but the code will try to fix cases
         where the RA's are in [-180, 180.0]. For peace of mind, always supply
@@ -70,75 +70,70 @@ def DDtheta_mocks(autocorr, nthreads, binfile,
 
         Calculations are done in the precision of the supplied arrays.
 
-    DEC1: array-like, real (float/double)
+    DEC1 : array-like, real (float/double)
        Array of Declinations for the first set of points. DEC's are expected
        to be in the [-90.0, 90.0], but the code will try to fix cases where
        the DEC's are in [0.0, 180.0]. Again, for peace of mind, always supply
        DEC's in [-90.0, 90.0].
        Must be of same precision type as RA1.
        
-    weights1: array_like, real (float/double), optional
+    weights1 : array_like, real (float/double), optional
        A scalar, or an array of weights of shape (n_weights, n_positions) or 
        (n_positions,). `weight_type` specifies how these weights are used; 
        results are returned in the `weightavg` field.  If only one of weights1 
        and weights2 is specified, the other will be set to uniform weights.
 
-    RA2: array-like, real (float/double)
+    RA2 : array-like, real (float/double)
        The array of Right Ascensions for the second set of points. RA's
        are expected to be in [0.0, 360.0], but the code will try to fix cases
        where the RA's are in [-180, 180.0]. For peace of mind, always supply
        RA's in [0.0, 360.0].
        Must be of same precision type as RA1/DEC1.
 
-    DEC2: array-like, real (float/double)
+    DEC2 : array-like, real (float/double)
        Array of Declinations for the second set of points. DEC's are expected
        to be in the [-90.0, 90.0], but the code will try to fix cases where
        the DEC's are in [0.0, 180.0]. Again, for peace of mind, always supply
        DEC's in [-90.0, 90.0].
        Must be of same precision type as RA1/DEC1.
        
-    weights2: array-like, real (float/double), optional
+    weights2 : array-like, real (float/double), optional
        Same as weights1, but for the second set of positions
 
-    link_in_dec: boolean (default True)
+    link_in_dec : boolean (default True)
        Boolean flag to create lattice in Declination. Code runs faster with
        this option. However, if the angular separations are too small, then
        linking in declination might produce incorrect results. When running
        for the first time, check your results by comparing with the output
        of the code for ``link_in_dec=False`` and ``link_in_ra=False``.
 
-    link_in_ra: boolean (default True)
+    link_in_ra : boolean (default True)
        Boolean flag to create lattice in Right Ascension. Setting this option
        implies ``link_in_dec=True``. Similar considerations as ``link_in_dec``
        described above.
 
-
-    .. note:: If you disable both ``link_in_dec`` and ``link_in_ra``, then
+       If you disable both ``link_in_dec`` and ``link_in_ra``, then
        the code reduces to a brute-force pair counter. No lattices are created
        at all. For very small angular separations, the brute-force method 
        might be the most numerically stable method.
 
-
-    verbose: boolean (default false)
+    verbose : boolean (default false)
        Boolean flag to control output of informational messages
 
-    output_thetaavg: boolean (default false)
+    output_thetaavg : boolean (default false)
        Boolean flag to output the average ``\theta`` for each bin. Code will
        run slower if you set this flag. 
 
-    
-    .. note:: If you are calculating in single-precision, ``thetaavg`` will 
+       If you are calculating in single-precision, ``thetaavg`` will 
        suffer from numerical loss of precision and can not be trusted. If you 
        need accurate ``thetaavg`` values, then pass in double precision arrays 
        for ``RA/DEC``.
 
-
-    .. note:: Code will run significantly slower if you enable this option.
+       Code will run significantly slower if you enable this option.
        Use the keyword ``fast_acos`` if you can tolerate some loss of 
        precision.
 
-
-    fast_acos: boolean (default false)
+    fast_acos : boolean (default false)
        Flag to use numerical approximation for the ``arccos`` - gives better
        performance at the expense of some precision. Relevant only if
        ``output_thetaavg==True``.
@@ -149,30 +144,26 @@ def DDtheta_mocks(autocorr, nthreads, binfile,
        if you know your ``theta`` range is limited. If you implement a new
        version, then you will have to reinstall the entire Corrfunc package.
 
+       Note: Tests will fail if you run the tests with``fast_acos=True``.
 
-    .. note:: Tests will fail if you run the tests with``fast_acos=True``.
-
-
-    (radec)_refine_factor: integer, default is (2,2); typically within [1-3]
+    (radec)_refine_factor : integer, default is (2,2); typically within [1-3]
        Controls the refinement on the cell sizes. Can have up to a 20% impact
        on runtime. 
 
-
-    .. note:: Only two refine factors are to be specified and these
+       Only two refine factors are to be specified and these
        correspond to ``ra`` and ``dec`` (rather, than the usual three of
        ``(xyz)bin_refine_factor`` for all other correlation functions).
 
-
-    max_cells_per_dim: integer, default is 100, typical values in [50-300]
+    max_cells_per_dim : integer, default is 100, typical values in [50-300]
        Controls the maximum number of cells per dimension. Total number of
        cells can be up to (max_cells_per_dim)^3. Only increase if ``thetamax``
        is too small relative to the boxsize (and increasing helps the runtime).
 
-    c_api_timer: boolean (default false)
+    c_api_timer : boolean (default false)
        Boolean flag to measure actual time spent in the C libraries. Here
        to allow for benchmarking and scaling studies.
 
-    isa: string (default ``fastest``)
+    isa : string (default ``fastest``)
        Controls the runtime dispatch for the instruction set to use. Possible
        options are: [``fastest``, ``avx``, ``sse42``, ``fallback``]
 
@@ -189,15 +180,14 @@ def DDtheta_mocks(autocorr, nthreads, binfile,
     Returns
     --------
 
-    results: Numpy structured array
-
+    results : Numpy structured array
        A numpy structured array containing [thetamin, thetamax, thetaavg,
        npairs, weightavg] for each angular bin specified in the ``binfile``. If
        ``output_thetaavg`` is not set then ``thetavg`` will be set to 0.0 for
        all bins; similarly for
        ``weightavg``. ``npairs`` contains the number of pairs in that bin.
 
-    api_time: float, optional
+    api_time : float, optional
        Only returned if ``c_api_timer`` is set.  ``api_time`` measures only the time
        spent within the C library and ignores all python overhead.
 
diff --git a/Corrfunc/mocks/__init__.py b/Corrfunc/mocks/__init__.py
index cf8b4294..d559fbdb 100644
--- a/Corrfunc/mocks/__init__.py
+++ b/Corrfunc/mocks/__init__.py
@@ -9,12 +9,13 @@
                         unicode_literals)
 
 __author__ = ('Manodeep Sinha')
-__all__ = ("DDrppi_mocks", "DDtheta_mocks", "vpf_mocks", )
+__all__ = ("DDrppi_mocks", "DDtheta_mocks", "vpf_mocks", "DDsmu_mocks" )
 
 import sys
 from .DDrppi_mocks import DDrppi_mocks
 from .DDtheta_mocks import DDtheta_mocks
 from .vpf_mocks import vpf_mocks
+from .DDsmu_mocks import DDsmu_mocks
 
 if sys.version_info[0] < 3:
     __all__ = [n.encode('ascii') for n in __all__]
diff --git a/Corrfunc/mocks/vpf_mocks.py b/Corrfunc/mocks/vpf_mocks.py
index 39e51271..5872de4b 100644
--- a/Corrfunc/mocks/vpf_mocks.py
+++ b/Corrfunc/mocks/vpf_mocks.py
@@ -39,12 +39,12 @@ def vpf_mocks(rmax, nbins, nspheres, numpN,
        Number of bins in the counts-in-cells. Radius of first shell
        is rmax/nbins
 
-    nspheres: integer (>= 0)
+    nspheres : integer (>= 0)
        Number of random spheres to place within the particle distribution.
        For a small number of spheres, the error is larger in the measured
        pN's.
 
-    numpN: integer (>= 1)
+    numpN : integer (>= 1)
        Governs how many unique pN's are to returned. If ``numpN`` is set to 1,
        then only the vpf (p0) is returned. For ``numpN=2``, p0 and p1 are
        returned.
@@ -62,18 +62,16 @@ def vpf_mocks(rmax, nbins, nspheres, numpN,
 
        and so on...
 
+       Note: ``p0`` is the vpf
 
-    .. note:: p0 is the vpf
-
-
-    threshold_ngb: integer
+    threshold_ngb : integer
        Minimum number of random points needed in a ``rmax`` sphere such that it
        is considered to be entirely within the mock footprint. The
        command-line version, ``mocks/vpf/vpf_mocks.c``, assumes that the
        minimum number of randoms can be at most a 1-sigma deviation from
        the expected random number density.
 
-    centers_file: string, filename
+    centers_file : string, filename
        A file containing random sphere centers. If the file does not exist,
        then a list of random centers will be written out. In that case, the
        randoms arrays, ``RAND_RA``, ``RAND_DEC`` and ``RAND_CZ`` are used to
@@ -81,13 +79,11 @@ def vpf_mocks(rmax, nbins, nspheres, numpN,
        exist but either ``rmax`` is too small or there are not enough centers
        then the file will be overwritten.
 
-    
-    .. note:: If the centers file has to be written, the code will take
+       Note: If the centers file has to be written, the code will take
        significantly longer to finish. However, subsequent runs can re-use
        that centers file and will be faster.
 
-
-    cosmology: integer, required
+    cosmology : integer, required
         Integer choice for setting cosmology. Valid values are 1->LasDamas
         cosmology and 2->Planck cosmology. If you need arbitrary cosmology,
         easiest way is to convert the ``CZ`` values into co-moving distance,
@@ -102,7 +98,7 @@ def vpf_mocks(rmax, nbins, nspheres, numpN,
         ``init_cosmology`` in ``ROOT/utils/cosmology_params.c`` and re-install
         the entire package.
 
-    RA: array-like, real (float/double)
+    RA : array-like, real (float/double)
        The array of Right Ascensions for the first set of points. RA's
        are expected to be in [0.0, 360.0], but the code will try to fix cases
        where the RA's are in [-180, 180.0]. For peace of mind, always supply
@@ -110,7 +106,7 @@ def vpf_mocks(rmax, nbins, nspheres, numpN,
 
        Calculations are done in the precision of the supplied arrays.
 
-    DEC: array-like, real (float/double)
+    DEC : array-like, real (float/double)
        Array of Declinations for the first set of points. DEC's are expected
        to be in the [-90.0, 90.0], but the code will try to fix cases where
        the DEC's are in [0.0, 180.0]. Again, for peace of mind, always supply
@@ -118,7 +114,7 @@ def vpf_mocks(rmax, nbins, nspheres, numpN,
 
        Must be of same precision type as RA.
 
-    CZ: array-like, real (float/double)
+    CZ : array-like, real (float/double)
        Array of (Speed Of Light * Redshift) values for the first set of
        points. Code will try to detect cases where ``redshifts`` have been
        passed and multiply the entire array with the ``speed of light``.
@@ -126,7 +122,7 @@ def vpf_mocks(rmax, nbins, nspheres, numpN,
        If ``is_comoving_dist`` is set, then ``CZ`` is interpreted as the
        co-moving distance, rather than (Speed Of Light * Redshift).
 
-    RAND_RA: array-like, real (float/double)
+    RAND_RA : array-like, real (float/double)
        The array of Right Ascensions for the randoms. RA's are expected to be
        in [0.0, 360.0], but the code will try to fix cases where the RA's are
        in [-180, 180.0]. For peace of mind, always supply RA's in
@@ -134,7 +130,7 @@ def vpf_mocks(rmax, nbins, nspheres, numpN,
 
        Must be of same precision type as RA/DEC/CZ.
 
-    RAND_DEC: array-like, real (float/double)
+    RAND_DEC : array-like, real (float/double)
        Array of Declinations for the randoms. DEC's are expected to be in the
        [-90.0, 90.0], but the code will try to fix cases where the DEC's are
        in [0.0, 180.0]. Again, for peace of mind, always supply DEC's in
@@ -142,7 +138,7 @@ def vpf_mocks(rmax, nbins, nspheres, numpN,
 
        Must be of same precision type as RA/DEC/CZ.
 
-    RAND_CZ: array-like, real (float/double)
+    RAND_CZ : array-like, real (float/double)
        Array of (Speed Of Light * Redshift) values for the randoms. Code
        will try to detect cases where ``redshifts`` have been
        passed and multiply the entire array with the ``speed of light``.
@@ -150,40 +146,36 @@ def vpf_mocks(rmax, nbins, nspheres, numpN,
        If ``is_comoving_dist`` is set, then ``CZ2`` is interpreted as the
        co-moving distance, rather than ``(Speed Of Light * Redshift)``.
 
-     
-    .. note:: RAND_RA, RAND_DEC and RAND_CZ are only used when the
+       Note: RAND_RA, RAND_DEC and RAND_CZ are only used when the
           ``centers_file``  needs to be written out. In that case, the
           RAND_RA, RAND_DEC, and RAND_CZ are used as random centers.
 
-
-    verbose: boolean (default false)
+    verbose : boolean (default false)
        Boolean flag to control output of informational messages
 
-    is_comoving_dist: boolean (default false)
+    is_comoving_dist : boolean (default false)
        Boolean flag to indicate that ``cz`` values have already been
        converted into co-moving distances. This flag allows arbitrary
        cosmologies to be used in ``Corrfunc``.
 
-    (xyz)bin_refine_factor: integer, default is (1,1,1); typically within [1-3]
+    (xyz)bin_refine_factor : integer, default is (1,1,1); typically within [1-3]
        Controls the refinement on the cell sizes. Can have up to a 20% impact
        on runtime. 
 
-
-    .. note:: Since the counts in spheres calculation is symmetric
+       Note: Since the counts in spheres calculation is symmetric
        in all 3 dimensions, the defaults are different from the clustering
        routines.
 
-
-    max_cells_per_dim: integer, default is 100, typical values in [50-300]
+    max_cells_per_dim : integer, default is 100, typical values in [50-300]
        Controls the maximum number of cells per dimension. Total number of
        cells can be up to (max_cells_per_dim)^3. Only increase if ``rmax`` is
        too small relative to the boxsize (and increasing helps the runtime).
 
-    c_api_timer: boolean (default false)
+    c_api_timer : boolean (default false)
        Boolean flag to measure actual time spent in the C libraries. Here
        to allow for benchmarking and scaling studies.
 
-    isa: string (default ``fastest``)
+    isa : string (default ``fastest``)
        Controls the runtime dispatch for the instruction set to use. Possible
        options are: [``fastest``, ``avx``, ``sse42``, ``fallback``]
 
@@ -201,8 +193,7 @@ def vpf_mocks(rmax, nbins, nspheres, numpN,
     Returns
     --------
 
-    results: Numpy structured array
-
+    results : Numpy structured array
        A numpy structured array containing [rmax, pN[numpN]] with ``nbins``
        elements. Each row contains the maximum radius of the sphere and the
        ``numpN`` elements in the ``pN`` array. Each element of this array
@@ -211,9 +202,9 @@ def vpf_mocks(rmax, nbins, nspheres, numpN,
        function) is the probability that a sphere of radius ``rmax`` contains 0
        galaxies.
 
-       if ``c_api_timer`` is set, then the return value is a tuple containing
-       (results, api_time). ``api_time`` measures only the time spent within
-       the C library and ignores all python overhead.
+    api_time : float, optional
+       Only returned if ``c_api_timer`` is set.  ``api_time`` measures only the time
+       spent within the C library and ignores all python overhead.
 
 
     Example
diff --git a/Corrfunc/theory/DD.py b/Corrfunc/theory/DD.py
index 02a52267..29c2c044 100644
--- a/Corrfunc/theory/DD.py
+++ b/Corrfunc/theory/DD.py
@@ -53,8 +53,8 @@ def DD(autocorr, nthreads, binfile, X1, Y1, Z1, weights1=None, periodic=True,
         bin-edges. For example,
         ``np.logspace(np.log10(0.1), np.log10(10.0), 15)`` is a valid
         input specifying **14** (logarithmic) bins between 0.1 and 10.0. This
-        array does not need to be sorted.
-
+        array does not need to be sorted.         
+    
     X1/Y1/Z1: array_like, real (float/double)
         The array of X/Y/Z positions for the first set of points.
         Calculations are done in the precision of the supplied arrays.
@@ -88,8 +88,7 @@ def DD(autocorr, nthreads, binfile, X1, Y1, Z1, weights1=None, periodic=True,
        Boolean flag to output the average ``r`` for each bin. Code will
        run slower if you set this flag. 
 
-
-    .. note:: If you are calculating in single-precision, ``ravg`` will 
+       Note: If you are calculating in single-precision, ``ravg`` will 
        suffer from numerical loss of precision and can not be trusted. 
        If you need accurate ``ravg`` values, then pass in double precision 
        arrays for the particle positions.
diff --git a/Corrfunc/theory/DDrppi.py b/Corrfunc/theory/DDrppi.py
index 8b2f4695..482e366a 100644
--- a/Corrfunc/theory/DDrppi.py
+++ b/Corrfunc/theory/DDrppi.py
@@ -57,21 +57,20 @@ def DDrppi(autocorr, nthreads, pimax, binfile, X1, Y1, Z1, weights1=None,
        depth. For instance, if ``pimax=40``, then 40 bins will be created
        along the ``pi`` direction.
 
-
-    .. note:: Only pairs with ``0 <= dz < pimax`` are counted (no equality).
+       Note: Only pairs with ``0 <= dz < pimax`` are counted (no equality).
 
 
     binfile: string or an list/array of floats
-       For string input: filename specifying the ``rp`` bins for
-       ``DDrppi``. The file should contain white-space separated values
-       of (rpmin, rpmax)  for each ``rp`` wanted. The bins need to be
-       contiguous and sorted in increasing order (smallest bins come first).
+        For string input: filename specifying the ``rp`` bins for
+        ``DDrppi``. The file should contain white-space separated values
+        of (rpmin, rpmax)  for each ``rp`` wanted. The bins need to be
+        contiguous and sorted in increasing order (smallest bins come first).
 
-       For array-like input: A sequence of ``rp`` values that provides the
-       bin-edges. For example,
-       ``np.logspace(np.log10(0.1), np.log10(10.0), 15)`` is a valid
-       input, specifying **14** (logarithmic) bins between 0.1 and 10.0. This
-       array does not need to be sorted.
+        For array-like input: A sequence of ``rp`` values that provides the
+        bin-edges. For example,
+        ``np.logspace(np.log10(0.1), np.log10(10.0), 15)`` is a valid
+        input specifying **14** (logarithmic) bins between 0.1 and 10.0. This
+        array does not need to be sorted.         
 
     X1/Y1/Z1: array-like, real (float/double)
        The array of X/Y/Z positions for the first set of points.
@@ -106,11 +105,10 @@ def DDrppi(autocorr, nthreads, pimax, binfile, X1, Y1, Z1, weights1=None,
        Boolean flag to output the average ``rp`` for each bin. Code will
        run slower if you set this flag. 
 
-
-    .. note:: If you are calculating in single-precision, ``rpavg`` will 
-        suffer from numerical loss of precision and can not be trusted. If 
-        you need accurate ``rpavg`` values, then pass in double precision 
-        arrays for the particle positions.
+       Note: If you are calculating in single-precision, ``rpavg`` will 
+       suffer from numerical loss of precision and can not be trusted. If 
+       you need accurate ``rpavg`` values, then pass in double precision 
+       arrays for the particle positions.
 
 
     (xyz)bin_refine_factor: integer, default is (2,2,1); typically within [1-3]
diff --git a/Corrfunc/theory/DDsmu.py b/Corrfunc/theory/DDsmu.py
new file mode 100644
index 00000000..335ea677
--- /dev/null
+++ b/Corrfunc/theory/DDsmu.py
@@ -0,0 +1,317 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+"""
+Python wrapper around the C extension for the pair counter in
+``theory/DDsmu/``. This wrapper is in :py:mod:`Corrfunc.theory.DDsmu`
+"""
+
+from __future__ import (division, print_function, absolute_import,
+                        unicode_literals)
+
+__author__ = ('Manodeep Sinha', 'Nick Hand')
+__all__ = ('DDsmu', )
+
+
+def DDsmu(autocorr, nthreads, binfile, mu_max, nmu_bins, X1, Y1, Z1, weights1=None,
+           periodic=True, X2=None, Y2=None, Z2=None, weights2=None,
+           verbose=False, boxsize=0.0, output_savg=False,
+           xbin_refine_factor=2, ybin_refine_factor=2,
+           zbin_refine_factor=1, max_cells_per_dim=100,
+           c_api_timer=False, isa=r'fastest', weight_type=None):
+    """
+    Calculate the 2-D pair-counts corresponding to the redshift-space 
+    correlation function, :math:`\\xi(s, \mu)` Pairs which are separated
+    by less than the ``s`` bins (specified in ``binfile``) in 3-D, and
+    less than ``s*mu_max`` in the Z-dimension are counted.
+
+    If ``weights`` are provided, the resulting pair counts are weighted.  The
+    weighting scheme depends on ``weight_type``.
+
+
+    .. note:: This module only returns pair counts and not the actual
+       correlation function :math:`\\xi(s, \mu)`. See the
+       utilities :py:mod:`Corrfunc.utils.convert_3d_counts_to_cf` 
+       for computing :math:`\\xi(s, \mu)` from the pair counts.
+
+    .. versionadded:: 2.1.0
+
+    Parameters
+    ----------
+
+    autocorr: boolean, required
+        Boolean flag for auto/cross-correlation. If autocorr is set to 1,
+        then the second set of particle positions are not required.
+
+    nthreads: integer
+        The number of OpenMP threads to use. Has no effect if OpenMP was not
+        enabled during library compilation.
+
+    binfile: string or an list/array of floats
+        For string input: filename specifying the ``s`` bins for
+        ``DDsmu_mocks``. The file should contain white-space separated values
+        of (smin, smax) specifying each ``s`` bin wanted. The bins
+        need to be contiguous and sorted in increasing order (smallest bins
+        come first).
+
+        For array-like input: A sequence of ``s`` values that provides the
+        bin-edges. For example,
+        ``np.logspace(np.log10(0.1), np.log10(10.0), 15)`` is a valid
+        input specifying **14** (logarithmic) bins between 0.1 and 10.0. This
+        array does not need to be sorted.         
+
+    mu_max: double. Must be in range (0.0, 1.0]
+        A double-precision value for the maximum cosine of the angular 
+        separation from the line of sight (LOS). Here, LOS is taken to be 
+        along the Z direction. 
+
+        Note: Only pairs with :math:`0 <= cos(\theta_{LOS}) < \mu_{max}`
+        are counted (no equality).
+
+    nmu_bins: int
+        The number of linear ``mu`` bins, with the bins ranging from
+        from (0,``mu_max``)
+
+    X1/Y1/Z1 : array-like, real (float/double)
+        The array of X/Y/Z positions for the first set of points.
+        Calculations are done in the precision of the supplied arrays.
+
+    weights1 : array-like, real (float/double), shape (n_particles,) or \
+        (n_weights_per_particle,n_particles), optional
+        Weights for computing a weighted pair count.
+
+    weight_type : str, optional
+        The type of pair weighting to apply.
+        Options: "pair_product", None; Default: None.
+
+    periodic : boolean
+        Boolean flag to indicate periodic boundary conditions.
+
+    X2/Y2/Z2 : array-like, real (float/double)
+        Array of XYZ positions for the second set of points. *Must* be the same
+        precision as the X1/Y1/Z1 arrays. Only required when ``autocorr==0``.
+
+    weights2 : array-like, real (float/double), shape (n_particles,) or \
+        (n_weights_per_particle,n_particles), optional
+        Weights for computing a weighted pair count.
+
+    verbose : boolean (default false)
+        Boolean flag to control output of informational messages
+
+    boxsize : double
+        The side-length of the cube in the cosmological simulation.
+        Present to facilitate exact calculations for periodic wrapping.
+        If boxsize is not supplied, then the wrapping is done based on
+        the maximum difference within each dimension of the X/Y/Z arrays.
+
+    output_savg : boolean (default false)
+        Boolean flag to output the average ``s`` for each bin. Code will
+        run slower if you set this flag. Also, note, if you are calculating
+        in single-precision, ``s`` will suffer from numerical loss of
+        precision and can not be trusted. If you need accurate ``s``
+        values, then pass in double precision arrays for the particle positions.
+
+    (xyz)bin_refine_factor: integer (default (2,2,1) typical values in [1-3])
+        Controls the refinement on the cell sizes. Can have up to a 20% impact
+        on runtime.
+
+    max_cells_per_dim: integer (default 100, typical values in [50-300])
+        Controls the maximum number of cells per dimension. Total number of 
+        cells can be up to (max_cells_per_dim)^3. Only increase if ``rmax`` is
+        too small relative to the boxsize (and increasing helps the runtime).
+
+    c_api_timer : boolean (default false)
+        Boolean flag to measure actual time spent in the C libraries. Here
+        to allow for benchmarking and scaling studies.
+
+    isa : integer (default -1)
+      Controls the runtime dispatch for the instruction set to use. Possible
+      options are: [-1, AVX, SSE42, FALLBACK]
+
+      Setting isa to -1 will pick the fastest available instruction
+      set on the current computer. However, if you set ``isa`` to, say,
+      ``AVX`` and ``AVX`` is not available on the computer, then the code will
+      revert to using ``FALLBACK`` (even though ``SSE42`` might be available).
+
+      Unless you are benchmarking the different instruction sets, you should
+      always leave ``isa`` to the default value. And if you *are* benchmarking,
+      then the integer values correspond to the ``enum`` for the instruction set
+      defined in ``utils/defs.h``.
+
+    Returns
+    --------
+    results : A python list
+        A python list containing ``nmu_bins`` of [smin, smax, savg, mu_max, npairs, weightavg]
+        for each spatial bin specified in the ``binfile``. There will be a total of ``nmu_bins``
+        ranging from [0, ``mu_max``) *per* spatial bin. If ``output_savg`` is not set, then ``savg``
+        will be set to 0.0 for all bins; similarly for ``weight_avg``. ``npairs``
+        contains the number of pairs in that bin.
+
+    time : if ``c_api_timer`` is set, then the return value contains the time spent
+        in the API; otherwise time is set to 0.0
+
+    Example
+    -------
+    >>> from __future__ import print_function
+    >>> import numpy as np
+    >>> from os.path import dirname, abspath, join as pjoin
+    >>> import Corrfunc
+    >>> from Corrfunc.theory.DDsmu import DDsmu
+    >>> binfile = pjoin(dirname(abspath(Corrfunc.__file__)),
+    ...                 "../theory/tests/", "bins")
+    >>> N = 10000
+    >>> boxsize = 420.0
+    >>> nthreads = 4
+    >>> autocorr = 1
+    >>> mu_max = 1.0
+    >>> seed = 42
+    >>> nmu_bins = 10
+    >>> np.random.seed(seed)
+    >>> X = np.random.uniform(0, boxsize, N)
+    >>> Y = np.random.uniform(0, boxsize, N)
+    >>> Z = np.random.uniform(0, boxsize, N)
+    >>> weights = np.ones_like(X)
+    >>> results = DDsmu(autocorr, nthreads, binfile, mu_max, nmu_bins,
+    ...                  X, Y, Z, weights1=weights, weight_type='pair_product', output_savg=True)
+    >>> for r in results[100:]: print("{0:10.6f} {1:10.6f} {2:10.6f} {3:10.1f}"
+    ...                               " {4:10d} {5:10.6f}".format(r['smin'], r['smax'],
+    ...                               r['savg'], r['mu_max'], r['npairs'], r['weightavg']))
+    ...                         # doctest: +NORMALIZE_WHITESPACE
+     5.788530   8.249250   7.148213        0.1        230   1.000000
+     5.788530   8.249250   7.157218        0.2        236   1.000000
+     5.788530   8.249250   7.165338        0.3        208   1.000000
+     5.788530   8.249250   7.079905        0.4        252   1.000000
+     5.788530   8.249250   7.251661        0.5        184   1.000000
+     5.788530   8.249250   7.118536        0.6        222   1.000000
+     5.788530   8.249250   7.083466        0.7        238   1.000000
+     5.788530   8.249250   7.198184        0.8        170   1.000000
+     5.788530   8.249250   7.127409        0.9        208   1.000000
+     5.788530   8.249250   6.973090        1.0        206   1.000000
+     8.249250  11.756000  10.149183        0.1        592   1.000000
+     8.249250  11.756000  10.213009        0.2        634   1.000000
+     8.249250  11.756000  10.192220        0.3        532   1.000000
+     8.249250  11.756000  10.246931        0.4        544   1.000000
+     8.249250  11.756000  10.102675        0.5        530   1.000000
+     8.249250  11.756000  10.276180        0.6        644   1.000000
+     8.249250  11.756000  10.251264        0.7        666   1.000000
+     8.249250  11.756000  10.138399        0.8        680   1.000000
+     8.249250  11.756000  10.191916        0.9        566   1.000000
+     8.249250  11.756000  10.243229        1.0        608   1.000000
+    11.756000  16.753600  14.552776        0.1       1734   1.000000
+    11.756000  16.753600  14.579991        0.2       1806   1.000000
+    11.756000  16.753600  14.599611        0.3       1802   1.000000
+    11.756000  16.753600  14.471100        0.4       1820   1.000000
+    11.756000  16.753600  14.480192        0.5       1740   1.000000
+    11.756000  16.753600  14.493679        0.6       1746   1.000000
+    11.756000  16.753600  14.547713        0.7       1722   1.000000
+    11.756000  16.753600  14.465390        0.8       1750   1.000000
+    11.756000  16.753600  14.547465        0.9       1798   1.000000
+    11.756000  16.753600  14.440975        1.0       1828   1.000000
+    16.753600  23.875500  20.720406        0.1       5094   1.000000
+    16.753600  23.875500  20.735403        0.2       5004   1.000000
+    16.753600  23.875500  20.721069        0.3       5172   1.000000
+    16.753600  23.875500  20.723648        0.4       5014   1.000000
+    16.753600  23.875500  20.650621        0.5       5094   1.000000
+    16.753600  23.875500  20.688135        0.6       5076   1.000000
+    16.753600  23.875500  20.735691        0.7       4910   1.000000
+    16.753600  23.875500  20.714097        0.8       4864   1.000000
+    16.753600  23.875500  20.751836        0.9       4954   1.000000
+    16.753600  23.875500  20.721183        1.0       5070   1.000000
+    """
+    try:
+        from Corrfunc._countpairs import countpairs_s_mu as DDsmu_extn
+    except ImportError:
+        msg = "Could not import the C extension for the 3-D "\
+              "redshift-space pair counter."
+        raise ImportError(msg)
+
+    import numpy as np
+    from Corrfunc.utils import translate_isa_string_to_enum,\
+        return_file_with_rbins
+    from future.utils import bytes_to_native_str
+
+    # Broadcast scalar weights to arrays
+    if weights1 is not None:
+        weights1 = np.atleast_1d(weights1)
+    if weights2 is not None:
+        weights2 = np.atleast_1d(weights2)
+
+    # Check if mu_max is scalar
+    if not np.isscalar(mu_max):
+        msg = "The parameter `mu_max` = {0}, has size = {1}. "\
+              "The code is expecting a scalar quantity (and not "\
+              "not a list, array)".format(mu_max, np.size(mu_max))
+        raise TypeError(msg)
+
+    # Check that mu_max is within (0.0, 1.0]
+    if mu_max <= 0.0 or mu_max > 1.0:
+        msg = "The parameter `mu_max` = {0}, is the max. of cosine of an "
+        "angle and should be within (0.0, 1.0]".format(mu_max)
+        raise ValueError(msg)
+        
+    if not autocorr:
+        if X2 is None or Y2 is None or Z2 is None:
+            msg = "Must pass valid arrays for X2/Y2/Z2 for "\
+                "computing cross-correlation"
+            raise ValueError(msg)
+
+        # If only one set of points has weights, set the other to uniform weights
+        if weights1 is None and weights2 is not None:
+            weights1 = np.ones_like(weights2)
+        if weights2 is None and weights1 is not None:
+            weights2 = np.ones_like(weights1)
+
+    else:
+        X2 = np.empty(1)
+        Y2 = np.empty(1)
+        Z2 = np.empty(1)
+
+    # Passing None parameters breaks the parsing code, so avoid this
+    kwargs = {}
+    for k in ['weights1', 'weights2', 'weight_type', 'X2', 'Y2', 'Z2']:
+        v = locals()[k]
+        if v is not None:
+            kwargs[k] = v
+
+    integer_isa = translate_isa_string_to_enum(isa)
+    sbinfile, delete_after_use = return_file_with_rbins(binfile)
+    extn_results, api_time = DDsmu_extn(autocorr, nthreads,
+                                        sbinfile,
+                                        mu_max, nmu_bins,
+                                        X1, Y1, Z1,
+                                        periodic=periodic,
+                                        verbose=verbose,
+                                        boxsize=boxsize,
+                                        output_savg=output_savg,
+                                        xbin_refine_factor=xbin_refine_factor,
+                                        ybin_refine_factor=ybin_refine_factor,
+                                        zbin_refine_factor=zbin_refine_factor,
+                                        max_cells_per_dim=max_cells_per_dim,
+                                        c_api_timer=c_api_timer,
+                                        isa=integer_isa, **kwargs)
+        
+    if extn_results is None:
+        msg = "RuntimeError occurred"
+        raise RuntimeError(msg)
+
+    if delete_after_use:
+        import os
+        os.remove(sbinfile)
+
+    results_dtype = np.dtype([(bytes_to_native_str(b'smin'), np.float),
+                              (bytes_to_native_str(b'smax'), np.float),
+                              (bytes_to_native_str(b'savg'), np.float),
+                              (bytes_to_native_str(b'mu_max'), np.float),
+                              (bytes_to_native_str(b'npairs'), np.uint64),
+                              (bytes_to_native_str(b'weightavg'), np.float),])
+    results = np.array(extn_results, dtype=results_dtype)
+
+    if not c_api_timer:
+        return results
+    else:
+        return results, api_time
+
+
+if __name__ == '__main__':
+    import doctest
+    doctest.testmod()
diff --git a/Corrfunc/theory/__init__.py b/Corrfunc/theory/__init__.py
index 7d19322d..cf1afcfa 100644
--- a/Corrfunc/theory/__init__.py
+++ b/Corrfunc/theory/__init__.py
@@ -9,7 +9,7 @@
                         unicode_literals)
 
 __author__ = ('Manodeep Sinha')
-__all__ = ('DD', 'DDrppi', 'wp', 'xi', 'vpf', )
+__all__ = ('DD', 'DDrppi', 'wp', 'xi', 'vpf', 'DDsmu',)
 
 import sys
 
@@ -18,7 +18,7 @@
 from .wp import wp
 from .xi import xi
 from .vpf import vpf
-
+from .DDsmu import DDsmu
 
 if sys.version_info[0] < 3:
     __all__ = [n.encode('ascii') for n in __all__]
diff --git a/Corrfunc/theory/vpf.py b/Corrfunc/theory/vpf.py
index 8b76c8c1..2696f31d 100644
--- a/Corrfunc/theory/vpf.py
+++ b/Corrfunc/theory/vpf.py
@@ -59,8 +59,7 @@ def vpf(rmax, nbins, nspheres, numpN, seed,
 
        and so on...
 
-    
-    .. note:: p0 is the vpf
+       Note: ``p0`` is the vpf
 
 
     seed: unsigned integer
@@ -93,8 +92,7 @@ def vpf(rmax, nbins, nspheres, numpN, seed,
        Controls the refinement on the cell sizes. Can have up to a 20% impact
        on runtime. 
 
-
-    .. note:: Since the counts in spheres calculation is symmetric
+       Note: Since the counts in spheres calculation is symmetric
        in all 3 dimensions, the defaults are different from the clustering
        routines.
 
diff --git a/Corrfunc/theory/wp.py b/Corrfunc/theory/wp.py
index cd72fc55..b78b5d93 100644
--- a/Corrfunc/theory/wp.py
+++ b/Corrfunc/theory/wp.py
@@ -36,8 +36,7 @@ def find_fastest_wp_bin_refs(boxsize, pimax, nthreads, binfile, X, Y, Z,
        A double-precision value for the maximum separation along
        the Z-dimension. 
 
-
-    .. note:: Only pairs with ``0 <= dz < pimax`` are counted (no equality).
+       Note: Only pairs with ``0 <= dz < pimax`` are counted (no equality).
 
 
     nthreads: integer
@@ -52,8 +51,8 @@ def find_fastest_wp_bin_refs(boxsize, pimax, nthreads, binfile, X, Y, Z,
        For array-like input: A sequence of ``rp`` values that provides the
        bin-edges. For example,
        ``np.logspace(np.log10(0.1), np.log10(10.0), 15)`` is a valid
-       input, specifying **14** (logarithmic) bins between 0.1 and 10.0. This
-       array does not need to be sorted.
+       input specifying **14** (logarithmic) bins between 0.1 and 10.0. This
+       array does not need to be sorted.         
 
     X/Y/Z: arraytype, real (float/double)
        Particle positions in the 3 axes. Must be within [0, boxsize]
@@ -72,11 +71,10 @@ def find_fastest_wp_bin_refs(boxsize, pimax, nthreads, binfile, X, Y, Z,
        Boolean flag to output the average ``rp`` for each bin. Code will
        run slower if you set this flag. 
 
-
-    .. note:: If you are calculating in single-precision, ``rpavg`` will 
-        suffer from numerical loss of precision and can not be trusted. If 
-        you need accurate ``rpavg`` values, then pass in double precision 
-        arrays for the particle positions.
+       Note: If you are calculating in single-precision, ``rpavg`` will 
+       suffer from numerical loss of precision and can not be trusted. If 
+       you need accurate ``rpavg`` values, then pass in double precision 
+       arrays for the particle positions.
 
 
     max_cells_per_dim: integer, default is 100, typical values in [50-300]
@@ -119,7 +117,7 @@ def find_fastest_wp_bin_refs(boxsize, pimax, nthreads, binfile, X, Y, Z,
        The combination of ``bin refine factors`` along each dimension that
        produces the fastest code.
 
-    runtimes: numpy structured array
+    runtimes : numpy structured array
 
        if ``return_runtimes`` is set, then the return value is a tuple
        containing ((nx, ny, nz), runtimes). ``runtimes`` is a ``numpy``
@@ -318,8 +316,7 @@ def wp(boxsize, pimax, nthreads, binfile, X, Y, Z,
        A double-precision value for the maximum separation along
        the Z-dimension. 
 
-
-    .. note:: Only pairs with ``0 <= dz < pimax`` are counted (no equality).
+       Note: Only pairs with ``0 <= dz < pimax`` are counted (no equality).
 
 
     nthreads: integer
@@ -334,8 +331,8 @@ def wp(boxsize, pimax, nthreads, binfile, X, Y, Z,
        For array-like input: A sequence of ``rp`` values that provides the
        bin-edges. For example,
        ``np.logspace(np.log10(0.1), np.log10(10.0), 15)`` is a valid
-       input, specifying **14** (logarithmic) bins between 0.1 and 10.0. This
-       array does not need to be sorted.
+       input specifying **14** (logarithmic) bins between 0.1 and 10.0. This
+       array does not need to be sorted.         
 
     X/Y/Z: arraytype, real (float/double)
        Particle positions in the 3 axes. Must be within [0, boxsize]
@@ -348,9 +345,9 @@ def wp(boxsize, pimax, nthreads, binfile, X, Y, Z,
        are double precision arrays (C double type).
        
     weights: array_like, real (float/double), optional
-        A scalar, or an array of weights of shape (n_weights, n_positions) or (n_positions,).
-        `weight_type` specifies how these weights are used; results are returned
-        in the `weightavg` field.
+       A scalar, or an array of weights of shape (n_weights, n_positions) or (n_positions,).
+       `weight_type` specifies how these weights are used; results are returned
+       in the `weightavg` field.
 
     verbose: boolean (default false)
        Boolean flag to control output of informational messages
@@ -359,11 +356,10 @@ def wp(boxsize, pimax, nthreads, binfile, X, Y, Z,
        Boolean flag to output the average ``rp`` for each bin. Code will
        run slower if you set this flag. 
 
-
-    .. note:: If you are calculating in single-precision, ``rpavg`` will 
-        suffer from numerical loss of precision and can not be trusted. If 
-        you need accurate ``rpavg`` values, then pass in double precision 
-        arrays for the particle positions.
+       Note: If you are calculating in single-precision, ``rpavg`` will 
+       suffer from numerical loss of precision and can not be trusted. If 
+       you need accurate ``rpavg`` values, then pass in double precision 
+       arrays for the particle positions.
 
 
     (xyz)bin_refine_factor: integer, default is (2,2,1); typically within [1-3]
diff --git a/Corrfunc/theory/xi.py b/Corrfunc/theory/xi.py
index 5fe30077..557f96d5 100644
--- a/Corrfunc/theory/xi.py
+++ b/Corrfunc/theory/xi.py
@@ -46,16 +46,16 @@ def xi(boxsize, nthreads, binfile, X, Y, Z,
        Number of threads to use.
 
     binfile: string or an list/array of floats
-       For string input: filename specifying the ``r`` bins for
-       ``xi``. The file should contain white-space separated values
-       of (rmin, rmax)  for each ``r`` wanted. The bins need to be
-       contiguous and sorted in increasing order (smallest bins come first).
+        For string input: filename specifying the ``r`` bins for
+        ``xi``. The file should contain white-space separated values
+        of (rmin, rmax)  for each ``r`` wanted. The bins need to be
+        contiguous and sorted in increasing order (smallest bins come first).
 
-       For array-like input: A sequence of ``r`` values that provides the
-       bin-edges. For example,
-       ``np.logspace(np.log10(0.1), np.log10(10.0), 15)`` is a valid
-       input specifying **14** (logarithmic) bins between 0.1 and 10.0. This
-       array does not need to be sorted.
+        For array-like input: A sequence of ``r`` values that provides the
+        bin-edges. For example,
+        ``np.logspace(np.log10(0.1), np.log10(10.0), 15)`` is a valid
+        input specifying **14** (logarithmic) bins between 0.1 and 10.0. This
+        array does not need to be sorted.         
 
     X/Y/Z: arraytype, real (float/double)
        Particle positions in the 3 axes. Must be within [0, boxsize]
@@ -68,9 +68,9 @@ def xi(boxsize, nthreads, binfile, X, Y, Z,
        are double precision arrays (C double type).
        
     weights: array_like, real (float/double), optional
-        A scalar, or an array of weights of shape (n_weights, n_positions) or 
-        (n_positions,). `weight_type` specifies how these weights are used; 
-        results are returned in the `weightavg` field.
+       A scalar, or an array of weights of shape (n_weights, n_positions) or 
+       (n_positions,). `weight_type` specifies how these weights are used; 
+       results are returned in the `weightavg` field.
 
     verbose: boolean (default false)
        Boolean flag to control output of informational messages
@@ -79,11 +79,10 @@ def xi(boxsize, nthreads, binfile, X, Y, Z,
        Boolean flag to output the average ``r`` for each bin. Code will
        run slower if you set this flag. 
 
-
-    .. note:: If you are calculating in single-precision, ``rpavg`` will 
-        suffer from numerical loss of precision and can not be trusted. If 
-        you need accurate ``rpavg`` values, then pass in double precision 
-        arrays for the particle positions.
+       Note: If you are calculating in single-precision, ``rpavg`` will 
+       suffer from numerical loss of precision and can not be trusted. If 
+       you need accurate ``rpavg`` values, then pass in double precision 
+       arrays for the particle positions.
 
 
     (xyz)bin_refine_factor: integer, default is (2,2,1); typically within [1-3]
diff --git a/Corrfunc/utils.py b/Corrfunc/utils.py
index 1877515e..a183f3ac 100644
--- a/Corrfunc/utils.py
+++ b/Corrfunc/utils.py
@@ -19,11 +19,6 @@
 except NameError:
     xrange = range
 
-try:
-    long
-except NameError:
-    long = int
-    
 def convert_3d_counts_to_cf(ND1, ND2, NR1, NR2,
                             D1D2, D1R2, D2R1, R1R2,
                             estimator='LS'):
@@ -571,7 +566,7 @@ def compute_nbins(max_diff, binsize,
         msg = 'Error: Invalid value for max_diff = {0} or binsize = {1}. '\
               'Both must be positive'.format(max_diff, binsize)
         raise ValueError(msg)
-    if max_nbins < 1:
+    if max_nbins is not None and max_nbins < 1:
         msg = 'Error: Invalid for the max. number of bins allowed = {0}.'\
               'Max. nbins must be >= 1'.format(max_nbins)
         raise ValueError(msg)
@@ -582,7 +577,7 @@ def compute_nbins(max_diff, binsize,
         raise ValueError(msg)
 
     # At least 1 bin
-    ngrid = max(1, long(max_diff/binsize))
+    ngrid = max(int(1), int(max_diff/binsize))
 
     # Then refine
     ngrid *= refine_factor
@@ -590,7 +585,7 @@ def compute_nbins(max_diff, binsize,
     # But don't exceed max number of bins
     # (if passed as a parameter)
     if max_nbins:
-        ngrid = min(max_nbins, ngrid)
+        ngrid = min(int(max_nbins), ngrid)
 
     return ngrid             
                      
@@ -679,65 +674,67 @@ def gridlink_sphere(thetamax,
     --------
 
     >>> from Corrfunc.utils import gridlink_sphere
+    >>> import numpy as np
+    >>> np.set_printoptions(precision=8)
     >>> thetamax=30
-    >>> gridlink_sphere(thetamax)
-    array([([-1.57079633, -1.04719755], [ 0.        ,  3.14159265]),
-       ([-1.57079633, -1.04719755], [ 3.14159265,  6.28318531]),
-       ([-1.04719755, -0.52359878], [ 0.        ,  3.14159265]),
-       ([-1.04719755, -0.52359878], [ 3.14159265,  6.28318531]),
-       ([-0.52359878,  0.        ], [ 0.        ,  1.25663706]),
-       ([-0.52359878,  0.        ], [ 1.25663706,  2.51327412]),
-       ([-0.52359878,  0.        ], [ 2.51327412,  3.76991118]),
-       ([-0.52359878,  0.        ], [ 3.76991118,  5.02654825]),
-       ([-0.52359878,  0.        ], [ 5.02654825,  6.28318531]),
-       ([ 0.        ,  0.52359878], [ 0.        ,  1.25663706]),
-       ([ 0.        ,  0.52359878], [ 1.25663706,  2.51327412]),
-       ([ 0.        ,  0.52359878], [ 2.51327412,  3.76991118]),
-       ([ 0.        ,  0.52359878], [ 3.76991118,  5.02654825]),
-       ([ 0.        ,  0.52359878], [ 5.02654825,  6.28318531]),
-       ([ 0.52359878,  1.04719755], [ 0.        ,  3.14159265]),
-       ([ 0.52359878,  1.04719755], [ 3.14159265,  6.28318531]),
-       ([ 1.04719755,  1.57079633], [ 0.        ,  3.14159265]),
-       ([ 1.04719755,  1.57079633], [ 3.14159265,  6.28318531])], 
-      dtype=[(u'dec_limit', '<f8', (2,)), (u'ra_limit', '<f8', (2,))])
-    >>> gridlink_sphere(60, dec_refine_factor=3, ra_refine_factor=2)
-    array([([-1.57079633, -1.22173048], [ 0.        ,  1.57079633]),
-           ([-1.57079633, -1.22173048], [ 1.57079633,  3.14159265]),
-           ([-1.57079633, -1.22173048], [ 3.14159265,  4.71238898]),
-           ([-1.57079633, -1.22173048], [ 4.71238898,  6.28318531]),
-           ([-1.22173048, -0.87266463], [ 0.        ,  1.57079633]),
-           ([-1.22173048, -0.87266463], [ 1.57079633,  3.14159265]),
-           ([-1.22173048, -0.87266463], [ 3.14159265,  4.71238898]),
-           ([-1.22173048, -0.87266463], [ 4.71238898,  6.28318531]),
-           ([-0.87266463, -0.52359878], [ 0.        ,  1.57079633]),
-           ([-0.87266463, -0.52359878], [ 1.57079633,  3.14159265]),
-           ([-0.87266463, -0.52359878], [ 3.14159265,  4.71238898]),
-           ([-0.87266463, -0.52359878], [ 4.71238898,  6.28318531]),
-           ([-0.52359878, -0.17453293], [ 0.        ,  1.57079633]),
-           ([-0.52359878, -0.17453293], [ 1.57079633,  3.14159265]),
-           ([-0.52359878, -0.17453293], [ 3.14159265,  4.71238898]),
-           ([-0.52359878, -0.17453293], [ 4.71238898,  6.28318531]),
-           ([-0.17453293,  0.17453293], [ 0.        ,  1.57079633]),
-           ([-0.17453293,  0.17453293], [ 1.57079633,  3.14159265]),
-           ([-0.17453293,  0.17453293], [ 3.14159265,  4.71238898]),
-           ([-0.17453293,  0.17453293], [ 4.71238898,  6.28318531]),
-           ([ 0.17453293,  0.52359878], [ 0.        ,  1.57079633]),
-           ([ 0.17453293,  0.52359878], [ 1.57079633,  3.14159265]),
-           ([ 0.17453293,  0.52359878], [ 3.14159265,  4.71238898]),
-           ([ 0.17453293,  0.52359878], [ 4.71238898,  6.28318531]),
-           ([ 0.52359878,  0.87266463], [ 0.        ,  1.57079633]),
-           ([ 0.52359878,  0.87266463], [ 1.57079633,  3.14159265]),
-           ([ 0.52359878,  0.87266463], [ 3.14159265,  4.71238898]),
-           ([ 0.52359878,  0.87266463], [ 4.71238898,  6.28318531]),
-           ([ 0.87266463,  1.22173048], [ 0.        ,  1.57079633]),
-           ([ 0.87266463,  1.22173048], [ 1.57079633,  3.14159265]),
-           ([ 0.87266463,  1.22173048], [ 3.14159265,  4.71238898]),
-           ([ 0.87266463,  1.22173048], [ 4.71238898,  6.28318531]),
-           ([ 1.22173048,  1.57079633], [ 0.        ,  1.57079633]),
-           ([ 1.22173048,  1.57079633], [ 1.57079633,  3.14159265]),
-           ([ 1.22173048,  1.57079633], [ 3.14159265,  4.71238898]),
-           ([ 1.22173048,  1.57079633], [ 4.71238898,  6.28318531])], 
-          dtype=[(u'dec_limit', '<f8', (2,)), (u'ra_limit', '<f8', (2,))])
+    >>> grid = gridlink_sphere(thetamax) # doctest: +NORMALIZE_WHITESPACE
+    >>> print(grid)
+    [([-1.57079633, -1.04719755], [ 0.        ,  3.14159265])
+     ([-1.57079633, -1.04719755], [ 3.14159265,  6.28318531])
+     ([-1.04719755, -0.52359878], [ 0.        ,  3.14159265])
+     ([-1.04719755, -0.52359878], [ 3.14159265,  6.28318531])
+     ([-0.52359878,  0.        ], [ 0.        ,  1.25663706])
+     ([-0.52359878,  0.        ], [ 1.25663706,  2.51327412])
+     ([-0.52359878,  0.        ], [ 2.51327412,  3.76991118])
+     ([-0.52359878,  0.        ], [ 3.76991118,  5.02654825])
+     ([-0.52359878,  0.        ], [ 5.02654825,  6.28318531])
+     ([ 0.        ,  0.52359878], [ 0.        ,  1.25663706])
+     ([ 0.        ,  0.52359878], [ 1.25663706,  2.51327412])
+     ([ 0.        ,  0.52359878], [ 2.51327412,  3.76991118])
+     ([ 0.        ,  0.52359878], [ 3.76991118,  5.02654825])
+     ([ 0.        ,  0.52359878], [ 5.02654825,  6.28318531])
+     ([ 0.52359878,  1.04719755], [ 0.        ,  3.14159265])
+     ([ 0.52359878,  1.04719755], [ 3.14159265,  6.28318531])
+     ([ 1.04719755,  1.57079633], [ 0.        ,  3.14159265])
+     ([ 1.04719755,  1.57079633], [ 3.14159265,  6.28318531])]
+    >>> grid = gridlink_sphere(60, dec_refine_factor=3, ra_refine_factor=2) # doctest: +NORMALIZE_WHITESPACE
+    >>> print(grid)
+    [([-1.57079633, -1.22173048], [ 0.        ,  1.57079633])
+     ([-1.57079633, -1.22173048], [ 1.57079633,  3.14159265])
+     ([-1.57079633, -1.22173048], [ 3.14159265,  4.71238898])
+     ([-1.57079633, -1.22173048], [ 4.71238898,  6.28318531])
+     ([-1.22173048, -0.87266463], [ 0.        ,  1.57079633])
+     ([-1.22173048, -0.87266463], [ 1.57079633,  3.14159265])
+     ([-1.22173048, -0.87266463], [ 3.14159265,  4.71238898])
+     ([-1.22173048, -0.87266463], [ 4.71238898,  6.28318531])
+     ([-0.87266463, -0.52359878], [ 0.        ,  1.57079633])
+     ([-0.87266463, -0.52359878], [ 1.57079633,  3.14159265])
+     ([-0.87266463, -0.52359878], [ 3.14159265,  4.71238898])
+     ([-0.87266463, -0.52359878], [ 4.71238898,  6.28318531])
+     ([-0.52359878, -0.17453293], [ 0.        ,  1.57079633])
+     ([-0.52359878, -0.17453293], [ 1.57079633,  3.14159265])
+     ([-0.52359878, -0.17453293], [ 3.14159265,  4.71238898])
+     ([-0.52359878, -0.17453293], [ 4.71238898,  6.28318531])
+     ([-0.17453293,  0.17453293], [ 0.        ,  1.57079633])
+     ([-0.17453293,  0.17453293], [ 1.57079633,  3.14159265])
+     ([-0.17453293,  0.17453293], [ 3.14159265,  4.71238898])
+     ([-0.17453293,  0.17453293], [ 4.71238898,  6.28318531])
+     ([ 0.17453293,  0.52359878], [ 0.        ,  1.57079633])
+     ([ 0.17453293,  0.52359878], [ 1.57079633,  3.14159265])
+     ([ 0.17453293,  0.52359878], [ 3.14159265,  4.71238898])
+     ([ 0.17453293,  0.52359878], [ 4.71238898,  6.28318531])
+     ([ 0.52359878,  0.87266463], [ 0.        ,  1.57079633])
+     ([ 0.52359878,  0.87266463], [ 1.57079633,  3.14159265])
+     ([ 0.52359878,  0.87266463], [ 3.14159265,  4.71238898])
+     ([ 0.52359878,  0.87266463], [ 4.71238898,  6.28318531])
+     ([ 0.87266463,  1.22173048], [ 0.        ,  1.57079633])
+     ([ 0.87266463,  1.22173048], [ 1.57079633,  3.14159265])
+     ([ 0.87266463,  1.22173048], [ 3.14159265,  4.71238898])
+     ([ 0.87266463,  1.22173048], [ 4.71238898,  6.28318531])
+     ([ 1.22173048,  1.57079633], [ 0.        ,  1.57079633])
+     ([ 1.22173048,  1.57079633], [ 1.57079633,  3.14159265])
+     ([ 1.22173048,  1.57079633], [ 3.14159265,  4.71238898])
+     ([ 1.22173048,  1.57079633], [ 4.71238898,  6.28318531])]
 
     """
 
diff --git a/README.rst b/README.rst
index 9f712f64..f7287df6 100644
--- a/README.rst
+++ b/README.rst
@@ -350,7 +350,6 @@ with the code, including using it in commercial application.
 Project URL
 ===========
 
--  website (https://manodeep.github.io/Corrfunc/)
 -  documentation (http://corrfunc.rtfd.io/)   
 -  version control (https://github.com/manodeep/Corrfunc)
 
diff --git a/common.mk b/common.mk
index e1f93d8c..455e2187 100644
--- a/common.mk
+++ b/common.mk
@@ -40,8 +40,8 @@ OPT += -DUSE_OMP
 ### You should NOT edit below this line
 DISTNAME:=Corrfunc
 MAJOR:=2
-MINOR:=0
-PATCHLEVEL:=1
+MINOR:=1
+PATCHLEVEL:=0
 VERSION:=$(MAJOR).$(MINOR).$(PATCHLEVEL)
 ABI_COMPAT_VERSION:=$(MAJOR).0
 # Whenever conda needs to be checked again
@@ -212,7 +212,7 @@ ifeq ($(DO_CHECKS), 1)
     CFLAGS += -Werror -Wno-unknown-warning-option
   endif
 
-  GSL_FOUND := $(shell gsl-config --version)
+  GSL_FOUND := $(shell gsl-config --version 2>/dev/null)
   ifndef GSL_FOUND
     $(error $(ccred)Error:$(ccreset) GSL not found in path - please install GSL before installing $(DISTNAME).$(VERSION) $(ccreset))
   endif
diff --git a/docs/source/all-interfaces.rst b/docs/source/all-interfaces.rst
index 418abb29..dd81782a 100644
--- a/docs/source/all-interfaces.rst
+++ b/docs/source/all-interfaces.rst
@@ -28,7 +28,8 @@ associated with each type of clustering statistic:
 Clustering Statistic      Python Interface                  Static library                            Command-line  (executable name)
 ======================    ================================  ========================================  ====================================
 :math:`\xi(r)`            :py:mod:`Corrfunc.theory.DD`       ``theory/DD/libcountpairs.a``            ``theory/DD/DD``             
-:math:`\xi(r_p,\pi)`      :py:mod:`Corrfunc.theory.DDrppi`   ``theory/DDrppi/libcountpairs_rp_pi.a``   ``theory/DDrppi/DDrppi``        
+:math:`\xi(r_p,\pi)`      :py:mod:`Corrfunc.theory.DDrppi`   ``theory/DDrppi/libcountpairs_rp_pi.a``  ``theory/DDrppi/DDrppi``        
+:math:`\xi(s,\mu)`        :py:mod:`Corrfunc.theory.DDsmu`    ``theory/DDsmu/libcountpairs_s_mu.a``    ``theory/DDsmu/DDsmu``        
 :math:`w_p(r_p)`          :py:mod:`Corrfunc.theory.wp`       ``theory/wp/libcountpairs_wp.a``         ``theory/wp/wp``         
 :math:`\xi(r)`            :py:mod:`Corrfunc.theory.xi`       ``theory/xi/libcountpairs_xi.a``         ``theory/xi/xi``         
 :math:`pN(n)`             :py:mod:`Corrfunc.theory.vpf`      ``theory/vpf/libcountspheres.a``         ``theory/vpf/vpf``       
@@ -44,6 +45,7 @@ command-line executables are:
 Clustering Statistic     Python Interface                        Static library                                           Command-line (executable name)
 ======================   ======================================  =====================================================    =====================================
 :math:`\xi(r_p,\pi)`     :py:mod:`Corrfunc.mocks.DDrppi_mocks`    ``mocks/DDrppi_mocks/libcountpairs_rp_pi_mocks.a``      ``mocks/DDrppi_mocks/DDrppi_mocks``  
+:math:`\xi(s,\mu)`       :py:mod:`Corrfunc.mocks.DDsmu_mocks`     ``mocks/DDsmu_mocks/libcountpairs_s_mu_mocks.a``        ``mocks/DDsmu_mocks/DDsmu_mocks``  
 :math:`\omega(\theta)`   :py:mod:`Corrfunc.mocks.DDtheta_mocks`   ``mocks/DDtheta_mocks/libcountpairs_theta_mocks.a``     ``mocks/DDtheta_mocks/DDtheta_mocks``
 :math:`pN(n)`            :py:mod:`Corrfunc.mocks.vpf_mocks`       ``mocks/vpf_mocks/libcountspheres_mocks``               ``mocks/vpf_mocks/vpf_mocks``        
 ======================   ======================================  =====================================================    =====================================
diff --git a/docs/source/api/Corrfunc.mocks.rst b/docs/source/api/Corrfunc.mocks.rst
index e8a40caa..633c5cf0 100644
--- a/docs/source/api/Corrfunc.mocks.rst
+++ b/docs/source/api/Corrfunc.mocks.rst
@@ -1,5 +1,5 @@
-Corrfunc.mocks package
-======================
+Corrfunc\.mocks package
+=======================
 
 .. automodule:: Corrfunc.mocks
     :members:
@@ -9,24 +9,32 @@ Corrfunc.mocks package
 Submodules
 ----------
 
-Corrfunc.mocks.DDrppi_mocks module
-----------------------------------
+Corrfunc\.mocks\.DDrppi\_mocks module
+-------------------------------------
 
 .. automodule:: Corrfunc.mocks.DDrppi_mocks
     :members:
     :undoc-members:
     :show-inheritance:
 
-Corrfunc.mocks.DDtheta_mocks module
------------------------------------
+Corrfunc\.mocks\.DDsmu\_mocks module
+------------------------------------
+
+.. automodule:: Corrfunc.mocks.DDsmu_mocks
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+Corrfunc\.mocks\.DDtheta\_mocks module
+--------------------------------------
 
 .. automodule:: Corrfunc.mocks.DDtheta_mocks
     :members:
     :undoc-members:
     :show-inheritance:
 
-Corrfunc.mocks.vpf_mocks module
--------------------------------
+Corrfunc\.mocks\.vpf\_mocks module
+----------------------------------
 
 .. automodule:: Corrfunc.mocks.vpf_mocks
     :members:
diff --git a/docs/source/api/Corrfunc.rst b/docs/source/api/Corrfunc.rst
index c75f401d..818832da 100644
--- a/docs/source/api/Corrfunc.rst
+++ b/docs/source/api/Corrfunc.rst
@@ -17,40 +17,40 @@ Subpackages
 Submodules
 ----------
 
-Corrfunc.call_correlation_functions module
-------------------------------------------
+Corrfunc\.call\_correlation\_functions module
+---------------------------------------------
 
 .. automodule:: Corrfunc.call_correlation_functions
     :members:
     :undoc-members:
     :show-inheritance:
 
-Corrfunc.call_correlation_functions_mocks module
-------------------------------------------------
+Corrfunc\.call\_correlation\_functions\_mocks module
+----------------------------------------------------
 
 .. automodule:: Corrfunc.call_correlation_functions_mocks
     :members:
     :undoc-members:
     :show-inheritance:
 
-Corrfunc.io module
-------------------
+Corrfunc\.io module
+-------------------
 
 .. automodule:: Corrfunc.io
     :members:
     :undoc-members:
     :show-inheritance:
 
-Corrfunc.tests module
----------------------
+Corrfunc\.tests module
+----------------------
 
 .. automodule:: Corrfunc.tests
     :members:
     :undoc-members:
     :show-inheritance:
 
-Corrfunc.utils module
----------------------
+Corrfunc\.utils module
+----------------------
 
 .. automodule:: Corrfunc.utils
     :members:
diff --git a/docs/source/api/Corrfunc.theory.rst b/docs/source/api/Corrfunc.theory.rst
index 445767b7..cef35db5 100644
--- a/docs/source/api/Corrfunc.theory.rst
+++ b/docs/source/api/Corrfunc.theory.rst
@@ -1,5 +1,5 @@
-Corrfunc.theory package
-=======================
+Corrfunc\.theory package
+========================
 
 .. automodule:: Corrfunc.theory
     :members:
@@ -9,40 +9,48 @@ Corrfunc.theory package
 Submodules
 ----------
 
-Corrfunc.theory.DD module
--------------------------
+Corrfunc\.theory\.DD module
+---------------------------
 
 .. automodule:: Corrfunc.theory.DD
     :members:
     :undoc-members:
     :show-inheritance:
 
-Corrfunc.theory.DDrppi module
------------------------------
+Corrfunc\.theory\.DDrppi module
+-------------------------------
 
 .. automodule:: Corrfunc.theory.DDrppi
     :members:
     :undoc-members:
     :show-inheritance:
 
-Corrfunc.theory.vpf module
---------------------------
+Corrfunc\.theory\.DDsmu module
+------------------------------
+
+.. automodule:: Corrfunc.theory.DDsmu
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+Corrfunc\.theory\.vpf module
+----------------------------
 
 .. automodule:: Corrfunc.theory.vpf
     :members:
     :undoc-members:
     :show-inheritance:
 
-Corrfunc.theory.wp module
--------------------------
+Corrfunc\.theory\.wp module
+---------------------------
 
 .. automodule:: Corrfunc.theory.wp
     :members:
     :undoc-members:
     :show-inheritance:
 
-Corrfunc.theory.xi module
--------------------------
+Corrfunc\.theory\.xi module
+---------------------------
 
 .. automodule:: Corrfunc.theory.xi
     :members:
diff --git a/docs/source/modules/converting_3d_counts.rst b/docs/source/modules/converting_3d_counts.rst
index b9a20420..23695d49 100644
--- a/docs/source/modules/converting_3d_counts.rst
+++ b/docs/source/modules/converting_3d_counts.rst
@@ -18,6 +18,7 @@ wrapper :py:mod:`Corrfunc.theory.DD`
 
           >>> # Read the supplied galaxies on a periodic box
           >>> X, Y, Z = read_catalog()
+          >>> N = len(X)
           >>> boxsize = 420.0
           >>> nthreads = 2
 
diff --git a/docs/source/modules/converting_ddtheta_mocks.rst b/docs/source/modules/converting_ddtheta_mocks.rst
index 9e60abad..4dd4f559 100644
--- a/docs/source/modules/converting_ddtheta_mocks.rst
+++ b/docs/source/modules/converting_ddtheta_mocks.rst
@@ -12,6 +12,7 @@ wrapper :py:mod:`Corrfunc.mocks.DDtheta_mocks`
 .. code-block:: python
 
           >>> from os.path import dirname, abspath, join as pjoin
+          >>> import numpy as np
           >>> import Corrfunc
           >>> from Corrfunc.mocks.DDtheta_mocks import DDtheta_mocks
           >>> from Corrfunc.io import read_catalog
@@ -28,11 +29,15 @@ wrapper :py:mod:`Corrfunc.mocks.DDtheta_mocks`
           >>> random_catalog=pjoin(dirname(abspath(Corrfunc.__file__)),
           ...                     "../mocks/tests/data", "Mr19_randoms_northonly.rdcz.ff")
           >>> rand_RA, rand_DEC, _ = read_catalog(random_catalog)
-          
+          >>> rand_N = len(rand_RA)
+
           # Setup the bins
           >>> nbins = 10
           >>> bins = np.linspace(0.1, 10.0, nbins + 1) # note the +1 to nbins
 
+          # Number of threads to use
+          >>> nthreads = 2
+          
           # Auto pair counts in DD
           >>> autocorr=1
           >>> DD_counts = DDtheta_mocks(autocorr, nthreads, bins,
diff --git a/docs/source/modules/converting_rp_pi_counts.rst b/docs/source/modules/converting_rp_pi_counts.rst
index 77cc492d..0450e597 100644
--- a/docs/source/modules/converting_rp_pi_counts.rst
+++ b/docs/source/modules/converting_rp_pi_counts.rst
@@ -8,12 +8,14 @@ by using the helper function :py:mod:`Corrfunc.utils.convert_rp_pi_counts_to_wp`
 
 .. code-block:: python
 
+          >>> import numpy as np
           >>> from Corrfunc.theory import DDrppi
           >>> from Corrfunc.io import read_catalog
           >>> from Corrfunc.utils import convert_rp_pi_counts_to_wp
           
           # Read the supplied galaxies on a periodic box          
           >>> X, Y, Z = read_catalog()
+          >>> N = len(X)
           >>> boxsize = 420.0
 
           # Generate randoms on the box          
diff --git a/docs/source/modules/mock_functions.rst b/docs/source/modules/mock_functions.rst
index 8307f9ba..443fce49 100644
--- a/docs/source/modules/mock_functions.rst
+++ b/docs/source/modules/mock_functions.rst
@@ -11,6 +11,7 @@ Clustering in 2-D
 -------------------
 
 * Pair counts (auto or cross) correlations for :math:`\xi(rp, \pi)` -- :py:mod:`Corrfunc.mocks.DDrppi_mocks`
+* Pair counts (auto or cross) correlations for :math:`\xi(s, \mu)` -- :py:mod:`Corrfunc.mocks.DDsmu_mocks`     
 
 Angular clustering
 --------------------
diff --git a/docs/source/modules/theory_functions.rst b/docs/source/modules/theory_functions.rst
index 05ca039a..b08efc94 100644
--- a/docs/source/modules/theory_functions.rst
+++ b/docs/source/modules/theory_functions.rst
@@ -16,7 +16,8 @@ Clustering in 3-D
 Clustering in 2-D
 ------------------
 
-* Pair counts (auto or cross) correlations for :math:`\xi(rp, \pi)` -- :py:mod:`Corrfunc.theory.DDrppi`
+* Pair counts (auto or cross) correlations for :math:`\xi(rp, \pi)` --  :py:mod:`Corrfunc.theory.DDrppi`
+* Pair counts (auto or cross) correlations for :math:`\xi(s, \mu)` -- :py:mod:`Corrfunc.theory.DDsmu`     
 * Projected auto-correlation function, :math:`wp(rp)` --  :py:mod:`Corrfunc.theory.wp`
 
 Counts-in-cells
diff --git a/docs/source/modules/which_corrfunc.rst b/docs/source/modules/which_corrfunc.rst
index 8a710e51..3ce2a6dd 100644
--- a/docs/source/modules/which_corrfunc.rst
+++ b/docs/source/modules/which_corrfunc.rst
@@ -20,8 +20,12 @@ type of data, **and** the desired correlation function you want, the following t
 | X, Y, Z           | True or False | Arbitrary       | :math:`\xi(r)`                          | Pair-counts in 3-D real-space |:py:mod:`Corrfunc.theory.DD`           |
 |                   |               |                 +-----------------------------------------+-------------------------------+---------------------------------------+
 |                   |               |                 | :math:`\xi(r_p, \pi)`                   | Pair-counts in 2-D            |:py:mod:`Corrfunc.theory.DDrppi`       |
+|                   |               |                 +-----------------------------------------+-------------------------------+---------------------------------------+
+|                   |               |                 | :math:`\xi(s, \mu)`                     | Pair-counts in 2-D            |:py:mod:`Corrfunc.theory.DDsmu`        |
 +-------------------+---------------+-----------------+-----------------------------------------+-------------------------------+---------------------------------------+
 | ra, dec, cz       | False         | Arbitrary       | :math:`\xi(r_p, \pi)`                   | Pair-counts in 2-D            |:py:mod:`Corrfunc.mocks.DDrppi_mocks`  |
+|                   |               |                 +-----------------------------------------+-------------------------------+---------------------------------------+
+|                   |               |                 | :math:`\xi(s, \mu)`                     | Pair-counts in 2-D            |:py:mod:`Corrfunc.mocks.DDsmu_mocks`   |
 +-------------------+---------------+-----------------+-----------------------------------------+-------------------------------+---------------------------------------+
 | ra, dec           | False         | Arbitrary       | :math:`\omega(\theta)`                  | Pair-counts in angular space  |:py:mod:`Corrfunc.mocks.DDtheta_mocks` |
 +-------------------+---------------+-----------------+-----------------------------------------+-------------------------------+---------------------------------------+
diff --git a/docs/source/python-interface.rst b/docs/source/python-interface.rst
index 1ef04079..8e0a28b2 100644
--- a/docs/source/python-interface.rst
+++ b/docs/source/python-interface.rst
@@ -104,6 +104,13 @@ clustering functions:
           
                # Specify the distance to integrate along line of sight
                >>> pimax = 40.0
+
+               # Specify the max. of the cosine of the angle to the LOS for
+               # DD(s, mu) 
+               >>> mu_max = 1.0
+
+               # Specify the number of linear bins in `mu`
+               >>> nmu_bins = 20
                
                # Specify that an autocorrelation is wanted
                >>> autocorr = 1
@@ -120,6 +127,8 @@ clustering functions:
           nbins = 20
           rbins = np.logspace(np.log10(rmin), np.log10(rmax), nbins + 1)
           pimax = 40.0
+          mu_max = 1.0
+          nmu_bins = 20
           autocorr = 1
 
           
@@ -169,6 +178,7 @@ bin is ``0.0`` for an autocorrelation, then the self-pairs *will* be counted.
 
 Calculating 2-D pair-counts (``Corrfunc.theory.DDrppi``)
 --------------------------------------------------------
+
 Corrfunc can return the pair counts in 2-D real-space for a set of arrays. The
 calculation can be either auto or cross-correlation, *and* with or without periodic
 boundaries. The projected separation, :math:`r_p` is calculated in the X-Y plane while the
@@ -182,6 +192,31 @@ bin is ``0.0`` for an autocorrelation, then the self-pairs *will* be counted.
           from Corrfunc.theory.DDrppi import DDrppi
           results_DDrppi = DDrppi(autocorr, nthreads, pimax, rbins, X, Y, Z, boxsize=boxsize)
 
+Calculating 2-D pair-counts (``Corrfunc.theory.DDsmu``)
+--------------------------------------------------------
+
+Corrfunc can return the pair counts in 2-D real-space for a set of arrays. The
+calculation can be either auto or cross-correlation, *and* with or without periodic
+boundaries. The spatial separation, :math:`s` is calculated in 3-D while 
+:math:`mu` is the cosine of angle to the line-of-sight and is calculated
+assuming that the Z-axis is the line-of-sight.
+
+.. math::
+   
+   \mathbf{s}  &= \mathbf{v_1} - \mathbf{v_2}, \\
+   {\mu} &= \frac{\left(z_1 - z_2 \right)}{\Vert\mathbf{s}\Vert}
+
+where, :math:`\mathbf{v_1}:=(x_1, y_1, z_1)` and :math:`\mathbf{v_2}:=(x_2, y_2, z_2)` are the vectors for the
+two points under consideration, and, :math:`\Vert\mathbf{s}\Vert=\sqrt{(x_1 - x_2)^2 + (y_1 - y_2)^2 + (z_1 - z_2)^2}`
+
+The pairs are always double-counted. Additionally, if the smallest
+bin is ``0.0`` for an autocorrelation, then the self-pairs *will* be counted.
+
+.. testcode:: theory
+
+          from Corrfunc.theory.DDsmu import DDsmu
+          results_DDsmu = DDsmu(autocorr, nthreads, rbins, mu_max, nmu_bins, X, Y, Z, boxsize=boxsize)
+              
 
 Calculating the Counts-in-Cells (``Corrfunc.theory.vpf``)
 ---------------------------------------------------------
@@ -249,6 +284,13 @@ sets up the default arrays and parameters for the actual clustering calculations
           # Specify the distance to integrate along line of sight
           pimax = 40.0
 
+          # Specify the max. of the cosine of the angle to the LOS
+          # for DD(s, mu)
+          mu_max = 1.0
+
+          # Specify the number of linear bins in `mu`
+          nmu_bins = 20
+
           # Specify that an autocorrelation is wanted
           autocorr = 1
 
@@ -270,10 +312,11 @@ sets up the default arrays and parameters for the actual clustering calculations
           nbins = 20
           rbins = np.logspace(np.log10(rmin), np.log10(rmax), nbins + 1)
           pimax = 40.0
+          mu_max = 1.0
+          nmu_bins = 20
           autocorr = 1
 
           
-
 Calculating 2-D pair counts (``Corrfunc.mocks.DDrppi_mocks``)
 -------------------------------------------------------------
 Corrfunc can calculate pair counts for mock catalogs. The input positions are
@@ -291,19 +334,46 @@ equations from `Zehavi et al. 2002 <http://adsabs.harvard.edu/abs/2002ApJ...571.
    \mathbf{l} &= \frac{1}{2}\left(\mathbf{v_1} + \mathbf{v_2}\right), \\
    \pi &= \left(\mathbf{s} \cdot \mathbf{l}\right)/\Vert\mathbf{l}\Vert, \\
    r_p^2 &= \mathbf{s} \cdot \mathbf{s} - \pi^2
-   
-where, :math:`\mathbf{v_1}` and :math:`\mathbf{v_2}` are the vectors for the
-two points under consideration. 
-   
+
+where, :math:`\mathbf{v_1}:=(x_1, y_1, z_1)` and :math:`\mathbf{v_2}:=(x_2, y_2, z_2)` are the vectors for the
+two points under consideration, and, :math:`\Vert\mathbf{s}\Vert=\sqrt{(x_1 - x_2)^2 + (y_1 - y_2)^2 + (z_1 - z_2)^2}`.
+
 Here is the python code to call ``Corrfunc.mocks.DDrppi_mocks``:
 
 .. testcode:: mocks
 
           from Corrfunc.mocks.DDrppi_mocks import DDrppi_mocks
-          results_DDrppi_mocks = DDrppi_mocks(autocorr, cosmology, nthreads,
-          pimax, rbins, RA, DEC, CZ)
+          results_DDrppi_mocks = DDrppi_mocks(autocorr, cosmology, nthreads, pimax, rbins, RA, DEC, CZ)
   
 
+Calculating 2-D pair counts (``Corrfunc.mocks.DDsmu_mocks``)
+-------------------------------------------------------------
+Corrfunc can calculate pair counts for mock catalogs. The input positions are
+expected to be ``Right Ascension``, ``Declination`` and ``CZ`` (speed of light
+times redshift, in ``Mpc/h``). Cosmology has to be specified since ``CZ`` needs
+to be converted into co-moving distance. If you want to calculate in arbitrary
+cosmology, then convert ``CZ`` into co-moving distance, and then pass the
+converted array while setting the option ``is_comoving_dist=True``. The
+projected and line of sight separations are calculated using the following
+equations from `Zehavi et al. 2002 <http://adsabs.harvard.edu/abs/2002ApJ...571..172Z>`_
+
+.. math::
+   
+   \mathbf{s} &= \mathbf{v_1} - \mathbf{v_2}, \\
+   \mathbf{l} &= \frac{1}{2}\left(\mathbf{v_1} + \mathbf{v_2}\right), \\
+   \mu &= \left(\mathbf{s} \cdot \mathbf{l}\right)/\left(\Vert\mathbf{l}\Vert \Vert\mathbf{s}\Vert \right)
+   
+where, :math:`\mathbf{v_1}:=(x_1, y_1, z_1)` and :math:`\mathbf{v_2}:=(x_2, y_2, z_2)` are the vectors for the
+two points under consideration, and, :math:`\Vert\mathbf{s}\Vert=\sqrt{(x_1 - x_2)^2 + (y_1 - y_2)^2 + (z_1 - z_2)^2}`
+   
+Here is the python code to call ``Corrfunc.mocks.DDsmu_mocks``:
+
+.. testcode:: mocks
+
+          from Corrfunc.mocks.DDsmu_mocks import DDsmu_mocks
+          results_DDsmu_mocks = DDsmu_mocks(autocorr, cosmology, nthreads, mu_max, nmu_bins, rbins, RA, DEC, CZ)
+  
+              
 Calculating angular pair-counts (``Corrfunc.mocks.DDtheta_mocks``)
 -------------------------------------------------------------------
 Corrfunc can compute angular pair counts for mock catalogs. The input positions
diff --git a/index.md b/index.md
index 6d5fe276..c568c70a 100644
--- a/index.md
+++ b/index.md
@@ -337,10 +337,8 @@ with the code including using it in commercial application.
 
 Project URL
 ===========
-
--  website (https://manodeep.github.io/Corrfunc/)
+-  website & version control (https://github.com/manodeep/Corrfunc)
 -  documentation (http://corrfunc.rtfd.io/)   
--  version control (https://github.com/manodeep/Corrfunc)
 
 .. |Release| image:: https://img.shields.io/github/release/manodeep/Corrfunc.svg
    :target: https://github.com/manodeep/Corrfunc/releases/latest
diff --git a/meta.yaml b/meta.yaml
index 22131649..d66332a5 100644
--- a/meta.yaml
+++ b/meta.yaml
@@ -48,7 +48,7 @@ test:
     - python -c "import Corrfunc.call_correlation_functions_mocks as m; m.main()"
 
 about:
-  home: http://manodeep.github.io/Corrfunc/
+  home: https://github.com/manodeep/Corrfunc
   license: MIT
   license_file: LICENSE
   summary: Blazing fast correlation functions on the CPU
diff --git a/mocks/DDrppi_mocks/countpairs_rp_pi_mocks_kernels.c.src b/mocks/DDrppi_mocks/countpairs_rp_pi_mocks_kernels.c.src
index 99bddc12..dde05f57 100644
--- a/mocks/DDrppi_mocks/countpairs_rp_pi_mocks_kernels.c.src
+++ b/mocks/DDrppi_mocks/countpairs_rp_pi_mocks_kernels.c.src
@@ -372,8 +372,8 @@ static inline int countpairs_rp_pi_mocks_avx_intrinsics_DOUBLE(const int64_t N0,
                 pair.pary.d = pary;
                 pair.parz.d = parz;
 
-                pairweight = fallback_weight_func(&pair);
-            }
+                pairweight = fallback_weight_func(&pair); 
+           }
 
             for(int kbin=nbin-1;kbin>=1;kbin--) {
                 if(sqr_Dperp >= rupp_sqr[kbin-1]) {
@@ -705,8 +705,7 @@ static inline int countpairs_rp_pi_mocks_sse_intrinsics_DOUBLE(const int64_t N0,
             const DOUBLE sqr_s = perpx*perpx + perpy*perpy + perpz*perpz;
             if(sqr_s >= sqr_max_sep) continue;
 
-            /* const DOUBLE dot_product  = (parx*perpx+pary*perpy+parz*perpz); */
-            const DOUBLE norm = (parx*parx+pary*pary+parz*parz);
+            const DOUBLE norm = (parx*parx + pary*pary + parz*parz);
             const DOUBLE tmp = dot_product * dot_product;
             if(tmp >= sqr_pimax * norm) continue;
             const DOUBLE sqr_Dpar = (dot_product * dot_product)/norm;
@@ -724,6 +723,10 @@ static inline int countpairs_rp_pi_mocks_sse_intrinsics_DOUBLE(const int64_t N0,
                 pair.dy.d = perpy;
                 pair.dz.d = perpz;
 
+                pair.parx.d = parx;
+                pair.pary.d = pary;
+                pair.parz.d = parz;
+
                 pairweight = fallback_weight_func(&pair);
             }
 
@@ -895,9 +898,9 @@ static inline int countpairs_rp_pi_mocks_fallback_DOUBLE(const int64_t N0, DOUBL
                 pair.dy.d = perpy;
                 pair.dz.d = perpz;
                 
-                pair.dx.d = parx;
-                pair.dy.d = pary;
-                pair.dz.d = parz;
+                pair.parx.d = parx;
+                pair.pary.d = pary;
+                pair.parz.d = parz;
                 
                 pairweight = weight_func(&pair);
             }
diff --git a/mocks/DDsmu_mocks/DDsmu_mocks.c b/mocks/DDsmu_mocks/DDsmu_mocks.c
new file mode 100644
index 00000000..d24cb00b
--- /dev/null
+++ b/mocks/DDsmu_mocks/DDsmu_mocks.c
@@ -0,0 +1,335 @@
+/* File: DDsmu_mocks.c */
+/*
+  This file is a part of the Corrfunc package
+  Copyright (C) 2015-- Manodeep Sinha (manodeep@gmail.com)
+  License: MIT LICENSE. See LICENSE file under the top-level
+  directory at https://github.com/manodeep/Corrfunc/
+*/
+
+/* PROGRAM DDsmu
+
+--- DDsmu file1 format1 file2 format2 sbinfile Nmu cosmology numthreads [weight_method weights_file1 weights_format1 [weights_file2 weights_format2]] > DDfile
+--- Measure the cross-correlation function xi(rp,pi) for two different
+   data files (or autocorrelation if data1=data2).
+ * file1         = name of first data file
+ * format1       = format of first data file  (a=ascii, c=csv, f=fast-food)
+ * file2         = name of second data file
+ * format2       = format of second data file (a=ascii, c=csv, f=fast-food)
+ * sbinfile       = name of ascii file containing the r-bins (rmin rmax for each bin)
+ * mu_max        = maximum mu value (>0 and <= 1.0)
+ * nmu_bins      = number of mu bins
+ * cosmology     = flag to pick-up the cosmology combination to use (set as an array of combinations in ../utils/cosmology_params.c)
+ * numthreads    = number of threads to use
+--- OPTIONAL ARGS:
+ * weight_method = the type of pair weighting to apply.  Options are: 'pair_product', 'none'.  Default: 'none'.
+ * weights_file1 = name of file containing the weights corresponding to the first data file
+ * weights_format1 = format of file containing the weights corresponding to the first data file
+ * weights_file2 = name of file containing the weights corresponding to the second data file
+ * weights_format2 = format of file containing the weights corresponding to the second data file
+---OUTPUT:
+ > DDfile        = name of output file. Contains <npairs savg logrp pi weightavg>
+
+*/
+
+#include <stdio.h>
+#include <math.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "defs.h" //for basic API + all macros
+#include "function_precision.h" //definition of DOUBLE
+#include "countpairs_s_mu_mocks.h" //function proto-type for countpairs
+#include "io.h" //function proto-type for file input
+#include "utils.h" //general utilities
+
+
+void Printhelp(void);
+
+int main(int argc, char *argv[])
+{
+    /*---Arguments-------------------------*/
+    char *file1=NULL,*file2=NULL, *weights_file1=NULL,*weights_file2=NULL;
+    char *fileformat1=NULL,*fileformat2=NULL, *weights_fileformat1=NULL,*weights_fileformat2=NULL;
+    char *sbinfile=NULL;
+    char *weight_method_str=NULL;
+    int nmu_bins;
+    DOUBLE mu_max;
+
+    weight_method_t weight_method = NONE;
+    int num_weights = 0;
+
+    /*---Data-variables--------------------*/
+    int64_t ND1,ND2 ;
+
+    DOUBLE *thetaD1,*phiD1,*czD1, *weights1[MAX_NUM_WEIGHTS]={NULL};
+    DOUBLE *thetaD2,*phiD2,*czD2, *weights2[MAX_NUM_WEIGHTS]={NULL};
+
+    struct timeval t_end,t_start,t0,t1;
+    double read_time=0.0;
+    gettimeofday(&t_start,NULL);
+    int nthreads=1;
+
+    /*---Corrfunc-variables----------------*/
+#if defined(_OPENMP)
+    const char argnames[][30]={"file1","format1","file2","format2","sbinfile","mu_max","nmu_bins","cosmology flag","numthreads"};
+#else
+    const char argnames[][30]={"file1","format1","file2","format2","sbinfile","mu_max","nmu_bins","cosmology flag"};
+#endif
+    const char optargnames[][30]={"weight_method", "weights_file1","weights_format1","weights_file2","weights_format2"};
+
+    int nargs=sizeof(argnames)/(sizeof(char)*30);
+    int noptargs=sizeof(optargnames)/(sizeof(char)*30);
+
+    int cosmology=1;
+
+    /*---Read-arguments-----------------------------------*/
+    if(argc< (nargs+1)) {
+        Printhelp() ;
+        fprintf(stderr,"\nFound: %d parameters\n ",argc-1);
+        int i;
+        for(i=1;i<argc;i++) {
+            if(i <= nargs)
+                fprintf(stderr,"\t\t %s = `%s' \n",argnames[i-1],argv[i]);
+            else if(i <= nargs + noptargs)
+                fprintf(stderr,"\t\t %s = `%s' \n",optargnames[i-1-nargs],argv[i]);
+            else
+                fprintf(stderr,"\t\t <> = `%s' \n",argv[i]);
+        }
+        fprintf(stderr,"\nMissing required parameters \n");
+        for(i=argc;i<=nargs;i++)
+            fprintf(stderr,"\t\t %s = `?'\n",argnames[i-1]);
+        return EXIT_FAILURE;
+    }
+
+    /* Validate optional arguments */
+    int noptargs_given = argc - (nargs + 1);
+    if(noptargs_given != 0 && noptargs_given != 3 && noptargs_given != 5){
+        Printhelp();
+        fprintf(stderr,"\nFound: %d optional arguments; must be 0 (no weights), 3 (for one set of weights) or 5 (for two sets)\n ", noptargs_given);
+        int i;
+        for(i=nargs+1;i<argc;i++) {
+            if(i <= nargs + noptargs)
+                fprintf(stderr,"\t\t %s = `%s' \n",optargnames[i-nargs-1],argv[i]);
+            else
+                fprintf(stderr,"\t\t <> = `%s' \n",argv[i]);
+        }
+        return EXIT_FAILURE;
+    }
+
+    file1=argv[1];
+    fileformat1=argv[2];
+    file2=argv[3];
+    fileformat2=argv[4];
+    sbinfile=argv[5];
+
+    mu_max=1.0;
+    sscanf(argv[6],"%"REAL_FORMAT,&mu_max) ;
+    nmu_bins=-10;
+    sscanf(argv[7],"%d",&nmu_bins) ;
+    cosmology = atoi(argv[8]);
+
+#if defined(USE_OMP) && defined(_OPENMP)
+    nthreads=atoi(argv[9]);
+    assert(nthreads >= 1 && "Number of threads must be at least 1");
+#endif
+
+    if(noptargs_given >= 3){
+       weight_method_str = argv[nargs + 1];
+       int wstatus = get_weight_method_by_name(weight_method_str, &weight_method);
+       if(wstatus != EXIT_SUCCESS){
+         fprintf(stderr, "Error: Unknown weight method \"%s\"\n", weight_method_str);
+         return EXIT_FAILURE;
+       }
+       num_weights = get_num_weights_by_method(weight_method);
+
+       weights_file1 = argv[nargs + 2];
+       weights_fileformat1 = argv[nargs + 3];
+    }
+    if(noptargs_given >= 5){
+       weights_file2 = argv[nargs + 4];
+       weights_fileformat2 = argv[nargs + 5];
+    }
+
+    int autocorr=0;
+    if(strcmp(file1,file2)==0) {
+        autocorr=1;
+    }
+
+    fprintf(stderr,"Running `%s' with the parameters \n",argv[0]);
+    fprintf(stderr,"\n\t\t -------------------------------------\n");
+    for(int i=1;i<argc;i++) {
+        if(i <= nargs) {
+            fprintf(stderr,"\t\t %-10s = %s \n",argnames[i-1],argv[i]);
+        } else if(i <= nargs + noptargs){
+            fprintf(stderr,"\t\t %-10s = %s \n",optargnames[i-nargs-1],argv[i]);
+        } else {
+            fprintf(stderr,"\t\t <> = `%s' \n",argv[i]);
+        }
+    }
+    fprintf(stderr,"\t\t -------------------------------------\n");
+
+
+    /*---Read-data1-file----------------------------------*/
+    gettimeofday(&t0,NULL);
+    ND1=read_positions(file1,fileformat1,sizeof(DOUBLE), 3, &phiD1, &thetaD1, &czD1);
+    gettimeofday(&t1,NULL);
+    read_time += ADD_DIFF_TIME(t0,t1);
+    gettimeofday(&t0,NULL);
+
+    /* Read weights file 1 */
+    if(weights_file1 != NULL){
+        gettimeofday(&t0,NULL);
+        int64_t wND1 = read_columns_into_array(weights_file1,weights_fileformat1, sizeof(DOUBLE), num_weights, (void **) weights1);
+        gettimeofday(&t1,NULL);
+        read_time += ADD_DIFF_TIME(t0,t1);
+
+        if(wND1 != ND1){
+          fprintf(stderr, "Error: read %"PRId64" lines from %s, but read %"PRId64" from %s\n", wND1, weights_file1, ND1, file1);
+          return EXIT_FAILURE;
+        }
+    }
+
+    if (autocorr==0) {
+        /*---Read-data2-file----------------------------------*/
+        ND2=read_positions(file2,fileformat2,sizeof(DOUBLE), 3, &phiD2, &thetaD2, &czD2);
+        gettimeofday(&t1,NULL);
+        read_time += ADD_DIFF_TIME(t0,t1);
+
+        if(weights_file2 != NULL){
+            gettimeofday(&t0,NULL);
+            int64_t wND2 = read_columns_into_array(weights_file2,weights_fileformat2, sizeof(DOUBLE), num_weights, (void **) weights2);
+            gettimeofday(&t1,NULL);
+            read_time += ADD_DIFF_TIME(t0,t1);
+
+            if(wND2 != ND2){
+              fprintf(stderr, "Error: read %"PRId64" lines from %s, but read %"PRId64" from %s\n", wND2, weights_file2, ND2, file2);
+              return EXIT_FAILURE;
+            }
+        }
+    } else {
+        //None of these are required. But I prefer to preserve the possibility
+        ND2 = ND1;
+        thetaD2 = thetaD1;
+        phiD2 = phiD1;
+        czD2 = czD1;
+        for(int w = 0; w < MAX_NUM_WEIGHTS; w++){
+          weights2[w] = weights1[w];
+        }
+    }
+
+
+
+    /*---Count-pairs--------------------------------------*/
+    results_countpairs_mocks_s_mu results;
+    struct config_options options = get_config_options();
+
+    /* Pack weights into extra options */
+    struct extra_options extra = get_extra_options(weight_method);
+    for(int w = 0; w < num_weights; w++){
+        extra.weights0.weights[w] = (void *) weights1[w];
+        extra.weights1.weights[w] = (void *) weights2[w];
+    }
+
+    int status = countpairs_mocks_s_mu(ND1,phiD1,thetaD1,czD1,
+                                  ND2,phiD2,thetaD2,czD2,
+                                  nthreads,
+                                  autocorr,
+                                  sbinfile,
+                                  mu_max,
+                                  nmu_bins,
+                                  cosmology,
+                                  &results,
+                                  &options,
+                                  &extra);
+
+    free(phiD1);free(thetaD1);free(czD1);
+    for(int w = 0; w < num_weights; w++){
+        free(weights1[w]);
+    }
+    if(autocorr == 0) {
+        free(phiD2);free(thetaD2);free(czD2);
+        for(int w = 0; w < num_weights; w++){
+          free(weights2[w]);
+        }
+    }
+
+    if(status != EXIT_SUCCESS) {
+        return status;
+    }
+
+    const DOUBLE dmu = mu_max/(DOUBLE)results.nmu_bins ;
+    const int nmubin = results.nmu_bins;
+    for(int i=1;i<results.nsbin;i++) {
+        const double log_supp = LOG10(results.supp[i]);
+        for(int j=0;j<nmubin;j++) {
+            const int index = i*(nmubin+1) + j;
+            fprintf(stdout,"%10"PRIu64" %20.8lf %20.8lf  %20.8lf %20.8lf \n",results.npairs[index],results.savg[index],log_supp,(j+1)*dmu, results.weightavg[index]);
+        }
+    }
+
+    free_results_mocks_s_mu(&results);
+    gettimeofday(&t_end,NULL);
+    fprintf(stderr,"DDsmu> Done -  ND1=%"PRId64" ND2=%"PRId64". Time taken = %6.2lf seconds, read-in time = %6.2lf seconds \n",ND1,ND2,ADD_DIFF_TIME(t_start,t_end),read_time);
+    return EXIT_SUCCESS;
+}
+
+/*---Print-help-information---------------------------*/
+void Printhelp(void)
+{
+    fprintf(stderr,"=========================================================================\n") ;
+#if defined(USE_OMP) && defined(_OPENMP)
+    fprintf(stderr,"   --- DDsmu file1 format1 file2 format2 sbinfile nmu_bins mu_max cosmology numthreads [weight_method weights_file1 weights_format1 [weights_file2 weights_format2]] > DDfile\n") ;
+#else
+    fprintf(stderr,"   --- DDsmu file1 format1 file2 format2 sbinfile nmu_bins mu_max cosmology [weight_method weights_file1 weights_format1 [weights_file2 weights_format2]] > DDfile\n") ;
+#endif
+    fprintf(stderr,"   --- Measure the cross-correlation function xi(rp,pi) for two different\n") ;
+    fprintf(stderr,"       data files (or autocorrelation if data1=data2).\n") ;
+    fprintf(stderr,"     * data1         = name of first data file\n") ;
+    fprintf(stderr,"     * format1       = format of first data file  (a=ascii, c=csv, f=fast-food)\n") ;
+    fprintf(stderr,"     * data2         = name of second data file\n") ;
+    fprintf(stderr,"     * format2       = format of second data file (a=ascii, c=csv, f=fast-food)\n") ;
+    fprintf(stderr,"     * sbinfile      = name of ascii file containing the r-bins (rmin rmax for each bin)\n") ;
+    fprintf(stderr,"     * nmu_bins      = number of mu bins\n") ;
+    fprintf(stderr,"     * mu_max        = maximum mu value (>0 and <= 1.0)\n") ;
+    fprintf(stderr,"     * cosmology     = flag to pick-up the cosmology combination to use (set as an array of combinations in ../utils/cosmology_params.c)\n") ;
+#if defined(USE_OMP) && defined(_OPENMP)
+    fprintf(stderr,"     * numthreads    = number of threads to use\n");
+#endif
+    fprintf(stderr,"   --- OPTIONAL ARGS:\n");
+    fprintf(stderr,"     * weight_method = the type of pair weighting to apply.  Options are: 'pair_product', 'none'.  Default: 'none'.\n");
+    fprintf(stderr,"     * weights_file1 = name of file containing the weights corresponding to the first data file\n");
+    fprintf(stderr,"     * weights_format1 = format of file containing the weights corresponding to the first data file\n");
+    fprintf(stderr,"     * weights_file2 = name of file containing the weights corresponding to the second data file\n");
+    fprintf(stderr,"     * weights_format2 = format of file containing the weights corresponding to the second data file\n");
+    fprintf(stderr,"   ---OUTPUT:\n") ;
+    fprintf(stderr,"     > DDfile        = name of output file. Contains <npairs savg logrp mu weightavg>\n") ;
+
+    fprintf(stderr,"\n\tCompile options: \n");
+
+#ifdef OUTPUT_SAVG
+    fprintf(stderr,"Output SAVG = True\n");
+#else
+    fprintf(stderr,"Output SAVG = False\n");
+#endif
+
+#ifdef DOUBLE_PREC
+    fprintf(stderr,"Precision = double\n");
+#else
+    fprintf(stderr,"Precision = float\n");
+#endif
+
+#if defined(USE_AVX) && defined(__AVX__)
+    fprintf(stderr,"Use AVX = True\n");
+#else
+    fprintf(stderr,"Use AVX = False\n");
+#endif
+
+#if defined(USE_OMP) && defined(_OPENMP)
+    fprintf(stderr,"Use OMP = True\n");
+#else
+    fprintf(stderr,"Use OMP = False\n");
+#endif
+
+    fprintf(stderr,"=========================================================================\n") ;
+}
diff --git a/mocks/DDsmu_mocks/Makefile b/mocks/DDsmu_mocks/Makefile
new file mode 100644
index 00000000..22571ea3
--- /dev/null
+++ b/mocks/DDsmu_mocks/Makefile
@@ -0,0 +1,62 @@
+ROOT_DIR := ../..
+INSTALL_HEADERS_DIR := $(ROOT_DIR)/include
+INSTALL_LIB_DIR := $(ROOT_DIR)/lib
+INSTALL_BIN_DIR := $(ROOT_DIR)/bin
+UTILS_DIR := $(ROOT_DIR)/utils
+IO_DIR := $(ROOT_DIR)/io
+
+include $(ROOT_DIR)/mocks.options $(ROOT_DIR)/common.mk
+
+LIBNAME := countpairs_s_mu_mocks
+LIBRARY := lib$(LIBNAME).a
+LIBSRC  := countpairs_s_mu_mocks.c countpairs_s_mu_mocks_impl_double.c countpairs_s_mu_mocks_impl_float.c \
+           $(UTILS_DIR)/gridlink_mocks_impl_float.c $(UTILS_DIR)/gridlink_mocks_impl_double.c \
+           $(UTILS_DIR)/utils.c $(UTILS_DIR)/progressbar.c $(UTILS_DIR)/cpu_features.c \
+	   $(UTILS_DIR)/set_cosmo_dist.c $(UTILS_DIR)/cosmology_params.c
+LIBRARY_HEADERS := $(LIBNAME).h
+
+TARGET   := DDsmu_mocks
+TARGETS  := $(TARGET)
+TARGETSRC:= $(TARGET).c $(IO_DIR)/io.c $(IO_DIR)/ftread.c $(LIBSRC)
+INCL     := countpairs_s_mu_mocks_kernels_float.c countpairs_s_mu_mocks_kernels_double.c countpairs_s_mu_mocks_kernels.c.src \
+            countpairs_s_mu_mocks_impl.c.src countpairs_s_mu_mocks_impl.h.src countpairs_s_mu_mocks_impl_double.h countpairs_s_mu_mocks_impl_float.h \
+            countpairs_s_mu_mocks.h \
+            $(IO_DIR)/io.h $(IO_DIR)/ftread.h $(IO_DIR)/io.h  \
+            $(UTILS_DIR)/gridlink_mocks_impl_double.h $(UTILS_DIR)/gridlink_mocks_impl_float.h $(UTILS_DIR)/gridlink_mocks_impl.h.src \
+            $(UTILS_DIR)/cellarray_mocks_float.h $(UTILS_DIR)/cellarray_mocks_double.h $(UTILS_DIR)/cellarray_mocks.h.src \
+	    $(UTILS_DIR)/set_cosmo_dist.h $(UTILS_DIR)/cosmology_params.h  $(UTILS_DIR)/progressbar.h $(UTILS_DIR)/cpu_features.h \
+	    $(UTILS_DIR)/utils.h $(UTILS_DIR)/function_precision.h $(UTILS_DIR)/avx_calls.h $(UTILS_DIR)/defs.h \
+        $(UTILS_DIR)/weight_functions_double.h $(UTILS_DIR)/weight_functions_float.h $(UTILS_DIR)/weight_functions.h.src \
+		  $(UTILS_DIR)/weight_defs_double.h $(UTILS_DIR)/weight_defs_float.h $(UTILS_DIR)/weight_defs.h.src
+
+TARGETOBJS:=$(TARGETSRC:.c=.o)
+LIBOBJS:=$(LIBSRC:.c=.o) 
+
+all: $(TARGETS) $(TARGETSRC) $(ROOT_DIR)/mocks.options $(ROOT_DIR)/common.mk Makefile
+
+EXTRA_INCL:=$(GSL_CFLAGS)
+EXTRA_LINK:=$(GSL_LINK)
+
+countpairs_s_mu_mocks_impl_double.o:countpairs_s_mu_mocks_impl_double.c countpairs_s_mu_mocks_impl_double.h countpairs_s_mu_mocks_kernels_double.c $(UTILS_DIR)/gridlink_mocks_impl_double.h $(UTILS_DIR)/cellarray_mocks_double.h
+countpairs_s_mu_mocks_impl_float.o:countpairs_s_mu_mocks_impl_float.c countpairs_s_mu_mocks_impl_float.h countpairs_s_mu_mocks_kernels_float.c $(UTILS_DIR)/gridlink_mocks_impl_float.h $(UTILS_DIR)/cellarray_mocks_float.h
+countpairs_s_mu_mocks.o:countpairs_s_mu_mocks.c countpairs_s_mu_mocks_impl_double.h countpairs_s_mu_mocks_impl_float.h $(INCL)
+
+
+libs: lib
+lib: $(LIBRARY) 
+
+install: $(INSTALL_BIN_DIR)/$(TARGET) $(INSTALL_LIB_DIR)/$(LIBRARY) $(INSTALL_HEADERS_DIR)/$(LIBRARY_HEADERS)
+
+tests: 
+	$(MAKE) -C ../tests DDsmu_mocks
+
+clean:
+	$(RM) $(TARGETS) $(TARGETOBJS) $(LIBRARY) countpairs_s_mu_mocks_impl_float.[ch] countpairs_s_mu_mocks_impl_double.[ch] countpairs_s_mu_mocks_kernels_double.c countpairs_s_mu_mocks_kernels_float.c 
+	$(RM) -R *.dSYM
+
+distclean:clean | $(INSTALL_LIB_DIR) $(INSTALL_HEADERS_DIR) $(INSTALL_BIN_DIR)
+	cd $(INSTALL_LIB_DIR) && $(RM) $(LIBRARY)
+	cd $(INSTALL_HEADERS_DIR) && $(RM) $(LIBRARY_HEADERS)
+	cd $(INSTALL_BIN_DIR) && $(RM) $(TARGETS)
+
+include $(ROOT_DIR)/rules.mk
diff --git a/mocks/DDsmu_mocks/countpairs_s_mu_mocks.c b/mocks/DDsmu_mocks/countpairs_s_mu_mocks.c
new file mode 100644
index 00000000..3197cf9f
--- /dev/null
+++ b/mocks/DDsmu_mocks/countpairs_s_mu_mocks.c
@@ -0,0 +1,77 @@
+/* File: countpairs_s_mu_mocks.c */
+/*
+  This file is a part of the Corrfunc package
+  Copyright (C) 2015-- Manodeep Sinha (manodeep@gmail.com)
+  License: MIT LICENSE. See LICENSE file under the top-level
+  directory at https://github.com/manodeep/Corrfunc/
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "countpairs_s_mu_mocks.h" //function proto-type for API
+#include "countpairs_s_mu_mocks_impl_double.h"//actual implementations for double
+#include "countpairs_s_mu_mocks_impl_float.h"//actual implementations for float
+
+void free_results_mocks_s_mu(results_countpairs_mocks_s_mu *results)
+{
+    if(results==NULL)
+        return;
+
+    free(results->npairs);
+    free(results->supp);
+    free(results->savg);
+    free(results->weightavg);
+}
+
+
+int countpairs_mocks_s_mu(const int64_t ND1, void *phi1, void *theta1, void *czD1,
+                          const int64_t ND2, void *phi2, void *theta2, void *czD2,
+                          const int numthreads,
+                          const int autocorr,
+                          const char *sbinfile,
+                          const double mu_max,
+                          const int nmu_bins,
+                          const int cosmology,
+                          results_countpairs_mocks_s_mu *results,
+                          struct config_options *options,
+                          struct extra_options *extra)
+{
+    if( ! (options->float_type == sizeof(float) || options->float_type == sizeof(double))){
+        fprintf(stderr,"ERROR: In %s> Can only handle doubles or floats. Got an array of size = %zu\n",
+                __FUNCTION__, options->float_type);
+        return EXIT_FAILURE;
+    }
+
+    if( strncmp(options->version, STR(VERSION), sizeof(options->version)/sizeof(char)-1) != 0) {
+        fprintf(stderr,"Error: Do not know this API version = `%s'. Expected version = `%s'\n", options->version, STR(VERSION));
+        return EXIT_FAILURE;
+    }
+
+    if(options->float_type == sizeof(float)) {
+        return countpairs_mocks_s_mu_float(ND1, (float *) phi1, (float *) theta1, (float *) czD1,
+                                           ND2, (float *) phi2, (float *) theta2, (float *) czD2,
+                                           numthreads,
+                                           autocorr,
+                                           sbinfile,
+                                           mu_max,
+                                           nmu_bins,
+                                           cosmology,
+                                           results,
+                                           options,
+                                           extra);
+    } else {
+        return countpairs_mocks_s_mu_double(ND1, (double *) phi1, (double *) theta1, (double *) czD1,
+                                            ND2, (double *) phi2, (double *) theta2, (double *) czD2,
+                                            numthreads,
+                                            autocorr,
+                                            sbinfile,
+                                            mu_max,
+                                            nmu_bins,
+                                            cosmology,
+                                            results,
+                                            options,
+                                            extra);
+    }
+}
diff --git a/mocks/DDsmu_mocks/countpairs_s_mu_mocks.h b/mocks/DDsmu_mocks/countpairs_s_mu_mocks.h
new file mode 100644
index 00000000..2dc7d11d
--- /dev/null
+++ b/mocks/DDsmu_mocks/countpairs_s_mu_mocks.h
@@ -0,0 +1,46 @@
+/* File: countpairs_s_mu_mocks.h */
+/*
+  This file is a part of the Corrfunc package
+  Copyright (C) 2015-- Manodeep Sinha (manodeep@gmail.com)
+  License: MIT LICENSE. See LICENSE file under the top-level
+  directory at https://github.com/manodeep/Corrfunc/
+*/
+
+#pragma once
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "defs.h"
+#include <stdint.h> //for uint64_t
+
+    //define the results structure
+    typedef struct{
+        uint64_t *npairs;
+        double *supp;
+        double *savg;
+        double mu_max;
+        double mu_min;//not used -> assumed to be 0.0
+        double *weightavg;
+        int nsbin;
+        int nmu_bins;
+    } results_countpairs_mocks_s_mu;
+
+    int countpairs_mocks_s_mu(const int64_t ND1, void *theta1, void *phi1, void *czD1,
+                              const int64_t ND2, void *theta2, void *phi2, void *czD2,
+                              const int numthreads,
+                              const int autocorr,
+                              const char *sbinfile,
+                              const double mu_max,
+                              const int nmu_bins,
+                              const int cosmology,
+                              results_countpairs_mocks_s_mu *results,
+                              struct config_options *options,
+                              struct extra_options *extra);
+
+    void free_results_mocks_s_mu(results_countpairs_mocks_s_mu *results);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/mocks/DDsmu_mocks/countpairs_s_mu_mocks_impl.c.src b/mocks/DDsmu_mocks/countpairs_s_mu_mocks_impl.c.src
new file mode 100644
index 00000000..b4fbf26a
--- /dev/null
+++ b/mocks/DDsmu_mocks/countpairs_s_mu_mocks_impl.c.src
@@ -0,0 +1,796 @@
+// # -*- mode: c -*-
+/* File: countpairs_s_mu_mocks_impl.c.src */
+/*
+  This file is a part of the Corrfunc package
+  Copyright (C) 2015-- Manodeep Sinha (manodeep@gmail.com)
+  License: MIT LICENSE. See LICENSE file under the top-level
+  directory at https://github.com/manodeep/Corrfunc/
+*/
+
+#include <stdio.h>
+#include <math.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <unistd.h>
+#include <gsl/gsl_interp.h>
+
+
+#include "countpairs_s_mu_mocks_impl_DOUBLE.h"
+#include "countpairs_s_mu_mocks_kernels_DOUBLE.c"
+#include "cellarray_mocks_DOUBLE.h"
+#include "gridlink_mocks_impl_DOUBLE.h"
+
+#include "defs.h"
+#include "utils.h"
+#include "cosmology_params.h"
+#include "set_cosmo_dist.h"
+#include "cpu_features.h"
+#include "progressbar.h"
+
+#if defined(_OPENMP)
+#include <omp.h>
+#endif
+
+int interrupt_status_DDsmu_mocks_DOUBLE=EXIT_SUCCESS;
+
+void interrupt_handler_countpairs_s_mu_mocks_DOUBLE(int signo)
+{
+    fprintf(stderr,"Received signal = `%s' (signo = %d). Aborting \n",strsignal(signo), signo);
+    interrupt_status_DDsmu_mocks_DOUBLE = EXIT_FAILURE;
+}
+
+
+int check_ra_dec_cz_s_mu_DOUBLE(const int64_t N, DOUBLE *phi, DOUBLE *theta, DOUBLE *cz)
+{
+
+    if(N==0) {
+        return EXIT_SUCCESS;
+    }
+    if(phi == NULL || theta == NULL || cz == NULL) {
+        fprintf(stderr,"Input arrays can not be NULL. Have RA = %p DEC = %p cz = %p\n", phi, theta, cz);
+        return EXIT_FAILURE;
+    }
+
+    int fix_cz  = 0;
+    int fix_ra  = 0;
+    int fix_dec = 0;
+
+    const DOUBLE max_cz_threshold = 10.0;//if I find that max cz is smaller than this threshold, then I will assume z has been supplied rather than cz
+    DOUBLE max_cz = 0.0;
+    //Check input cz -> ensure that cz contains cz and not z
+    for(int64_t i=0;i<N;i++) {
+        if(cz[i] > max_cz) max_cz = cz[i];
+        if(phi[i] < 0.0) {
+            fix_ra = 1;
+        }
+        if(theta[i] > 90.0) {
+            fix_dec = 1;
+        }
+        if(theta[i] > 180) {
+            fprintf(stderr,"theta[%"PRId64"] = %"REAL_FORMAT"should be less than 180 deg\n", i, theta[i]);
+            return EXIT_FAILURE;
+        }
+    }
+    if(max_cz < max_cz_threshold) fix_cz = 1;
+
+    //Only run the loop if something needs to be fixed
+    if(fix_cz==1 || fix_ra == 1 || fix_dec == 1) {
+        if(fix_ra == 1) {
+            fprintf(stderr,"%s> Out of range values found for ra. Expected ra to be in the range [0.0,360.0]. Found ra values in [-180,180] -- fixing that\n", __FUNCTION__);
+        }
+        if(fix_dec == 1) {
+            fprintf(stderr,"%s> Out of range values found for dec. Expected dec to be in the range [-90.0,90.0]. Found dec values in [0,180] -- fixing that\n", __FUNCTION__);
+        }
+        if(fix_cz == 1)  {
+            fprintf(stderr,"%s> Out of range values found for cz. Expected input to be `cz' but found `z' instead. max_cz (found in input) = %"REAL_FORMAT" threshold "
+                    "= %"REAL_FORMAT"\n",__FUNCTION__,max_cz,max_cz_threshold);
+        }
+
+        for(int64_t i=0;i<N;i++) {
+            if(fix_ra==1) {
+                phi[i] += (DOUBLE) 180.0;
+            }
+            if(fix_dec==1) {
+                theta[i] -= (DOUBLE) 90.0;
+            }
+            if(fix_cz == 1) {
+                cz[i] *= (DOUBLE) SPEED_OF_LIGHT;//input was z -> convert to cz
+            }
+        }
+    }
+
+    return EXIT_SUCCESS;
+}
+
+
+countpairs_mocks_func_ptr_DOUBLE countpairs_s_mu_mocks_driver_DOUBLE(const struct config_options *options)
+{
+
+    static countpairs_mocks_func_ptr_DOUBLE function = NULL;
+    static isa old_isa=-1;
+    if(old_isa == options->instruction_set) {
+        return function;
+    }
+
+    /* Array of function pointers */
+    countpairs_mocks_func_ptr_DOUBLE allfunctions[] = {
+#ifdef __AVX__
+        countpairs_s_mu_mocks_avx_intrinsics_DOUBLE,
+#endif
+#ifdef __SSE4_2__
+      countpairs_s_mu_mocks_sse_intrinsics_DOUBLE,
+#endif
+      countpairs_s_mu_mocks_fallback_DOUBLE
+    };
+
+    const int num_functions = sizeof(allfunctions)/sizeof(void *);
+    const int fallback_offset = num_functions - 1;
+#if defined(__AVX__) || defined __SSE4_2__
+    const int highest_isa = instrset_detect();
+#endif
+    int curr_offset = 0;
+
+    /* Now check if AVX is supported by the CPU */
+    int avx_offset = fallback_offset;
+#ifdef __AVX__
+    avx_offset = highest_isa >= 7 ? curr_offset:fallback_offset;
+    curr_offset++;
+#endif
+
+    /* Is the SSE function supported at runtime and enabled at compile-time?*/
+    int sse_offset = fallback_offset;
+#ifdef __SSE4_2__
+    sse_offset = highest_isa >= 6 ? curr_offset:fallback_offset;
+    curr_offset++;
+#endif
+    if( curr_offset != fallback_offset) {
+      fprintf(stderr,"ERROR: Bug in code (current offset = %d *should equal* fallback function offset = %d)\n",
+              curr_offset, fallback_offset);
+      return NULL;
+    }
+
+    int function_dispatch=0;
+    /* Check that cpu supports feature */
+    if(options->instruction_set >= 0) {
+        switch(options->instruction_set) {
+        case(AVX512F):
+        case(AVX2):
+        case(AVX):function_dispatch=avx_offset;break;
+        case(SSE42): function_dispatch=sse_offset;break;
+        default:function_dispatch=fallback_offset;break;
+        }
+    }
+
+    if(function_dispatch >= num_functions) {
+      fprintf(stderr,"In %s> ERROR: Could not resolve the correct function.\n Function index = %d must lie between [0, %d)\n",
+              __FUNCTION__, function_dispatch, num_functions);
+      return NULL;
+    }
+    function = allfunctions[function_dispatch];
+    old_isa = options->instruction_set;
+
+    if(options->verbose){
+        // This must be first (AVX/SSE may be aliased to fallback)
+        if(function_dispatch == fallback_offset){
+            fprintf(stderr,"Using fallback kernel\n");
+        } else if(function_dispatch == avx_offset){
+            fprintf(stderr,"Using AVX kernel\n");
+        } else if(function_dispatch == sse_offset){
+            fprintf(stderr,"Using SSE kernel\n");
+        } else {
+            printf("Unknown kernel!\n");
+        }
+    }
+
+    return function;
+}
+
+
+int countpairs_mocks_s_mu_DOUBLE(const int64_t ND1, DOUBLE *ra1, DOUBLE *dec1, DOUBLE *czD1,
+                                 const int64_t ND2, DOUBLE *ra2, DOUBLE *dec2, DOUBLE *czD2,
+                                 const int numthreads,
+                                 const int autocorr,
+                                 const char *sbinfile,
+                                 const double max_mu,
+                                 const int nmu_bins,
+                                 const int cosmology,
+                                 results_countpairs_mocks_s_mu *results,
+                                 struct config_options *options, struct extra_options *extra)
+{
+
+    if(options->float_type != sizeof(DOUBLE)) {
+        fprintf(stderr,"ERROR: In %s> Can only handle arrays of size=%zu. Got an array of size = %zu\n",
+                __FUNCTION__, sizeof(DOUBLE), options->float_type);
+        return EXIT_FAILURE;
+    }
+
+    // If no extra options were passed, create dummy options
+    // This allows us to pass arguments like "extra->weights0" below;
+    // they'll just be NULLs, which is the correct behavior
+    struct extra_options dummy_extra;
+    if(extra == NULL){
+      weight_method_t dummy_method = NONE;
+      dummy_extra = get_extra_options(dummy_method);
+      extra = &dummy_extra;
+    }
+
+    int need_weightavg = extra->weight_method != NONE;
+
+    options->sort_on_z = 1;
+    struct timeval t0;
+    if(options->c_api_timer) {
+        gettimeofday(&t0, NULL);
+    }
+
+    //Check inputs
+    if(ND1 == 0 || (autocorr == 0 && ND2 == 0)) {
+        return EXIT_SUCCESS;
+    }
+
+    //Check inputs
+    int status1 = check_ra_dec_cz_s_mu_DOUBLE(ND1, ra1, dec1, czD1);
+    if(status1 != EXIT_SUCCESS) {
+        return status1;
+    }
+    if(autocorr==0) {
+        int status2 = check_ra_dec_cz_s_mu_DOUBLE(ND2, ra2, dec2, czD2);
+        if(status2 != EXIT_SUCCESS) {
+            return status2;
+        }
+    }
+
+#if defined(_OPENMP)
+    omp_set_num_threads(numthreads);
+#else
+    (void) numthreads;
+#endif
+
+    if(options->max_cells_per_dim == 0) {
+        fprintf(stderr,"Warning: Max. cells per dimension is set to 0 - resetting to `NLATMAX' = %d\n", NLATMAX);
+        options->max_cells_per_dim = NLATMAX;
+    }
+    for(int i=0;i<3;i++) {
+        if(options->bin_refine_factors[i] < 1) {
+            fprintf(stderr,"Warning: bin refine factor along axis = %d *must* be >=1. Instead found bin refine factor =%d\n",
+                    i, options->bin_refine_factors[i]);
+            reset_bin_refine_factors(options);
+            break;/* all factors have been reset -> no point continuing with the loop */
+        }
+    }
+
+    /* setup interrupt handler -> mostly useful during the python execution.
+       Let's Ctrl-C abort the extension  */
+    SETUP_INTERRUPT_HANDLERS(interrupt_handler_countpairs_s_mu_mocks_DOUBLE);
+
+    //Try to initialize cosmology - code will exit if comoslogy is not implemented.
+    //Putting in a different scope so I can call the variable status
+    {
+        int status = init_cosmology(cosmology);
+        if(status != EXIT_SUCCESS) {
+            return status;
+        }
+    }
+
+    /***********************
+     *initializing the  bins
+     ************************/
+    double *supp;
+    int nsbin;
+    double smin,smax;
+    setup_bins(sbinfile,&smin,&smax,&nsbin,&supp);
+    if( ! (smin > 0.0 && smax > 0.0 && smin < smax && nsbin > 0)) {
+        fprintf(stderr,"Error: Could not setup with S bins correctly. (smin = %lf, smax = %lf, with nbins = %d). Expected non-zero smin/smax with smax > smin and nbins >=1 \n",
+                smin, smax, nsbin);
+        return EXIT_FAILURE;
+    }
+
+
+    if(max_mu <= 0.0 || max_mu > 1.0) {
+        fprintf(stderr,"Error: max_mu (max. value for the cosine of the angle with line of sight) must be greater than 0 and at most 1).\n"
+                "The passed value is max_mu = %lf. Please change it to be > 0 and <= 1.0\n", max_mu);
+        return EXIT_FAILURE;
+    }
+
+    if(nmu_bins < 1 ) {
+        fprintf(stderr,"Error: Number of mu bins = %d must be at least 1\n", nmu_bins);
+        return EXIT_FAILURE;
+    }
+
+    //Change cz into co-moving distance
+    DOUBLE *D1 = NULL, *D2 = NULL;
+    if(options->is_comoving_dist == 0) {
+        D1 = my_malloc(sizeof(*D1),ND1);
+        D2 = autocorr == 0 ? my_malloc(sizeof(*D2),ND2):D1;
+    } else {
+        D1 = czD1;
+        D2 = autocorr == 0 ? czD2:czD1;
+    }
+
+    if(D1 == NULL || D2 == NULL) {
+        free(D1);free(D2);
+        return EXIT_FAILURE;
+    }
+
+
+    if(options->is_comoving_dist == 0) {
+        //Setup variables to do the cz->comoving distance
+        DOUBLE czmax = 0.0;
+        const DOUBLE inv_speed_of_light = 1.0/SPEED_OF_LIGHT;
+        get_max_DOUBLE(ND1, czD1, &czmax);
+        if(autocorr == 0) {
+            get_max_DOUBLE(ND2, czD2, &czmax);
+        }
+        const double zmax = czmax * inv_speed_of_light + 0.01;
+
+        const int workspace_size = 10000;
+        double *interp_redshift  = my_calloc(sizeof(*interp_redshift), workspace_size);//the interpolation is done in 'z' and not in 'cz'
+        double *interp_comoving_dist = my_calloc(sizeof(*interp_comoving_dist),workspace_size);
+        int Nzdc = set_cosmo_dist(zmax, workspace_size, interp_redshift, interp_comoving_dist, cosmology);
+        if(Nzdc < 0) {
+            free(interp_redshift);free(interp_comoving_dist);
+            return EXIT_FAILURE;
+        }
+
+        gsl_interp *interpolation;
+        gsl_interp_accel *accelerator;
+        accelerator =  gsl_interp_accel_alloc();
+        interpolation = gsl_interp_alloc (gsl_interp_linear,Nzdc);
+        gsl_interp_init(interpolation, interp_redshift, interp_comoving_dist, Nzdc);
+        for(int64_t i=0;i<ND1;i++) {
+            D1[i] = gsl_interp_eval(interpolation, interp_redshift, interp_comoving_dist, czD1[i]*inv_speed_of_light, accelerator);
+        }
+
+        if(autocorr==0) {
+            for(int64_t i=0;i<ND2;i++) {
+                D2[i] = gsl_interp_eval(interpolation, interp_redshift, interp_comoving_dist, czD2[i]*inv_speed_of_light, accelerator);
+            }
+        }
+        free(interp_redshift);free(interp_comoving_dist);
+        gsl_interp_free(interpolation);
+        gsl_interp_accel_free(accelerator);
+    }
+
+    DOUBLE *X1 = my_malloc(sizeof(*X1), ND1);
+    DOUBLE *Y1 = my_malloc(sizeof(*Y1), ND1);
+    DOUBLE *Z1 = my_malloc(sizeof(*Z1), ND1);
+    if(X1 == NULL || Y1 == NULL || Z1 == NULL) {
+        free(X1);free(Y1);free(Z1);
+        return EXIT_FAILURE;
+    }
+    for(int64_t i=0;i<ND1;i++) {
+        X1[i] = D1[i]*COSD(dec1[i])*COSD(ra1[i]);
+        Y1[i] = D1[i]*COSD(dec1[i])*SIND(ra1[i]);
+        Z1[i] = D1[i]*SIND(dec1[i]);
+    }
+
+    DOUBLE *X2,*Y2,*Z2;
+    if(autocorr==0) {
+        X2 = my_malloc(sizeof(*X2), ND2);
+        Y2 = my_malloc(sizeof(*Y2), ND2);
+        Z2 = my_malloc(sizeof(*Z2), ND2);
+        for(int64_t i=0;i<ND2;i++) {
+            X2[i] = D2[i]*COSD(dec2[i])*COSD(ra2[i]);
+            Y2[i] = D2[i]*COSD(dec2[i])*SIND(ra2[i]);
+            Z2[i] = D2[i]*SIND(dec2[i]);
+        }
+    } else {
+        X2 = X1;
+        Y2 = Y1;
+        Z2 = Z1;
+    }
+
+    DOUBLE supp_sqr[nsbin];
+    for(int i=0; i < nsbin;i++) {
+        supp_sqr[i] = supp[i]*supp[i];
+    }
+    const DOUBLE mu_max = (DOUBLE) max_mu;
+
+    DOUBLE xmin=1e10,ymin=1e10,zmin=1e10;
+    DOUBLE xmax=-1e10,ymax=-1e10,zmax=-1e10;
+    get_max_min_data_DOUBLE(ND1, X1, Y1, Z1, &xmin, &ymin, &zmin, &xmax, &ymax, &zmax);
+
+    if(autocorr==0) {
+        get_max_min_data_DOUBLE(ND2, X2, Y2, Z2, &xmin, &ymin, &zmin, &xmax, &ymax, &zmax);
+    }
+
+    const DOUBLE xdiff = xmax-xmin;
+    const DOUBLE ydiff = ymax-ymin;
+    const DOUBLE zdiff = zmax-zmin;
+    if(get_bin_refine_scheme(options) == BINNING_DFL) {
+        if(smax < 0.05*xdiff) {
+            options->bin_refine_factors[0] = 1;
+      }
+        if(smax < 0.05*ydiff) {
+            options->bin_refine_factors[1] = 1;
+        }
+        if(smax < 0.05*zdiff) {
+            options->bin_refine_factors[2] = 1;
+        }
+    }
+
+    /*---Create 3-D lattice--------------------------------------*/
+    int nmesh_x=0,nmesh_y=0,nmesh_z=0;
+    cellarray_mocks_index_particles_DOUBLE *lattice1 = gridlink_mocks_index_particles_DOUBLE(ND1, X1, Y1, Z1, D1, &(extra->weights0),
+                                                                                             xmin, xmax, ymin, ymax, zmin, zmax,
+                                                                                             smax, smax, smax,
+                                                                                             options->bin_refine_factors[0],
+                                                                                             options->bin_refine_factors[1],
+                                                                                             options->bin_refine_factors[2],
+                                                                                             &nmesh_x, &nmesh_y, &nmesh_z,
+                                                                                             options);
+    if(lattice1 == NULL) {
+        return EXIT_FAILURE;
+    }
+
+    /* If there too few cells (BOOST_CELL_THRESH is ~10), and the number of cells can be increased, then boost bin refine factor by ~1*/
+    const double avg_np = ((double)ND1)/(nmesh_x*nmesh_y*nmesh_z);
+    const int8_t max_nmesh = fmax(nmesh_x, fmax(nmesh_y, nmesh_z));
+    if((max_nmesh <= BOOST_CELL_THRESH || avg_np >= BOOST_NUMPART_THRESH)
+        && max_nmesh < options->max_cells_per_dim) {
+      fprintf(stderr,"%s> gridlink seems inefficient. nmesh = (%d, %d, %d); avg_np = %.3g. ", __FUNCTION__, nmesh_x, nmesh_y, nmesh_z, avg_np);
+      if(get_bin_refine_scheme(options) == BINNING_DFL) {
+            fprintf(stderr,"Boosting bin refine factor - should lead to better performance\n");
+            // Only boost the first two dimensions.  Prevents excessive refinement.
+            for(int i=0;i<2;i++) {
+              options->bin_refine_factors[i] += BOOST_BIN_REF;
+            }
+
+            free_cellarray_mocks_index_particles_DOUBLE(lattice1, nmesh_x * (int64_t) nmesh_y * nmesh_z);
+            lattice1 = gridlink_mocks_index_particles_DOUBLE(ND1, X1, Y1, Z1, D1, &(extra->weights0),
+                                                             xmin, xmax, ymin, ymax, zmin, zmax,
+                                                             smax, smax, smax,
+                                                             options->bin_refine_factors[0],
+                                                             options->bin_refine_factors[1],
+                                                             options->bin_refine_factors[2],
+                                                             &nmesh_x, &nmesh_y, &nmesh_z,
+                                                             options);
+            if(lattice1 == NULL) {
+                return EXIT_FAILURE;
+            }
+        } else {
+            fprintf(stderr,"Boosting bin refine factor could have helped. However, since custom bin refine factors "
+                  "= (%d, %d, %d) are being used - continuing with inefficient mesh\n", options->bin_refine_factors[0],
+                  options->bin_refine_factors[1], options->bin_refine_factors[2]);
+
+        }
+    }
+
+    cellarray_mocks_index_particles_DOUBLE *lattice2 = NULL;
+    if(autocorr==0) {
+        int ngrid2_x=0,ngrid2_y=0,ngrid2_z=0;
+        lattice2 = gridlink_mocks_index_particles_DOUBLE(ND2, X2, Y2, Z2, D2, &(extra->weights1),
+                                                         xmin, xmax,
+                                                         ymin, ymax,
+                                                         zmin, zmax,
+                                                         smax, smax, smax,
+                                                         options->bin_refine_factors[0],
+                                                         options->bin_refine_factors[1],
+                                                         options->bin_refine_factors[2],
+                                                         &ngrid2_x, &ngrid2_y, &ngrid2_z, options);
+        if(lattice2 == NULL) {
+            return EXIT_FAILURE;
+        }
+        if( ! (nmesh_x == ngrid2_x && nmesh_y == ngrid2_y && nmesh_z == ngrid2_z) ) {
+            fprintf(stderr,"Error: The two sets of 3-D lattices do not have identical bins. First has dims (%d, %d, %d) while second has (%d, %d, %d)\n",
+                    nmesh_x, nmesh_y, nmesh_z, ngrid2_x, ngrid2_y, ngrid2_z);
+            return EXIT_FAILURE;
+        }
+    } else {
+        lattice2 = lattice1;
+    }
+    free(X1);free(Y1);free(Z1);
+    if(autocorr == 0) {
+        free(X2);free(Y2);free(Z2);
+    }
+
+    if(options->is_comoving_dist == 0) {
+        free(D1);
+        if(autocorr == 0) {
+            free(D2);
+        }
+    }
+
+
+
+    const int64_t totncells = (int64_t) nmesh_x * (int64_t) nmesh_y * (int64_t) nmesh_z;
+    {
+        int status = assign_ngb_cells_mocks_index_particles_DOUBLE(lattice1, lattice2, totncells,
+                                                                   options->bin_refine_factors[0], options->bin_refine_factors[1], options->bin_refine_factors[2],
+                                                                   nmesh_x, nmesh_y, nmesh_z,
+                                                                   autocorr);
+        if(status != EXIT_SUCCESS) {
+            free_cellarray_mocks_index_particles_DOUBLE(lattice1, totncells);
+            if(autocorr == 0) {
+                free_cellarray_mocks_index_particles_DOUBLE(lattice2, totncells);
+            }
+            free(supp);
+            return EXIT_FAILURE;
+        }
+    }
+    /*---Gridlink-variables----------------*/
+    const int totnbins = (nmu_bins+1)*(nsbin+1);
+#if defined(_OPENMP)
+    uint64_t **all_npairs = (uint64_t **) matrix_calloc(sizeof(uint64_t), numthreads, totnbins);
+    DOUBLE **all_savg = NULL;
+    if(options->need_avg_sep){
+        all_savg = (DOUBLE **) matrix_calloc(sizeof(DOUBLE),numthreads,totnbins);
+    }
+    DOUBLE **all_weightavg = NULL;
+    if(need_weightavg) {
+      all_weightavg = (DOUBLE **) matrix_calloc(sizeof(DOUBLE),numthreads,totnbins);
+    }
+
+#else //USE_OMP
+    uint64_t npairs[totnbins];
+    DOUBLE savg[totnbins], weightavg[totnbins];
+
+    for(int i=0; i <totnbins;i++) {
+        npairs[i] = 0;
+        if(options->need_avg_sep) {
+            savg[i] = ZERO;
+        }
+        if(need_weightavg) {
+            weightavg[i] = ZERO;
+        }
+    }
+#endif //USE_OMP
+
+    /* runtime dispatch - get the function pointer */
+    countpairs_mocks_func_ptr_DOUBLE countpairs_s_mu_mocks_function_DOUBLE = countpairs_s_mu_mocks_driver_DOUBLE(options);
+    if(countpairs_s_mu_mocks_function_DOUBLE == NULL) {
+        return EXIT_FAILURE;
+    }
+
+    int interrupted=0,numdone=0, abort_status=EXIT_SUCCESS;
+    if(options->verbose) {
+        init_my_progressbar(totncells,&interrupted);
+    }
+
+
+#if defined(_OPENMP)
+#pragma omp parallel shared(numdone, abort_status, interrupt_status_DDsmu_mocks_DOUBLE)
+    {
+        const int tid = omp_get_thread_num();
+        uint64_t npairs[totnbins];
+        DOUBLE savg[totnbins], weightavg[totnbins];
+        for(int i=0;i<totnbins;i++) {
+            npairs[i] = 0;
+            if(options->need_avg_sep) {
+                savg[i] = ZERO;
+            }
+            if(need_weightavg) {
+                weightavg[i] = ZERO;
+            }
+        }
+
+#pragma omp for  schedule(dynamic)
+#endif//USE_OMP
+
+        /*---Loop-over-Data1-particles--------------------*/
+        for(int64_t index1=0;index1<totncells;index1++) {
+
+#if defined(_OPENMP)
+#pragma omp flush (abort_status, interrupt_status_DDsmu_mocks_DOUBLE)
+#endif
+            if(abort_status == EXIT_SUCCESS && interrupt_status_DDsmu_mocks_DOUBLE == EXIT_SUCCESS) {
+                //omp cancel was introduced in omp 4.0 - so this is my way of checking if loop needs to be cancelled
+                /* If the verbose option is not enabled, avoid outputting anything unnecessary*/
+                if(options->verbose) {
+#if defined(_OPENMP)
+                    if (omp_get_thread_num() == 0)
+#endif
+                        my_progressbar(numdone,&interrupted);
+
+
+#if defined(_OPENMP)
+#pragma omp atomic
+#endif
+                    numdone++;
+                }
+
+                const cellarray_mocks_index_particles_DOUBLE *first  = &(lattice1[index1]);
+                if(first->nelements == 0) {
+                    continue;
+                }
+                DOUBLE *x1 = first->x;
+                DOUBLE *y1 = first->y;
+                DOUBLE *z1 = first->z;
+                DOUBLE *d1 = first->cz;
+                const weight_struct_DOUBLE *weights1 = &(first->weights);
+                const int64_t N1 = first->nelements;
+
+                if(autocorr == 1) {
+                    int same_cell = 1;
+                    DOUBLE *this_savg = options->need_avg_sep ? &(savg[0]):NULL;
+                    DOUBLE *this_weightavg = need_weightavg ? weightavg:NULL;
+                    const int status = countpairs_s_mu_mocks_function_DOUBLE(N1, x1, y1, z1, d1, weights1,
+                                                                             N1, x1, y1, z1, d1, weights1,
+                                                                             same_cell,
+                                                                             options->fast_divide,
+                                                                             smax, smin, nsbin,
+                                                                             nmu_bins, supp_sqr, mu_max,
+                                                                             this_savg, npairs,
+                                                                             this_weightavg, extra->weight_method);
+                    /* This actually causes a race condition under OpenMP - but mostly
+                       I care that an error occurred - rather than the exact value of
+                       the error status */
+                    abort_status |= status;
+                }
+
+                for(int64_t ngb=0;ngb<first->num_ngb;ngb++){
+                    const cellarray_mocks_index_particles_DOUBLE *second = first->ngb_cells[ngb];
+                    if(second->nelements == 0) {
+                        continue;
+                    }
+                    const int same_cell = 0;
+                    DOUBLE *x2 = second->x;
+                    DOUBLE *y2 = second->y;
+                    DOUBLE *z2 = second->z;
+                    DOUBLE *d2 = second->cz;
+                    const weight_struct_DOUBLE *weights2 = &(second->weights);
+                    const int64_t N2 = second->nelements;
+                    DOUBLE *this_savg = options->need_avg_sep ? &(savg[0]):NULL;
+                    DOUBLE *this_weightavg = need_weightavg ? weightavg:NULL;
+                    const int status = countpairs_s_mu_mocks_function_DOUBLE(N1, x1, y1, z1, d1, weights1,
+                                                                             N2, x2, y2, z2, d2, weights2,
+                                                                             same_cell,
+                                                                             options->fast_divide,
+                                                                             smax, smin, nsbin,
+                                                                             nmu_bins, supp_sqr, mu_max,
+                                                                             this_savg, npairs,
+                                                                             this_weightavg, extra->weight_method);
+                    /* This actually causes a race condition under OpenMP - but mostly
+                       I care that an error occurred - rather than the exact value of
+                       the error status */
+                    abort_status |= status;
+                }//loop over ngb cells
+            }//abort_status check
+        }//i loop over ND1 particles
+#if defined(_OPENMP)
+        for(int i=0;i<totnbins;i++) {
+            all_npairs[tid][i] = npairs[i];
+            if(options->need_avg_sep) {
+                all_savg[tid][i] = savg[i];
+            }
+            if(need_weightavg) {
+                all_weightavg[tid][i] = weightavg[i];
+            }
+        }
+    }//close the omp parallel region
+#endif//USE_OMP
+
+    free_cellarray_mocks_index_particles_DOUBLE(lattice1,totncells);
+    if(autocorr == 0) {
+        free_cellarray_mocks_index_particles_DOUBLE(lattice2,totncells);
+    }
+
+    if(abort_status != EXIT_SUCCESS || interrupt_status_DDsmu_mocks_DOUBLE != EXIT_SUCCESS) {
+        /* Cleanup memory here if aborting */
+        free(supp);
+#if defined(_OPENMP)
+        matrix_free((void **) all_npairs, numthreads);
+        if(options->need_avg_sep) {
+            matrix_free((void **) all_savg, numthreads);
+        }
+        if(need_weightavg) {
+            matrix_free((void **) all_weightavg, numthreads);
+        }
+#endif
+        return EXIT_FAILURE;
+    }
+
+    if(options->verbose) {
+        finish_myprogressbar(&interrupted);
+    }
+
+
+
+#if defined(_OPENMP)
+    uint64_t npairs[totnbins];
+    DOUBLE savg[totnbins], weightavg[totnbins];
+    for(int i=0;i<totnbins;i++) {
+        npairs[i] = 0;
+        if(options->need_avg_sep) {
+            savg[i] = ZERO;
+        }
+        if(need_weightavg) {
+            weightavg[i] = ZERO;
+        }
+    }
+
+    for(int i=0;i<numthreads;i++) {
+        for(int j=0;j<totnbins;j++) {
+            npairs[j] += all_npairs[i][j];
+            if(options->need_avg_sep) {
+                savg[j] += all_savg[i][j];
+            }
+            if(need_weightavg) {
+                weightavg[j] += all_weightavg[i][j];
+            }
+        }
+    }
+    matrix_free((void **) all_npairs, numthreads);
+    if(options->need_avg_sep) {
+        matrix_free((void **) all_savg, numthreads);
+    }
+    if(need_weightavg) {
+        matrix_free((void **) all_weightavg, numthreads);
+    }
+#endif //USE_OMP
+
+    //The code does not double count for autocorrelations
+    //which means the npairs and savg values need to be doubled;
+    if(autocorr == 1) {
+        const uint64_t int_fac = 2;
+        const DOUBLE dbl_fac = (DOUBLE) 2.0;
+        for(int i=0;i<totnbins;i++) {
+            npairs[i] *= int_fac;
+            if(options->need_avg_sep) {
+                savg[i] *= dbl_fac;
+            }
+            if(need_weightavg) {
+                weightavg[i] *= dbl_fac;
+            }
+        }
+    }
+
+    for(int i=0;i<totnbins;i++) {
+        if(npairs[i] > 0) {
+            if(options->need_avg_sep) {
+                savg[i] /= (DOUBLE) npairs[i] ;
+            }
+            if(need_weightavg) {
+                weightavg[i] /= (DOUBLE) npairs[i];
+            }
+        }
+    }
+
+    results->nsbin   = nsbin;
+    results->nmu_bins = nmu_bins;
+    results->mu_max = max_mu;//NOTE max_mu which is double and not mu_max (which might be float)
+    results->mu_min = ZERO;
+    results->npairs = my_malloc(sizeof(*(results->npairs)), totnbins);
+    results->supp   = my_malloc(sizeof(*(results->supp))  , nsbin);
+    results->savg  = my_malloc(sizeof(*(results->savg)) , totnbins);
+    results->weightavg  = my_calloc(sizeof(double)  , totnbins);
+    if(results->npairs == NULL || results->supp == NULL || results->savg == NULL || results->weightavg == NULL) {
+        free_results_mocks_s_mu(results);
+        free(supp);
+        return EXIT_FAILURE;
+    }
+
+    for(int i=0;i<nsbin;i++) {
+        results->supp[i] = supp[i];
+        for(int j=0;j<nmu_bins;j++) {
+            const int index = i*(nmu_bins+1) + j;
+            if( index >= totnbins ) {
+                fprintf(stderr, "ERROR: In %s> index = %d must be in range [0, %d)\n", __FUNCTION__, index, totnbins);
+                free_results_mocks_s_mu(results);
+                free(supp);
+                return EXIT_FAILURE;
+            }
+            results->npairs[index] = npairs[index];
+            results->savg[index] = ZERO;
+            results->weightavg[index] = ZERO;
+            if(options->need_avg_sep) {
+                results->savg[index] = savg[index];
+            }
+            if(need_weightavg) {
+                results->weightavg[index] = weightavg[index];
+            }
+        }
+    }
+    free(supp);
+
+    /* reset interrupt handlers to default */
+    RESET_INTERRUPT_HANDLERS();
+    reset_bin_refine_factors(options);
+
+    if(options->c_api_timer) {
+        struct timeval t1;
+        gettimeofday(&t1, NULL);
+        options->c_api_time = ADD_DIFF_TIME(t0, t1);
+    }
+
+    return EXIT_SUCCESS;
+}
diff --git a/mocks/DDsmu_mocks/countpairs_s_mu_mocks_impl.h.src b/mocks/DDsmu_mocks/countpairs_s_mu_mocks_impl.h.src
new file mode 100644
index 00000000..acf0b6bb
--- /dev/null
+++ b/mocks/DDsmu_mocks/countpairs_s_mu_mocks_impl.h.src
@@ -0,0 +1,51 @@
+// # -*- mode: c -*-
+/* File: countpairs_s_mu_mocks_impl.h.src */
+/*
+  This file is a part of the Corrfunc package
+  Copyright (C) 2015-- Manodeep Sinha (manodeep@gmail.com)
+  License: MIT LICENSE. See LICENSE file under the top-level
+  directory at https://github.com/manodeep/Corrfunc/
+*/
+
+#pragma once
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "defs.h" //for struct config_options
+#include "weight_defs_DOUBLE.h"
+#include <inttypes.h> //for uint64_t
+
+#include "countpairs_s_mu_mocks.h" //for definition of results_countpairs_mocks
+
+    extern void interrupt_handler_countpairs_s_mu_mocks_DOUBLE(int signo);
+
+    typedef int (*countpairs_mocks_func_ptr_DOUBLE)(const int64_t N0, DOUBLE *x0, DOUBLE *y0, DOUBLE *z0, DOUBLE *d0, const weight_struct_DOUBLE *weights0,
+                                                    const int64_t N1, DOUBLE *x1, DOUBLE *y1, DOUBLE *z1, DOUBLE *d1, const weight_struct_DOUBLE *weights1,
+                                                    const int same_cell,
+                                                    const int fast_divide,
+                                                    const DOUBLE smax, const DOUBLE smin, const int nsbin,
+                                                    const int nmu_bins, const DOUBLE *supp_sqr,
+                                                    const DOUBLE mu_max,
+                                                    DOUBLE *src_savg, uint64_t *src_npairs,
+                                                    DOUBLE *src_weightavg, const weight_method_t weight_method);
+
+    extern countpairs_mocks_func_ptr_DOUBLE countpairs_s_mu_mocks_driver_DOUBLE(const struct config_options *options) __attribute__((warn_unused_result));
+
+    extern int countpairs_mocks_s_mu_DOUBLE(const int64_t ND1, DOUBLE *theta1, DOUBLE *phi1, DOUBLE *czD1,
+                                            const int64_t ND2, DOUBLE *theta2, DOUBLE *phi2, DOUBLE *czD2,
+                                            const int numthreads,
+                                            const int autocorr,
+                                            const char *sbinfile,
+                                            const double mu_max,
+                                            const int nmu_bins,
+                                            const int cosmology,
+                                            results_countpairs_mocks_s_mu *results,
+                                            struct config_options *options, struct extra_options *extra);
+
+    extern int check_ra_dec_cz_s_mu_DOUBLE(const int64_t N, DOUBLE *phi, DOUBLE *theta, DOUBLE *cz);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/mocks/DDsmu_mocks/countpairs_s_mu_mocks_kernels.c.src b/mocks/DDsmu_mocks/countpairs_s_mu_mocks_kernels.c.src
new file mode 100644
index 00000000..077a76e9
--- /dev/null
+++ b/mocks/DDsmu_mocks/countpairs_s_mu_mocks_kernels.c.src
@@ -0,0 +1,884 @@
+// # -*- mode: c -*-
+/* File: countpairs_s_mu_mocks_kernels.c */
+/*
+  This file is a part of the Corrfunc package
+  Copyright (C) 2015-- Manodeep Sinha (manodeep@gmail.com)
+  License: MIT LICENSE. See LICENSE file under the top-level
+  directory at https://github.com/manodeep/Corrfunc/
+*/
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <stdint.h>
+#include <inttypes.h>
+
+#include "defs.h"
+#include "function_precision.h"
+#include "utils.h"
+
+#include "weight_functions_DOUBLE.h"
+
+#if defined(__AVX__)
+#include "avx_calls.h"
+
+static inline int countpairs_s_mu_mocks_avx_intrinsics_DOUBLE(const int64_t N0, DOUBLE *x0, DOUBLE *y0, DOUBLE *z0, DOUBLE *d0, const weight_struct_DOUBLE *weights0,
+                                                              const int64_t N1, DOUBLE *x1, DOUBLE *y1, DOUBLE *z1, DOUBLE *d1, const weight_struct_DOUBLE *weights1,
+                                                              const int same_cell,
+                                                              const int fast_divide,
+                                                              const DOUBLE smax, const DOUBLE smin, const int nsbin,const int nmu_bins,
+                                                              const DOUBLE *supp_sqr, const DOUBLE mu_max,
+                                                              DOUBLE *src_savg,
+                                                              uint64_t *src_npairs, DOUBLE *src_weightavg, const weight_method_t weight_method)
+{
+    if(N0 == 0 || N1 == 0) {
+        return EXIT_SUCCESS;
+    }
+
+    if(src_npairs == NULL) {
+        return EXIT_FAILURE;
+    }
+
+    const int32_t need_savg = src_savg != NULL;
+    const int32_t need_weightavg = src_weightavg != NULL;
+
+    const int64_t totnbins = (nmu_bins+1)*(nsbin+1);
+    const DOUBLE sqr_mumax = mu_max*mu_max;
+    const DOUBLE sqr_smax  = smax*smax;
+    const DOUBLE sqr_smin  = smin*smin;
+
+    AVX_FLOATS m_supp_sqr[nsbin];
+    AVX_FLOATS m_kbin[nsbin];
+    for(int i=0;i<nsbin;i++) {
+        m_supp_sqr[i] = AVX_SET_FLOAT(supp_sqr[i]);
+        m_kbin[i] = AVX_SET_FLOAT((DOUBLE) i);
+    }
+
+    uint64_t npairs[totnbins];
+    const DOUBLE dmu = mu_max/(DOUBLE) nmu_bins;
+    const DOUBLE inv_dmu = 1.0/dmu;
+    DOUBLE savg[totnbins], weightavg[totnbins];
+    for(int i=0;i<totnbins;i++) {
+        npairs[i] = ZERO;
+        if(need_savg) {
+            savg[i] = ZERO;
+        }
+        if(need_weightavg){
+            weightavg[i] = ZERO;
+        }
+    }
+
+    // A copy whose pointers we can advance
+    weight_struct_DOUBLE local_w0 = {.weights={NULL}, .num_weights=0},
+                         local_w1 = {.weights={NULL}, .num_weights=0};
+    pair_struct_DOUBLE pair = {.num_weights=0};
+    avx_weight_func_t_DOUBLE avx_weight_func = NULL;
+    weight_func_t_DOUBLE fallback_weight_func = NULL;
+    if(need_weightavg){
+        // Same particle list, new copy of num_weights pointers into that list
+        local_w0 = *weights0;
+        local_w1 = *weights1;
+
+        pair.num_weights = local_w0.num_weights;
+
+        avx_weight_func = get_avx_weight_func_by_method_DOUBLE(weight_method);
+        fallback_weight_func = get_weight_func_by_method_DOUBLE(weight_method);
+    }
+
+    int64_t prev_j = 0, n_off = 0;
+    for(int64_t i=0;i<N0;i++) {
+        const DOUBLE xpos = *x0++;
+        const DOUBLE ypos = *y0++;
+        const DOUBLE zpos = *z0++;
+        const DOUBLE dpos = *d0++;
+        for(int w = 0; w < pair.num_weights; w++){
+            // local_w0.weights[w] is a pointer to a float in the particle list of weights,
+            // just as x0 is a pointer into the list of x-positions.
+            // The advancement of the local_w0.weights[w] pointer should always mirror x0.
+            pair.weights0[w].a = AVX_SET_FLOAT(*(local_w0.weights[w])++);
+        }
+
+        int64_t j;
+        if(same_cell == 1) {
+            d1++; n_off++;
+            j = i+1;
+        } else {
+            for(;prev_j<N1;prev_j++) {
+                const DOUBLE dz = *d1 - dpos;
+                if(dz > -smax) break;
+                d1++; n_off++;
+            }
+            if(prev_j == N1) {
+                break;
+            }
+            j = prev_j;
+        }
+        DOUBLE *locald1 = d1;
+        DOUBLE *localx1 = x1 + n_off;
+        DOUBLE *localy1 = y1 + n_off;
+        DOUBLE *localz1 = z1 + n_off;
+        for(int w = 0; w < local_w1.num_weights; w++){
+            local_w1.weights[w] = weights1->weights[w] + n_off;
+        }
+
+        AVX_FLOATS m_xpos = AVX_SET_FLOAT(xpos);
+        AVX_FLOATS m_ypos = AVX_SET_FLOAT(ypos);
+        AVX_FLOATS m_zpos = AVX_SET_FLOAT(zpos);
+        AVX_FLOATS m_dpos = AVX_SET_FLOAT(dpos);
+        union int8 {
+            AVX_INTS m_ibin;
+            int ibin[AVX_NVEC];
+        };
+
+        union float8{
+            AVX_FLOATS m_sep;
+            DOUBLE sep[AVX_NVEC];
+        };
+
+        const AVX_FLOATS m_sqr_smax = AVX_SET_FLOAT(sqr_smax);
+        const AVX_FLOATS m_sqr_smin = AVX_SET_FLOAT(sqr_smin);
+        const AVX_FLOATS m_sqr_mumax = AVX_SET_FLOAT(sqr_mumax);
+        const AVX_FLOATS m_inv_dmu = AVX_SET_FLOAT(inv_dmu);
+        const AVX_FLOATS m_nmu_bins = AVX_SET_FLOAT((DOUBLE) nmu_bins);
+        const AVX_FLOATS m_zero = AVX_SET_FLOAT(ZERO);
+        const AVX_FLOATS m_one = AVX_SET_FLOAT((DOUBLE) 1);
+
+        for(;j<=(N1-AVX_NVEC);j+=AVX_NVEC){
+            const AVX_FLOATS m_x2 = AVX_LOAD_FLOATS_UNALIGNED(localx1);
+            const AVX_FLOATS m_y2 = AVX_LOAD_FLOATS_UNALIGNED(localy1);
+            const AVX_FLOATS m_z2 = AVX_LOAD_FLOATS_UNALIGNED(localz1);
+            const AVX_FLOATS m_d2 = AVX_LOAD_FLOATS_UNALIGNED(locald1);
+
+            localx1 += AVX_NVEC;
+            localy1 += AVX_NVEC;
+            localz1 += AVX_NVEC;
+            locald1 += AVX_NVEC;
+
+            for(int w = 0; w < pair.num_weights; w++){
+                pair.weights1[w].a = AVX_LOAD_FLOATS_UNALIGNED(local_w1.weights[w]);
+                local_w1.weights[w] += AVX_NVEC;
+            }
+
+            union float8_weights{
+                AVX_FLOATS m_weights;
+                DOUBLE weights[NVEC];
+            };
+            union float8_weights union_mweight;
+
+            const AVX_FLOATS m_perpx = AVX_SUBTRACT_FLOATS(m_xpos, m_x2);
+            const AVX_FLOATS m_perpy = AVX_SUBTRACT_FLOATS(m_ypos, m_y2);
+            const AVX_FLOATS m_perpz = AVX_SUBTRACT_FLOATS(m_zpos, m_z2);
+
+            const AVX_FLOATS m_parx = AVX_ADD_FLOATS(m_x2, m_xpos);
+            const AVX_FLOATS m_pary = AVX_ADD_FLOATS(m_y2, m_ypos);
+            const AVX_FLOATS m_parz = AVX_ADD_FLOATS(m_z2, m_zpos);
+
+            AVX_FLOATS m_sqr_mu, m_sqr_s;
+            {
+                /*
+                  //Technically l := 1/2 (v1 + v2) but the factor of 1/2 occurs both in numerator and denominator
+                  and cancels out.
+
+                  s \dot l := (parx*perpx + pary*perpy + parz*perp)
+                           := (x1 + x2)*(x1 - x2) + (y1 + y2)*(y1 - y2) + (z1 + z2)*(z1 - z2)
+                           := (x1^2 + y1^2 + z1^2) - (x2^2 + y2^2 + z2^2)
+                           := d1^2 - d2^2
+                */
+                const AVX_FLOATS m_s_dot_l =  AVX_SUBTRACT_FLOATS(AVX_SQUARE_FLOAT(m_d2), AVX_SQUARE_FLOAT(m_dpos));
+                const AVX_FLOATS m_sqr_s_dot_l = AVX_SQUARE_FLOAT(m_s_dot_l);// numerator := |s.l|^2
+                const AVX_FLOATS m_sqr_perpx = AVX_SQUARE_FLOAT(m_perpx);
+                const AVX_FLOATS m_sqr_perpy = AVX_SQUARE_FLOAT(m_perpy);
+                const AVX_FLOATS m_sqr_perpz = AVX_SQUARE_FLOAT(m_perpz);
+                m_sqr_s = AVX_ADD_FLOATS(m_sqr_perpx, AVX_ADD_FLOATS(m_sqr_perpy, m_sqr_perpz));//3-d separation
+
+                //Create a mask where s^2 < smax^2
+                const AVX_FLOATS m_mask_3d_sep = AVX_COMPARE_FLOATS(m_sqr_s, m_sqr_smax, _CMP_LT_OQ);
+                if(AVX_TEST_COMPARISON(m_mask_3d_sep) == 0) {
+                    continue;
+                }
+                const AVX_FLOATS m_sqr_norm_l = AVX_ADD_FLOATS(AVX_SQUARE_FLOAT(m_parx),
+                                                               AVX_ADD_FLOATS(AVX_SQUARE_FLOAT(m_pary),
+                                                                              AVX_SQUARE_FLOAT(m_parz)));
+
+                // \mu^2 := cos^2(\theta_between_s_and_l) = |s.l|^2 / (|s|^2 * |l|^2)
+                const AVX_FLOATS m_sqr_norm_l_norm_s = AVX_MULTIPLY_FLOATS(m_sqr_norm_l, m_sqr_s);
+                if (fast_divide == 0) {
+                    m_sqr_mu = AVX_DIVIDE_FLOATS(m_sqr_s_dot_l, m_sqr_norm_l_norm_s);
+                    //The divide is the actual operation we need
+                    // but divides are about 10x slower than multiplies. So, I am replacing it
+                    //with a approximate reciprocal in floating point
+                    // + 2 iterations of newton-raphson in case of DOUBLE
+                } else {
+                    //following blocks do an approximate reciprocal followed by two iterations of Newton-Raphson
+
+#ifndef DOUBLE_PREC
+                    //Taken from Intel's site: https://software.intel.com/en-us/articles/wiener-filtering-using-intel-advanced-vector-extensions
+                    // (which has bugs in it, just FYI). Plus, https://techblog.lankes.org/2014/06/16/avx-isnt-always-faster-then-see/
+                    __m256 rc  = _mm256_rcp_ps(m_sqr_norm_l_norm_s);
+#else
+                    //we have to do this for doubles now.
+                    //if the vrcpps instruction is not generated, there will
+                    //be a ~70 cycle performance hit from switching between
+                    //AVX and SSE modes.
+                    __m128 float_tmp1 =  _mm256_cvtpd_ps(m_sqr_norm_l_norm_s);
+                    __m128 float_inv_tmp1 = _mm_rcp_ps(float_tmp1);
+                    AVX_FLOATS rc = _mm256_cvtps_pd(float_inv_tmp1);
+#endif//DOUBLE_PREC
+
+                  //We have the double->float->approx. reciprocal->double process done.
+                  //Now improve the accuracy of the divide with newton-raphson.
+
+                  //Ist iteration of NewtonRaphson
+                  AVX_FLOATS two = AVX_SET_FLOAT((DOUBLE) 2.0);
+                  AVX_FLOATS rc1 = AVX_MULTIPLY_FLOATS(rc,
+                                                       AVX_SUBTRACT_FLOATS(two,
+                                                                           AVX_MULTIPLY_FLOATS(m_sqr_norm_l_norm_s,rc)));
+                  //2nd iteration of NewtonRaphson
+                  AVX_FLOATS rc2 = AVX_MULTIPLY_FLOATS(rc1,
+                                                       AVX_SUBTRACT_FLOATS(two,
+                                                                           AVX_MULTIPLY_FLOATS(m_sqr_norm_l_norm_s,rc1)));
+                  m_sqr_mu = AVX_MULTIPLY_FLOATS(m_sqr_s_dot_l,rc2);
+                } //end of FAST_DIVIDE
+            }
+
+            const AVX_FLOATS m_mu = AVX_SQRT_FLOAT(m_sqr_mu);
+
+            AVX_FLOATS m_mask_left;
+            //Do the mask filters in a separate scope
+            {
+                const AVX_FLOATS m_mask_mumax = AVX_COMPARE_FLOATS(m_sqr_mu,m_sqr_mumax,_CMP_LT_OQ);
+                const AVX_FLOATS m_smax_mask = AVX_COMPARE_FLOATS(m_sqr_s, m_sqr_smax, _CMP_LT_OQ);
+                const AVX_FLOATS m_smin_mask = AVX_COMPARE_FLOATS(m_sqr_s, m_sqr_smin, _CMP_GE_OQ);
+                const AVX_FLOATS m_s_mask = AVX_BITWISE_AND(m_smax_mask, m_smin_mask);
+
+                m_mask_left = AVX_BITWISE_AND(m_mask_mumax, m_s_mask);
+                if(AVX_TEST_COMPARISON(m_mask_left)==0) {
+                    continue;
+                }
+                m_sqr_s = AVX_BLEND_FLOATS_WITH_MASK(m_zero,m_sqr_s,m_mask_left);
+                m_sqr_mu  = AVX_BLEND_FLOATS_WITH_MASK(m_sqr_mumax,m_sqr_mu,m_mask_left);
+            }
+
+            union float8 union_msep;
+            if(need_savg) {
+                union_msep.m_sep = AVX_SQRT_FLOAT(m_sqr_s);
+            }
+            if(need_weightavg){
+                pair.dx.a = m_perpx;
+                pair.dy.a = m_perpy;
+                pair.dz.a = m_perpz;
+
+                pair.parx.a = m_parx;
+                pair.pary.a = m_pary;
+                pair.parz.a = m_parz;
+
+                union_mweight.m_weights = avx_weight_func(&pair);
+            }
+
+            const AVX_FLOATS m_mask = m_mask_left;
+            AVX_FLOATS m_sbin = AVX_SET_FLOAT((DOUBLE) 0);
+            for(int kbin=nsbin-1;kbin>=1;kbin--) {
+                const AVX_FLOATS m_mask_low = AVX_COMPARE_FLOATS(m_sqr_s,m_supp_sqr[kbin-1],_CMP_GE_OQ);
+                const AVX_FLOATS m_bin_mask = AVX_BITWISE_AND(m_mask_low,m_mask_left);
+                m_sbin = AVX_BLEND_FLOATS_WITH_MASK(m_sbin,m_kbin[kbin], m_bin_mask);
+                m_mask_left = AVX_COMPARE_FLOATS(m_sqr_s, m_supp_sqr[kbin-1],_CMP_LT_OQ);
+                if(AVX_TEST_COMPARISON(m_mask_left) == 0) {
+                    break;
+                }
+            }
+
+            /* Compute the 1-D index to the [sbin, mubin] := sbin*(nmu_bins+1) + mubin */
+            const AVX_FLOATS m_tmp2 = AVX_MULTIPLY_FLOATS(m_mu,m_inv_dmu);
+            const AVX_FLOATS m_mubin = AVX_BLEND_FLOATS_WITH_MASK(m_nmu_bins, m_tmp2, m_mask);
+            const AVX_FLOATS m_nmu_bins_p1 = AVX_ADD_FLOATS(m_nmu_bins,m_one);
+            const AVX_FLOATS m_binproduct = AVX_ADD_FLOATS(AVX_MULTIPLY_FLOATS(m_sbin,m_nmu_bins_p1),m_mubin);
+            union int8 union_finalbin;
+            union_finalbin.m_ibin = AVX_TRUNCATE_FLOAT_TO_INT(m_binproduct);
+
+#if  __INTEL_COMPILER
+#pragma unroll(AVX_NVEC)
+#endif
+            for(int jj=0;jj<AVX_NVEC;jj++) {
+                const int ibin=union_finalbin.ibin[jj];
+
+                npairs[ibin]++;
+                if(need_savg) {
+                    savg[ibin] += union_msep.sep[jj];
+                }
+                if(need_weightavg){
+                    const DOUBLE weight = union_mweight.weights[jj];
+                    weightavg[ibin] += weight;
+                }
+            }
+        }//AVX j loop
+
+        //Take care of the remainder
+        for(;j<N1;j++) {
+            const DOUBLE parx = xpos + *localx1;
+            const DOUBLE pary = ypos + *localy1;
+            const DOUBLE parz = zpos + *localz1;
+
+            const DOUBLE perpx = xpos - *localx1;
+            const DOUBLE perpy = ypos - *localy1;
+            const DOUBLE perpz = zpos - *localz1;
+            /*
+              s := (perpx, perpy, perpz)
+              l := 1/2 (parx, pary, parz)  //ignoring the factor 1/2 since it cancels out in both numerator and denominator
+              
+              s \dot l := (parx*perpx + pary*perpy + parz*perpz)
+                       := (x1 + x2)*(x1 - x2) + (y1 + y2)*(y1 - y2) + (z1 + z2)*(z1 - z2)
+                       := (x1^2 + y1^2 + z1^2) - (x2^2 + y2^2 + z2^2)
+                       := d1^2 - d2^2
+            */
+            const DOUBLE s_dot_l = dpos*dpos - (*locald1) * (*locald1);// s \dot l
+
+            localx1++;localy1++;localz1++;locald1++;
+
+            for(int w = 0; w < pair.num_weights; w++){
+                pair.weights1[w].d = *local_w1.weights[w]++;
+            }
+
+
+            const DOUBLE sqr_s = perpx*perpx + perpy*perpy + perpz*perpz;
+            if(sqr_s >= sqr_smax || sqr_s < sqr_smin) continue;
+
+            const DOUBLE norm_l = (parx*parx + pary*pary + parz*parz);// := |l|^2
+            const DOUBLE sqr_s_dot_l = s_dot_l * s_dot_l;
+            const DOUBLE sqr_mu = sqr_s_dot_l/(norm_l * sqr_s);
+            const int mubin  = (sqr_mu >= sqr_mumax) ? nmu_bins:(int) (SQRT(sqr_mu)*inv_dmu);
+            DOUBLE s, pairweight;
+            if(need_savg) {
+                s = SQRT(sqr_s);
+            }
+            if(need_weightavg){
+                pair.dx.d = perpx;
+                pair.dy.d = perpy;
+                pair.dz.d = perpz;
+
+                pair.parx.d = parx;
+                pair.pary.d = pary;
+                pair.parz.d = parz;
+
+                pairweight = fallback_weight_func(&pair);
+            }
+
+            for(int kbin=nsbin-1;kbin>=1;kbin--) {
+                if(sqr_s >= supp_sqr[kbin-1]) {
+                    const int ibin = kbin*(nmu_bins+1) + mubin;
+                    npairs[ibin]++;
+                    if(need_savg) {
+                        savg[ibin] += s;
+                    }
+                    if(need_weightavg){
+                        weightavg[ibin] += pairweight;
+                    }
+                    break;
+                }
+            }
+        }//remainder jloop
+    }//i-loop
+
+    for(int i=0;i<totnbins;i++) {
+        src_npairs[i] += npairs[i];
+        if(need_savg) {
+            src_savg[i] += savg[i];
+        }
+        if(need_weightavg) {
+            src_weightavg[i] += weightavg[i];
+        }
+    }
+    return EXIT_SUCCESS;
+}
+#endif //AVX
+
+
+#if defined(__SSE4_2__)
+#include "sse_calls.h"
+
+static inline int countpairs_s_mu_mocks_sse_intrinsics_DOUBLE(const int64_t N0, DOUBLE *x0, DOUBLE *y0, DOUBLE *z0, DOUBLE *d0, const weight_struct_DOUBLE *weights0,
+                                                              const int64_t N1, DOUBLE *x1, DOUBLE *y1, DOUBLE *z1, DOUBLE *d1, const weight_struct_DOUBLE *weights1,
+                                                              const int same_cell,
+                                                              const int fast_divide,
+                                                              const DOUBLE smax, const DOUBLE smin, const int nsbin,
+                                                              const int nmu_bins, const DOUBLE *supp_sqr, const DOUBLE mu_max,
+                                                              DOUBLE *src_savg, uint64_t *src_npairs,
+                                                              DOUBLE *src_weightavg, const weight_method_t weight_method)
+{
+    if(N0 == 0 || N1 == 0) {
+        return EXIT_SUCCESS;
+    }
+    if(src_npairs == NULL) {
+        return EXIT_FAILURE;
+    }
+
+    const int32_t need_savg = src_savg != NULL;
+    const int32_t need_weightavg = src_weightavg != NULL;
+    (void) fast_divide; //unused
+
+    const int64_t totnbins = (nmu_bins+1)*(nsbin+1);
+    const DOUBLE sqr_mumax = mu_max*mu_max;
+    const DOUBLE sqr_smax  = smax*smax;
+    const DOUBLE sqr_smin  = smin*smin;
+
+    SSE_FLOATS m_supp_sqr[nsbin];
+    SSE_FLOATS m_kbin[nsbin];
+    for(int i=0;i<nsbin;i++) {
+        m_supp_sqr[i] = SSE_SET_FLOAT(supp_sqr[i]);
+        m_kbin[i] = SSE_SET_FLOAT((DOUBLE) i);
+    }
+
+    uint64_t npairs[totnbins];
+    const DOUBLE dmu = mu_max/(DOUBLE) nmu_bins;
+    const DOUBLE inv_dmu = 1.0/dmu;
+    DOUBLE savg[totnbins], weightavg[totnbins];
+    for(int64_t i=0;i<totnbins;i++) {
+        npairs[i] = ZERO;
+        if (need_savg) {
+            savg[i] = ZERO;
+        }
+        if(need_weightavg){
+            weightavg[i] = ZERO;
+        }
+    }
+
+    // A copy whose pointers we can advance
+    weight_struct_DOUBLE local_w0 = {.weights={NULL}, .num_weights=0},
+                         local_w1 = {.weights={NULL}, .num_weights=0};
+    pair_struct_DOUBLE pair = {.num_weights=0};
+    sse_weight_func_t_DOUBLE sse_weight_func = NULL;
+    weight_func_t_DOUBLE fallback_weight_func = NULL;
+    if(need_weightavg){
+      // Same particle list, new copy of num_weights pointers into that list
+      local_w0 = *weights0;
+      local_w1 = *weights1;
+
+      pair.num_weights = local_w0.num_weights;
+
+      sse_weight_func = get_sse_weight_func_by_method_DOUBLE(weight_method);
+      fallback_weight_func = get_weight_func_by_method_DOUBLE(weight_method);
+    }
+
+    int64_t prev_j=0, n_off = 0;
+    for(int64_t i=0;i<N0;i++) {
+        const DOUBLE xpos = *x0++;
+        const DOUBLE ypos = *y0++;
+        const DOUBLE zpos = *z0++;
+        const DOUBLE dpos = *d0++;
+        for(int w = 0; w < pair.num_weights; w++){
+            // local_w0.weights[w] is a pointer to a float in the particle list of weights,
+            // just as x0 is a pointer into the list of x-positions.
+            // The advancement of the local_w0.weights[w] pointer should always mirror x0.
+            pair.weights0[w].s = SSE_SET_FLOAT(*local_w0.weights[w]++);
+        }
+
+        int64_t j;
+        if(same_cell == 1) {
+            d1++; n_off++;
+            j = i+1;
+        } else {
+            for(;prev_j<N1;prev_j++) {
+                const DOUBLE dz = *d1 - dpos;
+                if(dz > -smax) break;
+                d1++; n_off++;
+            }
+            if(prev_j == N1) {
+                break;
+            }
+            j = prev_j;
+        }
+        DOUBLE *locald1 = d1;
+        DOUBLE *localx1 = x1 + n_off;
+        DOUBLE *localy1 = y1 + n_off;
+        DOUBLE *localz1 = z1 + n_off;
+        for(int w = 0; w < local_w1.num_weights; w++){
+            local_w1.weights[w] = weights1->weights[w] + n_off;
+        }
+
+        const SSE_FLOATS m_xpos = SSE_SET_FLOAT(xpos);
+        const SSE_FLOATS m_ypos = SSE_SET_FLOAT(ypos);
+        const SSE_FLOATS m_zpos = SSE_SET_FLOAT(zpos);
+        const SSE_FLOATS m_dpos = SSE_SET_FLOAT(dpos);
+
+        union int8 {
+            SSE_INTS m_ibin;
+            int ibin[SSE_NVEC];
+        };
+
+
+        union float8{
+            SSE_FLOATS m_sep;
+            DOUBLE sep[SSE_NVEC];
+        };
+
+        const SSE_FLOATS m_sqr_smax = SSE_SET_FLOAT(sqr_smax);
+        const SSE_FLOATS m_sqr_smin = SSE_SET_FLOAT(sqr_smin);
+        const SSE_FLOATS m_sqr_mumax = SSE_SET_FLOAT(sqr_mumax);
+        const SSE_FLOATS m_inv_dmu = SSE_SET_FLOAT(inv_dmu);
+        const SSE_FLOATS m_nmu_bins = SSE_SET_FLOAT((DOUBLE) nmu_bins);
+        const SSE_FLOATS m_zero = SSE_SET_FLOAT(ZERO);
+        const SSE_FLOATS m_one = SSE_SET_FLOAT((DOUBLE) 1);
+
+        for(;j<=(N1-SSE_NVEC);j+=SSE_NVEC){
+            const SSE_FLOATS m_x2 = SSE_LOAD_FLOATS_UNALIGNED(localx1);
+            const SSE_FLOATS m_y2 = SSE_LOAD_FLOATS_UNALIGNED(localy1);
+            const SSE_FLOATS m_z2 = SSE_LOAD_FLOATS_UNALIGNED(localz1);
+            const SSE_FLOATS m_d2 = SSE_LOAD_FLOATS_UNALIGNED(locald1);
+
+            localx1 += SSE_NVEC;
+            localy1 += SSE_NVEC;
+            localz1 += SSE_NVEC;
+            locald1 += SSE_NVEC;
+
+            for(int w = 0; w < pair.num_weights; w++){
+                pair.weights1[w].s = SSE_LOAD_FLOATS_UNALIGNED(local_w1.weights[w]);
+                local_w1.weights[w] += SSE_NVEC;
+            }
+
+            union float4_weights{
+                SSE_FLOATS m_weights;
+                DOUBLE weights[SSE_NVEC];
+            };
+            union float4_weights union_mweight;
+
+            const SSE_FLOATS m_perpx = SSE_SUBTRACT_FLOATS(m_xpos, m_x2);
+            const SSE_FLOATS m_perpy = SSE_SUBTRACT_FLOATS(m_ypos, m_y2);
+            const SSE_FLOATS m_perpz = SSE_SUBTRACT_FLOATS(m_zpos, m_z2);
+
+            const SSE_FLOATS m_parx = SSE_ADD_FLOATS(m_x2, m_xpos);
+            const SSE_FLOATS m_pary = SSE_ADD_FLOATS(m_y2, m_ypos);
+            const SSE_FLOATS m_parz = SSE_ADD_FLOATS(m_z2, m_zpos);
+
+            SSE_FLOATS m_sqr_s, m_sqr_mu;
+            {
+                const SSE_FLOATS m_s_dot_l =  SSE_SUBTRACT_FLOATS(SSE_SQUARE_FLOAT(m_d2), SSE_SQUARE_FLOAT(m_dpos));
+
+                const SSE_FLOATS m_sqr_s_dot_l = SSE_SQUARE_FLOAT(m_s_dot_l);
+                const SSE_FLOATS m_sqr_perpx = SSE_SQUARE_FLOAT(m_perpx);
+                const SSE_FLOATS m_sqr_perpy = SSE_SQUARE_FLOAT(m_perpy);
+                const SSE_FLOATS m_sqr_perpz = SSE_SQUARE_FLOAT(m_perpz);
+                m_sqr_s = SSE_ADD_FLOATS(m_sqr_perpx, SSE_ADD_FLOATS(m_sqr_perpy, m_sqr_perpz));//3-d separation
+
+                const SSE_FLOATS m_mask_3d_sep = SSE_COMPARE_FLOATS_LT(m_sqr_s, m_sqr_smax);
+                const SSE_FLOATS m_sqr_norm_l = SSE_ADD_FLOATS(SSE_SQUARE_FLOAT(m_parx), SSE_ADD_FLOATS(SSE_SQUARE_FLOAT(m_pary), SSE_SQUARE_FLOAT(m_parz)));
+
+                if(SSE_TEST_COMPARISON(m_mask_3d_sep)==0) {
+                    continue;
+                }
+
+                // \mu^2 = \pi^2 / s^2
+                const SSE_FLOATS m_sqr_norm_l_norm_s = SSE_MULTIPLY_FLOATS(m_sqr_norm_l, m_sqr_s);
+                m_sqr_mu = SSE_DIVIDE_FLOATS(m_sqr_s_dot_l,m_sqr_norm_l_norm_s);
+            }
+
+
+            const SSE_FLOATS m_mu = SSE_SQRT_FLOAT(m_sqr_mu);
+
+            SSE_FLOATS m_mask_left;
+            //Do the mask filters in a separate scope
+            {
+                const SSE_FLOATS m_mask_mumax = SSE_COMPARE_FLOATS_LT(m_sqr_mu,m_sqr_mumax);
+                const SSE_FLOATS m_smax_mask = SSE_COMPARE_FLOATS_LT(m_sqr_s, m_sqr_smax);
+                const SSE_FLOATS m_smin_mask = SSE_COMPARE_FLOATS_GE(m_sqr_s, m_sqr_smin);
+                const SSE_FLOATS m_s_mask = SSE_BITWISE_AND(m_smax_mask,m_smin_mask);
+
+                m_mask_left = SSE_BITWISE_AND(m_mask_mumax, m_s_mask);
+                if(SSE_TEST_COMPARISON(m_mask_left)==0) {
+                    continue;
+                }
+
+                m_sqr_s = SSE_BLEND_FLOATS_WITH_MASK(m_zero,m_sqr_s,m_mask_left);
+                m_sqr_mu  = SSE_BLEND_FLOATS_WITH_MASK(m_sqr_mumax,m_sqr_mu,m_mask_left);
+            }
+            union float8 union_msep;
+            if(need_savg) {
+                union_msep.m_sep = SSE_SQRT_FLOAT(m_sqr_s);
+            }
+            if(need_weightavg){
+                pair.dx.s = m_perpx;
+                pair.dy.s = m_perpy;
+                pair.dz.s = m_perpz;
+
+                pair.parx.s = m_parx;
+                pair.pary.s = m_pary;
+                pair.parz.s = m_parz;
+
+                union_mweight.m_weights = sse_weight_func(&pair);
+            }
+
+            const SSE_FLOATS m_mask = m_mask_left;
+            SSE_FLOATS m_sbin = SSE_SET_FLOAT((DOUBLE) 0);
+            for(int kbin=nsbin-1;kbin>=1;kbin--) {
+                const SSE_FLOATS m_mask_low = SSE_COMPARE_FLOATS_GE(m_sqr_s,m_supp_sqr[kbin-1]);
+                const SSE_FLOATS m_bin_mask = SSE_BITWISE_AND(m_mask_low,m_mask_left);
+                m_sbin = SSE_BLEND_FLOATS_WITH_MASK(m_sbin,m_kbin[kbin], m_bin_mask);
+                m_mask_left = SSE_COMPARE_FLOATS_LT(m_sqr_s, m_supp_sqr[kbin-1]);
+                if(SSE_TEST_COMPARISON(m_mask_left) == 0) {
+                    break;
+                }
+            }
+
+            /* Compute the 1-D index to the [sbin, mubin] := sbin*(nmu_bins+1) + mubin */
+            const SSE_FLOATS m_tmp2 = SSE_MULTIPLY_FLOATS(m_mu,m_inv_dmu);
+            const SSE_FLOATS m_mubin = SSE_BLEND_FLOATS_WITH_MASK(m_nmu_bins, m_tmp2, m_mask);
+            const SSE_FLOATS m_nmu_bins_p1 = SSE_ADD_FLOATS(m_nmu_bins,m_one);
+            const SSE_FLOATS m_binproduct = SSE_ADD_FLOATS(SSE_MULTIPLY_FLOATS(m_sbin,m_nmu_bins_p1),m_mubin);
+            union int8 union_finalbin;
+            union_finalbin.m_ibin = SSE_TRUNCATE_FLOAT_TO_INT(m_binproduct);
+
+#if  __INTEL_COMPILER
+#pragma unroll(SSE_NVEC)
+#endif
+            for(int jj=0;jj<SSE_NVEC;jj++) {
+                const int ibin=union_finalbin.ibin[jj];
+
+                npairs[ibin]++;
+                if(need_savg) {
+                    savg[ibin] += union_msep.sep[jj];
+                }
+                if(need_weightavg){
+                    const DOUBLE weight = union_mweight.weights[jj];
+                    weightavg[ibin] += weight;
+                }
+            }
+        }//SSE j loop
+
+        //Take care of the remainder
+        for(;j<N1;j++) {
+            const DOUBLE parx = xpos + *localx1;
+            const DOUBLE pary = ypos + *localy1;
+            const DOUBLE parz = zpos + *localz1;
+
+            const DOUBLE perpx = xpos - *localx1;
+            const DOUBLE perpy = ypos - *localy1;
+            const DOUBLE perpz = zpos - *localz1;
+
+            //parx*perpx + pary*perpy + parz*perpz == (x1^2 + y1^2 + z1^2) - (x2^2 + y2^2 + z2^2) == d1^2 - d2^2
+            const DOUBLE s_dot_l = dpos*dpos - (*locald1) * (*locald1);
+            localx1++;localy1++;localz1++;locald1++;
+
+            for(int w = 0; w < pair.num_weights; w++){
+                pair.weights1[w].d = *local_w1.weights[w]++;
+            }
+
+            const DOUBLE sqr_s = perpx*perpx + perpy*perpy + perpz*perpz;
+            if(sqr_s >= sqr_smax || sqr_s < sqr_smin) continue;
+
+            const DOUBLE norm_l = (parx*parx + pary*pary + parz*parz);
+            const DOUBLE sqr_s_dot_l = s_dot_l * s_dot_l;
+            const DOUBLE sqr_mu = sqr_s_dot_l/(norm_l * sqr_s);
+            const int mubin  = (sqr_mu >= sqr_mumax) ? nmu_bins:(int) (SQRT(sqr_mu)*inv_dmu);
+            DOUBLE s, pairweight;
+            if(need_savg) {
+                s = SQRT(sqr_s);
+            }
+            if(need_weightavg){
+                pair.dx.d = perpx;
+                pair.dy.d = perpy;
+                pair.dz.d = perpz;
+
+                pair.parx.d = parx;
+                pair.pary.d = pary;
+                pair.parz.d = parz;
+
+                pairweight = fallback_weight_func(&pair);
+            }
+
+
+            for(int kbin=nsbin-1;kbin>=1;kbin--) {
+                if(sqr_s >= supp_sqr[kbin-1]) {
+                    const int ibin = kbin*(nmu_bins+1) + mubin;
+                    npairs[ibin]++;
+                    if(need_savg){
+                        savg[ibin] += s;
+                    }
+                    if(need_weightavg){
+                        weightavg[ibin] += pairweight;
+                    }
+                    break;
+                }
+            }
+        }//remainder jloop
+    }//i-loop
+
+    for(int i=0;i<totnbins;i++) {
+        src_npairs[i] += npairs[i];
+        if(need_savg) {
+            src_savg[i] += savg[i];
+        }
+        if(need_weightavg) {
+            src_weightavg[i] += weightavg[i];
+        }
+    }
+
+    return EXIT_SUCCESS;
+}
+#endif //SSE4.2 defined
+
+
+
+static inline int countpairs_s_mu_mocks_fallback_DOUBLE(const int64_t N0, DOUBLE *x0, DOUBLE *y0, DOUBLE *z0, DOUBLE *d0, const weight_struct_DOUBLE *weights0,
+                                                        const int64_t N1, DOUBLE *x1, DOUBLE *y1, DOUBLE *z1, DOUBLE *d1, const weight_struct_DOUBLE *weights1,
+                                                        const int same_cell,
+                                                        const int fast_divide,
+                                                        const DOUBLE smax, const DOUBLE smin, const int nsbin,
+                                                        const int nmu_bins, const DOUBLE *supp_sqr, const DOUBLE mu_max,
+                                                        DOUBLE *src_savg, uint64_t *src_npairs,
+                                                        DOUBLE *src_weightavg, const weight_method_t weight_method)
+{
+    if(N0 == 0 || N1 == 0) {
+        return EXIT_SUCCESS;
+    }
+
+    if(src_npairs == NULL) {
+        return EXIT_FAILURE;
+    }
+
+    const int32_t need_savg = src_savg != NULL;
+    const int32_t need_weightavg = src_weightavg != NULL;
+
+    (void) fast_divide;//unused parameter but required to keep the same function signature amongst the kernels
+
+    const DOUBLE sqr_smax  = smax*smax;
+    const DOUBLE sqr_smin  = smin*smin;
+    const DOUBLE sqr_mumax = mu_max*mu_max;
+
+    /*----------------- FALLBACK CODE --------------------*/
+    const int64_t totnbins = (nmu_bins+1)*(nsbin+1);
+
+    uint64_t npairs[totnbins];
+    DOUBLE savg[totnbins], weightavg[totnbins];
+    for(int i=0;i<totnbins;i++) {
+        npairs[i] = ZERO;
+        if(need_savg) {
+            savg[i]=ZERO;
+        }
+        if(need_weightavg){
+            weightavg[i]=ZERO;
+        }
+    }
+
+    // A copy whose pointers we can advance
+    weight_struct_DOUBLE local_w0 = {.weights={NULL}, .num_weights=0},
+                         local_w1 = {.weights={NULL}, .num_weights=0};
+    pair_struct_DOUBLE pair = {.num_weights=0};
+    weight_func_t_DOUBLE weight_func = NULL;
+    if(need_weightavg){
+        // Same particle list, new copy of num_weights pointers into that list
+        local_w0 = *weights0;
+        local_w1 = *weights1;
+        pair.num_weights = local_w0.num_weights;
+        weight_func = get_weight_func_by_method_DOUBLE(weight_method);
+    }
+
+    const DOUBLE dmu = mu_max/(DOUBLE) nmu_bins;
+    const DOUBLE inv_dmu = 1.0/dmu;
+
+    int64_t nleft=N1, n_off = 0;
+    for(int64_t i=0;i<N0;i++) {
+        const DOUBLE xpos = *x0++;
+        const DOUBLE ypos = *y0++;
+        const DOUBLE zpos = *z0++;
+        const DOUBLE dpos = *d0++;//d is the co-moving distance
+        for(int w = 0; w < pair.num_weights; w++){
+            pair.weights0[w].d = *local_w0.weights[w]++;
+        }
+
+        /* If in the same cell, unique pairs are guaranteed by not including the current particle */
+        if(same_cell == 1) {
+            d1++; n_off++;
+            nleft--;
+        } else {
+            /* For a different cell, all pairs are unique pairs, since two cells are only opened for pairs once (accounted for in the assign_ngb_cells function)*/
+            while(nleft > 0) {
+                /*Particles are sorted on 'd', in increasing order */
+                const DOUBLE dz = *d1 - dpos;
+                if(dz > -smax) break;
+                d1++; n_off++;
+                nleft--;
+            }
+            /*If no particle in the second cell satisfies distance constraints on 'dz' for the current 'i'th particle in first cell,
+              then there can be no more pairs from any particles in the first cell (since the first cell is also sorted in increasing order in 'd')
+             */
+            if(nleft == 0) {
+                i=N0;
+                break;
+            }
+        }
+
+        DOUBLE *localx1 = x1 + n_off;
+        DOUBLE *localy1 = y1 + n_off;
+        DOUBLE *localz1 = z1 + n_off;
+        DOUBLE *locald1 = d1;
+        for(int w = 0; w < pair.num_weights; w++){
+            local_w1.weights[w] = weights1->weights[w] + n_off;
+        }
+
+        for(int64_t j=0;j<nleft;j++){
+            const DOUBLE parx = xpos + *localx1;
+            const DOUBLE pary = ypos + *localy1;
+            const DOUBLE parz = zpos + *localz1;
+
+            const DOUBLE perpx = xpos - *localx1;
+            const DOUBLE perpy = ypos - *localy1;
+            const DOUBLE perpz = zpos - *localz1;
+
+            //parx*perpx + pary*perpy + parz*perpz == (x1^2 + y1^2 + z1^2) - (x2^2 + y2^2 + z2^2) == d1^2 - d2^2
+            const DOUBLE s_dot_l = dpos*dpos - (*locald1) * (*locald1);
+            localx1++;localy1++;localz1++;locald1++;
+
+            for(int w = 0; w < pair.num_weights; w++){
+                pair.weights1[w].d = *local_w1.weights[w]++;
+            }
+
+            const DOUBLE sqr_s = perpx*perpx + perpy*perpy + perpz*perpz;
+            if(sqr_s >= sqr_smax || sqr_s < sqr_smin) continue;
+
+            const DOUBLE sqr_l = (parx*parx + pary*pary + parz*parz);
+            const DOUBLE sqr_s_dot_l = s_dot_l * s_dot_l;
+            const DOUBLE sqr_mu = sqr_s_dot_l/(sqr_l * sqr_s);
+            const int mubin  = (sqr_mu >= sqr_mumax) ? nmu_bins:(int) (SQRT(sqr_mu)*inv_dmu);
+            DOUBLE s, pairweight;
+            if(need_savg) {
+                s = SQRT(sqr_s);
+            }
+            if(need_weightavg){
+                pair.dx.d = perpx;
+                pair.dy.d = perpy;
+                pair.dz.d = perpz;
+                
+                pair.parx.d = parx;
+                pair.pary.d = pary;
+                pair.parz.d = parz;
+
+                pairweight = weight_func(&pair);
+            }
+
+            for(int kbin=nsbin-1;kbin>=1;kbin--) {
+                if(sqr_s >= supp_sqr[kbin-1]) {
+                    const int ibin = kbin*(nmu_bins+1) + mubin;
+                    npairs[ibin]++;
+                    if(need_savg) {
+                        savg[ibin]+=s;
+                    }
+                    if(need_weightavg){
+                        weightavg[ibin] += pairweight;
+                    }
+                    break;
+                }
+            }//finding kbin
+        }//j loop over second set of particles
+    }//i loop over first set of particles
+
+    for(int i=0;i<totnbins;i++) {
+        src_npairs[i] += npairs[i];
+        if(need_savg) {
+            src_savg[i] += savg[i];
+        }
+        if(need_weightavg){
+            src_weightavg[i] += weightavg[i];
+        }
+    }
+
+    return EXIT_SUCCESS;
+}//end of fallback code
diff --git a/mocks/DDtheta_mocks/countpairs_theta_mocks_impl.c.src b/mocks/DDtheta_mocks/countpairs_theta_mocks_impl.c.src
index de1e610a..1d0f5dfc 100644
--- a/mocks/DDtheta_mocks/countpairs_theta_mocks_impl.c.src
+++ b/mocks/DDtheta_mocks/countpairs_theta_mocks_impl.c.src
@@ -489,7 +489,8 @@ int countpairs_theta_mocks_DOUBLE(const int64_t ND1, DOUBLE *ra1, DOUBLE *dec1,
         options->bin_refine_factors[1]=numthreads;
     }
 #endif
-    for(int i=0;i<3;i++) {
+    /* Only check the ra and dec bin refine factors (not all 3 bin refs)*/
+    for(int i=0;i<2;i++) {
         if(options->bin_refine_factors[i] < 1) {
             fprintf(stderr,"Warning: bin refine factor along axis = %d *must* be >=1. Instead found bin refine factor =%d\n",
                     i, options->bin_refine_factors[i]);
diff --git a/mocks/Makefile b/mocks/Makefile
index 177b3c6d..44853a5a 100644
--- a/mocks/Makefile
+++ b/mocks/Makefile
@@ -1,28 +1,28 @@
 include ../mocks.options ../common.mk
 
-TARGETS:= dirs DDrppi_mocks DDtheta_mocks vpf_mocks examples
+TARGETS:= dirs DDrppi_mocks DDtheta_mocks DDsmu_mocks vpf_mocks examples
 ifneq ($(COMPILE_PYTHON_EXT), 0)
   TARGETS += python_bindings
 else
   $(warning $(ccmagenta) Not compiling C extensions for mocks. Either python or numpy not available $(ccreset))
 endif
 
-all: $(TARGETS) 
+all: $(TARGETS)
 
 dirs: | ../lib ../bin ../include
 
 ../lib ../bin ../include:
 	mkdir -p $@
 
-.PHONY: clean celna clena celan $(TARGETS) tests distclean realclean distclena realclena dirs test  python_bindings libs all 
+.PHONY: clean celna clena celan $(TARGETS) tests distclean realclean distclena realclena dirs test  python_bindings libs all
 
-DDrppi_mocks DDtheta_mocks vpf_mocks:
+DDrppi_mocks DDtheta_mocks vpf_mocks DDsmu_mocks:
 	$(MAKE) -C $@
 
 examples: libs
 	$(MAKE) -C examples
 
-python_bindings: libs 
+python_bindings: libs
 	$(MAKE) -C $@
 
 distclean:realclean
@@ -32,6 +32,7 @@ realclena:realclean
 realclean:
 	$(MAKE) clean
 	$(MAKE) -C DDrppi_mocks distclean
+	$(MAKE) -C DDsmu_mocks distclean
 	$(MAKE) -C DDtheta_mocks distclean
 	$(MAKE) -C vpf_mocks distclean
 	$(MAKE) -C python_bindings distclean
@@ -40,6 +41,7 @@ realclean:
 
 clean:
 	$(MAKE) -C DDrppi_mocks clean
+	$(MAKE) -C DDsmu_mocks clean
 	$(MAKE) -C DDtheta_mocks clean
 	$(MAKE) -C vpf_mocks clean
 	$(MAKE) -C examples clean
@@ -50,18 +52,19 @@ clena: clean
 celan: clean
 celna: clean
 
-install: examples | dirs 
+install: examples | dirs
 	$(MAKE) -C DDrppi_mocks install
+	$(MAKE) -C DDsmu_mocks install
 	$(MAKE) -C DDtheta_mocks install
 	$(MAKE) -C vpf_mocks install
 	$(MAKE) -C python_bindings install
 
 libs:  | dirs
 	$(MAKE) -C DDrppi_mocks lib
+	$(MAKE) -C DDsmu_mocks lib
 	$(MAKE) -C DDtheta_mocks lib
 	$(MAKE) -C vpf_mocks lib
 
 test: tests
 tests:
 	$(MAKE) -C tests
-
diff --git a/mocks/examples/Makefile b/mocks/examples/Makefile
index 3047cfbf..230a7d16 100644
--- a/mocks/examples/Makefile
+++ b/mocks/examples/Makefile
@@ -7,10 +7,12 @@ DATA_DIR := ../tests/data
 
 MOCKS_DIR := $(ROOT_DIR)/mocks
 DDrppi_mocks_DIR := $(MOCKS_DIR)/DDrppi_mocks
+DDsmu_mocks_DIR := $(MOCKS_DIR)/DDsmu_mocks
 DDTHETA_mocks_DIR := $(MOCKS_DIR)/DDtheta_mocks
 VPF_mocks_DIR := $(MOCKS_DIR)/vpf_mocks
 
 DDrppi_mocks_LIB := countpairs_rp_pi_mocks
+DDsmu_mocks_LIB := countpairs_s_mu_mocks
 DDTHETA_mocks_LIB := countpairs_theta_mocks
 VPF_mocks_LIB := countspheres_mocks
 
@@ -20,14 +22,16 @@ TARGET := run_correlations_mocks
 TARGETSRC   := $(TARGET).c $(IO_DIR)/ftread.c $(IO_DIR)/io.c  $(UTILS_DIR)/utils.c $(UTILS_DIR)/progressbar.c \
                $(UTILS_DIR)/cosmology_params.c 
 TARGETOBJS  := $(TARGETSRC:.c=.o)
-C_LIBRARIES := $(DDrppi_mocks_DIR)/lib$(DDrppi_mocks_LIB).a $(DDTHETA_mocks_DIR)/lib$(DDTHETA_mocks_LIB).a $(VPF_mocks_DIR)/lib$(VPF_mocks_LIB).a
-INCL :=	$(DDrppi_mocks_DIR)/$(DDrppi_mocks_LIB).h $(DDTHETA_mocks_DIR)/$(DDTHETA_mocks_LIB).h $(VPF_mocks_DIR)/$(VPF_mocks_LIB).h \
+C_LIBRARIES := $(DDrppi_mocks_DIR)/lib$(DDrppi_mocks_LIB).a $(DDsmu_mocks_DIR)/lib$(DDsmu_mocks_LIB).a \
+               $(DDTHETA_mocks_DIR)/lib$(DDTHETA_mocks_LIB).a $(VPF_mocks_DIR)/lib$(VPF_mocks_LIB).a 
+INCL :=	$(DDrppi_mocks_DIR)/$(DDrppi_mocks_LIB).h $(DDsmu_mocks_DIR)/$(DDsmu_mocks_LIB).h \
+        $(DDTHETA_mocks_DIR)/$(DDTHETA_mocks_LIB).h $(VPF_mocks_DIR)/$(VPF_mocks_LIB).h \
         $(UTILS_DIR)/defs.h $(IO_DIR)/io.h $(IO_DIR)/ftread.h \
         $(UTILS_DIR)/utils.h $(UTILS_DIR)/gridlink_mocks_impl_double.h $(UTILS_DIR)/gridlink_mocks_impl_float.h \
 	$(UTILS_DIR)/function_precision.h $(UTILS_DIR)/cellarray_mocks_double.h $(UTILS_DIR)/cellarray_mocks_float.h \
         $(UTILS_DIR)/progressbar.h $(UTILS_DIR)/cosmology_params.h 
 
-LIBRARY_INCL := -I$(DDrppi_mocks_DIR) -I$(DDTHETA_mocks_DIR) -I$(VPF_mocks_DIR)
+LIBRARY_INCL := -I$(DDrppi_mocks_DIR) -I$(DDsmu_mocks_DIR) -I$(DDTHETA_mocks_DIR) -I$(VPF_mocks_DIR)
 
 all: $(TARGET) $(TARGETSRC) $(C_LIBRARIES) $(INCL) $(ROOT_DIR)/mocks.options $(ROOT_DIR)/common.mk Makefile
 
@@ -39,6 +43,9 @@ $(TARGET):$(C_LIBRARIES)
 $(DDrppi_mocks_DIR)/lib$(DDrppi_mocks_LIB).a: $(DDrppi_mocks_DIR)/*.c $(DDrppi_mocks_DIR)/*.c.src $(DDrppi_mocks_DIR)/*.h.src $(ROOT_DIR)/mocks.options $(ROOT_DIR)/common.mk
 	$(MAKE) -C $(DDrppi_mocks_DIR) libs
 
+$(DDsmu_mocks_DIR)/lib$(DDsmu_mocks_LIB).a: $(DDsmu_mocks_DIR)/*.c $(DDsmu_mocks_DIR)/*.c.src $(DDsmu_mocks_DIR)/*.h.src $(ROOT_DIR)/mocks.options $(ROOT_DIR)/common.mk
+	$(MAKE) -C $(DDsmu_mocks_DIR) libs
+
 $(DDTHETA_mocks_DIR)/lib$(DDTHETA_mocks_LIB).a: $(DDTHETA_mocks_DIR)/*.c $(DDTHETA_mocks_DIR)/*.c.src $(DDTHETA_mocks_DIR)/*.h.src $(ROOT_DIR)/mocks.options $(ROOT_DIR)/common.mk
 	$(MAKE) -C $(DDTHETA_mocks_DIR) libs
 
diff --git a/mocks/examples/run_correlations_mocks.c b/mocks/examples/run_correlations_mocks.c
index a0930b6b..0c9248cf 100644
--- a/mocks/examples/run_correlations_mocks.c
+++ b/mocks/examples/run_correlations_mocks.c
@@ -26,6 +26,7 @@
 
 /* Library proto-types + struct definitions in the ../..//include directory */
 #include "countpairs_rp_pi_mocks.h"
+#include "countpairs_s_mu_mocks.h"
 #include "countpairs_theta_mocks.h"
 #include "countspheres_mocks.h"
 
@@ -45,6 +46,8 @@ void Printhelp(void)
     fprintf(stderr,"     * binfile      = name of ascii file containing the r-bins (rmin rmax for each bin)\n") ;
     fprintf(stderr,"     * pimax        = pimax   (in same units as X/Y/Z of the data)\n");
     fprintf(stderr,"     * cosmology    = flag to pick-up the cosmology combination to use (set as an array of combinations in ../utils/cosmology_params.c)\n");
+    fprintf(stderr,"     * mu_max       = Max. value of the cosine of the angle to the LOS (must be within [0.0, 1.0])\n");
+    fprintf(stderr,"     * nmu_bins     = Number of linear bins to create (the bins themselves range from [0.0, mu_max]\n");
 #if defined(USE_OMP) && defined(_OPENMP)
     fprintf(stderr,"     * numthreads   = number of threads to use\n");
 #endif
@@ -61,7 +64,9 @@ int main(int argc, char **argv)
     DOUBLE pimax;
     int cosmology=1;
     int nthreads=1;
-
+    int nmu_bins;
+    DOUBLE mu_max;
+        
     struct config_options options = get_config_options();
     options.verbose=1;
     options.periodic=0;
@@ -70,9 +75,9 @@ int main(int argc, char **argv)
     
 #if defined(_OPENMP)
     nthreads=4;//default to 4 threads
-    const char argnames[][30]={"file","format","binfile","pimax","cosmology","Nthreads"};
+    const char argnames[][30]={"file","format","binfile","pimax","cosmology","mu_max", "nmu_bins", "Nthreads"};
 #else
-    const char argnames[][30]={"file","format","binfile","pimax","cosmology"};
+    const char argnames[][30]={"file","format","binfile","pimax","cosmology", "mu_max", "nmu_bins"};
 #endif
     int nargs=sizeof(argnames)/(sizeof(char)*30);
 
@@ -89,8 +94,10 @@ int main(int argc, char **argv)
             my_snprintf(binfile,MAXLEN,"%s",argv[3]);
             pimax=atof(argv[4]);
             cosmology=atoi(argv[5]);
+            mu_max=atof(argv[6]);
+            nmu_bins=atoi(argv[7]);
 #if defined(_OPENMP)
-            nthreads = atoi(argv[6]);
+            nthreads = atoi(argv[8]);
 #endif
         }
     } else {
@@ -99,6 +106,8 @@ int main(int argc, char **argv)
         my_snprintf(binfile, MAXLEN,"%s","../tests/bins");
         pimax=40.0;
         cosmology=1;
+        mu_max=1.0;
+        nmu_bins=10;
     }
 
     fprintf(stderr,ANSI_COLOR_BLUE  "Running `%s' with the parameters \n",argv[0]);
@@ -108,8 +117,10 @@ int main(int argc, char **argv)
     fprintf(stderr,"\t\t %-10s = %s \n",argnames[2],binfile);
     fprintf(stderr,"\t\t %-10s = %10.4lf\n",argnames[3],pimax);
     fprintf(stderr,"\t\t %-10s = %d\n",argnames[4],cosmology);
+    fprintf(stderr,"\t\t %-10s = %10.4lf\n",argnames[5],mu_max);
+    fprintf(stderr,"\t\t %-10s = %dlf\n",argnames[6],nmu_bins);
 #if defined(_OPENMP)
-    fprintf(stderr,"\t\t %-10s = %d\n",argnames[5],nthreads);
+    fprintf(stderr,"\t\t %-10s = %d\n",argnames[7],nthreads);
 #endif
     fprintf(stderr,"\t\t -------------------------------------" ANSI_COLOR_RESET "\n");
 
@@ -135,10 +146,10 @@ int main(int argc, char **argv)
         gettimeofday(&t0,NULL);
 #if defined(_OPENMP)
         fprintf(stderr,ANSI_COLOR_MAGENTA "Command-line for running equivalent DD(rp,pi) calculation would be:\n `%s %s %s %s %s %s %lf %d %d'" ANSI_COLOR_RESET "\n",
-                "../DDrppi/DDrppi_mocks",file,fileformat,file,fileformat,binfile,pimax,cosmology,nthreads);
+                "../DDrppi_mocks/DDrppi_mocks",file,fileformat,file,fileformat,binfile,pimax,cosmology,nthreads);
 #else
         fprintf(stderr,ANSI_COLOR_MAGENTA "Command-line for running equivalent DD(rp,pi) calculation would be:\n `%s %s %s %s %s %s %lf %d'" ANSI_COLOR_RESET "\n",
-                "../DDrppi/DDrppi_mocks",file,fileformat,file,fileformat,binfile,pimax,cosmology);
+                "../DDrppi_mocks/DDrppi_mocks",file,fileformat,file,fileformat,binfile,pimax,cosmology);
 #endif
 
         results_countpairs_mocks results;
@@ -177,15 +188,62 @@ int main(int argc, char **argv)
 
 
 
-    //Do the w(theta) counts
+    //Do the DD(s, mu) counts
+    {
+        gettimeofday(&t0,NULL);
+#if defined(_OPENMP)
+        fprintf(stderr,ANSI_COLOR_MAGENTA "Command-line for running equivalent DD(s,mu) calculation would be:\n `%s %s %s %s %s %s %lf %d %d %d'"ANSI_COLOR_RESET"\n",
+                "../DDsmu_mocks/DDsmu_mocks",file,fileformat,file,fileformat,binfile,mu_max,nmu_bins,cosmology,nthreads);
+#else
+        fprintf(stderr,ANSI_COLOR_MAGENTA "Command-line for running equivalent DD(s,mu) calculation would be:\n `%s %s %s %s %s %s %lf %d %d'"ANSI_COLOR_RESET"\n",
+                "../DDsmu_mocks/DDsmu_mocks",file,fileformat,file,fileformat,binfile,mu_max,nmu_bins,cosmology);
+#endif
+
+        results_countpairs_mocks_s_mu results;
+        int status = countpairs_mocks_s_mu(ND1,ra1,dec1,cz1,
+                                           ND2,ra2,dec2,cz2,
+                                           nthreads,
+                                           autocorr,
+                                           binfile,
+                                           mu_max,
+                                           nmu_bins,
+                                           cosmology,
+                                           &results,
+                                           &options, NULL);
+        if(status != EXIT_SUCCESS) {
+            return status;
+        }
+
+        gettimeofday(&t1,NULL);
+        double pair_time = ADD_DIFF_TIME(t0,t1);
+#if 0
+        const DOUBLE dmu = mu_max/(DOUBLE)results.nmu_bins ;
+        const int nmubin = results.nmu_bins;
+        for(int i=1;i<results.nsbin;i++) {
+            const double log_supp = LOG10(results.supp[i]);
+            for(int j=0;j<nmubin;j++) {
+                const int index = i*(nmubin+1) + j;
+                fprintf(stdout,"%10"PRIu64" %20.8lf %20.8lf  %20.8lf %20.8lf \n",results.npairs[index],results.savg[index],log_supp,(j+1)*dmu);
+            }
+        }
+
+#endif
+        fprintf(stderr,ANSI_COLOR_GREEN "Done DD(s,mu) auto-correlation. Ngalaxies = %12"PRId64" Time taken = %8.2lf seconds "ANSI_COLOR_RESET"\n", ND1, pair_time);
+
+        //free the result structure
+        free_results_mocks_s_mu(&results);
+    }
+    
+
+    //Do the DD(theta) counts
     {
         gettimeofday(&t0,NULL);
 #if defined(_OPENMP)
         fprintf(stderr,ANSI_COLOR_MAGENTA "Command-line for running equivalent w(theta) calculation would be:\n `%s %s %s %s %s %s %d'" ANSI_COLOR_RESET "\n",
-                "../wtheta/DDtheta_mocks",file,fileformat,file,fileformat,binfile,nthreads);
+                "../DDtheta_mocks/DDtheta_mocks",file,fileformat,file,fileformat,binfile,nthreads);
 #else
         fprintf(stderr,ANSI_COLOR_MAGENTA "Command-line for running equivalent w(theta) calculation would be:\n `%s %s %s %s %s %s '" ANSI_COLOR_RESET "\n",
-                "../wtheta/DDtheta_mocks",file,fileformat,file,fileformat,binfile);
+                "../DDtheta_mocks/DDtheta_mocks",file,fileformat,file,fileformat,binfile);
 #endif
 
         results_countpairs_theta results;
@@ -229,8 +287,9 @@ int main(int argc, char **argv)
         DOUBLE *xran=NULL,*yran=NULL,*zran=NULL;
         const int threshold_neighbors=1;
         const char centers_file[]="../tests/data/Mr19_centers_xyz_forVPF_rmax_10Mpc.txt";
-        fprintf(stderr,ANSI_COLOR_MAGENTA "Command-line for running equivalent w(theta) calculation would be:\n `%s %lf %d %d %d %lf %s %s %s %s %s %d'" ANSI_COLOR_RESET "\n",
-                "../vpf/vpf_mocks",rmax,nbin,nc,num_pN,0.0,file,fileformat,"junk","junkformat",centers_file,cosmology);
+        fprintf(stderr,ANSI_COLOR_MAGENTA "Command-line for running equivalent DD(theta) calculation would be:\n"
+                "`%s %lf %d %d %d %lf %s %s %s %s %s %d'"ANSI_COLOR_RESET "\n",
+                "../vpf_mocks/vpf_mocks",rmax,nbin,nc,num_pN,0.0,file,fileformat,"junk","junkformat",centers_file,cosmology);
 
         results_countspheres_mocks results;
         int status = countspheres_mocks(ND1, ra1, dec1, cz1,
diff --git a/mocks/python_bindings/Makefile b/mocks/python_bindings/Makefile
index 8c7e8df7..6a609921 100644
--- a/mocks/python_bindings/Makefile
+++ b/mocks/python_bindings/Makefile
@@ -10,9 +10,11 @@ INSTALL_BIN_DIR := $(ROOT_DIR)/bin
 MOCKS_DIR := $(ROOT_DIR)/mocks
 DDrppi_mocks_DIR := $(MOCKS_DIR)/DDrppi_mocks
 DDtheta_mocks_DIR := $(MOCKS_DIR)/DDtheta_mocks
+DDsmu_mocks_DIR := $(MOCKS_DIR)/DDsmu_mocks
 VPF_mocks_DIR := $(MOCKS_DIR)/vpf_mocks
 
 DDrppi_mocks_LIB := countpairs_rp_pi_mocks
+DDsmu_mocks_LIB := countpairs_s_mu_mocks
 DDtheta_mocks_LIB := countpairs_theta_mocks
 VPF_mocks_LIB := countspheres_mocks
 
@@ -22,17 +24,18 @@ PROJECT := _countpairs_mocks
 PYTHON_EXTN := $(PROJECT).so.$(MAJOR).$(MINOR).$(PATCHLEVEL)
 SOURCES := $(PROJECT).c
 OBJECTS := $(SOURCES:.c=.o)
-C_LIBRARIES := $(DDrppi_mocks_DIR)/lib$(DDrppi_mocks_LIB).a $(DDtheta_mocks_DIR)/lib$(DDtheta_mocks_LIB).a $(VPF_mocks_DIR)/lib$(VPF_mocks_LIB).a
+C_LIBRARIES := $(DDrppi_mocks_DIR)/lib$(DDrppi_mocks_LIB).a $(DDtheta_mocks_DIR)/lib$(DDtheta_mocks_LIB).a $(VPF_mocks_DIR)/lib$(VPF_mocks_LIB).a $(DDsmu_mocks_DIR)/lib$(DDsmu_mocks_LIB).a
 INCL :=	$(DDrppi_mocks_DIR)/$(DDrppi_mocks_LIB).h $(DDtheta_mocks_DIR)/$(DDtheta_mocks_LIB).h $(VPF_mocks_DIR)/$(VPF_mocks_LIB).h \
         $(UTILS_DIR)/defs.h $(IO_DIR)/io.h $(IO_DIR)/ftread.h \
         $(UTILS_DIR)/utils.h \
 	$(UTILS_DIR)/function_precision.h \
         $(UTILS_DIR)/progressbar.h $(UTILS_DIR)/cosmology_params.h \
-        $(UTILS_DIR)/cpu_features.h $(UTILS_DIR)/macros.h
+        $(UTILS_DIR)/cpu_features.h $(UTILS_DIR)/macros.h \
+				$(DDsmu_mocks_DIR)/$(DDsmu_mocks_LIB).h
 
-LIB_INCLUDE:=-I$(DDrppi_mocks_DIR) -I$(DDtheta_mocks_DIR) -I$(VPF_mocks_DIR)
+LIB_INCLUDE:=-I$(DDrppi_mocks_DIR) -I$(DDtheta_mocks_DIR) -I$(VPF_mocks_DIR) -I$(DDsmu_mocks_DIR)
 
-all: sharedlib $(LIBRARY) $(SOURCES) $(C_LIBRARIES) $(ROOT_DIR)/mocks.options $(ROOT_DIR)/common.mk Makefile 
+all: sharedlib $(LIBRARY) $(SOURCES) $(C_LIBRARIES) $(ROOT_DIR)/mocks.options $(ROOT_DIR)/common.mk Makefile
 
 $(PROJECT).so: $(PYTHON_EXTN)
 	$(RM) $(PROJECT).so
@@ -44,12 +47,15 @@ $(PROJECT).o: $(PROJECT).c $(ROOT_DIR)/mocks.options $(ROOT_DIR)/common.mk Makef
 tests: sharedlib $(ROOT_DIR)/mocks.options $(ROOT_DIR)/common.mk Makefile $(C_LIBRARIES) $(OBJECTS) | $(ROOT_DIR)/lib
 	python call_correlation_functions_mocks.py
 
-$(PYTHON_EXTN): $(OBJECTS) $(C_LIBRARIES) $(ROOT_DIR)/mocks.options $(ROOT_DIR)/common.mk Makefile 
+$(PYTHON_EXTN): $(OBJECTS) $(C_LIBRARIES) $(ROOT_DIR)/mocks.options $(ROOT_DIR)/common.mk Makefile
 	$(CC) $(OBJECTS) $(C_LIBRARIES) $(CLINK) $(GSL_LINK) $(PYTHON_LINK) -shared -o $@
 
 $(DDrppi_mocks_DIR)/lib$(DDrppi_mocks_LIB).a: $(DDrppi_mocks_DIR)/*.c $(DDrppi_mocks_DIR)/*.c.src $(DDrppi_mocks_DIR)/*.h.src $(ROOT_DIR)/mocks.options $(ROOT_DIR)/common.mk
 	$(MAKE) -C $(DDrppi_mocks_DIR) libs
 
+$(DDsmu_mocks_DIR)/lib$(DDsmu_mocks_LIB).a: $(DDsmu_mocks_DIR)/*.c $(DDsmu_mocks_DIR)/*.c.src $(DDsmu_mocks_DIR)/*.h.src $(ROOT_DIR)/mocks.options $(ROOT_DIR)/common.mk
+	$(MAKE) -C $(DDsmu_mocks_DIR) libs
+
 $(DDtheta_mocks_DIR)/lib$(DDtheta_mocks_LIB).a: $(DDtheta_mocks_DIR)/*.c $(DDtheta_mocks_DIR)/*.c.src $(DDtheta_mocks_DIR)/*.h.src $(ROOT_DIR)/mocks.options $(ROOT_DIR)/common.mk
 	$(MAKE) -C $(DDtheta_mocks_DIR) libs
 
@@ -68,7 +74,7 @@ ifeq ($(FIX_PYTHON_LINK), 1)
 	}
 endif
 
-install: sharedlib $(LIB_DIR)/$(PYTHON_EXTN) 
+install: sharedlib $(LIB_DIR)/$(PYTHON_EXTN)
 
 .PHONY: sharedlib
 
diff --git a/mocks/python_bindings/_countpairs_mocks.c b/mocks/python_bindings/_countpairs_mocks.c
index 207b0ace..2d34bcc4 100644
--- a/mocks/python_bindings/_countpairs_mocks.c
+++ b/mocks/python_bindings/_countpairs_mocks.c
@@ -14,6 +14,7 @@
 
 //for correlation functions
 #include "countpairs_rp_pi_mocks.h"
+#include "countpairs_s_mu_mocks.h"
 #include "countpairs_theta_mocks.h"
 
 //for the vpf
@@ -62,6 +63,7 @@ static char module_docstring[] =    "Python extensions for calculating clusterin
 
 /* function proto-type*/
 static PyObject *countpairs_countpairs_rp_pi_mocks(PyObject *self, PyObject *args, PyObject *kwargs);
+static PyObject *countpairs_countpairs_s_mu_mocks(PyObject *self, PyObject *args, PyObject *kwargs);
 static PyObject *countpairs_countpairs_theta_mocks(PyObject *self, PyObject *args, PyObject *kwargs);
 static PyObject *countpairs_countspheres_vpf_mocks(PyObject *self, PyObject *args, PyObject *kwargs);
 static PyObject *countpairs_mocks_error_out(PyObject *module, const char *msg);
@@ -142,22 +144,22 @@ static PyMethodDef module_methods[] = {
      "    max z is less than that threshold. If you really want to change the speed\n"
      "    of light, then edit the macro in `ROOT/utils/set_cosmo_dist.h`.\n"
      "\n"
-     
+
      "weights1 : array-like, real (float/double), shape (n_particles,) or (n_weights_per_particle,n_particles), optional\n"
      "   Weights for computing a weighted pair count.\n\n"
-     
+
      "weight_type : str, optional\n"
      "   The type of pair weighting to apply.\n"
-     "   Options: \"pair_product\", None\n" 
+     "   Options: \"pair_product\", None\n"
      "   Default: None.\n\n"
-     
+
      "RA2/DEC2/CZ2: float/double (default double)\n"
      "    Same as for RA1/DEC1/CZ1\n"
      "\n"
-     
+
      "weights2\n : array-like, real (float/double), shape (n_particles,) or (n_weights_per_particle,n_particles), optional\n"
      "   Weights for computing a weighted pair count."
-     
+
      "is_comoving_dist: boolean (default false)\n"
      "   Boolean flag to indicate that ``cz`` values have already been\n"
      "   converted into co-moving distances. This flag allows arbitrary\n"
@@ -198,7 +200,7 @@ static PyMethodDef module_methods[] = {
      "  set on the current computer. However, if you set ``isa`` to, say,\n"
      "  ``AVX`` and ``AVX`` is not available on the computer, then the code will\n"
      "  revert to using ``FALLBACK`` (even though ``SSE42`` might be available).\n\n"
-       
+
      "  Unless you are benchmarking the different instruction sets, you should\n"
      "  always leave ``isa`` to the default value. And if you *are* benchmarking,\n"
      "  then the integer values correspond to the ``enum`` for the instruction set\n"
@@ -234,6 +236,162 @@ static PyMethodDef module_methods[] = {
      "                                            verbose=True)\n"
      "\n"
     },
+    {"countpairs_s_mu_mocks"       ,(PyCFunction) countpairs_countpairs_s_mu_mocks ,METH_VARARGS | METH_KEYWORDS,
+         "countpairs_s_mu_mocks(autocorr, cosmology, nthreads, mu_max, nmu_bins, binfile,\n"
+         "                       RA1, DEC1, CZ1, weights1=None, weight_type=None,\n"
+         "                       RA2=None, DEC2=None, CZ2=None, weights2=None,\n"
+         "                       is_comoving_dist=False,\n"
+         "                       verbose=False, output_savg=False,\n"
+         "                       fast_divide=False, xbin_refine_factor=2, \n"
+         "                       ybin_refine_factor=2, zbin_refine_factor=1, \n"
+         "                       max_cells_per_dim=100, \n"
+         "                       c_api_timer=False, isa=-1)\n"
+         "\n"
+         "Calculate the 2-D pair-counts, "XI_CHAR"(s, "MU_CHAR"), auto/cross-correlation function given two\n"
+         "sets of RA1/DEC1/CZ1 and RA2/DEC2/CZ2 arrays. This module is suitable for mock catalogs that have been\n"
+         "created by carving out a survey footprint from simulated data. The module can also be used for actual\n"
+         "observed galaxies, but you probably want to attach weights to the points to account for completeness etc.\n"
+         "\n"
+         UNICODE_WARNING
+         "\n"
+         "Parameters\n"
+         "----------\n"
+         "Every parameter can be passed as a keyword of the corresponding name.\n"
+         "\n"
+         "autocorr: boolean\n"
+         "    Flag for auto/cross-correlation. If autocorr is not 0, the RA2/DEC2/CZ2 arrays\n"
+         "    are not used (but must still be passed, as RA1/DEC1/CZ1).\n"
+         "\n"
+         "cosmology: integer\n"
+         "    Integer to select cosmology. Pre-set values for (1,2) \n"
+         "    1 -> LasDamas cosmology. Om=0.25,  Ol=0.75  (other values are not used)\n"
+         "    2 -> Planck   cosmology. Om=0.302, Ol=0.698 \n"
+         "    To setup a new cosmology, add an entry to the function, `init_cosmology` in \n"
+         "    `ROOT/utils/cosmology_params.c` and recompile the package.\n"
+         "\n"
+         "nthreads: integer\n"
+         "    The number of OpenMP threads to use. Has no effect if OpenMP was not used\n"
+         "    during library compilation. \n"
+         "\n"
+         "mu_max: double \n"
+         "    The maximum mu value to use; must be > 0 and <= 1.0\n"
+         "\n"
+         "nmu_bins: int \n"
+         "    The number of "MU_CHAR" bins to use, binning from [0.0, mumax)\n"
+         "\n"
+         "binfile: filename\n"
+         "    Filename containing the radial bins for the correlation function. The file\n"
+         "    is expected to contain white-space separated ``smin  smax`` with the bin\n"
+         "    edges.  Units must be Mpc/h (see the ``bins`` file in the tests directory\n"
+         "    for a sample). For usual logarithmic bins, ``logbins``in the root directory\n"
+         "    of this package will create a compatible ``binfile``.\n"
+         "\n"
+         "RA1: array-like, float/double (default double)\n"
+         "    The right-ascension of the galaxy, in the range [0, 360]. If there are\n"
+         "    negative RA's in the supplied array (input RA in the range [-180, 180]),\n"
+         "    then the code will shift the entire array by 180 to put RA's in the\n"
+         "    [0, 360] range.\n"
+         "\n"
+         "DEC1: array-like, float/double (default double)\n"
+         "    The declination of the galaxy, in the range [-90, 90]. If there are\n"
+         "    declinations > 90 in the supplied array (input dec in the range [0, 180]),\n"
+         "    then the code will shift the entire array by -90 to put declinations in\n"
+         "    the [-90, 90] range. If the code finds declinations more than 180, then\n"
+         "    it assumes RA and DEC have been swapped and aborts with that message.\n"
+         "\n"
+         "CZ1: array-like, float/double (default double)\n"
+         "    The redshift multiplied by speed of light for the galaxies. The code will\n"
+         "    checks that cz has been supplied by comparing with a threshold (currently\n"
+         "    set to 10, defined in function check_ra_dec_cz in file\n"
+         "    `DDrppi/countpairs_rp_pi_mocks_impl.c.src`) and multiplies by the speed of light if\n"
+         "    max z is less than that threshold. If you really want to change the speed\n"
+         "    of light, then edit the macro in `ROOT/utils/set_cosmo_dist.h`.\n"
+         "\n"
+
+         "weights1 : array-like, real (float/double), shape (n_particles,) or (n_weights_per_particle,n_particles), optional\n"
+         "   Weights for computing a weighted pair count.\n\n"
+
+         "weight_type : str, optional\n"
+         "   The type of pair weighting to apply.\n"
+         "   Options: \"pair_product\", None\n"
+         "   Default: None.\n\n"
+
+         "RA2/DEC2/CZ2: float/double (default double)\n"
+         "    Same as for RA1/DEC1/CZ1\n"
+         "\n"
+
+         "weights2\n : array-like, real (float/double), shape (n_particles,) or (n_weights_per_particle,n_particles), optional\n"
+         "   Weights for computing a weighted pair count."
+
+         "is_comoving_dist: boolean (default false)\n"
+         "   Boolean flag to indicate that ``cz`` values have already been\n"
+         "   converted into co-moving distances. This flag allows arbitrary\n"
+         "   cosmologies to be used in ``Corrfunc``.\n"
+         "\n"
+         "verbose : boolean (default false)\n"
+         "   Boolean flag to control output of informational messages\n"
+         "\n"
+         "output_savg : boolean (default false)\n"
+         "   Boolean flag to output the average ``s`` for each bin. Code will\n"
+         "   run slightly slower if you set this flag. Also, note, if you are calculating\n"
+         "   in single-precision, ``savg`` will suffer from numerical loss of\n"
+         "   precision and can not be trusted. If you need accurate ``savg``\n"
+         "   values, then pass in double precision arrays for the particle positions.\n"
+         "\n"
+         "fast_divide: boolean (default false)\n"
+         "   Boolean flag to replace the division in ``AVX`` implementation with an\n"
+         "   approximate reciprocal, followed by a Newton-Raphson step. Improves\n"
+         "   runtime by ~15-20%. Loss of precision is at the 5-6th decimal place.\n"
+         "\n"
+         "(xyz)bin_refine_factor: integer (default (2,2,1) typical values in [1-3]) \n"
+         "   Controls the refinement on the cell sizes. Can have up to a 20% impact \n"
+         "   on runtime. \n"
+         "\n"
+         "max_cells_per_dim: integer (default 100, typical values in [50-300]) \n"
+         "   Controls the maximum number of cells per dimension. Total number of cells \n"
+         "   can be up to (max_cells_per_dim)^3. Only increase if ``rmax`` is too small \n"
+         "   relative to the boxsize (and increasing helps the runtime). \n"
+         "\n"
+         "c_api_timer : boolean (default false)\n"
+         "   Boolean flag to measure actual time spent in the C libraries. Here\n"
+         "   to allow for benchmarking and scaling studies.\n"
+         "\n"
+         "isa : integer (default -1)\n"
+         "  Controls the runtime dispatch for the instruction set to use. Possible\n"
+         "  options are: [-1, AVX, SSE42, FALLBACK]\n\n"
+         "  Setting isa to -1 will pick the fastest available instruction\n"
+         "  set on the current computer. However, if you set ``isa`` to, say,\n"
+         "  ``AVX`` and ``AVX`` is not available on the computer, then the code will\n"
+         "  revert to using ``FALLBACK`` (even though ``SSE42`` might be available).\n\n"
+
+         "  Unless you are benchmarking the different instruction sets, you should\n"
+         "  always leave ``isa`` to the default value. And if you *are* benchmarking,\n"
+         "  then the integer values correspond to the ``enum`` for the instruction set\n"
+         "  defined in ``utils/defs.h``.\n"
+         "\n"
+         "Returns\n"
+         "--------\n"
+         "\n"
+         "a Python list containing [smin, smax, savg, "MU_CHAR", npairs, weightavg] \n"
+         "for each "MU_CHAR"-bin (up to 1.0) for each radial bin specified in\n"
+         "the ``binfile``.\n"
+         "\n"
+         "Example\n"
+         "-------\n"
+         ">>> import numpy as np\n"
+         ">>> from Corrfunc._countpairs_mocks import countpairs_s_mu_mocks\n"
+         ">>> ra,dec,cz = np.genfromtxt('../mocks/tests/data/Mr19_mock_northonly.rdcz.dat',dtype=np.float,unpack=True)\n"
+         ">>> cosmology=1\n"
+         ">>> autocorr=1\n"
+         ">>> nthreads=4\n"
+         ">>> binfile='../mocks/tests/bins'\n"
+         ">>> nmu_bins=10\n"
+         ">>> mu_max=1.0\n"
+         ">>> (DDsmu, time) = countpairs_s_mu_mocks(autocorr, cosmology, nthreads, mu_max, nmu_bins, binfile,\n"
+         "                                            ra,dec,cz,ra,dec,cz,\n"
+         "                                            verbose=True)\n"
+         "\n"
+        },
     {"countpairs_theta_mocks"       ,(PyCFunction) countpairs_countpairs_theta_mocks ,METH_VARARGS | METH_KEYWORDS,
      "countpairs_theta_mocks(autocorr, nthreads, binfile,\n"
      "                       RA1, DEC1, weights1=None, weight_type=None,\n"
@@ -282,21 +440,21 @@ static PyMethodDef module_methods[] = {
      "    then the code will shift the entire array by -90 to put declinations in\n"
      "    the [-90, 90] range. If the code finds declinations more than 180, then\n"
      "    it assumes RA and DEC have been swapped and aborts with that message.\n"
-     
+
      "weights1 : array-like, real (float/double), shape (n_particles,) or (n_weights_per_particle,n_particles), optional\n"
      "   Weights for computing a weighted pair count.\n\n"
-     
+
      "weight_type : str, optional\n"
      "   The type of pair weighting to apply.\n"
-     "   Options: \"pair_product\", None\n" 
+     "   Options: \"pair_product\", None\n"
      "   Default: None.\n\n"
-     
+
      "RA2/DEC2: float/double (default double)\n"
      "    Same as for RA1/DEC1\n"
-     
+
      "weights2\n : array-like, real (float/double), shape (n_particles,) or (n_weights_per_particle,n_particles), optional\n"
      "   Weights for computing a weighted pair count."
-     
+
      "verbose : boolean (default false)\n"
      "   Boolean flag to control output of informational messages\n"
      "\n"
@@ -554,7 +712,7 @@ static PyMethodDef module_methods[] = {
      "   a sphere of radius ``rmax`` contains *exactly* ``N`` galaxies. For \n"
      "   example, pN[0] (p0, the void probibility function) is the probability\n"
      "   that a sphere of radius ``rmax`` contains 0 galaxies.\n"
-     "\n" 
+     "\n"
     "time : double\n"
     "   if ``c_api_timer`` is set, then the return value contains the time spent\n"
     "   in the API; otherwise time is set to 0.0\n"
@@ -593,7 +751,7 @@ static PyMethodDef module_methods[] = {
     "                        RA, DEC, CZ,\n"
     "                        verbose=True,\n"
     "                        is_comoving_dist=True)\n"
-    "\n" 
+    "\n"
     },
     {NULL, NULL, 0, NULL}
 };
@@ -604,7 +762,7 @@ static PyObject *countpairs_mocks_error_out(PyObject *module, const char *msg)
 #if PY_MAJOR_VERSION < 3
     (void) module;//to avoid unused warning with python2
 #endif
-    
+
     struct module_state *st = GETSTATE(module);
     PyErr_SetString(st->error, msg);
     PyErr_Print();
@@ -667,7 +825,7 @@ PyObject *PyInit__countpairs_mocks(void)
     import_array();
 
     highest_isa_mocks = instrset_detect();
-    
+
 #if PY_MAJOR_VERSION >= 3
     return module;
 #endif
@@ -677,18 +835,18 @@ PyObject *PyInit__countpairs_mocks(void)
 static int print_kwlist_into_msg(char *msg, const size_t totsize, size_t len, char *kwlist[], const size_t nitems)
 {
     for(size_t i=0;i<nitems;i++) {
-        
+
         if(len+strlen(kwlist[i]) >= totsize-2) {
             return EXIT_FAILURE;
         }
-        
+
         memcpy(msg+len, kwlist[i], strlen(kwlist[i]));
         len += strlen(kwlist[i]);
         msg[len] = ',';
         msg[len+1] = ' ';
         len += 2;
     }
-    
+
     msg[len]='\0';
     return EXIT_SUCCESS;
 }
@@ -699,24 +857,24 @@ static int print_kwlist_into_msg(char *msg, const size_t totsize, size_t len, ch
 static int64_t check_dims_and_datatype(PyObject *module, PyArrayObject *x1_obj, PyArrayObject *y1_obj, PyArrayObject *z1_obj, PyArrayObject *weights1_obj, size_t *element_size)
 {
     char msg[1024];
-    
+
     const int check_weights = weights1_obj != NULL;
 
     /* All the position arrays should be 1-D*/
     const int nxdims = PyArray_NDIM(x1_obj);
     const int nydims = PyArray_NDIM(y1_obj);
     const int nzdims = PyArray_NDIM(z1_obj);
-    
+
     if(nxdims != 1 || nydims != 1 || nzdims != 1) {
         snprintf(msg, 1024, "ERROR: Expected 1-D numpy arrays.\nFound (nxdims, nydims, nzdims) = (%d, %d, %d) instead",
                  nxdims, nydims, nzdims);
         countpairs_mocks_error_out(module, msg);
         return -1;
     }
-    
+
     /* The weights array can be 1-D or 2-D of shape (n_weights, n_particles) */
     const int n_weight_dims = check_weights ? PyArray_NDIM(weights1_obj) : 1;
-    
+
     if(n_weight_dims != 1 && n_weight_dims != 2) {
         snprintf(msg, 1024, "ERROR: Expected 1-D or 2-D weight array.\nFound n_weight_dims = %d instead", n_weight_dims);
         countpairs_mocks_error_out(module, msg);
@@ -750,7 +908,7 @@ static int64_t check_dims_and_datatype(PyObject *module, PyArrayObject *x1_obj,
         countpairs_mocks_error_out(module, msg);
         return -1;
     }
-    
+
     // Current version of the code only supports weights of the same dtype as positions
     if( x_type != y_type || y_type != z_type || (check_weights && z_type != weights_type)) {
         PyArray_Descr *x_descr = PyArray_DescrFromType(x_type);
@@ -770,12 +928,12 @@ static int64_t check_dims_and_datatype(PyObject *module, PyArrayObject *x1_obj,
         countpairs_mocks_error_out(module, msg);
         return -1;
     }
-    
+
     /* Check if the number of elements in the 3 Python arrays are identical */
     const int64_t nx1 = (int64_t)PyArray_SIZE(x1_obj);
     const int64_t ny1 = (int64_t)PyArray_SIZE(y1_obj);
     const int64_t nz1 = (int64_t)PyArray_SIZE(z1_obj);
-    
+
     if(nx1 != ny1 || ny1 != nz1) {
       snprintf(msg, 1024, "ERROR: Expected arrays to have the same number of elements in all 3-dimensions.\nFound (nx, ny, nz) = (%"PRId64", %"PRId64", %"PRId64") instead",
                nx1, ny1, nz1);
@@ -800,7 +958,7 @@ static int64_t check_dims_and_datatype(PyObject *module, PyArrayObject *x1_obj,
     } else {
       *element_size = sizeof(double);
     }
-    
+
     return nx1;
 }
 
@@ -859,7 +1017,7 @@ static int64_t check_dims_and_datatype_ra_dec(PyObject *module, PyArrayObject *x
         countpairs_mocks_error_out(module, msg);
         return -1;
     }
-    
+
     /* Check if the number of elements in the 3 Python arrays are identical */
     const int64_t nx1 = (int64_t)PyArray_SIZE(x1_obj);
     const int64_t ny1 = (int64_t)PyArray_SIZE(y1_obj);
@@ -878,7 +1036,7 @@ static int64_t check_dims_and_datatype_ra_dec(PyObject *module, PyArrayObject *x
     } else {
       *element_size = sizeof(double);
     }
-    
+
     return nx1;
 }
 
@@ -888,11 +1046,11 @@ static PyObject *countpairs_countpairs_rp_pi_mocks(PyObject *self, PyObject *arg
     //Error-handling is global in python2 -> stored in struct module_state _struct declared at the top of this file
 #if PY_MAJOR_VERSION < 3
     (void) self;
-    PyObject *module = NULL;//should not be used -> setting to NULL so any attempts to dereference will result in a crash. 
+    PyObject *module = NULL;//should not be used -> setting to NULL so any attempts to dereference will result in a crash.
 #else
     //In python3, self is simply the module object that was returned earlier by init
     PyObject *module = self;
-#endif    
+#endif
 
     //x1->ra (phi), y1-> declination (theta1), z1->cz (cz1)
     //x2->ra (ph2), y2-> declination (theta2), z2->cz (cz2)
@@ -971,16 +1129,16 @@ static PyObject *countpairs_countpairs_rp_pi_mocks(PyObject *self, PyObject *arg
 
         char msg[1024];
         int len=snprintf(msg, 1024,"ArgumentError: In DDrppi_mocks> Could not parse the arguments. Input parameters are: \n");
-        
+
         /* How many keywords do we have? Subtract 1 because of the last NULL */
         const size_t nitems = sizeof(kwlist)/sizeof(*kwlist) - 1;
         int status = print_kwlist_into_msg(msg, 1024, len, kwlist, nitems);
         if(status != EXIT_SUCCESS) {
             fprintf(stderr,"Error message does not contain all of the keywords\n");
         }
-        
+
         countpairs_mocks_error_out(module,msg);
-        
+
         Py_RETURN_NONE;
     }
 
@@ -998,16 +1156,16 @@ static PyObject *countpairs_countpairs_rp_pi_mocks(PyObject *self, PyObject *arg
     }
 
 
-    
+
     /* We have numpy arrays and all the required inputs*/
     /* How many data points are there? And are they all of floating point type */
     size_t element_size;
     const int64_t ND1 = check_dims_and_datatype(module, x1_obj, y1_obj, z1_obj, weights1_obj, &element_size);
     if(ND1 == -1) {
-        //Error has already been set -> simply return 
+        //Error has already been set -> simply return
         Py_RETURN_NONE;
     }
-    
+
     /* Ensure the weights are of the right shape (n_weights, n_particles) */
     if(weights1_obj != NULL){
         // A numpy dimension of length -1 will be expanded to n_weights
@@ -1015,7 +1173,7 @@ static PyObject *countpairs_countpairs_rp_pi_mocks(PyObject *self, PyObject *arg
         PyArray_Dims pdims = {.ptr = &(dims[0]), .len = 2};
         weights1_obj = (PyArrayObject *) PyArray_Newshape(weights1_obj, &pdims, NPY_CORDER);
     }
-    
+
     /* Validate the user's choice of weighting method */
     weight_method_t weighting_method;
     int wstatus = get_weight_method_by_name(weighting_method_str, &weighting_method);
@@ -1034,7 +1192,7 @@ static PyObject *countpairs_countpairs_rp_pi_mocks(PyObject *self, PyObject *arg
         countpairs_mocks_error_out(module, msg);
         Py_RETURN_NONE;
     }
-    
+
     if(extra.weights0.num_weights > 0 && found_weights > MAX_NUM_WEIGHTS){
         char msg[1024];
         snprintf(msg, 1024, "ValueError: In %s: Provided %d weights-per-particle, but the code was compiled with MAX_NUM_WEIGHTS=%d.\n",
@@ -1058,11 +1216,11 @@ static PyObject *countpairs_countpairs_rp_pi_mocks(PyObject *self, PyObject *arg
             countpairs_mocks_error_out(module, msg);
             Py_RETURN_NONE;
         }
-        
+
         size_t element_size2;
         ND2 = check_dims_and_datatype(module, x2_obj, y2_obj, z2_obj, weights2_obj, &element_size2);
         if(ND2 == -1) {
-            //Error has already been set -> simply return 
+            //Error has already been set -> simply return
             Py_RETURN_NONE;
         }
         /* Ensure the weights are of the right shape (n_weights, n_particles) */
@@ -1071,7 +1229,7 @@ static PyObject *countpairs_countpairs_rp_pi_mocks(PyObject *self, PyObject *arg
             PyArray_Dims pdims = {.ptr = &(dims[0]), .len = 2};
             weights2_obj = (PyArrayObject *) PyArray_Newshape(weights2_obj, &pdims, NPY_CORDER);
         }
-        
+
         if(element_size != element_size2) {
             snprintf(msg, 1024, "TypeError: In %s: The two arrays must have the same data-type. First array is of type %s while second array is of type %s\n",
                      __FUNCTION__, element_size == 4 ? "floats":"doubles", element_size2 == 4 ? "floats":"doubles");
@@ -1138,7 +1296,7 @@ static PyObject *countpairs_countpairs_rp_pi_mocks(PyObject *self, PyObject *arg
         }
     }
     options.float_type = element_size;
-    
+
     /* Pack the weights into extra_options */
     for(int64_t w = 0; w < extra.weights0.num_weights; w++){
         extra.weights0.weights[w] = (char *) weights1 + w*ND1*element_size;
@@ -1146,7 +1304,7 @@ static PyObject *countpairs_countpairs_rp_pi_mocks(PyObject *self, PyObject *arg
             extra.weights1.weights[w] = (char *) weights2 + w*ND2*element_size;
         }
     }
-    
+
     NPY_BEGIN_THREADS_DEF;
     NPY_BEGIN_THREADS;
 
@@ -1166,7 +1324,7 @@ static PyObject *countpairs_countpairs_rp_pi_mocks(PyObject *self, PyObject *arg
         c_api_time = options.c_api_time;
     }
     NPY_END_THREADS;
-    
+
     /* Clean up. */
     Py_DECREF(x1_array);Py_DECREF(y1_array);Py_DECREF(z1_array);Py_XDECREF(weights1_array);//x1 should absolutely not be NULL
     Py_XDECREF(x2_array);Py_XDECREF(y2_array);Py_XDECREF(z2_array);Py_XDECREF(weights2_array);//x2 might be NULL depending on value of autocorr
@@ -1174,8 +1332,8 @@ static PyObject *countpairs_countpairs_rp_pi_mocks(PyObject *self, PyObject *arg
     if(status != EXIT_SUCCESS) {
         Py_RETURN_NONE;
     }
-    
-    
+
+
 #if 0
     /* Output pairs*/
     for(int i=1;i<results.nbin;i++) {
@@ -1209,16 +1367,342 @@ static PyObject *countpairs_countpairs_rp_pi_mocks(PyObject *self, PyObject *arg
     return Py_BuildValue("(Od)", ret, c_api_time);
 }
 
+static PyObject *countpairs_countpairs_s_mu_mocks(PyObject *self, PyObject *args, PyObject *kwargs)
+{
+    //Error-handling is global in python2 -> stored in struct module_state _struct declared at the top of this file
+#if PY_MAJOR_VERSION < 3
+    (void) self;
+    PyObject *module = NULL;//should not be used -> setting to NULL so any attempts to dereference will result in a crash.
+#else
+    //In python3, self is simply the module object that was returned earlier by init
+    PyObject *module = self;
+#endif
+
+    //x1->ra (phi), y1-> declination (theta1), z1->cz (cz1)
+    //x2->ra (ph2), y2-> declination (theta2), z2->cz (cz2)
+    PyArrayObject *x1_obj=NULL, *y1_obj=NULL, *z1_obj=NULL, *weights1_obj=NULL;
+    PyArrayObject *x2_obj=NULL, *y2_obj=NULL, *z2_obj=NULL, *weights2_obj=NULL;
+
+    struct config_options options = get_config_options();
+    options.is_comoving_dist = 0;
+    options.verbose = 0;
+    options.instruction_set = -1;
+    options.periodic = 0;
+    options.fast_divide=0;
+    options.c_api_timer = 0;
+    int8_t xbin_ref=options.bin_refine_factors[0],
+        ybin_ref=options.bin_refine_factors[1],
+        zbin_ref=options.bin_refine_factors[2];
+
+    int autocorr=1;
+    int nthreads=4;
+    int cosmology=1;
+    int nmu_bins=10;
+    double mu_max=1.0;
+    char *binfile, *weighting_method_str = NULL;
+
+    static char *kwlist[] = {
+        "autocorr",
+        "cosmology",
+        "nthreads",
+        "mu_max",        
+        "nmu_bins",
+        "binfile",
+        "RA1",
+        "DEC1",
+        "CZ1",
+        "weights1",
+        "RA2",
+        "DEC2",
+        "CZ2",
+        "weights2",
+        "is_comoving_dist",
+        "verbose", /* keyword verbose -> print extra info at runtime + progressbar */
+        "output_savg",
+        "fast_divide",
+        "xbin_refine_factor",
+        "ybin_refine_factor",
+        "zbin_refine_factor",
+        "max_cells_per_dim",
+        "c_api_timer",
+        "isa",/* instruction set to use of type enum isa; valid values are AVX, SSE, FALLBACK (enum) */
+        "weight_type",
+        NULL
+    };
+
+    if ( ! PyArg_ParseTupleAndKeywords(args, kwargs, "iiidisO!O!O!|O!O!O!O!O!bbbbbbbhbis", kwlist,
+                                       &autocorr,&cosmology,&nthreads,&mu_max,&nmu_bins,&binfile,
+                                       &PyArray_Type,&x1_obj,
+                                       &PyArray_Type,&y1_obj,
+                                       &PyArray_Type,&z1_obj,
+                                       &PyArray_Type,&weights1_obj,
+                                       &PyArray_Type,&x2_obj,//optional parameters -> if autocorr == 1, not checked; required if autocorr=0
+                                       &PyArray_Type,&y2_obj,
+                                       &PyArray_Type,&z2_obj,
+                                       &PyArray_Type,&weights2_obj,
+                                       &(options.is_comoving_dist),
+                                       &(options.verbose),
+                                       &(options.need_avg_sep),
+                                       &(options.fast_divide),
+                                       &xbin_ref, &ybin_ref, &zbin_ref,
+                                       &(options.max_cells_per_dim),
+                                       &(options.c_api_timer),
+                                       &(options.instruction_set),
+                                       &weighting_method_str)
+
+         ) {
+
+        PyObject_Print(kwargs, stdout, 0);
+        fprintf(stdout, "\n");
+
+        char msg[1024];
+        int len=snprintf(msg, 1024,"ArgumentError: In DDsmu_mocks> Could not parse the arguments. Input parameters are: \n");
+
+        /* How many keywords do we have? Subtract 1 because of the last NULL */
+        const size_t nitems = sizeof(kwlist)/sizeof(*kwlist) - 1;
+        int status = print_kwlist_into_msg(msg, 1024, len, kwlist, nitems);
+        if(status != EXIT_SUCCESS) {
+            fprintf(stderr,"Error message does not contain all of the keywords\n");
+        }
+
+        countpairs_mocks_error_out(module,msg);
+
+        Py_RETURN_NONE;
+    }
+
+    /*This is for the fastest isa */
+    if(options.instruction_set == -1) {
+        options.instruction_set = highest_isa_mocks;
+    }
+    if(xbin_ref != options.bin_refine_factors[0] ||
+       ybin_ref != options.bin_refine_factors[1] ||
+       zbin_ref != options.bin_refine_factors[2]) {
+        options.bin_refine_factors[0] = xbin_ref;
+        options.bin_refine_factors[1] = ybin_ref;
+        options.bin_refine_factors[2] = zbin_ref;
+        set_bin_refine_scheme(&options, BINNING_CUST);//custom binning -> code will honor requested binning scheme
+    }
+    /* We have numpy arrays and all the required inputs*/
+    /* How many data points are there? And are they all of floating point type */
+    size_t element_size;
+    const int64_t ND1 = check_dims_and_datatype(module, x1_obj, y1_obj, z1_obj, weights1_obj, &element_size);
+    if(ND1 == -1) {
+        //Error has already been set -> simply return
+        Py_RETURN_NONE;
+    }
+
+    /* Ensure the weights are of the right shape (n_weights, n_particles) */
+    if(weights1_obj != NULL){
+        // A numpy dimension of length -1 will be expanded to n_weights
+        npy_intp dims[2] = {-1, ND1};
+        PyArray_Dims pdims = {.ptr = &(dims[0]), .len = 2};
+        weights1_obj = (PyArrayObject *) PyArray_Newshape(weights1_obj, &pdims, NPY_CORDER);
+    }
+
+    /* Validate the user's choice of weighting method */
+    weight_method_t weighting_method;
+    int wstatus = get_weight_method_by_name(weighting_method_str, &weighting_method);
+    if(wstatus != EXIT_SUCCESS){
+        char msg[1024];
+        snprintf(msg, 1024, "ValueError: In %s: unknown weight_type %s!", __FUNCTION__, weighting_method_str);
+        countpairs_mocks_error_out(module, msg);
+        Py_RETURN_NONE;
+    }
+    int found_weights = weights1_obj == NULL ? 0 : PyArray_SHAPE(weights1_obj)[0];
+    struct extra_options extra = get_extra_options(weighting_method);
+    if(extra.weights0.num_weights > 0 && extra.weights0.num_weights != found_weights){
+        char msg[1024];
+        snprintf(msg, 1024, "ValueError: In %s: specified weighting method %s which requires %"PRId64" weight(s)-per-particle, but found %d weight(s) instead!\n",
+                 __FUNCTION__, weighting_method_str, extra.weights0.num_weights, found_weights);
+        countpairs_mocks_error_out(module, msg);
+        Py_RETURN_NONE;
+    }
+
+    if(extra.weights0.num_weights > 0 && found_weights > MAX_NUM_WEIGHTS){
+        char msg[1024];
+        snprintf(msg, 1024, "ValueError: In %s: Provided %d weights-per-particle, but the code was compiled with MAX_NUM_WEIGHTS=%d.\n",
+                 __FUNCTION__, found_weights, MAX_NUM_WEIGHTS);
+        countpairs_mocks_error_out(module, msg);
+        Py_RETURN_NONE;
+    }
+
+    int64_t ND2 = ND1;
+    if(autocorr == 0) {
+        char msg[1024];
+        if(x2_obj == NULL || y2_obj == NULL || z2_obj == NULL) {
+            snprintf(msg, 1024, "ValueError: In %s: If autocorr is 0, need to pass the second set of positions (X2=numpy array, Y2=numpy array, Z2=numpy array).\n",
+                     __FUNCTION__);
+            countpairs_mocks_error_out(module, msg);
+            Py_RETURN_NONE;
+        }
+        if((weights1_obj == NULL) != (weights2_obj == NULL)){
+            snprintf(msg, 1024, "ValueError: In %s: If autocorr is 0, must pass either zero or two sets of weights.\n",
+                     __FUNCTION__);
+            countpairs_mocks_error_out(module, msg);
+            Py_RETURN_NONE;
+        }
+
+        size_t element_size2;
+        ND2 = check_dims_and_datatype(module, x2_obj, y2_obj, z2_obj, weights2_obj, &element_size2);
+        if(ND2 == -1) {
+            //Error has already been set -> simply return
+            Py_RETURN_NONE;
+        }
+        /* Ensure the weights are of the right shape (n_weights, n_particles) */
+        if(weights2_obj != NULL){
+            npy_intp dims[2] = {-1, ND2};
+            PyArray_Dims pdims = {.ptr = &(dims[0]), .len = 2};
+            weights2_obj = (PyArrayObject *) PyArray_Newshape(weights2_obj, &pdims, NPY_CORDER);
+        }
+
+        if(element_size != element_size2) {
+            snprintf(msg, 1024, "TypeError: In %s: The two arrays must have the same data-type. First array is of type %s while second array is of type %s\n",
+                     __FUNCTION__, element_size == 4 ? "floats":"doubles", element_size2 == 4 ? "floats":"doubles");
+            countpairs_mocks_error_out(module, msg);
+            Py_RETURN_NONE;
+        }
+    }
+
+    /* Interpret the input objects as numpy arrays. */
+    const int requirements = NPY_ARRAY_IN_ARRAY;
+    PyObject *x1_array = NULL, *y1_array = NULL, *z1_array = NULL, *weights1_array = NULL;
+    PyObject *x2_array = NULL, *y2_array = NULL, *z2_array = NULL, *weights2_array = NULL;
+    x1_array = PyArray_FromArray(x1_obj, NOTYPE_DESCR, requirements);
+    y1_array = PyArray_FromArray(y1_obj, NOTYPE_DESCR, requirements);
+    z1_array = PyArray_FromArray(z1_obj, NOTYPE_DESCR, requirements);
+    if(weights1_obj != NULL){
+        weights1_array = PyArray_FromArray(weights1_obj, NOTYPE_DESCR, requirements);
+    }
+
+    if(autocorr == 0) {
+        x2_array = PyArray_FromArray(x2_obj, NOTYPE_DESCR, requirements);
+        y2_array = PyArray_FromArray(y2_obj, NOTYPE_DESCR, requirements);
+        z2_array = PyArray_FromArray(z2_obj, NOTYPE_DESCR, requirements);
+        if(weights2_obj != NULL){
+            weights2_array = PyArray_FromArray(weights2_obj, NOTYPE_DESCR, requirements);
+        }
+    }
+
+    if (x1_array == NULL || y1_array == NULL || z1_array == NULL ||
+        (autocorr == 0 && (x2_array == NULL || y2_array == NULL || z2_array == NULL))) {
+        Py_XDECREF(x1_array);
+        Py_XDECREF(y1_array);
+        Py_XDECREF(z1_array);
+        Py_XDECREF(weights1_array);
+
+        Py_XDECREF(x2_array);
+        Py_XDECREF(y2_array);
+        Py_XDECREF(z2_array);
+        Py_XDECREF(weights2_array);
+        char msg[1024];
+        snprintf(msg, 1024, "TypeError: In %s: Could not convert input to arrays of allowed floating point types (doubles or floats). Are you passing numpy arrays?",
+                 __FUNCTION__);
+        countpairs_mocks_error_out(module, msg);
+        Py_RETURN_NONE;
+    }
+
+    /* Get pointers to the data as C-types. */
+    void *phiD1=NULL, *thetaD1=NULL, *czD1=NULL, *weights1=NULL;
+    void *phiD2=NULL, *thetaD2=NULL, *czD2=NULL, *weights2=NULL;
+
+    phiD1   = PyArray_DATA((PyArrayObject *)x1_array);
+    thetaD1 = PyArray_DATA((PyArrayObject *)y1_array);
+    czD1    = PyArray_DATA((PyArrayObject *)z1_array);
+    if(weights1_array != NULL){
+        weights1 = PyArray_DATA((PyArrayObject *) weights1_array);
+    }
+
+    if(autocorr == 0) {
+        phiD2   = PyArray_DATA((PyArrayObject *) x2_array);
+        thetaD2 = PyArray_DATA((PyArrayObject *) y2_array);
+        czD2    = PyArray_DATA((PyArrayObject *) z2_array);
+        if(weights2_array != NULL){
+            weights2 = PyArray_DATA((PyArrayObject *) weights2_array);
+        }
+    }
+    options.float_type = element_size;
+
+    /* Pack the weights into extra_options */
+    for(int64_t w = 0; w < extra.weights0.num_weights; w++){
+        extra.weights0.weights[w] = (char *) weights1 + w*ND1*element_size;
+        if(autocorr == 0){
+            extra.weights1.weights[w] = (char *) weights2 + w*ND2*element_size;
+        }
+    }
+
+    NPY_BEGIN_THREADS_DEF;
+    NPY_BEGIN_THREADS;
+
+    results_countpairs_mocks_s_mu results;
+    double c_api_time = 0.0;
+    int status = countpairs_mocks_s_mu(ND1,phiD1,thetaD1,czD1,
+                                       ND2,phiD2,thetaD2,czD2,
+                                       nthreads,
+                                       autocorr,
+                                       binfile,
+                                       mu_max,
+                                       nmu_bins,
+                                       cosmology,
+                                       &results,
+                                       &options,
+                                       &extra);
+    if(options.c_api_timer) {
+        c_api_time = options.c_api_time;
+    }
+    NPY_END_THREADS;
+
+    /* Clean up. */
+    Py_DECREF(x1_array);Py_DECREF(y1_array);Py_DECREF(z1_array);Py_XDECREF(weights1_array);//x1 should absolutely not be NULL
+    Py_XDECREF(x2_array);Py_XDECREF(y2_array);Py_XDECREF(z2_array);Py_XDECREF(weights2_array);//x2 might be NULL depending on value of autocorr
+
+    if(status != EXIT_SUCCESS) {
+        Py_RETURN_NONE;
+    }
+
+
+#if 0
+    /* Output pairs*/
+    for(int i=1;i<results.nbin;i++) {
+        const double logrp = LOG10(results.rupp[i]);
+        for(int j=0;j<npibin;j++) {
+            const int index = i*(npibin+1) + j;
+            fprintf(stdout,"%10"PRIu64" %20.8lf %20.8lf  %20.8lf \n",results.npairs[index],results.rpavg[index],logrp,(j+1)*dpi);
+        }
+    }
+#endif
+
+
+    /* Build the output list */
+    PyObject *ret = PyList_New(0);//create an empty list
+    double rlow=results.supp[0];
+    const double dmu = mu_max/(double)results.nmu_bins ;
+
+    for(int i=1;i<results.nsbin;i++) {
+        for(int j=0;j<results.nmu_bins;j++) {
+            const int bin_index = i*(results.nmu_bins + 1) + j;
+            PyObject *item = NULL;
+            const double savg = results.savg[bin_index];
+            const double weight_avg = results.weightavg[bin_index];
+            item = Py_BuildValue("(ddddkd)", rlow,results.supp[i],savg,(j+1)*dmu,results.npairs[bin_index], weight_avg);
+            PyList_Append(ret, item);
+            Py_XDECREF(item);
+        }
+        rlow=results.supp[i];
+    }
+    free_results_mocks_s_mu(&results);
+    return Py_BuildValue("(Od)", ret, c_api_time);
+}
+
 static PyObject *countpairs_countpairs_theta_mocks(PyObject *self, PyObject *args, PyObject *kwargs)
 {
     //Error-handling is global in python2 -> stored in struct module_state _struct declared at the top of this file
 #if PY_MAJOR_VERSION < 3
     (void) self;
-    PyObject *module = NULL;//should not be used -> setting to NULL so any attempts to dereference will result in a crash. 
+    PyObject *module = NULL;//should not be used -> setting to NULL so any attempts to dereference will result in a crash.
 #else
     //In python3, self is simply the module object that was returned earlier by init
     PyObject *module = self;
-#endif    
+#endif
 
     PyArrayObject *x1_obj=NULL, *y1_obj=NULL, *weights1_obj=NULL;
     PyArrayObject *x2_obj=NULL, *y2_obj=NULL, *weights2_obj=NULL;
@@ -1272,7 +1756,7 @@ static PyObject *countpairs_countpairs_theta_mocks(PyObject *self, PyObject *arg
                                        &(options.verbose),
                                        &(options.need_avg_sep),
                                        &(options.fast_acos),
-                                       &ra_bin_ref, &dec_bin_ref, 
+                                       &ra_bin_ref, &dec_bin_ref,
                                        &(options.max_cells_per_dim),
                                        &(options.c_api_timer),
                                        &(options.instruction_set),
@@ -1284,7 +1768,7 @@ static PyObject *countpairs_countpairs_theta_mocks(PyObject *self, PyObject *arg
 
         char msg[1024];
         int len=snprintf(msg, 1024,"ArgumentError: In DDtheta_mocks> Could not parse the arguments. Input parameters are: \n");
-        
+
         /* How many keywords do we have? Subtract 1 because of the last NULL */
         const size_t nitems = sizeof(kwlist)/sizeof(*kwlist) - 1;
         int status = print_kwlist_into_msg(msg, 1024, len, kwlist, nitems);
@@ -1308,7 +1792,7 @@ static PyObject *countpairs_countpairs_theta_mocks(PyObject *self, PyObject *arg
         set_bin_refine_scheme(&options, BINNING_CUST);//custom binning -> code will honor requested binning scheme
     }
 
-    
+
     size_t element_size;
     /* We have numpy arrays and all the required inputs*/
     /* How many data points are there? And are they all of floating point type */
@@ -1317,7 +1801,7 @@ static PyObject *countpairs_countpairs_theta_mocks(PyObject *self, PyObject *arg
         //Error has already been set -> simply return
         Py_RETURN_NONE;
     }
-    
+
     /* Ensure the weights are of the right shape (n_weights, n_particles) */
     if(weights1_obj != NULL){
         // A numpy dimension of length -1 will be expanded to n_weights
@@ -1325,7 +1809,7 @@ static PyObject *countpairs_countpairs_theta_mocks(PyObject *self, PyObject *arg
         PyArray_Dims pdims = {.ptr = &(dims[0]), .len = 2};
         weights1_obj = (PyArrayObject *) PyArray_Newshape(weights1_obj, &pdims, NPY_CORDER);
     }
-    
+
     /* Validate the user's choice of weighting method */
     weight_method_t weighting_method;
     int wstatus = get_weight_method_by_name(weighting_method_str, &weighting_method);
@@ -1344,7 +1828,7 @@ static PyObject *countpairs_countpairs_theta_mocks(PyObject *self, PyObject *arg
         countpairs_mocks_error_out(module, msg);
         Py_RETURN_NONE;
     }
-    
+
     if(extra.weights0.num_weights > 0 && found_weights > MAX_NUM_WEIGHTS){
         char msg[1024];
         snprintf(msg, 1024, "ValueError: In %s: Provided %d weights-per-particle, but the code was compiled with MAX_NUM_WEIGHTS=%d.\n",
@@ -1368,11 +1852,11 @@ static PyObject *countpairs_countpairs_theta_mocks(PyObject *self, PyObject *arg
             countpairs_mocks_error_out(module, msg);
             Py_RETURN_NONE;
         }
-        
+
         size_t element_size2;
         ND2 = check_dims_and_datatype_ra_dec(module, x2_obj, y2_obj,&element_size2);
         if(ND2 == -1) {
-            //Error has already been set -> simply return 
+            //Error has already been set -> simply return
             Py_RETURN_NONE;
         }
         /* Ensure the weights are of the right shape (n_weights, n_particles) */
@@ -1381,7 +1865,7 @@ static PyObject *countpairs_countpairs_theta_mocks(PyObject *self, PyObject *arg
             PyArray_Dims pdims = {.ptr = &(dims[0]), .len = 2};
             weights2_obj = (PyArrayObject *) PyArray_Newshape(weights2_obj, &pdims, NPY_CORDER);
         }
-        
+
         if(element_size != element_size2) {
             snprintf(msg, 1024, "TypeError: In %s: The two arrays must have the same data-type. First array is of type %s while second array is of type %s\n",
                      __FUNCTION__, element_size == 4 ? "floats":"doubles", element_size2 == 4 ? "floats":"doubles");
@@ -1389,7 +1873,7 @@ static PyObject *countpairs_countpairs_theta_mocks(PyObject *self, PyObject *arg
             Py_RETURN_NONE;
         }
     }
-    
+
     /* Interpret the input objects as numpy arrays. */
     const int requirements = NPY_ARRAY_IN_ARRAY;
     PyObject *x1_array = NULL, *y1_array = NULL, *weights1_array = NULL;
@@ -1408,7 +1892,7 @@ static PyObject *countpairs_countpairs_theta_mocks(PyObject *self, PyObject *arg
         }
     }
 
-    if (x1_array == NULL || y1_array == NULL || 
+    if (x1_array == NULL || y1_array == NULL ||
         (autocorr == 0 && (x2_array == NULL || y2_array == NULL))) {
         Py_XDECREF(x1_array);
         Py_XDECREF(y1_array);
@@ -1427,7 +1911,7 @@ static PyObject *countpairs_countpairs_theta_mocks(PyObject *self, PyObject *arg
     /* Get pointers to the data as C-types. */
     void *phiD1 = NULL, *thetaD1 = NULL, *weights1=NULL;
     void *phiD2 = NULL, *thetaD2 = NULL, *weights2=NULL;
-    phiD1   = PyArray_DATA((PyArrayObject *) x1_array); 
+    phiD1   = PyArray_DATA((PyArrayObject *) x1_array);
     thetaD1 = PyArray_DATA((PyArrayObject *) y1_array);
     if(weights1_array != NULL){
         weights1 = PyArray_DATA((PyArrayObject *) weights1_array);
@@ -1440,7 +1924,7 @@ static PyObject *countpairs_countpairs_theta_mocks(PyObject *self, PyObject *arg
             weights2 = PyArray_DATA((PyArrayObject *) weights2_array);
         }
     }
-    
+
     /* Pack the weights into extra_options */
     for(int64_t w = 0; w < extra.weights0.num_weights; w++){
         extra.weights0.weights[w] = (char *) weights1 + w*ND1*element_size;
@@ -1468,7 +1952,7 @@ static PyObject *countpairs_countpairs_theta_mocks(PyObject *self, PyObject *arg
     }
     NPY_END_THREADS;
 
-    
+
     /* Clean up. */
     Py_DECREF(x1_array);Py_DECREF(y1_array);Py_XDECREF(weights1_array);//x1/y1 (representing ra1,dec1) should not be NULL
     Py_XDECREF(x2_array);Py_XDECREF(y2_array);Py_XDECREF(weights2_array);//x2/y2 may be NULL (in case of autocorr)
@@ -1476,7 +1960,7 @@ static PyObject *countpairs_countpairs_theta_mocks(PyObject *self, PyObject *arg
     if(status != EXIT_SUCCESS) {
         Py_RETURN_NONE;
     }
-    
+
 #if 0
     /*---Output-Pairs-------------------------------------*/
     double theta_low = results.theta_upp[0];
@@ -1509,11 +1993,11 @@ static PyObject *countpairs_countspheres_vpf_mocks(PyObject *self, PyObject *arg
     //Error-handling is global in python2 -> stored in struct module_state _struct declared at the top of this file
 #if PY_MAJOR_VERSION < 3
     (void) self;
-    PyObject *module = NULL;//should not be used -> setting to NULL so any attempts to dereference will result in a crash. 
+    PyObject *module = NULL;//should not be used -> setting to NULL so any attempts to dereference will result in a crash.
 #else
     //In python3, self is simply the module object that was returned earlier by init
     PyObject *module = self;
-#endif    
+#endif
 
     //x1->ra (phi), y1-> declination (theta1), z1->cz (cz1)
     //x2->ra (ph2), y2-> declination (theta2), z2->cz (cz2)
@@ -1536,7 +2020,7 @@ static PyObject *countpairs_countspheres_vpf_mocks(PyObject *self, PyObject *arg
     int8_t xbin_ref=options.bin_refine_factors[0],
         ybin_ref=options.bin_refine_factors[1],
         zbin_ref=options.bin_refine_factors[2];
-    
+
     static char *kwlist[] = {
         "rmax",
         "nbins",
@@ -1585,7 +2069,7 @@ static PyObject *countpairs_countspheres_vpf_mocks(PyObject *self, PyObject *arg
 
         char msg[1024];
         int len=snprintf(msg, 1024,"ArgumentError: In vpf_mocks> Could not parse the arguments. Input parameters are: \n");
-        
+
         /* How many keywords do we have? Subtract 1 because of the last NULL */
         const size_t nitems = sizeof(kwlist)/sizeof(*kwlist) - 1;
         int status = print_kwlist_into_msg(msg, 1024, len, kwlist, nitems);
@@ -1593,7 +2077,7 @@ static PyObject *countpairs_countspheres_vpf_mocks(PyObject *self, PyObject *arg
             fprintf(stderr,"Error message does not contain all of the keywords\n");
         }
         countpairs_mocks_error_out(module,msg);
-        
+
         Py_RETURN_NONE;
     }
     /*This is for the fastest isa */
@@ -1609,20 +2093,20 @@ static PyObject *countpairs_countspheres_vpf_mocks(PyObject *self, PyObject *arg
         options.bin_refine_factors[2] = zbin_ref;
         set_bin_refine_scheme(&options, BINNING_CUST);//custom binning -> code will honor requested binning scheme
     }
-    
+
     size_t element_size;
     /* We have numpy arrays and all the required inputs*/
     /* How many data points are there? And are they all of floating point type */
     const int64_t ND1 = check_dims_and_datatype(module, x1_obj, y1_obj, z1_obj, NULL, &element_size);
     if(ND1 == -1) {
-        //Error has already been set -> simply return 
+        //Error has already been set -> simply return
         Py_RETURN_NONE;
     }
 
     size_t element_size2;
     const int64_t ND2 = check_dims_and_datatype(module, x2_obj, y2_obj, z2_obj, NULL, &element_size2);
     if(ND2 == -1) {
-        //Error has already been set -> simply return 
+        //Error has already been set -> simply return
         Py_RETURN_NONE;
     }
 
@@ -1634,7 +2118,7 @@ static PyObject *countpairs_countspheres_vpf_mocks(PyObject *self, PyObject *arg
         Py_RETURN_NONE;
     }
 
-    
+
     /* Interpret the input objects as numpy arrays. */
     const int requirements = NPY_ARRAY_IN_ARRAY;
     PyObject *x1_array = NULL, *y1_array = NULL, *z1_array = NULL;
@@ -1666,7 +2150,7 @@ static PyObject *countpairs_countspheres_vpf_mocks(PyObject *self, PyObject *arg
     /* Get pointers to the data as C-types. */
     void *phiD1=NULL, *thetaD1=NULL,*czD1=NULL;
     void *phiD2=NULL, *thetaD2=NULL,*czD2=NULL;
-    
+
     phiD1   = PyArray_DATA((PyArrayObject *) x1_array);
     thetaD1 = PyArray_DATA((PyArrayObject *) y1_array);
     czD1    = PyArray_DATA((PyArrayObject *) z1_array);
@@ -1677,7 +2161,7 @@ static PyObject *countpairs_countspheres_vpf_mocks(PyObject *self, PyObject *arg
 
     NPY_BEGIN_THREADS_DEF;
     NPY_BEGIN_THREADS;
-    
+
     results_countspheres_mocks results;
     options.float_type = element_size;
     double c_api_time = 0.0;
@@ -1702,7 +2186,7 @@ static PyObject *countpairs_countspheres_vpf_mocks(PyObject *self, PyObject *arg
     if(status != EXIT_SUCCESS) {
         Py_RETURN_NONE;
     }
-    
+
 #if 0
     // Output the results
     const double rstep = rmax/(double)nbin ;
diff --git a/mocks/python_bindings/call_correlation_functions_mocks.py b/mocks/python_bindings/call_correlation_functions_mocks.py
index 5fe5d344..ede6fa47 100644
--- a/mocks/python_bindings/call_correlation_functions_mocks.py
+++ b/mocks/python_bindings/call_correlation_functions_mocks.py
@@ -15,7 +15,8 @@
 from _countpairs_mocks import \
     countpairs_rp_pi_mocks as rp_pi_mocks,\
     countpairs_theta_mocks as theta_mocks,\
-    countspheres_vpf_mocks as vpf_mocks
+    countspheres_vpf_mocks as vpf_mocks, \
+    countpairs_s_mu_mocks as s_mu_mocks
 
 
 try:
@@ -80,7 +81,7 @@ def main():
     autocorr = 1
     numbins_to_print = 5
     cosmology = 1
-    
+
     print("\nRunning 2-D correlation function xi(rp,pi)")
     results_DDrppi, _ = rp_pi_mocks(autocorr, cosmology, nthreads,
                                     pimax, binfile,
@@ -115,9 +116,29 @@ def main():
         items = results_DDrppi[ibin]
         print("{0:12.4f} {1:12.4f} {2:10.4f} {3:10.1f} {4:10d}"
               .format(items[0], items[1], items[2], items[3], items[4]))
-    
+
     print("-----------------------------------------------------------")
-    
+
+    nmu_bins = 10
+    mu_max = 1.0
+
+    print("\nRunning 2-D correlation function xi(s,mu)")
+    results_DDsmu, _ = s_mu_mocks(autocorr, cosmology, nthreads,
+                                  mu_max, nmu_bins, binfile,
+                                  ra, dec, cz, weights1=weights,
+                                  output_savg=True, verbose=True,
+                                  weight_type='pair_product')
+    print("\n#            ****** DD(s,mu): first {0} bins  *******      "
+          .format(numbins_to_print))
+    print("#      smin        smax       savg     mu_upper    npairs     weight_avg")
+    print("##########################################################################")
+    for ibin in range(numbins_to_print):
+        items = results_DDsmu[ibin]
+        print("{0:12.4f} {1:12.4f} {2:10.4f} {3:10.1f} {4:10d} {5:12.4f}"
+              .format(items[0], items[1], items[2], items[3], items[4], items[5]))
+
+    print("--------------------------------------------------------------------------")
+
     binfile = pjoin(dirname(abspath(__file__)),
                     "../tests/", "angular_bins")
     print("\nRunning angular correlation function w(theta)")
diff --git a/mocks/tests/Makefile b/mocks/tests/Makefile
index 6300a11c..05c04d7d 100644
--- a/mocks/tests/Makefile
+++ b/mocks/tests/Makefile
@@ -8,10 +8,12 @@ IO_DIR := $(ROOT_DIR)/io
 
 MOCKS_DIR := $(ROOT_DIR)/mocks
 DDrppi_mocks_DIR := $(MOCKS_DIR)/DDrppi_mocks
+DDsmu_mocks_DIR := $(MOCKS_DIR)/DDsmu_mocks
 DDtheta_mocks_DIR := $(MOCKS_DIR)/DDtheta_mocks
 VPF_mocks_DIR := $(MOCKS_DIR)/vpf_mocks
 
 DDrppi_mocks_LIB := countpairs_rp_pi_mocks
+DDsmu_mocks_LIB := countpairs_s_mu_mocks
 DDtheta_mocks_LIB := countpairs_theta_mocks
 VPF_mocks_LIB := countspheres_mocks
 
@@ -28,14 +30,15 @@ endif
 TARGETSRC   := tests_mocks.c $(IO_DIR)/io.c $(IO_DIR)/ftread.c $(UTILS_DIR)/utils.c $(UTILS_DIR)/cosmology_params.c
 TARGETOBJS  := $(TARGETSRC:.c=.o)
 C_LIBRARIES := $(DDrppi_mocks_DIR)/lib$(DDrppi_mocks_LIB).a $(DDtheta_mocks_DIR)/lib$(DDtheta_mocks_LIB).a \
-             $(VPF_mocks_DIR)/lib$(VPF_mocks_LIB).a
-INCL   := $(IO_DIR)/io.h $(IO_DIR)/ftread.h $(UTILS_DIR)/utils.h \
-          $(DDrppi_mocks_DIR)/$(DDrppi_mocks_LIB).h $(DDtheta_mocks_DIR)/$(DDtheta_mocks_LIB).h $(VPF_mocks_DIR)/$(VPF_mocks_LIB).h
+             $(VPF_mocks_DIR)/lib$(VPF_mocks_LIB).a $(DDsmu_mocks_DIR)/lib$(DDsmu_mocks_LIB).a
+INCL   := $(IO_DIR)/io.h $(IO_DIR)/ftread.h $(UTILS_DIR)/utils.h $(UTILS_DIR)/tests_common.h \
+          $(DDrppi_mocks_DIR)/$(DDrppi_mocks_LIB).h $(DDtheta_mocks_DIR)/$(DDtheta_mocks_LIB).h $(VPF_mocks_DIR)/$(VPF_mocks_LIB).h \
+	$(DDsmu_mocks_DIR)/$(DDsmu_mocks_LIB).h
 
-EXTRA_INCL:=-DDOUBLE_PREC -I$(DDrppi_mocks_DIR) -I$(DDtheta_mocks_DIR) -I$(VPF_mocks_DIR) $(GSL_CFLAGS)
+EXTRA_INCL:=-DDOUBLE_PREC -I$(DDrppi_mocks_DIR) -I$(DDtheta_mocks_DIR) -I$(VPF_mocks_DIR) -I$(DDsmu_mocks_DIR)  $(GSL_CFLAGS)
 EXTRA_LINK := $(GSL_LINK)
 
-OPT := 
+OPT :=
 
 all: tests $(TARGETS) $(INCL) uncompress $(ROOT_DIR)/mocks.options $(ROOT_DIR)/common.mk Makefile
 
@@ -45,6 +48,9 @@ UTILS_SRC := $(UTILS_DIR)/*.[ch] $(UTILS_DIR)/*.c.src $(UTILS_DIR)/*.h.src
 $(DDrppi_mocks_DIR)/lib$(DDrppi_mocks_LIB).a: $(DDrppi_mocks_DIR)/*.c $(DDrppi_mocks_DIR)/*.c.src $(DDrppi_mocks_DIR)/*.h.src $(ROOT_DIR)/mocks.options $(ROOT_DIR)/common.mk $(UTILS_SRC)
 	$(MAKE) -C $(DDrppi_mocks_DIR) libs
 
+$(DDsmu_mocks_DIR)/lib$(DDsmu_mocks_LIB).a: $(DDsmu_mocks_DIR)/*.c $(DDsmu_mocks_DIR)/*.c.src $(DDsmu_mocks_DIR)/*.h.src $(ROOT_DIR)/mocks.options $(ROOT_DIR)/common.mk $(UTILS_SRC)
+	$(MAKE) -C $(DDsmu_mocks_DIR) libs
+
 $(DDtheta_mocks_DIR)/lib$(DDtheta_mocks_LIB).a: $(DDtheta_mocks_DIR)/*.c $(DDtheta_mocks_DIR)/*.c.src $(DDtheta_mocks_DIR)/*.h.src $(ROOT_DIR)/mocks.options $(ROOT_DIR)/common.mk $(UTILS_SRC)
 	$(MAKE) -C $(DDtheta_mocks_DIR) libs
 
@@ -52,9 +58,9 @@ $(VPF_mocks_DIR)/lib$(VPF_mocks_LIB).a: $(VPF_mocks_DIR)/*.c $(VPF_mocks_DIR)/*.
 	$(MAKE) -C $(VPF_mocks_DIR) libs
 
 python_lib: tests $(TARGETOBJS) $(INCL) $(ROOT_DIR)/mocks.options $(ROOT_DIR)/common.mk Makefile | $(ROOT_DIR)/lib
-	@echo 
+	@echo
 	@echo "All MOCKS tests are done. Now checking that the C extensions work."
-	@echo 
+	@echo
 	$(MAKE) -C ../python_bindings tests
 
 tests: $(TARGET)
@@ -69,14 +75,17 @@ uncompress: | data
 		cd ..; \
 	}
 DDrppi_mocks: $(TARGET)
-	./$(TARGET) 0 3  
+	./$(TARGET) 0 3
 
 DDtheta_mocks: tests_mocks
-	./$(TARGET) 1 4 
+	./$(TARGET) 1 4
 
 vpf_mocks: tests_mocks
 	./$(TARGET) 2 5
 
+DDsmu_mocks: tests_mocks
+	./$(TARGET) 6 7
+
 clean:
 	$(RM) $(TARGETS) $(TARGETOBJS)
 	$(RM) -R *.dSYM
diff --git a/mocks/tests/Mr19_mock_DDsmu.DR b/mocks/tests/Mr19_mock_DDsmu.DR
new file mode 100644
index 00000000..5dc7f136
--- /dev/null
+++ b/mocks/tests/Mr19_mock_DDsmu.DR
@@ -0,0 +1,140 @@
+        16           0.12731059          -0.82871857            0.10000000           0.20633790 
+        11           0.12515353          -0.82871857            0.20000000           0.15051830 
+        18           0.12961133          -0.82871857            0.30000000           0.29915851 
+        11           0.12919907          -0.82871857            0.40000000           0.36693781 
+         8           0.13127004          -0.82871857            0.50000000           0.18077128 
+        11           0.12755128          -0.82871857            0.60000000           0.19729862 
+        15           0.12823310          -0.82871857            0.70000000           0.30941661 
+         8           0.13026533          -0.82871857            0.80000000           0.38308526 
+        11           0.12652695          -0.82871857            0.90000000           0.27457505 
+        11           0.12088876          -0.82871857            1.00000000           0.27030041 
+        44           0.18988933          -0.65743714            0.10000000           0.25046043 
+        40           0.18934067          -0.65743714            0.20000000           0.25819683 
+        46           0.18927409          -0.65743714            0.30000000           0.19757136 
+        23           0.18814162          -0.65743714            0.40000000           0.28705438 
+        44           0.18967457          -0.65743714            0.50000000           0.23617978 
+        37           0.18986317          -0.65743714            0.60000000           0.22119408 
+        33           0.18751866          -0.65743714            0.70000000           0.21886382 
+        49           0.19007635          -0.65743714            0.80000000           0.21821130 
+        54           0.18674075          -0.65743714            0.90000000           0.27237927 
+        36           0.19187276          -0.65743714            1.00000000           0.24887600 
+       144           0.27963534          -0.48615571            0.10000000           0.25038225 
+       155           0.28616186          -0.48615571            0.20000000           0.23707988 
+       163           0.28191365          -0.48615571            0.30000000           0.26649402 
+       120           0.27891034          -0.48615571            0.40000000           0.24197924 
+       135           0.28152441          -0.48615571            0.50000000           0.27956477 
+       166           0.28448131          -0.48615571            0.60000000           0.25379995 
+       124           0.27656690          -0.48615571            0.70000000           0.22831096 
+       149           0.27609419          -0.48615571            0.80000000           0.24063340 
+       128           0.27998106          -0.48615571            0.90000000           0.27801332 
+       143           0.27705729          -0.48615571            1.00000000           0.26451608 
+       508           0.41566032          -0.31487428            0.10000000           0.25759224 
+       445           0.41393342          -0.31487428            0.20000000           0.25868560 
+       444           0.41368517          -0.31487428            0.30000000           0.25914849 
+       474           0.41476120          -0.31487428            0.40000000           0.24918414 
+       447           0.41821551          -0.31487428            0.50000000           0.24930410 
+       458           0.41612023          -0.31487428            0.60000000           0.26149722 
+       401           0.41942983          -0.31487428            0.70000000           0.24346060 
+       476           0.41491533          -0.31487428            0.80000000           0.26839799 
+       450           0.41631213          -0.31487428            0.90000000           0.25158405 
+       456           0.41624403          -0.31487428            1.00000000           0.25803716 
+      1505           0.61603708          -0.14359285            0.10000000           0.25254030 
+      1504           0.61662759          -0.14359285            0.20000000           0.25054681 
+      1440           0.61864632          -0.14359285            0.30000000           0.25218452 
+      1475           0.61395625          -0.14359285            0.40000000           0.24692343 
+      1464           0.61714052          -0.14359285            0.50000000           0.25008302 
+      1358           0.61534298          -0.14359285            0.60000000           0.25381702 
+      1460           0.61634313          -0.14359285            0.70000000           0.25965402 
+      1463           0.61748002          -0.14359285            0.80000000           0.24329196 
+      1486           0.61420989          -0.14359285            0.90000000           0.25247011 
+      1535           0.61364964          -0.14359285            1.00000000           0.25148934 
+      4840           0.91599811           0.02768858            0.10000000           0.25299581 
+      4778           0.91322204           0.02768858            0.20000000           0.24907554 
+      4892           0.91413760           0.02768858            0.30000000           0.25405492 
+      4755           0.91489691           0.02768858            0.40000000           0.24244488 
+      4861           0.91404203           0.02768858            0.50000000           0.24703680 
+      4849           0.91502518           0.02768858            0.60000000           0.25107696 
+      4905           0.91465004           0.02768858            0.70000000           0.25486661 
+      4808           0.91361794           0.02768858            0.80000000           0.24922259 
+      4731           0.91424215           0.02768858            0.90000000           0.25047345 
+      4779           0.91412279           0.02768858            1.00000000           0.25095504 
+     15677           1.35610305           0.19897000            0.10000000           0.24670153 
+     15526           1.35514701           0.19897000            0.20000000           0.24716871 
+     15417           1.35534215           0.19897000            0.30000000           0.24755408 
+     15465           1.35677595           0.19897000            0.40000000           0.25058873 
+     15516           1.35614667           0.19897000            0.50000000           0.25110275 
+     15343           1.35248044           0.19897000            0.60000000           0.24583133 
+     15448           1.35441951           0.19897000            0.70000000           0.24819576 
+     15314           1.35850655           0.19897000            0.80000000           0.24877819 
+     15363           1.35733116           0.19897000            0.90000000           0.25047418 
+     15323           1.35480003           0.19897000            1.00000000           0.24727734 
+     50173           2.01189660           0.37025143            0.10000000           0.24871628 
+     50333           2.01226591           0.37025143            0.20000000           0.25029286 
+     49672           2.00924038           0.37025143            0.30000000           0.24951873 
+     50105           2.01148289           0.37025143            0.40000000           0.25016451 
+     50514           2.01265806           0.37025143            0.50000000           0.24985593 
+     50145           2.01417889           0.37025143            0.60000000           0.24999501 
+     49984           2.01375145           0.37025143            0.70000000           0.25029335 
+     50498           2.01440817           0.37025143            0.80000000           0.25006703 
+     50131           2.01264557           0.37025143            0.90000000           0.24950389 
+     50712           2.01214364           0.37025143            1.00000000           0.24930654 
+    163798           2.98425196           0.54153286            0.10000000           0.24966735 
+    163542           2.98354034           0.54153286            0.20000000           0.24918066 
+    162483           2.98380717           0.54153286            0.30000000           0.25002588 
+    163085           2.98434898           0.54153286            0.40000000           0.24952674 
+    162497           2.98289432           0.54153286            0.50000000           0.24977827 
+    161895           2.98420528           0.54153286            0.60000000           0.24968099 
+    161978           2.98355442           0.54153286            0.70000000           0.25013319 
+    162020           2.98299090           0.54153286            0.80000000           0.25021899 
+    162310           2.98442359           0.54153286            0.90000000           0.24973058 
+    162560           2.98411141           0.54153286            1.00000000           0.24988413 
+    528585           4.42722780           0.71281429            0.10000000           0.25005917 
+    526245           4.42650446           0.71281429            0.20000000           0.24960208 
+    523747           4.42770388           0.71281429            0.30000000           0.25009919 
+    524827           4.42799042           0.71281429            0.40000000           0.24995922 
+    523306           4.42656072           0.71281429            0.50000000           0.24934124 
+    520819           4.42697410           0.71281429            0.60000000           0.24973925 
+    522375           4.42750903           0.71281429            0.70000000           0.24927569 
+    519933           4.42624285           0.71281429            0.80000000           0.24964784 
+    519286           4.42590936           0.71281429            0.90000000           0.24948366 
+    523998           4.42625008           0.71281429            1.00000000           0.24982650 
+   1701198           6.56722610           0.88409572            0.10000000           0.24981610 
+   1695386           6.56615839           0.88409572            0.20000000           0.24980648 
+   1689614           6.56582680           0.88409572            0.30000000           0.24970345 
+   1680288           6.56634252           0.88409572            0.40000000           0.24980833 
+   1676873           6.56555037           0.88409572            0.50000000           0.24982931 
+   1672186           6.56526366           0.88409572            0.60000000           0.25029098 
+   1669435           6.56508268           0.88409572            0.70000000           0.24980584 
+   1667573           6.56444316           0.88409572            0.80000000           0.24978917 
+   1666891           6.56512622           0.88409572            0.90000000           0.24970372 
+   1681090           6.56461210           0.88409572            1.00000000           0.24942779 
+   5469907           9.73975928           1.05537715            0.10000000           0.24980874 
+   5427364           9.73869893           1.05537715            0.20000000           0.25010323 
+   5392991           9.73928456           1.05537715            0.30000000           0.24976474 
+   5362761           9.73865074           1.05537715            0.40000000           0.25003968 
+   5336546           9.73801205           1.05537715            0.50000000           0.24989101 
+   5309251           9.73660850           1.05537715            0.60000000           0.24982844 
+   5290896           9.73708529           1.05537715            0.70000000           0.25010795 
+   5281317           9.73624504           1.05537715            0.80000000           0.24995278 
+   5275222           9.73499203           1.05537715            0.90000000           0.24979796 
+   5314662           9.73553057           1.05537715            1.00000000           0.24971061 
+  17319173          14.44207574           1.22665858            0.10000000           0.24983025 
+  17152369          14.43989969           1.22665858            0.20000000           0.24972279 
+  16982951          14.43981743           1.22665858            0.30000000           0.24975144 
+  16852909          14.43816283           1.22665858            0.40000000           0.24973681 
+  16728133          14.43656638           1.22665858            0.50000000           0.24975790 
+  16615986          14.43514019           1.22665858            0.60000000           0.24969757 
+  16520907          14.43442137           1.22665858            0.70000000           0.24973680 
+  16439429          14.43367852           1.22665858            0.80000000           0.24977122 
+  16409485          14.43252612           1.22665858            0.90000000           0.24970919 
+  16544115          14.43317964           1.22665858            1.00000000           0.24992302 
+  53981727          21.41207942           1.39794001            0.10000000           0.24972265 
+  53169238          21.40834689           1.39794001            0.20000000           0.24970470 
+  52467621          21.40406125           1.39794001            0.30000000           0.24966018 
+  51795544          21.40083803           1.39794001            0.40000000           0.24964408 
+  51210464          21.39821490           1.39794001            0.50000000           0.24965978 
+  50699567          21.39559861           1.39794001            0.60000000           0.24966266 
+  50268928          21.39291448           1.39794001            0.70000000           0.24956103 
+  49994143          21.39486900           1.39794001            0.80000000           0.24963318 
+  49913169          21.39462559           1.39794001            0.90000000           0.24972569 
+  50385465          21.39529041           1.39794001            1.00000000           0.24966321 
diff --git a/mocks/tests/Mr19_mock_DDsmu.RR b/mocks/tests/Mr19_mock_DDsmu.RR
new file mode 100644
index 00000000..89c3e16e
--- /dev/null
+++ b/mocks/tests/Mr19_mock_DDsmu.RR
@@ -0,0 +1,140 @@
+       160           0.12628979          -0.82871857            0.10000000           0.21743459 
+       128           0.12837521          -0.82871857            0.20000000           0.22917674 
+       132           0.12957688          -0.82871857            0.30000000           0.24310458 
+       192           0.12604276          -0.82871857            0.40000000           0.22955525 
+       168           0.13041355          -0.82871857            0.50000000           0.24816036 
+       144           0.12665805          -0.82871857            0.60000000           0.25716712 
+       150           0.12622404          -0.82871857            0.70000000           0.24221825 
+       150           0.12508100          -0.82871857            0.80000000           0.24376147 
+       146           0.12730893          -0.82871857            0.90000000           0.23014688 
+       176           0.12536130          -0.82871857            1.00000000           0.27060385 
+       482           0.18942787          -0.65743714            0.10000000           0.23813889 
+       474           0.18732210          -0.65743714            0.20000000           0.24467089 
+       490           0.18882176          -0.65743714            0.30000000           0.24940495 
+       454           0.18995518          -0.65743714            0.40000000           0.22977090 
+       442           0.19047925          -0.65743714            0.50000000           0.25555259 
+       488           0.19030407          -0.65743714            0.60000000           0.23769962 
+       428           0.18830197          -0.65743714            0.70000000           0.26006574 
+       422           0.18943326          -0.65743714            0.80000000           0.25545760 
+       478           0.19054173          -0.65743714            0.90000000           0.25111605 
+       474           0.18991154          -0.65743714            1.00000000           0.25105800 
+      1484           0.27946766          -0.48615571            0.10000000           0.25082409 
+      1438           0.28084465          -0.48615571            0.20000000           0.24232445 
+      1460           0.27912751          -0.48615571            0.30000000           0.25310631 
+      1506           0.27976848          -0.48615571            0.40000000           0.25017242 
+      1490           0.28071458          -0.48615571            0.50000000           0.25109175 
+      1546           0.28270309          -0.48615571            0.60000000           0.24910759 
+      1468           0.28041406          -0.48615571            0.70000000           0.23917503 
+      1490           0.28144473          -0.48615571            0.80000000           0.24959235 
+      1492           0.28068602          -0.48615571            0.90000000           0.24560202 
+      1352           0.27975949          -0.48615571            1.00000000           0.26223559 
+      4768           0.41541954          -0.31487428            0.10000000           0.24492959 
+      4956           0.41445371          -0.31487428            0.20000000           0.24907868 
+      4932           0.41585921          -0.31487428            0.30000000           0.24676711 
+      4808           0.41517653          -0.31487428            0.40000000           0.24578592 
+      4978           0.41643178          -0.31487428            0.50000000           0.25628214 
+      4816           0.41488549          -0.31487428            0.60000000           0.24999633 
+      4854           0.41539651          -0.31487428            0.70000000           0.25658033 
+      4898           0.41408620          -0.31487428            0.80000000           0.24757377 
+      4838           0.41570765          -0.31487428            0.90000000           0.24527180 
+      4902           0.41619938          -0.31487428            1.00000000           0.24264299 
+     15732           0.61643753          -0.14359285            0.10000000           0.25292104 
+     15504           0.61570594          -0.14359285            0.20000000           0.25387814 
+     15904           0.61623943          -0.14359285            0.30000000           0.24902701 
+     15700           0.61557887          -0.14359285            0.40000000           0.25024467 
+     15624           0.61583671          -0.14359285            0.50000000           0.25071424 
+     15570           0.61572044          -0.14359285            0.60000000           0.24510102 
+     15746           0.61665016          -0.14359285            0.70000000           0.24785056 
+     15804           0.61562741          -0.14359285            0.80000000           0.25135713 
+     15830           0.61613368          -0.14359285            0.90000000           0.25063374 
+     15718           0.61683780          -0.14359285            1.00000000           0.25100202 
+     51812           0.91488229           0.02768858            0.10000000           0.25160493 
+     51628           0.91454455           0.02768858            0.20000000           0.25105392 
+     51392           0.91383275           0.02768858            0.30000000           0.25054706 
+     51458           0.91474181           0.02768858            0.40000000           0.25359633 
+     51510           0.91360431           0.02768858            0.50000000           0.24926571 
+     51262           0.91525552           0.02768858            0.60000000           0.25006126 
+     51450           0.91358300           0.02768858            0.70000000           0.24833332 
+     51728           0.91458395           0.02768858            0.80000000           0.25058864 
+     51704           0.91366245           0.02768858            0.90000000           0.24943424 
+     51702           0.91548296           0.02768858            1.00000000           0.25058458 
+    167810           1.35672520           0.19897000            0.10000000           0.24946337 
+    167526           1.35600111           0.19897000            0.20000000           0.25016147 
+    166978           1.35523003           0.19897000            0.30000000           0.25021493 
+    167926           1.35605349           0.19897000            0.40000000           0.24952661 
+    166768           1.35587951           0.19897000            0.50000000           0.24994968 
+    168016           1.35681133           0.19897000            0.60000000           0.25027993 
+    167298           1.35605243           0.19897000            0.70000000           0.25044787 
+    167312           1.35605190           0.19897000            0.80000000           0.25107847 
+    167906           1.35544064           0.19897000            0.90000000           0.25092785 
+    168506           1.35656696           0.19897000            1.00000000           0.24894104 
+    548394           2.01171852           0.37025143            0.10000000           0.24953462 
+    546334           2.01199335           0.37025143            0.20000000           0.25077123 
+    545538           2.01137662           0.37025143            0.30000000           0.25072295 
+    543310           2.01199925           0.37025143            0.40000000           0.24916730 
+    543920           2.01212113           0.37025143            0.50000000           0.25095361 
+    542280           2.01140194           0.37025143            0.60000000           0.25080692 
+    541306           2.01175064           0.37025143            0.70000000           0.25040784 
+    541920           2.01151853           0.37025143            0.80000000           0.25023523 
+    542020           2.01248167           0.37025143            0.90000000           0.24994036 
+    545146           2.01203497           0.37025143            1.00000000           0.25020849 
+   1768270           2.98415267           0.54153286            0.10000000           0.25006886 
+   1762308           2.98429495           0.54153286            0.20000000           0.24985701 
+   1763662           2.98471507           0.54153286            0.30000000           0.25041549 
+   1757430           2.98413600           0.54153286            0.40000000           0.24999589 
+   1751516           2.98453668           0.54153286            0.50000000           0.25001992 
+   1749338           2.98420092           0.54153286            0.60000000           0.24972907 
+   1748874           2.98467465           0.54153286            0.70000000           0.25019810 
+   1749772           2.98389924           0.54153286            0.80000000           0.25010485 
+   1750794           2.98361099           0.54153286            0.90000000           0.24981918 
+   1757564           2.98452942           0.54153286            1.00000000           0.24969792 
+   5707512           4.42654360           0.71281429            0.10000000           0.25031193 
+   5690564           4.42654992           0.71281429            0.20000000           0.25020124 
+   5669400           4.42686268           0.71281429            0.30000000           0.24997916 
+   5653936           4.42656855           0.71281429            0.40000000           0.24999317 
+   5639318           4.42601444           0.71281429            0.50000000           0.24992769 
+   5628352           4.42607913           0.71281429            0.60000000           0.25015125 
+   5620152           4.42638127           0.71281429            0.70000000           0.24997355 
+   5618500           4.42564675           0.71281429            0.80000000           0.24994699 
+   5625284           4.42559179           0.71281429            0.90000000           0.24989487 
+   5649710           4.42599173           0.71281429            1.00000000           0.24991835 
+  18367070           6.56656610           0.88409572            0.10000000           0.25007834 
+  18265436           6.56639756           0.88409572            0.20000000           0.24998823 
+  18187802           6.56562711           0.88409572            0.30000000           0.25012303 
+  18108114           6.56540420           0.88409572            0.40000000           0.25021503 
+  18036148           6.56545726           0.88409572            0.50000000           0.25010607 
+  17968856           6.56496478           0.88409572            0.60000000           0.24997142 
+  17937552           6.56470001           0.88409572            0.70000000           0.25005255 
+  17909324           6.56448033           0.88409572            0.80000000           0.25011287 
+  17917216           6.56436568           0.88409572            0.90000000           0.24998481 
+  18012276           6.56435986           0.88409572            1.00000000           0.24995248 
+  58768132           9.73925834           1.05537715            0.10000000           0.25010973 
+  58317582           9.73862576           1.05537715            0.20000000           0.25010661 
+  57885266           9.73757417           1.05537715            0.30000000           0.25005669 
+  57491968           9.73672770           1.05537715            0.40000000           0.25001275 
+  57177014           9.73627117           1.05537715            0.50000000           0.25006929 
+  56896508           9.73533655           1.05537715            0.60000000           0.25009712 
+  56640136           9.73511776           1.05537715            0.70000000           0.25001196 
+  56498546           9.73470899           1.05537715            0.80000000           0.25000736 
+  56459498           9.73377809           1.05537715            0.90000000           0.24999043 
+  56883826           9.73435806           1.05537715            1.00000000           0.25002170 
+ 186432122          14.44310499           1.22665858            0.10000000           0.25004933 
+ 184233678          14.44135371           1.22665858            0.20000000           0.25002489 
+ 182290138          14.43972295           1.22665858            0.30000000           0.24994850 
+ 180433056          14.43783328           1.22665858            0.40000000           0.24998143 
+ 178833330          14.43601495           1.22665858            0.50000000           0.25003258 
+ 177435368          14.43435874           1.22665858            0.60000000           0.25001640 
+ 176203780          14.43298254           1.22665858            0.70000000           0.25001632 
+ 175369724          14.43202719           1.22665858            0.80000000           0.25000012 
+ 175082112          14.43180162           1.22665858            0.90000000           0.24996348 
+ 176546470          14.43210484           1.22665858            1.00000000           0.24994666 
+ 582460056          21.41432667           1.39794001            0.10000000           0.25001249 
+ 572504788          21.41001671           1.39794001            0.20000000           0.24998329 
+ 563314804          21.40574110           1.39794001            0.30000000           0.25001709 
+ 554941466          21.40174330           1.39794001            0.40000000           0.25000411 
+ 547346358          21.39784700           1.39794001            0.50000000           0.24997044 
+ 540582506          21.39487204           1.39794001            0.60000000           0.24997578 
+ 535047050          21.39250791           1.39794001            0.70000000           0.24997567 
+ 530980264          21.39004662           1.39794001            0.80000000           0.24997634 
+ 529103668          21.38854786           1.39794001            0.90000000           0.25001065 
+ 534167788          21.38951334           1.39794001            1.00000000           0.25001138 
diff --git a/mocks/tests/tests_mocks.c b/mocks/tests/tests_mocks.c
index f86770d4..2f5a15cd 100644
--- a/mocks/tests/tests_mocks.c
+++ b/mocks/tests/tests_mocks.c
@@ -6,36 +6,27 @@
   directory at https://github.com/manodeep/Corrfunc/
 */
 
-#include <stdio.h>
-#include <stdlib.h>
-#include <assert.h>
-#include <string.h>
-#include <sys/time.h>
-#include <inttypes.h>
-
-#ifndef MAXLEN
-#define MAXLEN 500
-#endif
+#include "tests_common.h"
+#include "io.h"
+#include "utils.h"
+#include "cosmology_params.h"
 
 #if !(defined(__INTEL_COMPILER)) && defined(USE_AVX)
 #warning Test suite for mocks are faster with Intel compiler, icc, AVX libraries.
 #endif
 
 
-#include "defs.h"
-#include "io.h"
-#include "utils.h"
-#include "cosmology_params.h"
-
 #include "../DDrppi_mocks/countpairs_rp_pi_mocks.h"
+#include "../DDsmu_mocks/countpairs_s_mu_mocks.h"
 #include "../DDtheta_mocks/countpairs_theta_mocks.h"
 #include "../vpf_mocks/countspheres_mocks.h"
 
 char tmpoutputfile[]="../tests/tests_mocks_output.txt";
 
 int test_DDrppi_mocks(const char *correct_outputfile);
-int test_wtheta_mocks(const char *correct_outputfile);
+int test_DDtheta_mocks(const char *correct_outputfile);
 int test_vpf_mocks(const char *correct_outputfile);
+int test_DDsmu_mocks(const char *correct_outputfile);
 
 void read_data_and_set_globals(const char *firstfilename, const char *firstformat,const char *secondfilename,const char *secondformat);
 
@@ -46,38 +37,19 @@ double *RA1=NULL,*DEC1=NULL,*CZ1=NULL,*weights1=NULL;
 int ND2;
 double *RA2=NULL,*DEC2=NULL,*CZ2=NULL,*weights2=NULL;
 
-char binfile[]="../tests/bins";
-char angular_binfile[]="../tests/angular_bins";
-double pimax=40.0;
-double boxsize=420.0;
-#if defined(_OPENMP)
-const int nthreads=4;
-#else
-const int nthreads=1;
-#endif
 const int cosmology_flag=1;
 char current_file1[MAXLEN],current_file2[MAXLEN];
 
-const double maxdiff = 1e-9;
-const double maxreldiff = 1e-6;
-
 struct config_options options;
-const isa instruction_sets[] = {FALLBACK
-#if defined(__SSE4_2__)                                
-                                , SSE42
-#endif
-#if defined(__AVX__)
-                                , AVX
-#endif                                
-};
-const int num_isets = sizeof(instruction_sets)/sizeof(instruction_sets[0]);
 //end of global variables
 
 int test_DDrppi_mocks(const char *correct_outputfile)
 {
+    results_countpairs_mocks results;
+    int ret = EXIT_FAILURE;
     assert(RA1 != NULL && DEC1 != NULL && CZ1 != NULL && "ERROR: In test suite for DDrppi ra/dec/cz can not be NULL pointers");
     int autocorr = (RA1==RA2) ? 1:0;
-    
+
     // Set up the weights pointers
     weight_method_t weight_method = PAIR_PRODUCT;
     struct extra_options extra = get_extra_options(weight_method);
@@ -85,66 +57,67 @@ int test_DDrppi_mocks(const char *correct_outputfile)
     extra.weights1.weights[0] = weights2;
 
     //Do DD(rp,pi) counts
-    results_countpairs_mocks results;
-    int status = countpairs_mocks(ND1,RA1,DEC1,CZ1,
-                                  ND2,RA2,DEC2,CZ2,
-                                  nthreads,
-                                  autocorr,
-                                  binfile,
-                                  pimax,
-                                  cosmology_flag,
-                                  &results,
-                                  &options,
-                                  &extra);
-    if(status != EXIT_SUCCESS) {
-        return status;
-    }
+    BEGIN_INTEGRATION_TEST_SECTION
+        int status = countpairs_mocks(ND1,RA1,DEC1,CZ1,
+                                      ND2,RA2,DEC2,CZ2,
+                                      nthreads,
+                                      autocorr,
+                                      binfile,
+                                      pimax,
+                                      cosmology_flag,
+                                      &results,
+                                      &options,
+                                      &extra);
+        if(status != EXIT_SUCCESS) {
+            return status;
+        }
 
-    int ret = EXIT_FAILURE;
-    FILE *fp=my_fopen(correct_outputfile,"r");
-    if(fp == NULL) {
-        free_results_mocks(&results);
-        return EXIT_FAILURE;
-    }
-    const double dpi = pimax/(double)results.npibin ;
-    const int npibin = results.npibin;
-    for(int i=1;i<results.nbin;i++) {
-        for(int j=0;j<npibin;j++) {
-            int index = i*(npibin+1) + j;
-            uint64_t npairs;
-            double rpavg, weightavg;
-            ret = EXIT_FAILURE;
-            int nitems = fscanf(fp,"%"SCNu64" %lf %*f %*f %lf%*[^\n]", &npairs, &rpavg, &weightavg);
-            if(nitems != 3) {
-                ret = EXIT_FAILURE;//not required but showing intent
-                i = results.nbin;
-                break;
-            }
-            int floats_equal = AlmostEqualRelativeAndAbs_double(rpavg, results.rpavg[index], maxdiff, maxreldiff);
-            int weights_equal = AlmostEqualRelativeAndAbs_double(weightavg, results.weightavg[index], maxdiff, maxreldiff);
-            
-            //Check for exact equality of npairs and float "equality" for rpavg
-            if(npairs == results.npairs[index] && floats_equal == EXIT_SUCCESS && weights_equal == EXIT_SUCCESS) {
-                ret = EXIT_SUCCESS;
-            } else {
-                fprintf(stderr,"True npairs = %"PRIu64 " Computed results npairs = %"PRIu64"\n", npairs, results.npairs[index]);
-                fprintf(stderr,"True rpavg  = %20.12e Computed rpavg = %20.12e. floats_equal = %d\n", rpavg, results.rpavg[index], floats_equal);
-                fprintf(stderr,"True weightavg = %e Computed weightavg = %e. weights_equal = %d\n", weightavg, results.weightavg[index], weights_equal);
-                ret = EXIT_FAILURE;//not required but showing intent 
-                i = results.nbin;
-                break;
+        FILE *fp=my_fopen(correct_outputfile,"r");
+        if(fp == NULL) {
+            free_results_mocks(&results);
+            return EXIT_FAILURE;
+        }
+        const int npibin = results.npibin;
+        for(int i=1;i<results.nbin;i++) {
+            for(int j=0;j<npibin;j++) {
+                int index = i*(npibin+1) + j;
+                uint64_t npairs;
+                double rpavg, weightavg;
+                ret = EXIT_FAILURE;
+                int nitems = fscanf(fp,"%"SCNu64" %lf %*f %*f %lf%*[^\n]", &npairs, &rpavg, &weightavg);
+                if(nitems != 3) {
+                    ret = EXIT_FAILURE;//not required but showing intent
+                    i = results.nbin;
+                    break;
+                }
+                int floats_equal = AlmostEqualRelativeAndAbs_double(rpavg, results.rpavg[index], maxdiff, maxreldiff);
+                int weights_equal = AlmostEqualRelativeAndAbs_double(weightavg, results.weightavg[index], maxdiff, maxreldiff);
+                
+                //Check for exact equality of npairs and float "equality" for rpavg
+                if(npairs == results.npairs[index] && floats_equal == EXIT_SUCCESS && weights_equal == EXIT_SUCCESS) {
+                    ret = EXIT_SUCCESS;
+                } else {
+                    fprintf(stderr,"True npairs = %"PRIu64 " Computed results npairs = %"PRIu64"\n", npairs, results.npairs[index]);
+                    fprintf(stderr,"True rpavg  = %20.12e Computed rpavg = %20.12e. floats_equal = %d\n", rpavg, results.rpavg[index], floats_equal);
+                    fprintf(stderr,"True weightavg = %e Computed weightavg = %e. weights_equal = %d\n", weightavg, results.weightavg[index], weights_equal);
+                    ret = EXIT_FAILURE;//not required but showing intent
+                    i = results.nbin;
+                    break;
+                }
             }
         }
-    }
-    fclose(fp);
-    
+        fclose(fp);
+    END_INTEGRATION_TEST_SECTION;
+        
     /* If the test failed, then write the output into a temporary file */
     if(ret != EXIT_SUCCESS) {
-        fp=my_fopen(tmpoutputfile,"w");
+        FILE *fp=my_fopen(tmpoutputfile,"w");
         if(fp == NULL) {
             free_results_mocks(&results);
             return EXIT_FAILURE;
         }
+        const double dpi = pimax/(double)results.npibin ;
+        const int npibin = results.npibin;
         for(int i=1;i<results.nbin;i++) {
             const double logrp = log10(results.rupp[i]);
             for(int j=0;j<npibin;j++) {
@@ -159,16 +132,111 @@ int test_DDrppi_mocks(const char *correct_outputfile)
     return ret;
 }
 
-int test_wtheta_mocks(const char *correct_outputfile)
+int test_DDsmu_mocks(const char *correct_outputfile)
+{
+    results_countpairs_mocks_s_mu results;
+    int ret = EXIT_FAILURE;
+    
+    assert(RA1 != NULL && DEC1 != NULL && CZ1 != NULL && "ERROR: In test suite for DDsmu ra/dec/cz can not be NULL pointers");
+    int autocorr = (RA1==RA2) ? 1:0;
+
+    // Set up the weights pointers
+    weight_method_t weight_method = PAIR_PRODUCT;
+    struct extra_options extra = get_extra_options(weight_method);
+    extra.weights0.weights[0] = weights1;
+    extra.weights1.weights[0] = weights2;
+
+    BEGIN_INTEGRATION_TEST_SECTION
+        //Do DD(s,mu) counts
+        int status = countpairs_mocks_s_mu(ND1,RA1,DEC1,CZ1,
+                                           ND2,RA2,DEC2,CZ2,
+                                           nthreads,
+                                           autocorr,
+                                           binfile,
+                                           mocks_mu_max,
+                                           nmu_bins,
+                                           cosmology_flag,
+                                           &results,
+                                           &options,
+                                           &extra);
+        if(status != EXIT_SUCCESS) {
+            return status;
+        }
+
+        FILE *fp=my_fopen(correct_outputfile,"r");
+        if(fp == NULL) {
+            free_results_mocks_s_mu(&results);
+            return EXIT_FAILURE;
+        }
+
+        const int nmubin = results.nmu_bins;
+        for(int i=1;i<results.nsbin;i++) {
+            for(int j=0;j<nmubin;j++) {
+                int index = i*(nmubin+1) + j;
+                uint64_t npairs;
+                double savg, weightavg;
+                ret = EXIT_FAILURE;
+                int nitems = fscanf(fp,"%"SCNu64" %lf %*f %*f %lf%*[^\n]", &npairs, &savg, &weightavg);
+                if(nitems != 3) {
+                    ret = EXIT_FAILURE;//not required but showing intent
+                    i = results.nsbin;
+                    break;
+                }
+                int floats_equal = AlmostEqualRelativeAndAbs_double(savg, results.savg[index], maxdiff, maxreldiff);
+                int weights_equal = AlmostEqualRelativeAndAbs_double(weightavg, results.weightavg[index], maxdiff, maxreldiff);
+                
+                //Check for exact equality of npairs and float "equality" for savg
+                if(npairs == results.npairs[index] && floats_equal == EXIT_SUCCESS && weights_equal == EXIT_SUCCESS) {
+                    ret = EXIT_SUCCESS;
+                } else {
+                    fprintf(stderr,"True npairs = %"PRIu64 " Computed results npairs = %"PRIu64"\n", npairs, results.npairs[index]);
+                    fprintf(stderr,"True savg  = %20.12e Computed savg = %20.12e. floats_equal = %d\n", savg, results.savg[index], floats_equal);
+                    fprintf(stderr,"True weightavg = %e Computed weightavg = %e. weights_equal = %d\n", weightavg, results.weightavg[index], weights_equal);
+                    ret = EXIT_FAILURE;//not required but showing intent
+                    i = results.nsbin;
+                    break;
+                }
+            }
+        }
+        fclose(fp);
+    END_INTEGRATION_TEST_SECTION;
+   
+    /* If the test failed, then write the output into a temporary file */
+    if(ret != EXIT_SUCCESS) {
+        FILE *fp=my_fopen(tmpoutputfile,"w");
+        if(fp == NULL) {
+            free_results_mocks_s_mu(&results);
+            return EXIT_FAILURE;
+        }
+        const double dmu= 1.0/(double)results.nmu_bins;
+        const int nmubin = results.nmu_bins;
+        for(int i=1;i<results.nsbin;i++) {
+            const double logrp = log10(results.supp[i]);
+            for(int j=0;j<nmubin;j++) {
+                int index = i*(nmubin+1) + j;
+                fprintf(fp,"%10"PRIu64" %20.8lf %20.8lf  %20.8lf %20.8lf \n",results.npairs[index],results.savg[index],logrp,(j+1)*dmu, results.weightavg[index]);
+            }
+        }
+        fclose(fp);
+    }
+
+    free_results_mocks_s_mu(&results);
+    return ret;
+}
+
+int test_DDtheta_mocks(const char *correct_outputfile)
 {
     int autocorr = (RA1==RA2) ? 1:0;
     int ret = EXIT_FAILURE;
     results_countpairs_theta results;
-    const isa old_isa = options.instruction_set;
+
+    // Set up the weights pointers
+    weight_method_t weight_method = PAIR_PRODUCT;
+    struct extra_options extra = get_extra_options(weight_method);
+    extra.weights0.weights[0] = weights1;
+    extra.weights1.weights[0] = weights2;
     
-#ifdef DEVELOPER_TESTS
-    const int min_bin_ref = 1, max_bin_ref = 3;
-        
+#ifdef INTEGRATION_TESTS
     //wtheta has 3 implementations (brute-force, link-in-dec and link-in-dec + link-in-ra)
     //For developer testing, multiple bin refine factors are tested as well as the
     //all three of the linking logic.
@@ -176,109 +244,149 @@ int test_wtheta_mocks(const char *correct_outputfile)
     // (dec_link, ra_link) == (0, 0) -> brute-force
     // (dec_link, ra_link) == (1, 0) -> dec-linking only
     // (dec_link, ra_link) == (1, 1) -> dec + ra linking
-    for(int dec_link=0;dec_link<=1;dec_link++) {
-        for(int ra_link=0;ra_link <= dec_link; ra_link++) {
-            options.link_in_dec=dec_link;
-            options.link_in_ra=ra_link;
-            for(int bf=min_bin_ref;bf<=max_bin_ref;bf++) {
-                if((dec_link + ra_link) == 0 && bf > min_bin_ref) continue;//bin refine factor has no impact on brute-force
-                options.bin_refine_factors[0] = bf;
-                options.bin_refine_factors[1] = bf;
-                options.bin_refine_factors[2] = bf;
-
-                // Check the specific implementations for each instruction set
-                for(int iset=0;iset<num_isets;iset++) {
-                    options.instruction_set = instruction_sets[iset];
-                    struct timeval t0;
-                    gettimeofday(&t0, NULL);
+
+    /* The order of the for loop breaks the convention "RA before DEC"
+       -- This is because the binning in RA can only be done if the binning
+       in DEC is enabled. Therefore, it makes more sense to loop in RA *only*
+       after the DEC binning is decided.
+     */
+    struct timespec t0, t1;
+    const isa old_isa = options.instruction_set;
+    int dotest = 1;
+    // Check the specific implementations for each instruction set
+    for(int iset=0;iset<num_instructions;iset++) {
+        options.instruction_set = valid_instruction_sets[iset];
+        for(int dec_link=0;dec_link<=1;dec_link++) {
+            for(int ra_link=0;ra_link <= dec_link; ra_link++) {
+                int fastest_bin_ref[] = {1, 1, 1};
+                int fastest_isa = 0;
+                double fastest_time = 1e30;
+                for(int ra_bin_ref=min_bin_ref;ra_bin_ref<=max_bin_ref;ra_bin_ref++) {
+                    for(int dec_bin_ref=min_bin_ref;dec_bin_ref<=max_bin_ref;dec_bin_ref++) {
+
+                        if(dotest == 1) {
+                            if(dec_link == 0 && ra_link == 0) continue;//I have checked the brute force
+                            
+                            //bin refine factor has no impact on brute-force -> only check brute-force once
+                            if(dec_link == 0 && ra_link == 0 && (dec_bin_ref != min_bin_ref || ra_bin_ref != min_bin_ref)) continue;
+                            
+                            const int bf[] = {ra_bin_ref, dec_bin_ref, -1};
+                            set_custom_bin_refine_factors(&options, bf);
+                            
+                            options.link_in_dec=dec_link;
+                            options.link_in_ra=ra_link;
+
+                            fprintf(stderr,"Running with dec-linking = %d ra-linking = %d bin-ref = (%d, %d) and instruction set = %s ",
+                                    dec_link, ra_link,
+                                    options.bin_refine_factors[0],
+                                    options.bin_refine_factors[1],
+                                    isa_name[iset]);
+                            
+                            current_utc_time(&t0);
 #else
-                    options.link_in_dec = 1;
-                    options.link_in_ra = 1;
-#endif
-                
-                    // Set up the weights pointers
-                    weight_method_t weight_method = PAIR_PRODUCT;
-                    struct extra_options extra = get_extra_options(weight_method);
-                    extra.weights0.weights[0] = weights1;
-                    extra.weights1.weights[0] = weights2;
-                    int status = countpairs_theta_mocks(ND1,RA1,DEC1,
-                                                        ND2,RA2,DEC2,
-                                                        nthreads,
-                                                        autocorr,
-                                                        angular_binfile,
-                                                        &results,
-                                                        &options,
-                                                        &extra);
-#ifdef DEVELOPER_TESTS    
-                    struct timeval t1;
-                    gettimeofday(&t1, NULL);
-                    fprintf(stderr,"bf = %d dec = %d ra = %d (iset, isa) = (%d,%d) status = %d. Time taken = %0.3g sec\n",
-                            bf, dec_link, ra_link, iset, instruction_sets[iset], status, ADD_DIFF_TIME(t0, t1));
+   {
+       options.link_in_dec = 1;
+       options.link_in_ra = 1;
 #endif
-                    
-                    if(status != EXIT_SUCCESS) {
-                        return status;
-                    }
-                    
-                    /*---Output-Pairs-------------------------------------*/
-                    FILE *fp=my_fopen(correct_outputfile,"r");
-                    if(fp == NULL) {
-                        free_results_countpairs_theta(&results);
-                        return EXIT_FAILURE;
-                    }
-                    for(int i=1;i<results.nbin;i++) {
-                        uint64_t npairs;
-                        double theta_avg, weightavg;
-                        ret = EXIT_FAILURE;
-                        int nitems = fscanf(fp,"%"SCNu64" %lf %*f %*f %lf%*[^\n]", &npairs, &theta_avg, &weightavg);
-                        if(nitems != 3) {
-                            ret = EXIT_FAILURE;//not required but showing intent
-                            break;
-                        }
-                        int floats_equal = AlmostEqualRelativeAndAbs_double(theta_avg, results.theta_avg[i], maxdiff, maxreldiff);
-                        int weights_equal = AlmostEqualRelativeAndAbs_double(weightavg, results.weightavg[i], maxdiff, maxreldiff);
-                        
-                        //Check for exact equality of npairs and float "equality" for theta_avg
-                        if(npairs == results.npairs[i] && floats_equal == EXIT_SUCCESS && weights_equal == EXIT_SUCCESS) {
-                            ret = EXIT_SUCCESS;
-                        } else {
-                            ret = EXIT_FAILURE;//not required but showing intent 
-                            fprintf(stderr,"Failed. True npairs = %"PRIu64 " Computed results npairs = %"PRIu64"\n", npairs, results.npairs[i]);
-                            fprintf(stderr,"Failed. True thetaavg = %e Computed thetaavg = %e. floats_equal = %d\n", theta_avg, results.theta_avg[i], floats_equal);
-                            fprintf(stderr,"Failed. True weightavg = %e Computed weightavg = %e. floats_equal = %d\n", weightavg, results.weightavg[i], weights_equal);
-                            break;
-                        }
-                    }
-                    fclose(fp);
-                    
-                    if(ret != EXIT_SUCCESS) {
-                        fp=my_fopen(tmpoutputfile,"w"); 
-                        double theta_low = results.theta_upp[0];
-                        for(int i=1;i<results.nbin;i++) {
-                            fprintf(fp,"%10"PRIu64" %20.8lf %20.8lf %20.8lf %20.8lf\n",
-                                    results.npairs[i],results.theta_avg[i],theta_low,results.theta_upp[i], results.weightavg[i]);
-                            theta_low=results.theta_upp[i];
-                        }
-                        fclose(fp);
-                    }
-#ifdef DEVELOPER_TESTS
+
+
+                            int status = countpairs_theta_mocks(ND1,RA1,DEC1,
+                                                                ND2,RA2,DEC2,
+                                                                nthreads,
+                                                                autocorr,
+                                                                angular_binfile,
+                                                                &results,
+                                                                &options,
+                                                                &extra);
+                            
+                            if(status != EXIT_SUCCESS) {
+                                return status;
+                            }
+                            
+                            /*---Output-Pairs-------------------------------------*/
+                            FILE *fp=my_fopen(correct_outputfile,"r");
+                            if(fp == NULL) {
+                                free_results_countpairs_theta(&results);
+                                return EXIT_FAILURE;
+                            }
+                            for(int i=1;i<results.nbin;i++) {
+                                uint64_t npairs;
+                                double theta_avg, weightavg;
+                                ret = EXIT_FAILURE;
+                                int nitems = fscanf(fp,"%"SCNu64" %lf %*f %*f %lf%*[^\n]", &npairs, &theta_avg, &weightavg);
+                                if(nitems != 3) {
+                                    ret = EXIT_FAILURE;//not required but showing intent
+                                    break;
+                                }
+                                int floats_equal = AlmostEqualRelativeAndAbs_double(theta_avg, results.theta_avg[i], maxdiff, maxreldiff);
+                                int weights_equal = AlmostEqualRelativeAndAbs_double(weightavg, results.weightavg[i], maxdiff, maxreldiff);
+                                
+                                //Check for exact equality of npairs and float "equality" for theta_avg
+                                if(npairs == results.npairs[i] && floats_equal == EXIT_SUCCESS && weights_equal == EXIT_SUCCESS) {
+                                    ret = EXIT_SUCCESS;
+                                } else {
+                                    ret = EXIT_FAILURE;//not required but showing intent
+                                    fprintf(stderr,"Failed. True npairs = %"PRIu64 " Computed results npairs = %"PRIu64"\n", npairs, results.npairs[i]);
+                                    fprintf(stderr,"Failed. True thetaavg = %e Computed thetaavg = %e. floats_equal = %d\n", theta_avg, results.theta_avg[i], floats_equal);
+                                    fprintf(stderr,"Failed. True weightavg = %e Computed weightavg = %e. floats_equal = %d\n", weightavg, results.weightavg[i], weights_equal);
+                                    break;
+                                }
+                            }
+                            fclose(fp);
+
+#ifdef INTEGRATION_TESTS
+                            current_utc_time(&t1);                      
+                            double time_to_run = REALTIME_ELAPSED_NS(t0, t1); 
+                            if(time_to_run < fastest_time) {
+                                fastest_time = time_to_run;
+                                fastest_isa = iset;
+                                memcpy(&fastest_bin_ref, &bf, sizeof(bf));
+                            }
+                            if(ret != EXIT_SUCCESS) {                   
+                                fprintf(stderr, ANSI_COLOR_RED "FAILED"); 
+                                dotest = 0;                             
+                            } else {                                    
+                                fprintf(stderr,ANSI_COLOR_GREEN "PASSED"); 
+                            }
+                            fprintf(stderr, ANSI_COLOR_RESET ". Time taken = %8.2lf seconds \n", time_to_run * 1e-9);
+                        } //dotest if condition
+                    }//loop over declination bin refine factors
+                }//loop over ra bin refine factors
+                if(ret == EXIT_SUCCESS) {                                        
+                    fprintf(stderr, ANSI_COLOR_MAGENTA "Fastest time = %8.2lf seconds with bin-ref = {%d, %d} and instruction_set = %s" ANSI_COLOR_RESET "\n",
+                            fastest_time*1e-9,                                          
+                            fastest_bin_ref[0],                                         
+                            fastest_bin_ref[1],                                         
+                            isa_name[fastest_isa]);
                 }
-            }
-        }
+            }//loop over ra link
+        }//loop over dec link
+    }//iset loop (instruction sets)
+    options.instruction_set = old_isa;
+    reset_bin_refine_factors(&options);
+#else
     }
 #endif
-
-    options.instruction_set = old_isa;
     
+    if(ret != EXIT_SUCCESS) {
+        FILE *fp=my_fopen(tmpoutputfile,"w");
+        double theta_low = results.theta_upp[0];
+        for(int i=1;i<results.nbin;i++) {
+            fprintf(fp,"%10"PRIu64" %20.8lf %20.8lf %20.8lf %20.8lf\n",
+                    results.npairs[i],results.theta_avg[i],theta_low,results.theta_upp[i], results.weightavg[i]);
+            theta_low=results.theta_upp[i];
+        }
+        fclose(fp);
+    }
+
     //free the result structure
     free_results_countpairs_theta(&results);
     return ret;
-    
 }
-    
+
 int test_vpf_mocks(const char *correct_outputfile)
 {
-                const double rmax=10.0;
+    const double rmax=10.0;
     const int nbin=10;
     const int nc=10000;
     const int num_pN=6;
@@ -286,63 +394,66 @@ int test_vpf_mocks(const char *correct_outputfile)
     double *xran=NULL,*yran=NULL,*zran=NULL;
     const int threshold_neighbors=1;
     const char centers_file[]="../tests/data/Mr19_centers_xyz_forVPF_rmax_10Mpc.txt";
-
     results_countspheres_mocks results;
-    int status = countspheres_mocks(ND1, RA1, DEC1, CZ1,
-                                    Nran, xran, yran, zran,
-                                    threshold_neighbors,
-                                    rmax, nbin, nc,
-                                    num_pN,
-                                    centers_file,
-                                    cosmology_flag,
-                                    &results,
-                                    &options, NULL);
-    if(status != EXIT_SUCCESS) {
-        return status;
-    }
-
     int ret = EXIT_FAILURE;
-    //Output the results
-    FILE *fp=my_fopen(correct_outputfile,"r");
-    if(fp == NULL) {
-        free_results_countspheres_mocks(&results);
-        return EXIT_FAILURE;
-    }
-    const double rstep = rmax/(double)nbin ;
-    for(int ibin=0;ibin<results.nbin;ibin++) {
-        double r;
-        int nitems = fscanf(fp, "%lf", &r);
-        if(nitems != 1) {
+
+    BEGIN_INTEGRATION_TEST_SECTION
+        int status = countspheres_mocks(ND1, RA1, DEC1, CZ1,
+                                        Nran, xran, yran, zran,
+                                        threshold_neighbors,
+                                        rmax, nbin, nc,
+                                        num_pN,
+                                        centers_file,
+                                        cosmology_flag,
+                                        &results,
+                                        &options, NULL);
+        if(status != EXIT_SUCCESS) {
+            return status;
+        }
+
+
+        //Output the results
+        FILE *fp=my_fopen(correct_outputfile,"r");
+        if(fp == NULL) {
+            free_results_countspheres_mocks(&results);
             return EXIT_FAILURE;
         }
-        ret = EXIT_FAILURE;
-        for(int i=0;i<num_pN;i++) {
-            double pN;
-            nitems = fscanf(fp, " %lf ", &pN);
+        for(int ibin=0;ibin<results.nbin;ibin++) {
+            double r;
+            int nitems = fscanf(fp, "%lf", &r);
             if(nitems != 1) {
                 return EXIT_FAILURE;
             }
-
-            /* Not quite sure how this is working. The correct output columns only have 4 digits printed,
-               but I am comparing here with ~1e-9 in abs. diff. The only way the comparison should work is
-               if the conversion to 4 digits during printf, round-trips during scanf. But surely there must 
-               be a lot more doubles that can be fit within those missing digits of precision.
-
-               I would have thought the comparison would require maxdiff ~ 1e-4. -- MS
-             */
-            int floats_equal = AlmostEqualRelativeAndAbs_double(pN, (results.pN)[ibin][i], maxdiff, maxreldiff);
-            if(floats_equal != EXIT_SUCCESS) {
-                ibin=results.nbin;
-                ret=EXIT_FAILURE;
-                break;
+            ret = EXIT_FAILURE;
+            for(int i=0;i<num_pN;i++) {
+                double pN;
+                nitems = fscanf(fp, " %lf ", &pN);
+                if(nitems != 1) {
+                    return EXIT_FAILURE;
+                }
+                
+                /* Not quite sure how this is working. The correct output columns only have 4 digits printed,
+                   but I am comparing here with ~1e-9 in abs. diff. The only way the comparison should work is
+                   if the conversion to 4 digits during printf, round-trips during scanf. But surely there must
+                   be a lot more doubles that can be fit within those missing digits of precision.
+                   
+                   I would have thought the comparison would require maxdiff ~ 1e-4. -- MS
+                */
+                int floats_equal = AlmostEqualRelativeAndAbs_double(pN, (results.pN)[ibin][i], maxdiff, maxreldiff);
+                if(floats_equal != EXIT_SUCCESS) {
+                    ibin=results.nbin;
+                    ret=EXIT_FAILURE;
+                    break;
+                }
+                ret = EXIT_SUCCESS;
             }
-            ret = EXIT_SUCCESS;
         }
-    }
-    fclose(fp);
-
+        fclose(fp);
+    END_INTEGRATION_TEST_SECTION;
+        
     if(ret != EXIT_SUCCESS) {
-        fp=my_fopen(tmpoutputfile,"w");
+        FILE *fp=my_fopen(tmpoutputfile,"w");
+        const double rstep = rmax/(double)nbin ;
         for(int ibin=0;ibin<results.nbin;ibin++) {
             const double r=(ibin+1)*rstep;
             fprintf(fp,"%10.2lf ", r);
@@ -420,7 +531,7 @@ int main(int argc, char **argv)
     struct timeval tstart,t0,t1;
     char file[]="../tests/data/Mr19_mock_northonly.rdcz.dat";
     char fileformat[]="a";
-    
+
     options = get_config_options();
     options.need_avg_sep=1;
     options.verbose=0;
@@ -429,7 +540,7 @@ int main(int argc, char **argv)
     options.fast_divide=0;
     options.fast_acos=0;
     //options.instruction_set = FALLBACK;
-    
+
     int status = init_cosmology(cosmology_flag);
     if(status != EXIT_SUCCESS) {
         return EXIT_FAILURE;
@@ -438,7 +549,7 @@ int main(int argc, char **argv)
 
     //set the globals.
     ND1 = read_positions(file,fileformat, sizeof(double), 4, &RA1, &DEC1, &CZ1, &weights1);
-    
+
     ND2 = ND1;
     RA2 = RA1;
     DEC2 = DEC1;
@@ -448,12 +559,19 @@ int main(int argc, char **argv)
     strncpy(current_file1,file,MAXLEN);
     strncpy(current_file2,file,MAXLEN);
     reset_bin_refine_factors(&options);
-    
+
     int failed=0;
 
-    const char alltests_names[][MAXLEN] = {"Mr19 mocks DDrppi (DD)","Mr19 mocks wtheta (DD)","Mr19 mocks vpf (data)","Mr19 mocks DDrppi (DR)", "Mr19 mocks wtheta (DR)","Mr19 mocks vpf (randoms)"};
+    const char alltests_names[][MAXLEN] = {"Mr19 mocks DDrppi (DD)",
+                                           "Mr19 mocks wtheta (DD)",
+                                           "Mr19 mocks vpf (data)",
+                                           "Mr19 mocks DDrppi (DR)",
+                                           "Mr19 mocks wtheta (DR)",
+                                           "Mr19 mocks vpf (randoms)",
+                                           "Mr19 mocks DDsmu (RR)",
+                                           "Mr19 mocks DDsmu (DR)"};
     const int ntests = sizeof(alltests_names)/(sizeof(char)*MAXLEN);
-    const int function_pointer_index[] = {0,1,2,0,1,2};//0->DDrppi, 1->wtheta, 2->vpf
+    const int function_pointer_index[] = {0,1,2,0,1,2,3,3};//0->DDrppi, 1->wtheta, 2->vpf, 3->DDsmu
     assert(sizeof(function_pointer_index)/sizeof(int) == ntests && "Number of tests should equal the number of functions");
 
     const char correct_outputfiles[][MAXLEN] = {"../tests/Mr19_mock.DD", /* Test 0 Mr19 DD */
@@ -461,26 +579,32 @@ int main(int argc, char **argv)
                                                 "../tests/Mr19_mock_vpf", /* Test 2 Mr19 mocks vpf */
                                                 "../tests/Mr19_mock.DR", /* Test 3 Mr19 DR */
                                                 "../tests/Mr19_mock_wtheta.DR", /* Test 4 Mr19 wtheta DR */
-                                                "../tests/Mr19_randoms_vpf"}; /* Test 5 Mr19 randoms vpf */
+                                                "../tests/Mr19_randoms_vpf", /* Test 5 Mr19 randoms vpf */
+                                                "../tests/Mr19_mock_DDsmu.RR", /* Test 6 Mr19 RR smu */
+                                                "../tests/Mr19_mock_DDsmu.DR"}; /* Test 7 Mr19 DR smu */
     const char firstfilename[][MAXLEN] = {"../tests/data/Mr19_mock_northonly.rdcz.dat",
                                           "../tests/data/Mr19_mock_northonly.rdcz.dat",
                                           "../tests/data/Mr19_mock_northonly.rdcz.dat",
                                           "../tests/data/Mr19_randoms_northonly.rdcz.ff",
                                           "../tests/data/Mr19_randoms_northonly.rdcz.ff",
+                                          "../tests/data/Mr19_randoms_northonly.rdcz.ff",
+                                          "../tests/data/Mr19_randoms_northonly.rdcz.ff",
                                           "../tests/data/Mr19_randoms_northonly.rdcz.ff"};
-    const char firstfiletype[][MAXLEN]  = {"a","a","a","f","f","f"};
+    const char firstfiletype[][MAXLEN]  = {"a","a","a","f","f","f","f","f"};
     const char secondfilename[][MAXLEN] = {"../tests/data/Mr19_mock_northonly.rdcz.dat",
                                            "../tests/data/Mr19_mock_northonly.rdcz.dat",
                                            "../tests/data/Mr19_mock_northonly.rdcz.dat",
                                            "../tests/data/Mr19_mock_northonly.rdcz.dat",
                                            "../tests/data/Mr19_mock_northonly.rdcz.dat",
-                                           "../tests/data/Mr19_randoms_northonly.rdcz.ff"};
-    const char secondfiletype[][MAXLEN] = {"a","a","a","a","a","f"};
-    
-    const double allpimax[]             = {40.0,40.0,40.0,40.0,40.0,40.0};
+                                           "../tests/data/Mr19_randoms_northonly.rdcz.ff",
+                                           "../tests/data/Mr19_randoms_northonly.rdcz.ff",
+                                           "../tests/data/Mr19_mock_northonly.rdcz.dat"};
+    const char secondfiletype[][MAXLEN] = {"a","a","a","a","a","f","f","a"};
+
+    const double allpimax[]             = {40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0};
 
-    int (*allfunctions[]) (const char *) = {test_DDrppi_mocks,test_wtheta_mocks,test_vpf_mocks};
-    const int numfunctions=3;//3 functions total
+    int (*allfunctions[]) (const char *) = {test_DDrppi_mocks,test_DDtheta_mocks,test_vpf_mocks,test_DDsmu_mocks};
+    const int numfunctions=4;//4 functions total
 
     int total_tests=0,skipped=0;
 
@@ -508,7 +632,7 @@ int main(int argc, char **argv)
                 char execstring[MAXLEN];
                 my_snprintf(execstring,MAXLEN,"rm -f %s",tmpoutputfile);
                 run_system_call(execstring);//can ignore the status here
-                
+
             } else {
                 fprintf(stderr,ANSI_COLOR_RED "FAILED: " ANSI_COLOR_MAGENTA "%s" ANSI_COLOR_RED ". Time taken = %8.2lf seconds " ANSI_COLOR_RESET "\n", testname,pair_time);
                 failed++;
diff --git a/theory/DDsmu/DDsmu.c b/theory/DDsmu/DDsmu.c
new file mode 100644
index 00000000..edebb837
--- /dev/null
+++ b/theory/DDsmu/DDsmu.c
@@ -0,0 +1,346 @@
+/* File: DDsmu.c */
+/*
+  This file is a part of the Corrfunc package
+  Copyright (C) 2015-- Manodeep Sinha (manodeep@gmail.com)
+  License: MIT LICENSE. See LICENSE file under the top-level
+  directory at https://github.com/manodeep/Corrfunc/
+*/
+
+/* PROGRAM DDsmu
+
+--- DDsmu file1 format1 file2 format2 binfile mu_max nmu_bins numthreads [weight_method weights_file1 weights_format1 [weights_file2 weights_format2]] > DDfile
+--- Measure the cross-correlation function xi(s, mu) for two different
+   data files (or autocorrelation if file1=file1).
+ * file1         = name of first data file
+ * format1       = format of first data file  (a=ascii, c=csv, f=fast-food)
+ * file2         = name of second data file
+ * format2       = format of second data file (a=ascii, c=csv, f=fast-food)
+ * binfile       = name of ascii file containing the r-bins (rmin rmax for each bin)
+ * mu_max        = maximum of the cosine of the angle to the line-of-sight (LOS is taken to be along the z-direction)
+ * nmu_bins      = number of bins for mu
+ * numthreads    = number of threads to use
+--- OPTIONAL ARGS:
+ * weight_method = the type of pair weighting to apply.  Options are: 'pair_product', 'none'.  Default: 'none'.
+ * weights_file1 = name of file containing the weights corresponding to the first data file
+ * weights_format1 = format of file containing the weights corresponding to the first data file
+ * weights_file2 = name of file containing the weights corresponding to the second data file
+ * weights_format2 = format of file containing the weights corresponding to the second data file
+---OUTPUT:
+ > DDfile        = name of output file <smin smax mu_max_per_s savg npairs weightavg>
+   ----------------------------------------------------------------------------------
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <inttypes.h>
+
+#include "defs.h" //for ADD_DIFF_TIME
+#include "function_precision.h" //definition of DOUBLE
+#include "countpairs_s_mu.h" //function proto-type for countpairs
+#include "io.h" //function proto-type for file input
+#include "utils.h" //general utilities
+
+
+void Printhelp(void);
+
+int main(int argc, char *argv[])
+{
+
+    /*---Arguments-------------------------*/
+    char *file1=NULL,*file2=NULL,*weights_file1=NULL,*weights_file2=NULL;
+    char *fileformat1=NULL,*fileformat2=NULL,*weights_fileformat1=NULL,*weights_fileformat2=NULL;
+    char *sbinfile=NULL;
+    double mu_max;
+    int nmu_bins;
+    char *weight_method_str=NULL;
+
+    weight_method_t weight_method = NONE;
+    int num_weights = 0;
+
+    /*---Data-variables--------------------*/
+    int64_t ND1=0,ND2=0;
+
+    DOUBLE *x1=NULL,*y1=NULL,*z1=NULL,*weights1[MAX_NUM_WEIGHTS]={NULL};
+    DOUBLE *x2=NULL,*y2=NULL,*z2=NULL,*weights2[MAX_NUM_WEIGHTS]={NULL};//will point to x1/y1/z1 in case of auto-corr
+
+    int nthreads=1;
+    /*---Corrfunc-variables----------------*/
+#if !(defined(USE_OMP) && defined(_OPENMP))
+    const char argnames[][30]={"file1","format1","file2","format2","sbinfile","mu_max", "nmu_bins"};
+#else
+    const char argnames[][30]={"file1","format1","file2","format2","sbinfile","mu_max", "nmu_bins","Nthreads"};
+#endif
+    const char optargnames[][30]={"weight_method", "weights_file1","weights_format1","weights_file2","weights_format2"};
+
+    int nargs=sizeof(argnames)/(sizeof(char)*30);
+    int noptargs=sizeof(optargnames)/(sizeof(char)*30);
+
+    struct timeval t_end,t_start,t0,t1;
+    double read_time=0.0;
+    gettimeofday(&t_start,NULL);
+
+    /*---Read-arguments-----------------------------------*/
+    if(argc< (nargs+1)) {
+        Printhelp() ;
+        fprintf(stderr,"\nFound: %d parameters\n ",argc-1);
+        int i;
+        for(i=1;i<argc;i++) {
+            if(i <= nargs)
+                fprintf(stderr,"\t\t %s = `%s' \n",argnames[i-1],argv[i]);
+            else if(i <= nargs + noptargs)
+                fprintf(stderr,"\t\t %s = `%s' \n",optargnames[i-1-nargs],argv[i]);
+            else
+                fprintf(stderr,"\t\t <> = `%s' \n",argv[i]);
+        }
+        fprintf(stderr,"\nMissing required parameters \n");
+        for(i=argc;i<=nargs;i++)
+            fprintf(stderr,"\t\t %s = `?'\n",argnames[i-1]);
+        return EXIT_FAILURE;
+    }
+
+    /* Validate optional arguments */
+    int noptargs_given = argc - (nargs + 1);
+    if(noptargs_given != 0 && noptargs_given != 3 && noptargs_given != 5){
+        Printhelp();
+        fprintf(stderr,"\nFound: %d optional arguments; must be 0 (no weights), 3 (for one set of weights) or 5 (for two sets)\n ", noptargs_given);
+        int i;
+        for(i=nargs+1;i<argc;i++) {
+            if(i <= nargs + noptargs)
+                fprintf(stderr,"\t\t %s = `%s' \n",optargnames[i-nargs-1],argv[i]);
+            else
+                fprintf(stderr,"\t\t <> = `%s' \n",argv[i]);
+        }
+        return EXIT_FAILURE;
+    }
+
+    file1=argv[1];
+    fileformat1=argv[2];
+    file2=argv[3];
+    fileformat2=argv[4];
+    sbinfile=argv[5];
+    sscanf(argv[6],"%lf",&mu_max) ;
+    nmu_bins=atoi(argv[7]);
+
+#if defined(_OPENMP)
+    nthreads=atoi(argv[8]);
+    if(nthreads < 1 ) {
+        fprintf(stderr, "Nthreads = %d must be at least 1. Exiting...\n", nthreads);
+        return EXIT_FAILURE;
+    }
+#endif
+
+    if(noptargs_given >= 3){
+       weight_method_str = argv[nargs + 1];
+       int wstatus = get_weight_method_by_name(weight_method_str, &weight_method);
+       if(wstatus != EXIT_SUCCESS){
+         fprintf(stderr, "Error: Unknown weight method \"%s\"\n", weight_method_str);
+         return EXIT_FAILURE;
+       }
+       num_weights = get_num_weights_by_method(weight_method);
+
+       weights_file1 = argv[nargs + 2];
+       weights_fileformat1 = argv[nargs + 3];
+    }
+    if(noptargs_given >= 5){
+       weights_file2 = argv[nargs + 4];
+       weights_fileformat2 = argv[nargs + 5];
+    }
+
+    int autocorr=0;
+    if(strcmp(file1,file2)==0) {
+        autocorr=1;
+    }
+
+    fprintf(stderr,"Running `%s' with the parameters \n",argv[0]);
+    fprintf(stderr,"\n\t\t -------------------------------------\n");
+    for(int i=1;i<argc;i++) {
+        if(i <= nargs) {
+            fprintf(stderr,"\t\t %-10s = %s \n",argnames[i-1],argv[i]);
+        } else if(i <= nargs + noptargs){
+            fprintf(stderr,"\t\t %-10s = %s \n",optargnames[i-nargs-1],argv[i]);
+        } else {
+            fprintf(stderr,"\t\t <> = `%s' \n",argv[i]);
+        }
+    }
+    fprintf(stderr,"\t\t -------------------------------------\n");
+
+
+    gettimeofday(&t0,NULL);
+    /*---Read-data1-file----------------------------------*/
+    ND1=read_positions(file1,fileformat1,sizeof(DOUBLE), 3, &x1, &y1, &z1);
+    gettimeofday(&t1,NULL);
+    read_time += ADD_DIFF_TIME(t0,t1);
+    gettimeofday(&t0,NULL);
+
+    /* Read weights file 1 */
+    if(weights_file1 != NULL){
+        gettimeofday(&t0,NULL);
+        int64_t wND1 = read_columns_into_array(weights_file1,weights_fileformat1, sizeof(DOUBLE), num_weights, (void **) weights1);
+        gettimeofday(&t1,NULL);
+        read_time += ADD_DIFF_TIME(t0,t1);
+
+        if(wND1 != ND1){
+          fprintf(stderr, "Error: read %"PRId64" lines from %s, but read %"PRId64" from %s\n", wND1, weights_file1, ND1, file1);
+          return EXIT_FAILURE;
+        }
+    }
+
+    if (autocorr==0) {
+        /*---Read-data2-file----------------------------------*/
+        ND2=read_positions(file2,fileformat2,sizeof(DOUBLE), 3, &x2, &y2, &z2);
+        gettimeofday(&t1,NULL);
+        read_time += ADD_DIFF_TIME(t0,t1);
+
+        if(weights_file2 != NULL){
+            gettimeofday(&t0,NULL);
+            int64_t wND2 = read_columns_into_array(weights_file2,weights_fileformat2, sizeof(DOUBLE), num_weights, (void **) weights2);
+            gettimeofday(&t1,NULL);
+            read_time += ADD_DIFF_TIME(t0,t1);
+
+            if(wND2 != ND2){
+              fprintf(stderr, "Error: read %"PRId64" lines from %s, but read %"PRId64" from %s\n", wND2, weights_file2, ND2, file2);
+              return EXIT_FAILURE;
+            }
+        }
+    } else {
+        //None of these are required. But I prefer to preserve the possibility
+        ND2 = ND1;
+        x2 = x1;
+        y2 = y1;
+        z2 = z1;
+        for(int w = 0; w < MAX_NUM_WEIGHTS; w++){
+          weights2[w] = weights1[w];
+        }
+    }
+
+    /*---Count-pairs--------------------------------------*/
+    gettimeofday(&t0,NULL);
+    results_countpairs_s_mu results;
+    struct config_options options = get_config_options();
+
+    /* Pack weights into extra options */
+    struct extra_options extra = get_extra_options(weight_method);
+    for(int w = 0; w < num_weights; w++){
+        extra.weights0.weights[w] = (void *) weights1[w];
+        extra.weights1.weights[w] = (void *) weights2[w];
+    }
+
+    /* If you want to change the bin refine factors */
+    /* const int bf[] = {2, 2, 1}; */
+    /* set_bin_refine_factors(&options, bf); */
+    int status = countpairs_s_mu(ND1,x1,y1,z1,
+                                 ND2,x2,y2,z2,
+                                 nthreads,
+                                 autocorr,
+                                 sbinfile,
+                                 mu_max,
+                                 nmu_bins,
+                                 &results,
+                                 &options,
+                                 &extra);
+
+    free(x1);free(y1);free(z1);
+    for(int w = 0; w < num_weights; w++){
+        free(weights1[w]);
+    }
+    if(autocorr == 0) {
+        free(x2);free(y2);free(z2);
+        for(int w = 0; w < num_weights; w++){
+          free(weights2[w]);
+        }
+    }
+
+    if(status != EXIT_SUCCESS) {
+        return status;
+    }
+
+    gettimeofday(&t1,NULL);
+    double pair_time = ADD_DIFF_TIME(t0,t1);
+    double smin = results.supp[0];
+    const double dmu = mu_max/(double) nmu_bins;
+    for(int i=1;i<results.nsbin;i++) {
+        const double smax = results.supp[i];
+        for(int j=0;j<nmu_bins;j++) {
+            int index = i*(nmu_bins+1) + j;
+            fprintf(stdout,"%e\t%e\t%e\t%12"PRIu64"\t%e\n", smin, smax, (j+1)*dmu, results.npairs[index], results.weightavg[index]);
+        }
+        smin = smax;
+    }
+
+    //free memory in results struct
+    free_results_s_mu(&results);
+
+    gettimeofday(&t_end,NULL);
+    fprintf(stderr,"DDsmu> Done -  ND1=%12"PRId64" ND2=%12"PRId64". Time taken = %6.2lf seconds. read-in time = %6.2lf seconds pair-counting time = %6.2lf sec\n",
+            ND1,ND2,ADD_DIFF_TIME(t_start,t_end),read_time,pair_time);
+    return EXIT_SUCCESS;
+}
+
+/*---Print-help-information---------------------------*/
+void Printhelp(void)
+{
+    fprintf(stderr,"=========================================================================\n") ;
+#if defined(USE_OMP) && defined(_OPENMP)
+    fprintf(stderr,"   --- DDsmu file1 format1 file2 format2 sbinfile mu_max nmu_bins numthreads [weight_method weights_file1 weights_format1 [weights_file2 weights_format2]] > DDfile\n");
+#else
+    fprintf(stderr,"   --- DDsmu file1 format1 file2 format2 sbinfile mu_max nmu_bins [weight_method weights_file1 weights_format1 [weights_file2 weights_format2]] > DDfile\n") ;
+#endif
+
+    fprintf(stderr,"   --- Measure the cross-correlation function xi(s, mu) for two different\n") ;
+    fprintf(stderr,"       data files (or autocorrelation if data1=data2).\n") ;
+    fprintf(stderr,"     * file1         = name of first data file\n") ;
+    fprintf(stderr,"     * format1       = format of first data file  (a=ascii, c=csv, f=fast-food)\n") ;
+    fprintf(stderr,"     * file2         = name of second data file\n") ;
+    fprintf(stderr,"     * format2       = format of second data file (a=ascii, c=csv, f=fast-food)\n") ;
+    fprintf(stderr,"     * sbinfile      = name of ascii file containing the s-bins (smin smax for each bin)\n") ;
+    fprintf(stderr,"     * mu_max        = maximum of the cosine of the angle to the line-of-sight (LOS is taken to be along the z-direction). Valid values are in: (0.0, 1.0]\n");
+    fprintf(stderr,"     * nmu_bins      = number of bins for mu (must be >= 1)\n");
+#if defined(USE_OMP) && defined(_OPENMP)
+    fprintf(stderr,"     * numthreads    = number of threads to use (must be >= 1)\n");
+#endif
+    fprintf(stderr,"   --- OPTIONAL ARGS:\n");
+    fprintf(stderr,"     * weight_method = the type of pair weighting to apply.  Options are: 'pair_product', 'none'.  Default: 'none'.\n");
+    fprintf(stderr,"     * weights_file1 = name of file containing the weights corresponding to the first data file\n");
+    fprintf(stderr,"     * weights_format1 = format of file containing the weights corresponding to the first data file\n");
+    fprintf(stderr,"     * weights_file2 = name of file containing the weights corresponding to the second data file\n");
+    fprintf(stderr,"     * weights_format2 = format of file containing the weights corresponding to the second data file\n");
+    fprintf(stderr,"   ---OUTPUT:\n") ;
+#ifdef OUTPUT_RPAVG
+    fprintf(stderr,"     > DD(s, mu) file        = name of output file <smin smax mu_max_bin savg npairs weightavg>\n") ;
+#else
+    fprintf(stderr,"     > DD(s, mu) file        = name of output file <smin smax mu_max_bin  0.0 npairs weightavg>\n") ;
+#endif
+    fprintf(stderr,"\n\tCompile options: \n");
+#ifdef PERIODIC
+    fprintf(stderr,"\tPeriodic = True\n");
+#else
+    fprintf(stderr,"\tPeriodic = False\n");
+#endif
+
+#ifdef OUTPUT_RPAVG
+    fprintf(stderr,"\tOutput SAVG = True\n");
+#else
+    fprintf(stderr,"\tOutput SAVG = False\n");
+#endif
+
+#ifdef DOUBLE_PREC
+    fprintf(stderr,"\tPrecision = double\n");
+#else
+    fprintf(stderr,"\tPrecision = float\n");
+#endif
+
+#if defined(__AVX__)
+    fprintf(stderr,"\tUse AVX = True\n");
+#else
+    fprintf(stderr,"\tUse AVX = False\n");
+#endif
+
+#if defined(USE_OMP) && defined(_OPENMP)
+    fprintf(stderr,"\tUse OMP = True\n");
+#else
+    fprintf(stderr,"\tUse OMP = False\n");
+#endif
+
+    fprintf(stderr,"=========================================================================\n") ;
+}
diff --git a/theory/DDsmu/Makefile b/theory/DDsmu/Makefile
new file mode 100644
index 00000000..ca63851a
--- /dev/null
+++ b/theory/DDsmu/Makefile
@@ -0,0 +1,56 @@
+ROOT_DIR := ../..
+INSTALL_HEADERS_DIR := $(ROOT_DIR)/include
+INSTALL_LIB_DIR := $(ROOT_DIR)/lib
+INSTALL_BIN_DIR := $(ROOT_DIR)/bin
+UTILS_DIR := $(ROOT_DIR)/utils
+IO_DIR := $(ROOT_DIR)/io
+
+include $(ROOT_DIR)/theory.options $(ROOT_DIR)/common.mk
+
+TARGET := DDsmu
+TARGETS := $(TARGET) 
+LIBRARY := libcountpairs_s_mu.a
+LIBSRC  := countpairs_s_mu.c countpairs_s_mu_impl_double.c countpairs_s_mu_impl_float.c \
+         $(UTILS_DIR)/gridlink_impl_double.c $(UTILS_DIR)/gridlink_impl_float.c \
+         $(UTILS_DIR)/utils.c $(UTILS_DIR)/progressbar.c $(UTILS_DIR)/cpu_features.c
+LIBRARY_HEADERS := countpairs_s_mu.h
+
+TARGETSRC := DDsmu.c $(IO_DIR)/ftread.c $(IO_DIR)/io.c $(LIBSRC)
+
+INCL   := countpairs_s_mu_kernels_float.c countpairs_s_mu_kernels_double.c countpairs_s_mu_kernels.c.src countpairs_s_mu_impl.c.src countpairs_s_mu_impl.h.src \
+          countpairs_s_mu.h countpairs_s_mu_impl_double.h countpairs_s_mu_impl_float.h \
+          $(UTILS_DIR)/gridlink_impl_float.h $(UTILS_DIR)/gridlink_impl_double.h $(UTILS_DIR)/gridlink_impl.h.src \
+          $(UTILS_DIR)/cellarray_float.h $(UTILS_DIR)/cellarray_double.h $(UTILS_DIR)/cellarray.h.src \
+          $(UTILS_DIR)/function_precision.h  $(UTILS_DIR)/avx_calls.h $(UTILS_DIR)/sse_calls.h \
+          $(UTILS_DIR)/defs.h $(UTILS_DIR)/cpu_features.h \
+          $(IO_DIR)/ftread.h $(IO_DIR)/io.h $(UTILS_DIR)/utils.h $(UTILS_DIR)/progressbar.h \
+          $(UTILS_DIR)/weight_functions_double.h $(UTILS_DIR)/weight_functions_float.h $(UTILS_DIR)/weight_functions.h.src \
+		  $(UTILS_DIR)/weight_defs_double.h $(UTILS_DIR)/weight_defs_float.h $(UTILS_DIR)/weight_defs.h.src
+
+TARGETOBJS  := $(TARGETSRC:.c=.o)
+LIBOBJS := $(LIBSRC:.c=.o)
+all: $(TARGETS) $(TARGETSRC) $(ROOT_DIR)/theory.options $(ROOT_DIR)/common.mk Makefile 
+
+countpairs_s_mu_impl_double.o:countpairs_s_mu_impl_double.c countpairs_s_mu_impl_double.h countpairs_s_mu_kernels_double.c $(UTILS_DIR)/gridlink_impl_double.h  $(UTILS_DIR)/cellarray_double.h
+countpairs_s_mu_impl_float.o:countpairs_s_mu_impl_float.c countpairs_s_mu_impl_float.h countpairs_s_mu_kernels_float.c $(UTILS_DIR)/gridlink_impl_float.h  $(UTILS_DIR)/cellarray_float.h
+countpairs_s_mu.o:countpairs_s_mu.c countpairs_s_mu_impl_double.h countpairs_s_mu_impl_float.h $(INCL)
+
+libs: lib
+lib:  $(LIBRARY)
+install:$(INSTALL_BIN_DIR)/$(TARGET) $(INSTALL_LIB_DIR)/$(LIBRARY) $(INSTALL_HEADERS_DIR)/$(LIBRARY_HEADERS) 
+
+clean:
+	$(RM) $(TARGETOBJS) $(TARGET) $(LIBRARY) countpairs_s_mu_kernels_float.c countpairs_s_mu_kernels_double.c countpairs_s_mu_impl_double.[ch] countpairs_s_mu_impl_float.[ch]
+	$(RM) -R *.dSYM
+
+distclean:clean
+	cd $(INSTALL_HEADERS_DIR) && $(RM) $(LIBRARY_HEADERS)
+	cd $(INSTALL_LIB_DIR) && $(RM) $(LIBRARY)
+	cd $(INSTALL_BIN_DIR) && $(RM) $(TARGET)
+
+tests: 
+	$(MAKE) -C ../tests DDsmu
+
+include $(ROOT_DIR)/rules.mk
+
+
diff --git a/theory/DDsmu/countpairs_s_mu.c b/theory/DDsmu/countpairs_s_mu.c
new file mode 100644
index 00000000..b577a9ad
--- /dev/null
+++ b/theory/DDsmu/countpairs_s_mu.c
@@ -0,0 +1,74 @@
+/* File: countpairs_s_mu.c */
+/*
+  This file is a part of the Corrfunc package
+  Copyright (C) 2015-- Manodeep Sinha (manodeep@gmail.com)
+  License: MIT LICENSE. See LICENSE file under the top-level
+  directory at https://github.com/manodeep/Corrfunc/
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "countpairs_s_mu.h" //function proto-type for API
+#include "countpairs_s_mu_impl_double.h"//actual implementations for double
+#include "countpairs_s_mu_impl_float.h"//actual implementations for float
+
+void free_results_s_mu(results_countpairs_s_mu *results)
+{
+    if(results==NULL)
+        return;
+
+    free(results->npairs);
+    free(results->supp);
+    free(results->savg);
+    free(results->weightavg);
+}
+
+
+int countpairs_s_mu(const int64_t ND1, void *X1, void *Y1, void *Z1,
+                    const int64_t ND2, void *X2, void *Y2, void *Z2,
+                    const int numthreads,
+                    const int autocorr,
+                    const char *sbinfile,
+                    const double mu_max,
+                    const int nmu_bins, 
+                    results_countpairs_s_mu *results,
+                    struct config_options *options,
+                    struct extra_options *extra)
+{
+    if( ! (options->float_type == sizeof(float) || options->float_type == sizeof(double))){
+        fprintf(stderr,"ERROR: In %s> Can only handle doubles or floats. Got an array of size = %zu\n",
+                __FUNCTION__, options->float_type);
+        return EXIT_FAILURE;
+    }
+
+    if( strncmp(options->version, STR(VERSION), sizeof(options->version)/sizeof(char)-1) != 0) {
+        fprintf(stderr,"Error: Do not know this API version = `%s'. Expected version = `%s'\n", options->version, STR(VERSION));
+        return EXIT_FAILURE;
+    }
+    
+    if(options->float_type == sizeof(float)) {
+        return countpairs_s_mu_float(ND1, (float *) X1, (float *) Y1, (float *) Z1,
+                                     ND2, (float *) X2, (float *) Y2, (float *) Z2,
+                                     numthreads,
+                                     autocorr,
+                                     sbinfile,
+                                     mu_max,
+                                     nmu_bins,
+                                     results,
+                                     options,
+                                     extra);
+    } else {
+        return countpairs_s_mu_double(ND1, (double *) X1, (double *) Y1, (double *) Z1,
+                                      ND2, (double *) X2, (double *) Y2, (double *) Z2,
+                                      numthreads,
+                                      autocorr,
+                                      sbinfile,
+                                      mu_max,
+                                      nmu_bins,
+                                      results,
+                                      options,
+                                      extra);
+    }
+}
diff --git a/theory/DDsmu/countpairs_s_mu.h b/theory/DDsmu/countpairs_s_mu.h
new file mode 100644
index 00000000..f1182a3d
--- /dev/null
+++ b/theory/DDsmu/countpairs_s_mu.h
@@ -0,0 +1,45 @@
+/* File: countpairs_s_mu.h */
+/*
+  This file is a part of the Corrfunc package
+  Copyright (C) 2015-- Manodeep Sinha (manodeep@gmail.com)
+  License: MIT LICENSE. See LICENSE file under the top-level
+  directory at https://github.com/manodeep/Corrfunc/
+*/
+
+#pragma once
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "defs.h" //for struct config_options 
+#include <stdint.h> //for uint64_t
+
+    //define the results structure
+    typedef struct{
+        uint64_t *npairs;
+        double *supp;
+        double *savg;
+        double mu_max;
+        double mu_min;//not used -> assumed to be 0.0
+        double *weightavg;
+        int nsbin;
+        int nmu_bins;
+    } results_countpairs_s_mu;
+
+    extern int countpairs_s_mu(const int64_t ND1, void *X1, void *Y1, void *Z1,
+                               const int64_t ND2, void *X2, void *Y2, void *Z2,
+                               const int numthreads,
+                               const int autocorr,
+                               const char *sbinfile,
+                               const double mu_max,
+                               const int nmu_bins, 
+                               results_countpairs_s_mu *results,
+                               struct config_options *options,
+                               struct extra_options *extra);
+    
+    extern void free_results_s_mu(results_countpairs_s_mu *results);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/theory/DDsmu/countpairs_s_mu_impl.c.src b/theory/DDsmu/countpairs_s_mu_impl.c.src
new file mode 100644
index 00000000..494a6aff
--- /dev/null
+++ b/theory/DDsmu/countpairs_s_mu_impl.c.src
@@ -0,0 +1,681 @@
+// # -*- mode: c -*-
+/* File: countpairs_s_mu_impl.c.src */
+/*
+  This file is a part of the Corrfunc package
+  Copyright (C) 2015-- Manodeep Sinha (manodeep@gmail.com)
+  License: MIT LICENSE. See LICENSE file under the top-level
+  directory at https://github.com/manodeep/Corrfunc/
+*/
+
+
+#include <stdio.h>
+#include <math.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <unistd.h>
+
+#include "countpairs_s_mu_impl_DOUBLE.h" //function proto-type
+#include "countpairs_s_mu_kernels_DOUBLE.c"
+
+#include "defs.h"
+#include "utils.h" //all of the utilities
+#include "progressbar.h" //for the progressbar
+#include "cpu_features.h" //prototype instrset_detect required for runtime dispatch
+
+#include "cellarray_DOUBLE.h" //definition of struct cellarray*
+#include "gridlink_impl_DOUBLE.h"//function proto-type for gridlink
+
+#if defined(_OPENMP)
+#include <omp.h>
+#endif
+
+int interrupt_status_DDsmu_DOUBLE=EXIT_SUCCESS;
+
+void interrupt_handler_countpairs_s_mu_DOUBLE(int signo)
+{
+    fprintf(stderr,"Received signal = `%s' (signo = %d). Aborting \n",strsignal(signo), signo);
+    interrupt_status_DDsmu_DOUBLE = EXIT_FAILURE;
+}    
+
+countpairs_s_mu_func_ptr_DOUBLE countpairs_s_mu_driver_DOUBLE(const struct config_options *options)
+{
+    static countpairs_s_mu_func_ptr_DOUBLE function = NULL;
+    static isa old_isa=-1;
+    if(old_isa == options->instruction_set) {
+        return function;
+    }
+  
+
+    /* Array of function pointers */
+    countpairs_s_mu_func_ptr_DOUBLE allfunctions[] = {
+#ifdef __AVX__
+      countpairs_s_mu_avx_intrinsics_DOUBLE,
+#endif			 
+#ifdef __SSE4_2__
+      countpairs_s_mu_sse_intrinsics_DOUBLE,
+#endif
+      countpairs_s_mu_fallback_DOUBLE
+    };
+
+    const int num_functions = sizeof(allfunctions)/sizeof(void *);
+    const int fallback_offset = num_functions - 1;
+#if defined(__AVX__) || defined __SSE4_2__    
+    const int highest_isa = instrset_detect();
+#endif
+    int curr_offset = 0;
+    
+    /* Now check if AVX is supported by the CPU */
+    int avx_offset = fallback_offset;
+#ifdef __AVX__
+    avx_offset = highest_isa >= 7 ? curr_offset:fallback_offset;
+    curr_offset++;
+#endif        
+    
+    /* Is the SSE function supported at runtime and enabled at compile-time?*/
+    int sse_offset = fallback_offset;
+#ifdef __SSE4_2__
+    sse_offset = highest_isa >= 6 ? curr_offset:fallback_offset;
+    curr_offset++;
+#endif
+    if( curr_offset != fallback_offset) {
+      fprintf(stderr,"ERROR: Bug in code (current offset = %d *should equal* fallback function offset = %d)\n",
+              curr_offset, fallback_offset);
+      return NULL;
+    } 
+    
+    int function_dispatch=0;
+    /* Check that cpu supports feature */
+    if(options->instruction_set >= 0) {
+        switch(options->instruction_set) {
+        case(AVX512F):
+        case(AVX2):
+        case(AVX):function_dispatch=avx_offset;break;
+        case(SSE42):function_dispatch=sse_offset;break;
+        default:function_dispatch=fallback_offset;break;
+        }
+    }
+
+    if(function_dispatch >= num_functions) {
+        fprintf(stderr,"In %s> ERROR: Could not resolve the correct function.\n Function index = %d must lie between [0, %d)\n",
+              __FUNCTION__, function_dispatch, num_functions);
+      return NULL;
+    }
+    function = allfunctions[function_dispatch];
+    old_isa = options->instruction_set;
+    
+    if(options->verbose){
+        // This must be first (AVX/SSE may be aliased to fallback)
+        if(function_dispatch == fallback_offset){
+            fprintf(stderr,"Using fallback kernel\n");
+        } else if(function_dispatch == avx_offset){
+            fprintf(stderr, "Using AVX kernel\n");
+        } else if(function_dispatch == sse_offset){
+            fprintf(stderr, "Using SSE kernel\n");
+        } else {
+            fprintf(stderr, "Unknown kernel!\n");
+        }
+    }
+    
+    return function;
+}
+
+
+int countpairs_s_mu_DOUBLE(const int64_t ND1, DOUBLE *X1, DOUBLE *Y1, DOUBLE *Z1,
+                           const int64_t ND2, DOUBLE *X2, DOUBLE *Y2, DOUBLE *Z2,
+                           const int numthreads,
+                           const int autocorr,
+                           const char *sbinfile,
+                           const double max_mu,
+                           const int nmu_bins,
+                           results_countpairs_s_mu *results,
+                           struct config_options *options,
+                           struct extra_options *extra)
+{
+    if(options->float_type != sizeof(DOUBLE)) {
+        fprintf(stderr,"ERROR: In %s> Can only handle arrays of size=%zu. Got an array of size = %zu\n",
+                __FUNCTION__, sizeof(DOUBLE), options->float_type);
+        return EXIT_FAILURE;
+    }
+    
+    // If no extra options were passed, create dummy options
+    // This allows us to pass arguments like "extra->weights0" below;
+    // they'll just be NULLs, which is the correct behavior
+    struct extra_options dummy_extra;
+    if(extra == NULL){
+      weight_method_t dummy_method = NONE;
+      dummy_extra = get_extra_options(dummy_method);
+      extra = &dummy_extra;
+    }
+
+    int need_weightavg = extra->weight_method != NONE;
+
+    struct timeval t0;
+    if(options->c_api_timer) {
+        gettimeofday(&t0, NULL);
+    }
+    
+#if defined(_OPENMP)
+    omp_set_num_threads(numthreads);
+#else
+    (void) numthreads;
+#endif
+
+    options->sort_on_z = 1;
+    for(int i=0;i<3;i++) {
+        if(options->bin_refine_factors[i] < 1) {
+            fprintf(stderr,"Warning: bin refine factor along axis = %d *must* be >=1. Instead found bin refine factor =%d\n",
+                    i, options->bin_refine_factors[i]);
+            reset_bin_refine_factors(options);
+            break;/* all factors have been reset -> no point continuing with the loop */
+        }
+    }
+    if(options->max_cells_per_dim == 0) {
+        fprintf(stderr,"Warning: Max. cells per dimension is set to 0 - resetting to `NLATMAX' = %d\n", NLATMAX);
+        options->max_cells_per_dim = NLATMAX;
+    }
+    
+    /* setup interrupt handler -> mostly useful during the python execution. 
+       Let's Ctrl-C abort the extension  */
+    SETUP_INTERRUPT_HANDLERS(interrupt_handler_countpairs_s_mu_DOUBLE);
+    
+    /***********************
+     *initializing the  bins
+     ************************/
+    double *supp;
+    int nsbin;
+    double smin,smax;
+    setup_bins(sbinfile,&smin,&smax,&nsbin,&supp);
+    if( ! (smin >= 0.0 && smax > 0.0 && smin < smax && nsbin > 0)) {
+        fprintf(stderr,"Error: Could not setup with R bins correctly. (rmin = %lf, rmax = %lf, with nbins = %d). Expected non-zero rmin/rmax with rmax > rmin and nbins >=1 \n",
+                smin, smax, nsbin);
+        return EXIT_FAILURE;
+    }
+
+    if(max_mu <= 0.0 || max_mu > 1.0) {
+        fprintf(stderr,"Error: max_mu (max. value for the cosine of the angle with line of sight) must be greater than 0 and at most 1).\n"
+                "The passed value is max_mu = %lf. Please change it to be > 0 and <= 1.0\n", max_mu);
+        return EXIT_FAILURE;
+    }
+
+    if(nmu_bins < 1 ) {
+        fprintf(stderr,"Error: Number of mu bins = %d must be at least 1\n", nmu_bins);
+        return EXIT_FAILURE;
+    }
+    
+    DOUBLE supp_sqr[nsbin];
+    const int64_t totnbins = (nmu_bins+1)*(nsbin+1);
+    for(int i=0; i < nsbin;i++) {
+        supp_sqr[i] = supp[i]*supp[i];
+    }
+
+    const DOUBLE sqr_smax=supp_sqr[nsbin-1];
+    const DOUBLE sqr_smin=supp_sqr[0];
+    const DOUBLE mu_max = (DOUBLE) max_mu;
+    const DOUBLE pimax = smax*mu_max;
+    
+    //Find the min/max of the data
+    DOUBLE xmin=1e10,ymin=1e10,zmin=1e10;
+    DOUBLE xmax=-1e10,ymax=-1e10,zmax=-1e10;
+    get_max_min_DOUBLE(ND1, X1, Y1, Z1, &xmin, &ymin, &zmin, &xmax, &ymax, &zmax);
+
+    if(autocorr==0) {
+        if(options->verbose) {
+            fprintf(stderr,"ND1 = %12"PRId64" [xmin,ymin,zmin] = [%lf,%lf,%lf], [xmax,ymax,zmax] = [%lf,%lf,%lf]\n",ND1,xmin,ymin,zmin,xmax,ymax,zmax);
+        }
+
+        get_max_min_DOUBLE(ND2, X2, Y2, Z2, &xmin, &ymin, &zmin, &xmax, &ymax, &zmax);
+        if(options->verbose) {
+            fprintf(stderr,"ND2 = %12"PRId64" [xmin,ymin,zmin] = [%lf,%lf,%lf], [xmax,ymax,zmax] = [%lf,%lf,%lf]\n",ND2,xmin,ymin,zmin,xmax,ymax,zmax);
+        }
+    }
+
+    const DOUBLE xdiff = options->boxsize > 0 ? options->boxsize:(xmax-xmin);
+    const DOUBLE ydiff = options->boxsize > 0 ? options->boxsize:(ymax-ymin);
+    const DOUBLE zdiff = options->boxsize > 0 ? options->boxsize:(zmax-zmin);
+    if(options->verbose && options->periodic) {
+        fprintf(stderr,"Running with points in [xmin,xmax] = %lf,%lf with periodic wrapping = %lf\n",xmin,xmax,xdiff);
+        fprintf(stderr,"Running with points in [ymin,ymax] = %lf,%lf with periodic wrapping = %lf\n",ymin,ymax,ydiff);
+        fprintf(stderr,"Running with points in [zmin,zmax] = %lf,%lf with periodic wrapping = %lf\n",zmin,zmax,zdiff);
+    }
+
+    if(get_bin_refine_scheme(options) == BINNING_DFL) {
+        if(smax < 0.05*xdiff) {
+            options->bin_refine_factors[0] = 1;
+        }
+        if(smax < 0.05*ydiff) {
+            options->bin_refine_factors[1] = 1;
+        }
+        if(pimax < 0.05*zdiff) {
+            options->bin_refine_factors[2] = 1;
+        }
+    }
+
+    
+    /*---Create 3-D lattice--------------------------------------*/
+    int nmesh_x=0,nmesh_y=0,nmesh_z=0;
+    cellarray_index_particles_DOUBLE *lattice1 = gridlink_index_particles_DOUBLE(ND1, X1, Y1, Z1, &(extra->weights0),
+                                                                                 xmin, xmax, ymin, ymax, zmin, zmax,
+                                                                                 smax, smax, pimax, 
+                                                                                 options->bin_refine_factors[0], options->bin_refine_factors[1], options->bin_refine_factors[2],
+                                                                                 &nmesh_x, &nmesh_y, &nmesh_z, options);
+    if(lattice1 == NULL) {
+        return EXIT_FAILURE;
+    }
+
+    /* If there too few cells (BOOST_CELL_THRESH is ~10), and the number of cells can be increased, then boost bin refine factor (by 2x)*/
+    if(nmesh_x <= BOOST_CELL_THRESH && nmesh_y <= BOOST_CELL_THRESH && nmesh_z <= BOOST_CELL_THRESH && options->max_cells_per_dim >= BOOST_BIN_REF*BOOST_CELL_THRESH) {
+      if(get_bin_refine_scheme(options) == BINNING_DFL) {          
+          fprintf(stderr,"%s> gridlink seems inefficient nmesh = (%d, %d, %d). Boosting bin refine factor - should lead to better performance\n", __FUNCTION__, nmesh_x, nmesh_y, nmesh_z);
+          fprintf(stderr,"xmin = %lf xmax=%lf smax = %lf\n", xmin, xmax, smax);
+          free_cellarray_index_particles_DOUBLE(lattice1, nmesh_x * (int64_t) nmesh_y * nmesh_z);
+          for(int i=0;i<3;i++) {
+              options->bin_refine_factors[i] *= BOOST_BIN_REF;
+          }
+          lattice1 = gridlink_index_particles_DOUBLE(ND1, X1, Y1, Z1, &(extra->weights0),
+                                                     xmin, xmax, ymin, ymax, zmin, zmax,
+                                                     smax, smax, pimax, 
+                                                     options->bin_refine_factors[0], options->bin_refine_factors[1], options->bin_refine_factors[2],
+                                                     &nmesh_x, &nmesh_y, &nmesh_z, options);
+          if(lattice1 == NULL) {
+              return EXIT_FAILURE;
+          }
+        
+        } else {
+            fprintf(stderr,"%s> gridlink seems inefficient nmesh = (%d, %d, %d), boosting bin refine factor could have helped. However, since custom bin refine factors "
+                    "= (%d, %d, %d) are being used - continuing with inefficient mesh\n", __FUNCTION__, nmesh_x, nmesh_y, nmesh_z, options->bin_refine_factors[0],
+                    options->bin_refine_factors[1], options->bin_refine_factors[2]);
+        }
+    }
+
+    cellarray_index_particles_DOUBLE *lattice2 = NULL;
+    if(autocorr==0) {
+        int ngrid2_x=0,ngrid2_y=0,ngrid2_z=0;
+        lattice2 = gridlink_index_particles_DOUBLE(ND2, X2, Y2, Z2, &(extra->weights1),
+                                                   xmin, xmax, ymin, ymax, zmin, zmax,
+                                                   smax, smax, pimax, 
+                                                   options->bin_refine_factors[0], options->bin_refine_factors[1], options->bin_refine_factors[2],
+                                                   &ngrid2_x, &ngrid2_y, &ngrid2_z, options);
+        if(lattice2 == NULL) {
+            return EXIT_FAILURE;
+        }
+        if( ! (nmesh_x == ngrid2_x && nmesh_y == ngrid2_y && nmesh_z == ngrid2_z) ) {
+            fprintf(stderr,"Error: The two sets of 3-D lattices do not have identical bins. First has dims (%d, %d, %d) while second has (%d, %d, %d)\n",
+                    nmesh_x, nmesh_y, nmesh_z, ngrid2_x, ngrid2_y, ngrid2_z);
+            return EXIT_FAILURE;
+        }
+    } else {
+        lattice2 = lattice1;
+    }
+    const int64_t totncells = (int64_t) nmesh_x * (int64_t) nmesh_y * (int64_t) nmesh_z;
+
+    //Generate the unique set of neighbouring cells to count over.
+    {
+        int status = assign_ngb_cells_index_particles_DOUBLE(lattice1, lattice2, totncells,
+                                                             options->bin_refine_factors[0], options->bin_refine_factors[1], options->bin_refine_factors[2],
+                                                             nmesh_x, nmesh_y, nmesh_z, xdiff, ydiff, zdiff, autocorr, options->periodic);
+        if(status != EXIT_SUCCESS) {
+            free_cellarray_index_particles_DOUBLE(lattice1, totncells);
+            if(autocorr == 0) {
+                free_cellarray_index_particles_DOUBLE(lattice2, totncells);
+            }
+            free(supp);
+            return status;
+        }
+    }
+
+    /* runtime dispatch - get the function pointer */
+    countpairs_s_mu_func_ptr_DOUBLE countpairs_s_mu_function_DOUBLE = countpairs_s_mu_driver_DOUBLE(options);
+    if(countpairs_s_mu_function_DOUBLE == NULL) {
+        free_cellarray_index_particles_DOUBLE(lattice1, totncells);
+        if(autocorr == 0) {
+            free_cellarray_index_particles_DOUBLE(lattice2, totncells);
+        }
+        free(supp);
+        return EXIT_FAILURE;
+    }
+    
+
+#if defined(_OPENMP)
+    uint64_t **all_npairs = (uint64_t **) matrix_calloc(sizeof(uint64_t), numthreads, totnbins);
+    DOUBLE **all_savg = NULL;
+    if(options->need_avg_sep) {
+        all_savg = (DOUBLE **) matrix_calloc(sizeof(DOUBLE),numthreads,totnbins);
+    }
+    DOUBLE **all_weightavg = NULL;
+    if(need_weightavg) {
+      all_weightavg = (DOUBLE **) matrix_calloc(sizeof(DOUBLE),numthreads,totnbins);
+    }
+    
+    if(all_npairs == NULL ||
+       (options->need_avg_sep && all_savg == NULL) ||
+       (need_weightavg && all_weightavg == NULL)) {
+        free_cellarray_index_particles_DOUBLE(lattice1, totncells);
+        if(autocorr == 0) {
+            free_cellarray_index_particles_DOUBLE(lattice2, totncells);
+        }
+        matrix_free((void **)all_npairs, numthreads);
+        if(options->need_avg_sep) {
+            matrix_free((void **)all_savg, numthreads);
+        }
+        if(need_weightavg) {
+            matrix_free((void**) all_weightavg, numthreads);
+        }
+        free(supp);
+        return EXIT_FAILURE;
+    }
+#else
+    uint64_t npairs[totnbins];
+    DOUBLE savg[totnbins], weightavg[totnbins];
+    for(int ibin=0;ibin<totnbins;ibin++) {
+        npairs[ibin]=0;
+        if(options->need_avg_sep) {
+            savg[ibin] = ZERO;
+        }
+        if(need_weightavg) {
+            weightavg[ibin] = ZERO;
+        }
+    }
+#endif//OMP
+
+
+
+    int interrupted=0, abort_status = EXIT_SUCCESS;
+    int64_t numdone=0;
+    if(options->verbose) {
+        init_my_progressbar(totncells,&interrupted);
+    }
+
+#if defined(_OPENMP)
+#pragma omp parallel shared(numdone, abort_status, interrupt_status_DDsmu_DOUBLE)
+    {
+        const int tid = omp_get_thread_num();
+        uint64_t npairs[totnbins];
+        DOUBLE savg[totnbins], weightavg[totnbins];
+        for(int i=0;i<totnbins;i++) {
+            npairs[i] = 0;
+            if(options->need_avg_sep) {
+                savg[i] = ZERO;
+            }
+            if(need_weightavg) {
+                weightavg[i] = ZERO;
+            }
+        }
+
+#pragma omp for  schedule(dynamic) nowait
+#endif
+        /*---Loop-over-lattice1--------------------*/
+        for(int64_t index1=0;index1<totncells;index1++) {
+
+#if defined(_OPENMP)
+#pragma omp flush (abort_status, interrupt_status_DDsmu_DOUBLE)
+#endif
+            if(abort_status == EXIT_SUCCESS && interrupt_status_DDsmu_DOUBLE == EXIT_SUCCESS) {
+                //omp cancel was introduced in omp 4.0 - so this is my way of checking if loop needs to be cancelled
+                
+                /* If the verbose option is not enabled, avoid outputting anything unnecessary*/
+                if(options->verbose) {
+#if defined(_OPENMP)
+                    if (omp_get_thread_num() == 0)
+#endif
+                        my_progressbar(numdone,&interrupted);
+                    
+                    
+#if defined(_OPENMP)
+#pragma omp atomic
+#endif
+                    numdone++;
+                }
+
+
+                /* Calculate over all ngb cells */
+                const cellarray_index_particles_DOUBLE *first  = &(lattice1[index1]);
+                if(first->nelements == 0) {
+                    continue;
+                }
+                DOUBLE *x1 = first->x;
+                DOUBLE *y1 = first->y;
+                DOUBLE *z1 = first->z;
+                const weight_struct_DOUBLE *weights1 = &(first->weights);
+                const int64_t N1 = first->nelements;
+                if(autocorr == 1) {
+                    int same_cell = 1;
+                    DOUBLE *this_savg = NULL;
+                    DOUBLE *this_weightavg = NULL;
+                    if(options->need_avg_sep) {
+                        this_savg = savg;
+                    }
+                    if(need_weightavg) {
+                        this_weightavg = weightavg;
+                    }
+                    const int status = countpairs_s_mu_function_DOUBLE(N1, x1, y1, z1, weights1,
+                                                                       N1, x1, y1, z1, weights1,
+                                                                       same_cell
+                                                                       ,sqr_smax, sqr_smin, nsbin, nmu_bins, supp_sqr, mu_max, pimax
+                                                                       ,ZERO, ZERO, ZERO
+                                                                       ,this_savg, npairs,
+                                                                       this_weightavg, extra->weight_method);
+                    /* This actually causes a race condition under OpenMP - but mostly
+                       I care that an error occurred - rather than the exact value of
+                       the error status */
+                    abort_status |= status;
+                }
+                for(int64_t ngb=0;ngb<first->num_ngb;ngb++){
+                    const cellarray_index_particles_DOUBLE *second = first->ngb_cells[ngb];
+                    if(second->nelements == 0) {
+                        continue;
+                    }
+                    const int same_cell = 0;
+                    DOUBLE *x2 = second->x;
+                    DOUBLE *y2 = second->y;
+                    DOUBLE *z2 = second->z;
+                    const weight_struct_DOUBLE *weights2 = &(second->weights);
+                    DOUBLE off_xwrap = 0.0, off_ywrap = 0.0, off_zwrap = 0.0;
+                    if(options->periodic) {
+                        off_xwrap = first->xwrap[ngb];
+                        off_ywrap = first->ywrap[ngb];
+                        off_zwrap = first->zwrap[ngb];
+                    }
+                    const int64_t N2 = second->nelements;
+                    DOUBLE *this_savg = NULL;
+                    DOUBLE *this_weightavg = NULL;
+                    if(options->need_avg_sep) {
+                        this_savg = savg;
+                    }
+                    if(need_weightavg) {
+                        this_weightavg = weightavg;
+                    }
+                    const int status = countpairs_s_mu_function_DOUBLE(N1, x1, y1, z1, weights1,
+                                                                       N2, x2, y2, z2, weights2, same_cell,
+                                                                       sqr_smax, sqr_smin, nsbin, nmu_bins, supp_sqr, mu_max, pimax, 
+                                                                       off_xwrap, off_ywrap, off_zwrap,
+                                                                       this_savg, npairs,
+                                                                       this_weightavg, extra->weight_method);
+                    /* This actually causes a race condition under OpenMP - but mostly
+                       I care that an error occurred - rather than the exact value of
+                       the error status */
+                    abort_status |= status;
+                }//loop over ngb cells
+            }
+        }//index1 loop over totncells
+        
+#if defined(_OPENMP)
+        for(int i=0;i<totnbins;i++) {
+            all_npairs[tid][i] = npairs[i];
+            if(options->need_avg_sep) {
+                all_savg[tid][i] = savg[i];
+            }
+            if(need_weightavg) {
+                all_weightavg[tid][i] = weightavg[i];
+            }
+        }
+    }//close the omp parallel region
+#endif
+
+    free_cellarray_index_particles_DOUBLE(lattice1,totncells);
+    if(autocorr == 0) {
+        free_cellarray_index_particles_DOUBLE(lattice2,totncells);
+    }
+    if(abort_status != EXIT_SUCCESS || interrupt_status_DDsmu_DOUBLE != EXIT_SUCCESS) {
+        /* Cleanup memory here if aborting */
+        free(supp);
+#if defined(_OPENMP)
+        matrix_free((void **) all_npairs, numthreads);
+        if(options->need_avg_sep) {
+            matrix_free((void **) all_savg, numthreads);
+        }
+        if(need_weightavg) {
+            matrix_free((void **) all_weightavg, numthreads);
+        }
+#endif
+        return EXIT_FAILURE;
+    }
+    
+    if(options->verbose) {
+        finish_myprogressbar(&interrupted);
+    }
+    
+#if defined(_OPENMP)
+    uint64_t npairs[totnbins];
+    DOUBLE savg[totnbins];
+    DOUBLE weightavg[totnbins];
+    
+    for(int i=0;i<totnbins;i++) {
+        npairs[i] = 0;
+        if(options->need_avg_sep) {
+            savg[i] = 0.0;
+        }
+        if(need_weightavg) {
+            weightavg[i] = 0.0;
+        }
+    }
+
+    for(int i=0;i<numthreads;i++) {
+        for(int j=0;j<totnbins;j++) {
+            npairs[j] += all_npairs[i][j];
+            if(options->need_avg_sep) {
+                savg[j] += all_savg[i][j];
+            }
+            if(need_weightavg) {
+                weightavg[j] += all_weightavg[i][j];
+            }
+        }
+    }
+    matrix_free((void **) all_npairs, numthreads);
+    if(options->need_avg_sep) {
+        matrix_free((void **) all_savg, numthreads);
+    }
+    if(need_weightavg) {
+      matrix_free((void **) all_weightavg, numthreads);
+    }
+#endif
+
+
+    //The code does not double count for autocorrelations
+    //which means the npairs and savg values need to be doubled;
+    if(autocorr == 1) {
+        const uint64_t int_fac = 2;
+        const DOUBLE dbl_fac = (DOUBLE) 2.0;
+        for(int i=0;i<totnbins;i++) {
+            npairs[i] *= int_fac;
+            if(options->need_avg_sep) {
+                savg[i] *= dbl_fac;
+            }
+            if(need_weightavg) {
+                weightavg[i] *= dbl_fac;
+            }
+        }
+
+        /* Is the min. requested separation 0.0 ?*/
+        /* The comparison is '<=' rather than '==' only to silence
+           the compiler  */
+        if(supp[0] <= 0.0) {
+            int index = (nmu_bins + 1);//first valid s bin (with 0-dpi depth in pi)
+            /* Then, add all the self-pairs. This ensures that 
+               a cross-correlation with two identical datasets 
+               produces the same result as the auto-correlation  */
+            npairs[index] += ND1;
+            
+          // Increasing npairs affects savg and weightavg.
+          // We don't need to add anything to savg; all the self-pairs have 0 separation!
+          // The self-pairs have non-zero weight, though.  So, fix that here.
+          if(need_weightavg){
+            // Keep in mind this is an autocorrelation (i.e. only one particle set to consider)
+            weight_func_t_DOUBLE weight_func = get_weight_func_by_method_DOUBLE(extra->weight_method);
+            pair_struct_DOUBLE pair = {.num_weights = extra->weights0.num_weights,
+                                       .dx.d=0., .dy.d=0., .dz.d=0.,  // always 0 separation
+                                       .parx.d=0., .pary.d=0., .parz.d=0.};
+            for(int j = 0; j < ND1; j++){
+                for(int w = 0; w < pair.num_weights; w++){
+                    pair.weights0[w].d = ((DOUBLE *) extra->weights0.weights[w])[j];
+                    pair.weights1[w].d = ((DOUBLE *) extra->weights0.weights[w])[j];
+                }
+                weightavg[1] += weight_func(&pair);
+            }
+          }
+        }
+    }
+
+    
+    for(int i=0;i<totnbins;i++) {
+        if(npairs[i] > 0) {
+            if(options->need_avg_sep) {
+                savg[i] /= (DOUBLE) npairs[i] ;
+            }
+            if(need_weightavg) {
+                weightavg[i] /= (DOUBLE) npairs[i];
+            }
+        }
+    }
+
+
+    //Pack in the results
+    results->nsbin  = nsbin;
+    results->nmu_bins = nmu_bins;
+    results->mu_max = max_mu;//NOTE max_mu which is double and not mu_max (which might be float)
+    results->mu_min = ZERO;
+    results->npairs = my_malloc(sizeof(uint64_t), totnbins);
+    results->supp   = my_malloc(sizeof(double)  , nsbin);
+    results->savg  = my_malloc(sizeof(double)  , totnbins);
+    results->weightavg  = my_calloc(sizeof(double)  , totnbins);
+    if(results->npairs == NULL || results->supp == NULL ||
+       results->savg == NULL || results->weightavg == NULL) {
+        free_results_s_mu(results);
+        free(supp);
+        return EXIT_FAILURE;
+    }
+    
+    for(int i=0;i<nsbin;i++) {
+        results->supp[i] = supp[i];
+        for(int j=0;j<nmu_bins;j++) {
+            int index = i*((int64_t) nmu_bins+1) + j;
+            if(index < 0 || index >= totnbins) {
+                fprintf(stderr,"ERROR: In %s> Bin index = %d must lie within range [0, %"PRId64") (possible int overflow)\n",
+                        __FUNCTION__, index, totnbins);
+                return EXIT_FAILURE;
+            }
+
+            results->npairs[index] = npairs[index];
+            results->savg[index] = 0.0;
+            results->weightavg[index] = 0.0;
+            if(options->need_avg_sep){
+                results->savg[index] = savg[index];
+            }
+            if(need_weightavg) {
+                results->weightavg[index] = weightavg[index];
+            }
+        }
+    }
+    free(supp);
+    
+    /* reset interrupt handlers to default */
+    RESET_INTERRUPT_HANDLERS();
+    reset_bin_refine_factors(options);
+    
+    if(options->c_api_timer) {
+        struct timeval t1;
+        gettimeofday(&t1, NULL);
+        options->c_api_time = ADD_DIFF_TIME(t0, t1);
+    }
+    
+    return EXIT_SUCCESS;
+}
diff --git a/theory/DDsmu/countpairs_s_mu_impl.h.src b/theory/DDsmu/countpairs_s_mu_impl.h.src
new file mode 100644
index 00000000..7194b827
--- /dev/null
+++ b/theory/DDsmu/countpairs_s_mu_impl.h.src
@@ -0,0 +1,48 @@
+// # -*- mode: c -*-
+/* File: countpairs_s_mu_impl.h.src */
+/*
+  This file is a part of the Corrfunc package
+  Copyright (C) 2015-- Manodeep Sinha (manodeep@gmail.com)
+  License: MIT LICENSE. See LICENSE file under the top-level
+  directory at https://github.com/manodeep/Corrfunc/
+*/
+
+#pragma once
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "defs.h" //for struct config_options 
+#include "weight_defs_DOUBLE.h"
+#include <inttypes.h> //for uint64_t
+
+#include "countpairs_s_mu.h"//for struct results_countpairs_s_mu
+
+    extern void interrupt_handler_countpairs_s_mu_DOUBLE(int signo);
+    
+    typedef int (*countpairs_s_mu_func_ptr_DOUBLE)(const int64_t N0, DOUBLE *x0, DOUBLE *y0, DOUBLE *z0, const weight_struct_DOUBLE *weights0,
+                                                   const int64_t N1, DOUBLE *x1, DOUBLE *y1, DOUBLE *z1, const weight_struct_DOUBLE *weights1, const int same_cell,
+                                                   const DOUBLE sqr_smax, const DOUBLE sqr_smin, const int nsbin, const int nmu_bins,
+                                                   const DOUBLE *supp_sqr, const DOUBLE mu_max, const DOUBLE pimax,
+                                                   const DOUBLE off_xwrap, const DOUBLE off_ywrap, const DOUBLE off_zwrap,
+                                                   DOUBLE *src_savg, uint64_t *src_npairs,
+                                                   DOUBLE *src_weightavg, const weight_method_t weight_method);
+
+    
+    extern countpairs_s_mu_func_ptr_DOUBLE countpairs_s_mu_driver_DOUBLE(const struct config_options *options) __attribute__((warn_unused_result));
+
+    extern int countpairs_s_mu_DOUBLE(const int64_t ND1, DOUBLE *X1, DOUBLE *Y1, DOUBLE *Z1,
+                                      const int64_t ND2, DOUBLE *X2, DOUBLE *Y2, DOUBLE *Z2,
+                                      const int numthreads,
+                                      const int autocorr,
+                                      const char *sbinfile,
+                                      const double mu_max,
+                                      const int nmu_bins, 
+                                      results_countpairs_s_mu *results,
+                                      struct config_options *options,
+                                      struct extra_options *extra);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/theory/DDsmu/countpairs_s_mu_kernels.c.src b/theory/DDsmu/countpairs_s_mu_kernels.c.src
new file mode 100644
index 00000000..72bf8044
--- /dev/null
+++ b/theory/DDsmu/countpairs_s_mu_kernels.c.src
@@ -0,0 +1,787 @@
+// # -*- mode: c -*-
+/* File: countpairs_s_mu_kernels.c.src */
+/*
+  This file is a part of the Corrfunc package
+  Copyright (C) 2015-- Manodeep Sinha (manodeep@gmail.com)
+  License: MIT LICENSE. See LICENSE file under the top-level
+  directory at https://github.com/manodeep/Corrfunc/
+*/
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <inttypes.h>
+
+#include "function_precision.h"
+#include "utils.h"
+
+#include "weight_functions_DOUBLE.h"
+
+
+#if defined(__AVX__)
+#include "avx_calls.h"
+
+static inline int countpairs_s_mu_avx_intrinsics_DOUBLE(const int64_t N0, DOUBLE *x0, DOUBLE *y0, DOUBLE *z0, const weight_struct_DOUBLE *weights0,
+                                                        const int64_t N1, DOUBLE *x1, DOUBLE *y1, DOUBLE *z1, const weight_struct_DOUBLE *weights1, const int same_cell,
+                                                        const DOUBLE sqr_smax, const DOUBLE sqr_smin, const int nsbin,
+                                                        const int nmu_bins, const DOUBLE *supp_sqr, const DOUBLE mu_max, const DOUBLE pimax,
+                                                        const DOUBLE off_xwrap, const DOUBLE off_ywrap, const DOUBLE off_zwrap,
+                                                        DOUBLE *src_savg, uint64_t *src_npairs,
+                                                        DOUBLE *src_weightavg, const weight_method_t weight_method)
+{
+    if(N0 == 0 || N1 == 0) {
+        return EXIT_SUCCESS;
+    }
+
+    if(src_npairs == NULL) {
+        return EXIT_FAILURE;
+    }
+
+    const int32_t need_savg = src_savg != NULL;
+    const int32_t need_weightavg = src_weightavg != NULL;
+
+    const int64_t totnbins = (nmu_bins+1)*(nsbin+1);
+    uint64_t npairs[totnbins];
+    DOUBLE savg[totnbins], weightavg[totnbins];
+    for(int64_t i=0;i<totnbins;i++) {
+        npairs[i] = 0;
+        if(need_savg) {
+            savg[i] = ZERO;
+        }
+        if(need_weightavg){
+            weightavg[i] = ZERO;
+        }
+    }
+
+
+    AVX_FLOATS m_supp_sqr[nsbin];
+    AVX_FLOATS m_kbin[nsbin];
+    for(int i=0;i<nsbin;i++) {
+        m_supp_sqr[i] = AVX_SET_FLOAT(supp_sqr[i]);
+        m_kbin[i] = AVX_SET_FLOAT((DOUBLE) i);
+    }
+
+    /* const AVX_FLOATS m_mumax = AVX_SET_FLOAT(mu_max); */
+    const DOUBLE sqr_mumax = mu_max*mu_max;
+    const DOUBLE dmu = mu_max/(DOUBLE) nmu_bins;
+    const DOUBLE inv_dmu = 1.0/dmu;
+
+    // A copy whose pointers we can advance
+    weight_struct_DOUBLE local_w0 = {.weights={NULL}, .num_weights=0},
+                         local_w1 = {.weights={NULL}, .num_weights=0};
+    pair_struct_DOUBLE pair = {.num_weights=0};
+    avx_weight_func_t_DOUBLE avx_weight_func = NULL;
+    weight_func_t_DOUBLE fallback_weight_func = NULL;
+    if(need_weightavg){
+        // Same particle list, new copy of num_weights pointers into that list
+        local_w0 = *weights0;
+        local_w1 = *weights1;
+
+        pair.num_weights = local_w0.num_weights;
+
+        avx_weight_func = get_avx_weight_func_by_method_DOUBLE(weight_method);
+        fallback_weight_func = get_weight_func_by_method_DOUBLE(weight_method);
+    }
+
+    int64_t prev_j = 0, n_off = 0;
+    for(int64_t i=0;i<N0;i++) {
+        const DOUBLE xpos = *x0++ + off_xwrap;
+        const DOUBLE ypos = *y0++ + off_ywrap;
+        const DOUBLE zpos = *z0++ + off_zwrap;
+        for(int w = 0; w < pair.num_weights; w++){
+            // local_w0.weights[w] is a pointer to a float in the particle list of weights,
+            // just as x0 is a pointer into the list of x-positions.
+            // The advancement of the local_w0.weights[w] pointer should always mirror x0.
+            pair.weights0[w].a = AVX_SET_FLOAT(*(local_w0.weights[w])++);
+        }
+
+        int64_t j;
+        if(same_cell == 1) {
+            z1++; n_off++;
+            j = i+1;
+        } else {
+            for(;prev_j<N1;prev_j++) {
+                const DOUBLE dz = *z1 - zpos;
+                if(dz > -pimax) break;
+                z1++; n_off++;
+            }
+            if(prev_j == N1) {
+                i = N0;
+                break;
+            }
+            j = prev_j;
+        }
+        DOUBLE *localz1 = z1;
+        DOUBLE *localx1 = x1 + n_off;
+        DOUBLE *localy1 = y1 + n_off;
+        for(int w = 0; w < local_w1.num_weights; w++){
+            local_w1.weights[w] = weights1->weights[w] + n_off;
+        }
+
+        for(;j<=(N1 - AVX_NVEC);j+=AVX_NVEC) {
+            const AVX_FLOATS m_xpos    = AVX_SET_FLOAT(xpos);
+            const AVX_FLOATS m_ypos    = AVX_SET_FLOAT(ypos);
+            const AVX_FLOATS m_zpos    = AVX_SET_FLOAT(zpos);
+
+            union int8 {
+                AVX_INTS m_ibin;
+                int ibin[AVX_NVEC];
+            };
+            union int8 union_finalbin;
+            union float8{
+                AVX_FLOATS m_Dperp;
+                DOUBLE Dperp[AVX_NVEC];
+            };
+            union float8 union_mDperp;
+
+
+            const AVX_FLOATS m_x1 = AVX_LOAD_FLOATS_UNALIGNED(localx1);
+            const AVX_FLOATS m_y1 = AVX_LOAD_FLOATS_UNALIGNED(localy1);
+            const AVX_FLOATS m_z1 = AVX_LOAD_FLOATS_UNALIGNED(localz1);
+
+            localx1 += AVX_NVEC;//this might actually exceed the allocated range but we will never dereference that
+            localy1 += AVX_NVEC;
+            localz1 += AVX_NVEC;
+
+            for(int w = 0; w < pair.num_weights; w++){
+                pair.weights1[w].a = AVX_LOAD_FLOATS_UNALIGNED(local_w1.weights[w]);
+                local_w1.weights[w] += AVX_NVEC;
+            }
+
+            union float8_weights{
+                AVX_FLOATS m_weights;
+                DOUBLE weights[NVEC];
+            };
+            union float8_weights union_mweight;
+
+            const AVX_FLOATS m_pimax = AVX_SET_FLOAT((DOUBLE) pimax);
+            const AVX_FLOATS m_sqr_smax = m_supp_sqr[nsbin-1];
+            const AVX_FLOATS m_sqr_smin = m_supp_sqr[0];
+            const AVX_FLOATS m_inv_dmu    = AVX_SET_FLOAT(inv_dmu);
+            const AVX_FLOATS m_sqr_mumax = AVX_SET_FLOAT(sqr_mumax);
+
+            const AVX_FLOATS m_zero = AVX_SET_FLOAT(ZERO);
+            const AVX_FLOATS m_nmu_bins     = AVX_SET_FLOAT((DOUBLE) nmu_bins);
+            const AVX_FLOATS m_one    = AVX_SET_FLOAT((DOUBLE) 1);
+
+            const AVX_FLOATS m_xdiff = AVX_SUBTRACT_FLOATS(m_x1, m_xpos);  //(x[j] - x0)
+            const AVX_FLOATS m_ydiff = AVX_SUBTRACT_FLOATS(m_y1, m_ypos);  //(y[j] - y0)
+            AVX_FLOATS m_zdiff = AVX_SUBTRACT_FLOATS(m_z1, m_zpos);  //z2[j:j+NVEC-1] - z1
+
+            const AVX_FLOATS m_sqr_xdiff = AVX_SQUARE_FLOAT(m_xdiff);  //(x0 - x[j])^2
+            const AVX_FLOATS m_sqr_ydiff = AVX_SQUARE_FLOAT(m_ydiff);  //(y0 - y[j])^2
+            const AVX_FLOATS m_sqr_zdiff = AVX_SQUARE_FLOAT(m_zdiff);  //(z0 - z[j])^2
+
+            AVX_FLOATS s2  = AVX_ADD_FLOATS(m_sqr_zdiff, AVX_ADD_FLOATS(m_sqr_xdiff, m_sqr_ydiff));//s^2 = dz^2 + dx^2 + dy^2
+            m_zdiff = AVX_MAX_FLOATS(m_zdiff,AVX_SUBTRACT_FLOATS(m_zero,m_zdiff));//dz = fabs(dz) => dz = max(dz, -dz);
+
+            AVX_FLOATS m_mask_left;
+            AVX_FLOATS max_sqr_dz = AVX_MULTIPLY_FLOATS(s2, m_sqr_mumax);
+
+            //Do all the distance cuts using masks here in new scope
+            {
+                //the z2 arrays are sorted in increasing order. which means
+                //the z2 value will increase in any future iteration of j.
+                //that implies the zdiff values are also monotonically increasing
+                //Therefore, if none of the zdiff values are less than pimax, then
+                //no future iteration in j can produce a zdiff value less than pimax.
+                AVX_FLOATS m_mask_pimax = AVX_COMPARE_FLOATS(m_zdiff,m_pimax,_CMP_LT_OS);
+                if(AVX_TEST_COMPARISON(m_mask_pimax) == 0) {
+                    j=N1;
+                    break;
+                }
+
+                const AVX_FLOATS m_mu_mask = AVX_COMPARE_FLOATS(m_sqr_zdiff, max_sqr_dz, _CMP_LT_OS);
+                const AVX_FLOATS m_smax_mask = AVX_COMPARE_FLOATS(s2, m_sqr_smax, _CMP_LT_OS);//check for s2 < sqr_smax
+                const AVX_FLOATS m_smin_mask = AVX_COMPARE_FLOATS(s2, m_sqr_smin, _CMP_GE_OS);//check for s2 >= sqr_smin
+                const AVX_FLOATS m_s2_mask = AVX_BITWISE_AND(m_smax_mask,m_smin_mask);
+
+                //Create a combined mask by bitwise and of m1 and m_mask_left.
+                //This gives us the mask for all sqr_smin <= s2 < sqr_smax
+                // + mu_min <= mu < mu_max
+                m_mask_left = AVX_BITWISE_AND(m_mu_mask, m_s2_mask);
+
+                //If not, continue with the next iteration of j-loop
+                if(AVX_TEST_COMPARISON(m_mask_left) == 0) {
+                    continue;
+                }
+
+            }
+
+            //There is some s2 that satisfies sqr_smin <= s2 < sqr_smax && mu_min <= |dz| < mu_max
+            s2 = AVX_BLEND_FLOATS_WITH_MASK(m_sqr_smax, s2, m_mask_left);
+            /*m_mu := sqrt(s2/dz^2) (with masked elements set to mu_max */
+            const AVX_FLOATS m_mu = AVX_SQRT_FLOAT(AVX_BLEND_FLOATS_WITH_MASK(m_sqr_mumax, AVX_DIVIDE_FLOATS(m_sqr_zdiff, s2), m_mask_left));
+
+            if(need_savg) {
+                union_mDperp.m_Dperp = AVX_SQRT_FLOAT(s2);
+            }
+            if(need_weightavg){
+                pair.dx.a = m_xdiff;
+                pair.dy.a = m_ydiff;
+                pair.dz.a = m_zdiff;
+
+                union_mweight.m_weights = avx_weight_func(&pair);
+            }
+
+            const AVX_FLOATS m_mubin = AVX_MULTIPLY_FLOATS(m_mu,m_inv_dmu);
+            AVX_FLOATS m_sbin     = AVX_SET_FLOAT((DOUBLE) 0);
+            //AVX_FLOATS m_all_ones  = AVX_CAST_INT_TO_FLOAT(AVX_SET_INT(-1));
+            for(int kbin=nsbin-1;kbin>=1;kbin--) {
+                const AVX_FLOATS m_mask_low = AVX_COMPARE_FLOATS(s2,m_supp_sqr[kbin-1],_CMP_GE_OS);
+                const AVX_FLOATS m_bin_mask = AVX_BITWISE_AND(m_mask_low,m_mask_left);
+                m_sbin = AVX_BLEND_FLOATS_WITH_MASK(m_sbin,m_kbin[kbin], m_bin_mask);
+                m_mask_left = AVX_COMPARE_FLOATS(s2, m_supp_sqr[kbin-1],_CMP_LT_OS);
+                //m_mask_left = AVX_XOR_FLOATS(m_mask_low, m_all_ones);//XOR with 0xFFFF... gives the bins that are smaller than m_supp_sqr[kbin] (and is faster than cmp_p(s/d) in theory)
+                const int test = AVX_TEST_COMPARISON(m_mask_left);
+                if(test==0) {
+                    break;
+                }
+            }
+            const AVX_FLOATS m_nmu_bins_p1 = AVX_ADD_FLOATS(m_nmu_bins,m_one);
+            const AVX_FLOATS m_binproduct = AVX_ADD_FLOATS(AVX_MULTIPLY_FLOATS(m_sbin,m_nmu_bins_p1),m_mubin);
+            union_finalbin.m_ibin = AVX_TRUNCATE_FLOAT_TO_INT(m_binproduct);
+
+            //update the histograms
+#if defined(__ICC) || defined(__INTEL_COMPILER)
+#pragma unroll(AVX_NVEC)
+#endif
+            for(int jj=0;jj<AVX_NVEC;jj++) {
+                int ibin = union_finalbin.ibin[jj];
+                npairs[ibin]++;
+                if(need_savg) {
+                    savg[ibin] += union_mDperp.Dperp[jj];
+                }
+                if(need_weightavg){
+                    const DOUBLE weight = union_mweight.weights[jj];
+                    weightavg[ibin] += weight;
+                }
+            }
+        }
+
+
+        //remainder loop
+        for(;j<N1;j++){
+            const DOUBLE dz = FABS(*localz1++ - zpos);
+            const DOUBLE dx = *localx1++ - xpos;
+            const DOUBLE dy = *localy1++ - ypos;
+            for(int w = 0; w < pair.num_weights; w++){
+                pair.weights1[w].d = *local_w1.weights[w]++;
+            }
+
+            if(dz >= pimax) {
+                break;
+            }
+
+            const DOUBLE sqr_dx_dy = dx*dx + dy*dy;
+            const DOUBLE sqr_dz = dz*dz;
+            const DOUBLE s2 =  sqr_dx_dy + sqr_dz;
+            if(s2 >= sqr_smax || s2 < sqr_smin)
+                continue;
+            if(sqr_dz >= s2 * sqr_mumax) continue;
+            const DOUBLE mu = SQRT(sqr_dz/s2);
+
+            DOUBLE s, pairweight;
+            if(need_savg) {
+                s = SQRT(s2);
+            }
+            if(need_weightavg){
+                pair.dx.d = dx;
+                pair.dy.d = dy;
+                pair.dz.d = dz;
+                pairweight = fallback_weight_func(&pair);
+            }
+
+            int mu_bin = (int) (mu*inv_dmu);
+            mu_bin = mu_bin > nmu_bins ? nmu_bins:mu_bin;
+            for(int kbin=nsbin-1;kbin>=1;kbin--) {
+                if(s2 >= supp_sqr[kbin-1]) {
+                    const int ibin = kbin*(nmu_bins+1) + mu_bin;
+                    npairs[ibin]++;
+                    if(need_savg) {
+                        savg[ibin] += s;
+                    }
+                    if(need_weightavg){
+                        weightavg[ibin] += pairweight;
+                    }
+                    break;
+                }
+            }
+        }//remainder loop over second set of particles
+    }//loop over first set of particles
+
+	for(int i=0;i<totnbins;i++) {
+		src_npairs[i] += npairs[i];
+        if(need_savg) {
+            src_savg[i] += savg[i];
+        }
+        if(need_weightavg) {
+            src_weightavg[i] += weightavg[i];
+        }
+    }
+
+    return EXIT_SUCCESS;
+}
+#endif //__AVX__
+
+
+
+#if defined (__SSE4_2__)
+#include "sse_calls.h"
+
+static inline int countpairs_s_mu_sse_intrinsics_DOUBLE(const int64_t N0, DOUBLE *x0, DOUBLE *y0, DOUBLE *z0, const weight_struct_DOUBLE *weights0,
+                                                        const int64_t N1, DOUBLE *x1, DOUBLE *y1, DOUBLE *z1, const weight_struct_DOUBLE *weights1, const int same_cell,
+                                                        const DOUBLE sqr_smax, const DOUBLE sqr_smin, const int nsbin, const int nmu_bins,
+                                                        const DOUBLE *supp_sqr, const DOUBLE mu_max, const DOUBLE pimax,
+                                                        const DOUBLE off_xwrap, const DOUBLE off_ywrap, const DOUBLE off_zwrap,
+                                                        DOUBLE *src_savg, uint64_t *src_npairs,
+                                                        DOUBLE *src_weightavg, const weight_method_t weight_method)
+{
+    if(N0 == 0 || N1 == 0) {
+        return EXIT_SUCCESS;
+    }
+
+    if(src_npairs == NULL) {
+        return EXIT_FAILURE;
+    }
+
+    const int32_t need_savg = src_savg != NULL;
+    const int32_t need_weightavg = src_weightavg != NULL;
+    const int64_t totnbins = (nmu_bins+1) * (nsbin+1);
+    uint64_t npairs[totnbins];
+    DOUBLE savg[totnbins], weightavg[totnbins];
+    for(int64_t i=0;i<totnbins;i++) {
+        npairs[i] = 0;
+        if (need_savg) {
+            savg[i] = ZERO;
+        }
+        if(need_weightavg){
+            weightavg[i] = ZERO;
+        }
+    }
+
+    SSE_FLOATS m_kbin[nsbin];
+    SSE_FLOATS m_supp_sqr[nsbin];
+    for(int i=0;i<nsbin;i++) {
+        m_kbin[i] = SSE_SET_FLOAT((DOUBLE) i);
+        m_supp_sqr[i] = SSE_SET_FLOAT(supp_sqr[i]);
+    }
+
+    const DOUBLE sqr_mumax = mu_max*mu_max;
+    const DOUBLE dmu = mu_max/(DOUBLE) nmu_bins;
+    const DOUBLE inv_dmu = 1.0/dmu;
+
+
+    // A copy whose pointers we can advance
+    weight_struct_DOUBLE local_w0 = {.weights={NULL}, .num_weights=0},
+                         local_w1 = {.weights={NULL}, .num_weights=0};
+    pair_struct_DOUBLE pair = {.num_weights=0};
+    sse_weight_func_t_DOUBLE sse_weight_func = NULL;
+    weight_func_t_DOUBLE fallback_weight_func = NULL;
+    if(need_weightavg){
+      // Same particle list, new copy of num_weights pointers into that list
+      local_w0 = *weights0;
+      local_w1 = *weights1;
+
+      pair.num_weights = local_w0.num_weights;
+
+      sse_weight_func = get_sse_weight_func_by_method_DOUBLE(weight_method);
+      fallback_weight_func = get_weight_func_by_method_DOUBLE(weight_method);
+    }
+
+    int64_t prev_j = 0, n_off = 0;
+    for(int64_t i=0;i<N0;i++) {
+        const DOUBLE xpos = *x0++ + off_xwrap;
+        const DOUBLE ypos = *y0++ + off_ywrap;
+        const DOUBLE zpos = *z0++ + off_zwrap;
+        for(int w = 0; w < pair.num_weights; w++){
+            // local_w0.weights[w] is a pointer to a float in the particle list of weights,
+            // just as x0 is a pointer into the list of x-positions.
+            // The advancement of the local_w0.weights[w] pointer should always mirror x0.
+            pair.weights0[w].s = SSE_SET_FLOAT(*local_w0.weights[w]++);
+        }
+
+        int64_t j;
+        if(same_cell == 1) {
+            z1++; n_off++;
+            j = i+1;
+        } else {
+            for(;prev_j<N1;prev_j++) {
+                const DOUBLE dz = *z1 - zpos;
+                if(dz > -pimax) break;
+                z1++; n_off++;
+            }
+            if(prev_j == N1) {
+                i = N0;
+                break;
+            }
+            j = prev_j;
+        }
+        DOUBLE *localz1 = z1;
+        DOUBLE *localx1 = x1 + n_off;
+        DOUBLE *localy1 = y1 + n_off;
+        for(int w = 0; w < local_w1.num_weights; w++){
+            local_w1.weights[w] = weights1->weights[w] + n_off;
+        }
+
+        for(;j<=(N1 - SSE_NVEC);j+=SSE_NVEC){
+
+            union int4{
+                SSE_INTS m_ibin;
+                int ibin[SSE_NVEC];
+            };
+            union int4 union_finalbin;
+
+            union float4{
+                SSE_FLOATS m_Dperp;
+                DOUBLE Dperp[SSE_NVEC];
+            };
+            union float4 union_mDperp;
+
+            const SSE_FLOATS m_xpos = SSE_SET_FLOAT(xpos);
+            const SSE_FLOATS m_ypos = SSE_SET_FLOAT(ypos);
+            const SSE_FLOATS m_zpos = SSE_SET_FLOAT(zpos);
+
+            const SSE_FLOATS m_x1 = SSE_LOAD_FLOATS_UNALIGNED(localx1);
+            const SSE_FLOATS m_y1 = SSE_LOAD_FLOATS_UNALIGNED(localy1);
+            const SSE_FLOATS m_z1 = SSE_LOAD_FLOATS_UNALIGNED(localz1);
+
+            localx1 += SSE_NVEC;
+            localy1 += SSE_NVEC;
+            localz1 += SSE_NVEC;
+
+            for(int w = 0; w < pair.num_weights; w++){
+                pair.weights1[w].s = SSE_LOAD_FLOATS_UNALIGNED(local_w1.weights[w]);
+                local_w1.weights[w] += SSE_NVEC;
+            }
+
+            union float4_weights{
+                SSE_FLOATS m_weights;
+                DOUBLE weights[SSE_NVEC];
+            };
+            union float4_weights union_mweight;
+
+            const SSE_FLOATS m_pimax = SSE_SET_FLOAT((DOUBLE) pimax);
+            const SSE_FLOATS m_sqr_smax = m_supp_sqr[nsbin-1];
+            const SSE_FLOATS m_sqr_smin = m_supp_sqr[0];
+            const SSE_FLOATS m_sqr_mumax = SSE_SET_FLOAT(sqr_mumax);
+            const SSE_FLOATS m_inv_dmu    = SSE_SET_FLOAT(inv_dmu);
+            const SSE_FLOATS m_zero = SSE_SET_FLOAT(ZERO);
+            const SSE_FLOATS m_nmu_bins     = SSE_SET_FLOAT((DOUBLE) nmu_bins);
+            const SSE_FLOATS m_one    = SSE_SET_FLOAT((DOUBLE) 1);
+
+            const SSE_FLOATS m_xdiff = SSE_SUBTRACT_FLOATS(m_x1, m_xpos);  //(x[j] - x0)
+            const SSE_FLOATS m_ydiff = SSE_SUBTRACT_FLOATS(m_y1, m_ypos);  //(y[j] - y0)
+            SSE_FLOATS m_zdiff = SSE_SUBTRACT_FLOATS(m_z1, m_zpos);  //z2[j:j+NVEC-1] - z1
+
+            const SSE_FLOATS m_sqr_xdiff = SSE_SQUARE_FLOAT(m_xdiff);
+            const SSE_FLOATS m_sqr_ydiff = SSE_SQUARE_FLOAT(m_ydiff);
+            const SSE_FLOATS m_sqr_zdiff = SSE_SQUARE_FLOAT(m_zdiff);
+
+            SSE_FLOATS s2  = SSE_ADD_FLOATS(m_sqr_zdiff, SSE_ADD_FLOATS(m_sqr_xdiff, m_sqr_ydiff));//s^2 = dx^2 + dy^2 + dz^2
+            m_zdiff = SSE_MAX_FLOATS(m_zdiff,SSE_SUBTRACT_FLOATS(m_zero,m_zdiff));//dz = fabs(dz) => dz = max(dz, -dz);
+
+            SSE_FLOATS m_mask_left;
+            SSE_FLOATS max_sqr_dz = SSE_MULTIPLY_FLOATS(s2, m_sqr_mumax);
+
+            //Do all the distance cuts using masks here in new scope
+            {
+                //the z2 arrays are sorted in increasing order. which means
+                //the z2 value will increase in any future iteration of j.
+                //that implies the zdiff values are also monotonically increasing
+                //Therefore, if none of the zdiff values are less than pimax, then
+                //no future iteration in j can produce a zdiff value less than pimax.
+                SSE_FLOATS m_mask_pimax = SSE_COMPARE_FLOATS_LT(m_zdiff,m_pimax);
+                if(SSE_TEST_COMPARISON(m_mask_pimax) == 0) {
+                    j=N1;
+                    break;
+                }
+
+                const SSE_FLOATS m_mu_mask = SSE_COMPARE_FLOATS_LT(m_sqr_zdiff, max_sqr_dz);
+                const SSE_FLOATS m_smax_mask = SSE_COMPARE_FLOATS_LT(s2, m_sqr_smax);
+                const SSE_FLOATS m_smin_mask = SSE_COMPARE_FLOATS_GE(s2, m_sqr_smin);
+                const SSE_FLOATS m_s2_mask = SSE_BITWISE_AND(m_smax_mask,m_smin_mask);
+
+                //Create a combined mask by bitwise and of m1 and m_mask_left.
+                //This gives us the mask for all sqr_smin <= s2 < sqr_smax
+                // + mu_min <= mu < mu_max
+                m_mask_left = SSE_BITWISE_AND(m_mu_mask, m_s2_mask);
+
+                //If not, continue with the next iteration of j-loop
+                if(SSE_TEST_COMPARISON(m_mask_left) == 0) {
+                    continue;
+                }
+
+            }
+
+            //There is some s2 that satisfies sqr_smin <= s2 < sqr_smax && mu_min <= |dz| < mu_max
+            s2 = SSE_BLEND_FLOATS_WITH_MASK(m_sqr_smax, s2, m_mask_left);
+            const SSE_FLOATS m_mu = SSE_SQRT_FLOAT(SSE_BLEND_FLOATS_WITH_MASK(m_sqr_mumax, SSE_DIVIDE_FLOATS(m_sqr_zdiff, s2), m_mask_left));
+
+            if(need_savg) {
+                union_mDperp.m_Dperp = SSE_SQRT_FLOAT(s2);
+            }
+            if(need_weightavg){
+                pair.dx.s = m_xdiff;
+                pair.dy.s = m_ydiff;
+                pair.dz.s = m_zdiff;
+
+                union_mweight.m_weights = sse_weight_func(&pair);
+            }
+
+            const SSE_FLOATS m_mubin = SSE_MULTIPLY_FLOATS(m_mu,m_inv_dmu);
+            SSE_FLOATS m_sbin     = SSE_SET_FLOAT((DOUBLE) 0);
+            //SSE_FLOATS m_all_ones  = SSE_CAST_INT_TO_FLOAT(SSE_SET_INT(-1));
+            for(int kbin=nsbin-1;kbin>=1;kbin--) {
+                const SSE_FLOATS m_mask_low = SSE_COMPARE_FLOATS_GE(s2,m_supp_sqr[kbin-1]);
+                const SSE_FLOATS m_bin_mask = SSE_BITWISE_AND(m_mask_low,m_mask_left);
+                m_sbin = SSE_BLEND_FLOATS_WITH_MASK(m_sbin,m_kbin[kbin], m_bin_mask);
+                m_mask_left = SSE_COMPARE_FLOATS_LT(s2, m_supp_sqr[kbin-1]);
+                //XOR with 0xFFFF... gives the bins that are smaller than m_supp_sqr[kbin] (and is faster than cmp_p(s/d) in theory)
+                //m_mask_left = SSE_XOR_FLOATS(m_mask_low, m_all_ones);
+                const int test = SSE_TEST_COMPARISON(m_mask_left);
+                if(test==0) {
+                    break;
+                }
+            }
+            const SSE_FLOATS m_nmu_bins_p1 = SSE_ADD_FLOATS(m_nmu_bins,m_one);
+            const SSE_FLOATS m_binproduct = SSE_ADD_FLOATS(SSE_MULTIPLY_FLOATS(m_sbin,m_nmu_bins_p1),m_mubin);
+            union_finalbin.m_ibin = SSE_TRUNCATE_FLOAT_TO_INT(m_binproduct);
+
+            //update the histograms
+#if defined(__ICC) || defined(__INTEL_COMPILER)
+#pragma unroll(SSE_NVEC)
+#endif
+            for(int jj=0;jj<SSE_NVEC;jj++) {
+                int ibin = union_finalbin.ibin[jj];
+                npairs[ibin]++;
+                if(need_savg) {
+                    savg[ibin] += union_mDperp.Dperp[jj];
+                }
+                if(need_weightavg){
+                    const DOUBLE weight = union_mweight.weights[jj];
+                    weightavg[ibin] += weight;
+                }
+            }
+        }
+
+
+        for(;j<N1;j++) {
+            const DOUBLE dx = *localx1++ - xpos;
+            const DOUBLE dy = *localy1++ - ypos;
+            const DOUBLE dz = FABS(*localz1++ - zpos);
+            for(int w = 0; w < pair.num_weights; w++){
+                pair.weights1[w].d = *local_w1.weights[w]++;
+            }
+
+            if(dz >= pimax) break;
+
+            const DOUBLE sqr_dx_dy = dx*dx + dy*dy;
+            const DOUBLE sqr_dz = dz*dz;
+            const DOUBLE s2 =  sqr_dx_dy + sqr_dz;
+            if(s2 >= sqr_smax || s2 < sqr_smin)
+                continue;
+            if(sqr_dz >= s2 * sqr_mumax) continue;
+            const DOUBLE mu = SQRT(sqr_dz/s2);
+
+            DOUBLE s, pairweight;            
+            if(need_weightavg){
+                pair.dx.d = dx;
+                pair.dy.d = dy;
+                pair.dz.d = dz;
+                pairweight = fallback_weight_func(&pair);                
+            }
+
+            if(need_savg) {
+                s = SQRT(s2);
+            }
+
+            int mu_bin = (int) (mu*inv_dmu);
+            mu_bin = mu_bin > nmu_bins ? nmu_bins:mu_bin;
+            for(int kbin=nsbin-1;kbin>=1;kbin--) {
+                if(s2 >= supp_sqr[kbin-1]) {
+                    const int ibin = kbin*(nmu_bins+1) + mu_bin;
+                    npairs[ibin]++;
+                    if(need_savg) {
+                        savg[ibin] += s;
+                    }
+                    if(need_weightavg){
+                        weightavg[ibin] += pairweight;
+                    }
+                    break;
+                }
+            }//searching for kbin
+        }
+    }
+
+    for(int i=0;i<totnbins;i++) {
+        src_npairs[i] += npairs[i];
+        if(need_savg) {
+            src_savg[i] += savg[i];
+        }
+        if(need_weightavg) {
+            src_weightavg[i] += weightavg[i];
+        }
+    }
+
+    return EXIT_SUCCESS;
+}
+#endif //__SSE4_2__
+
+
+static inline int countpairs_s_mu_fallback_DOUBLE(const int64_t N0, DOUBLE *x0, DOUBLE *y0, DOUBLE *z0, const weight_struct_DOUBLE *weights0,
+                                                  const int64_t N1, DOUBLE *x1, DOUBLE *y1, DOUBLE *z1, const weight_struct_DOUBLE *weights1,
+                                                  const int same_cell,
+                                                  const DOUBLE sqr_smax, const DOUBLE sqr_smin, const int nsbin, const int nmu_bins,
+                                                  const DOUBLE *supp_sqr, const DOUBLE mu_max, const DOUBLE pimax,
+                                                  const DOUBLE off_xwrap, const DOUBLE off_ywrap, const DOUBLE off_zwrap,
+                                                  DOUBLE *src_savg, uint64_t *src_npairs,
+                                                  DOUBLE *src_weightavg, const weight_method_t weight_method)
+{
+
+    if(N0 == 0 || N1 == 0) {
+        return EXIT_SUCCESS;
+    }
+
+    if(src_npairs == NULL) {
+        return EXIT_FAILURE;
+    }
+
+    /*----------------- FALLBACK CODE --------------------*/
+    const int32_t need_savg = src_savg != NULL;
+    const int32_t need_weightavg = src_weightavg != NULL;
+    const int64_t totnbins = (nmu_bins+1)*(nsbin+1);
+    uint64_t npairs[totnbins];
+    DOUBLE savg[totnbins], weightavg[totnbins];
+    for(int i=0;i<totnbins;i++) {
+        npairs[i] = 0;
+        if(need_savg) {
+            savg[i]=ZERO;
+        }
+        if(need_weightavg){
+            weightavg[i]=ZERO;
+        }
+    }
+
+    // A copy whose pointers we can advance
+    weight_struct_DOUBLE local_w0 = {.weights={NULL}, .num_weights=0},
+                         local_w1 = {.weights={NULL}, .num_weights=0};
+    pair_struct_DOUBLE pair = {.num_weights=0};
+    weight_func_t_DOUBLE weight_func = NULL;
+    if(need_weightavg){
+        // Same particle list, new copy of num_weights pointers into that list
+        local_w0 = *weights0;
+        local_w1 = *weights1;
+
+        pair.num_weights = local_w0.num_weights;
+
+        weight_func = get_weight_func_by_method_DOUBLE(weight_method);
+    }
+
+
+    const DOUBLE dmu = mu_max/nmu_bins;
+    const DOUBLE inv_dmu = 1.0/dmu;
+    const DOUBLE sqr_mu_max = mu_max * mu_max;
+
+    /* naive implementation that is guaranteed to compile */
+    int64_t nleft=N1, n_off = 0;
+    for(int64_t i=0;i<N0;i++) {
+        const DOUBLE xpos = *x0++ + off_xwrap;
+        const DOUBLE ypos = *y0++ + off_ywrap;
+        const DOUBLE zpos = *z0++ + off_zwrap;
+        for(int w = 0; w < pair.num_weights; w++){
+            pair.weights0[w].d = *local_w0.weights[w]++;
+        }
+
+        /* If in the same cell, unique pairs are guaranteed by not including the current particle */
+        if(same_cell == 1) {
+            z1++; n_off++;
+            nleft--;
+        } else {
+            /* For a different cell, all pairs are unique pairs, since two cells are only opened for pairs once (accounted for in the assign_ngb_cells function)*/
+            while(nleft > 0) {
+                /*Particles are sorted on 'z', in increasing order */
+                const DOUBLE dz = *z1 - zpos;
+                if(dz > -pimax) break;
+                z1++; n_off++;
+                nleft--;
+            }
+            /*If no particle in the second cell satisfies distance constraints on 'dz' for the current 'i'th particle in first cell,
+              then there can be no more pairs from any particles in the first cell (since the first cell is also sorted in increasing order in 'z')
+             */
+            if(nleft == 0) {
+                i=N0;
+                break;
+            }
+        }
+        DOUBLE *localz1 = z1;
+        DOUBLE *localx1 = x1 + n_off;
+        DOUBLE *localy1 = y1 + n_off;
+        for(int w = 0; w < pair.num_weights; w++){
+            local_w1.weights[w] = weights1->weights[w] + n_off;
+        }
+
+        for(int64_t j=0;j<nleft;j++) {
+            const DOUBLE dx = *localx1++ - xpos;
+            const DOUBLE dy = *localy1++ - ypos;
+            const DOUBLE dz = FABS((*localz1++ - zpos));
+            for(int w = 0; w < pair.num_weights; w++){
+                pair.weights1[w].d = *local_w1.weights[w]++;
+            }
+
+            if(dz >= pimax) break;
+
+            const DOUBLE sqr_dx_dy = dx*dx + dy*dy;
+            const DOUBLE sqr_dz = dz*dz;
+            const DOUBLE s2 =  sqr_dx_dy + sqr_dz;
+            if(s2 >= sqr_smax || s2 < sqr_smin) {
+                continue;
+            }
+
+            if(sqr_dz >= s2 * sqr_mu_max) {
+                continue;
+            }
+            const DOUBLE mu = SQRT(sqr_dz/s2);
+
+            DOUBLE s, pairweight;
+            if(need_savg) {
+                s = SQRT(s2);
+            }
+
+            if(need_weightavg){
+                pair.dx.d = dx;
+                pair.dy.d = dy;
+                pair.dz.d = dz;
+                pairweight = weight_func(&pair);
+            }
+
+            int mu_bin = (int) (mu*inv_dmu);
+            mu_bin = mu_bin > nmu_bins ? nmu_bins:mu_bin;
+            for(int kbin=nsbin-1;kbin>=1;kbin--) {
+                if(s2 >= supp_sqr[kbin-1]) {
+                    const int ibin = kbin*(nmu_bins+1) + mu_bin;
+                    npairs[ibin]++;
+                    if(need_savg) {
+                        savg[ibin] += s;
+                    }
+                    if(need_weightavg){
+                        weightavg[ibin] += pairweight;
+                    }
+                    break;
+                }
+            }
+        }
+    }
+    for(int i=0;i<totnbins;i++) {
+        src_npairs[i] += npairs[i];
+        if(need_savg) {
+            src_savg[i] += savg[i];
+        }
+        if(need_weightavg){
+            src_weightavg[i] += weightavg[i];
+        }
+    }
+   /*----------------- FALLBACK CODE --------------------*/
+    return EXIT_SUCCESS;
+}
diff --git a/theory/Makefile b/theory/Makefile
index 92740455..5b1f3dd3 100644
--- a/theory/Makefile
+++ b/theory/Makefile
@@ -3,23 +3,23 @@ ROOT_DIR:=..
 include $(ROOT_DIR)/theory.options $(ROOT_DIR)/common.mk
 
 PYTHON_EXT_DIR:=python_bindings
-TARGETS:=DD DDrppi wp xi vpf examples $(PYTHON_EXT_DIR)
+TARGETS:=DD DDrppi DDsmu wp xi vpf examples $(PYTHON_EXT_DIR)
 
 all: $(TARGETS) $(PYTHON_EXT_DIR) logbins
 
-logbins: logbins.c 
+logbins: logbins.c
 	$(CC) $(CFLAGS) $< $(CLINK) -o $@
 
-.PHONY: $(TARGETS) libs install tests distclean realclean distclena realclena clean celna 
+.PHONY: $(TARGETS) libs install tests distclean realclean distclena realclena clean celna
 
-DD DDrppi xi vpf:
+DD DDrppi DDsmu xi vpf:
 	$(MAKE) -C $@
 
 python_bindings: libs
 	$(MAKE) -C $@
 
 ### Must create the libraries first before making the examples.
-examples: examples/run_correlations 
+examples: examples/run_correlations
 
 ## Not a phony target
 examples/run_correlations: libs
@@ -29,11 +29,12 @@ distclean:realclean
 distclena:realclean
 realclena:realclean
 
-realclean: 
+realclean:
 	$(RM) $(ROOT_DIR)/bin/logbins
 	$(RM) -R *.dSYM
 	$(MAKE) -C DD distclean
 	$(MAKE) -C DDrppi distclean
+	$(MAKE) -C DDsmu distclean
 	$(MAKE) -C wp distclean
 	$(MAKE) -C xi distclean
 	$(MAKE) -C vpf distclean
@@ -45,6 +46,7 @@ clean:
 	$(RM) -R *.dSYM
 	$(MAKE) -C DD clean
 	$(MAKE) -C DDrppi clean
+	$(MAKE) -C DDsmu clean
 	$(MAKE) -C wp clean
 	$(MAKE) -C xi clean
 	$(MAKE) -C vpf clean
@@ -54,9 +56,10 @@ clean:
 	$(MAKE) -C ../io clean
 	$(MAKE) -C ../utils clean
 
-install: $(ROOT_DIR)/bin/logbins examples $(TARGETS) $(PYTHON_EXT_DIR) 
+install: $(ROOT_DIR)/bin/logbins examples $(TARGETS) $(PYTHON_EXT_DIR)
 	$(MAKE) -C DD install
 	$(MAKE) -C DDrppi install
+	$(MAKE) -C DDsmu install
 	$(MAKE) -C wp install
 	$(MAKE) -C xi install
 	$(MAKE) -C vpf install
@@ -67,15 +70,16 @@ $(ROOT_DIR)/bin/logbins: logbins
 
 lib: libs
 
-libs: 
+libs:
 	$(MAKE) -C DD lib
 	$(MAKE) -C DDrppi lib
+	$(MAKE) -C DDsmu lib
 	$(MAKE) -C wp lib
 	$(MAKE) -C xi lib
-	$(MAKE) -C vpf lib	
+	$(MAKE) -C vpf lib
 
 test: tests
-tests:  
+tests:
 	$(MAKE) -C tests
 
 include $(ROOT_DIR)/rules.mk
diff --git a/theory/examples/Makefile b/theory/examples/Makefile
index e9836c62..410c7dfa 100644
--- a/theory/examples/Makefile
+++ b/theory/examples/Makefile
@@ -8,12 +8,14 @@ DATA_DIR := ../tests/data
 THEORY_DIR := $(ROOT_DIR)/theory
 DD_DIR := $(THEORY_DIR)/DD
 DDrppi_DIR := $(THEORY_DIR)/DDrppi
+DDsmu_DIR := $(THEORY_DIR)/DDsmu
 WP_DIR := $(THEORY_DIR)/wp
 XI_DIR := $(THEORY_DIR)/xi
 VPF_DIR := $(THEORY_DIR)/vpf
 
 DD_LIB := countpairs
 DDrppi_LIB := countpairs_rp_pi
+DDsmu_LIB := countpairs_s_mu
 WP_LIB := countpairs_wp
 XI_LIB := countpairs_xi
 VPF_LIB := countspheres
@@ -23,14 +25,13 @@ include $(ROOT_DIR)/theory.options $(ROOT_DIR)/common.mk
 TARGET := run_correlations
 TARGETSRC   := run_correlations.c $(IO_DIR)/ftread.c $(IO_DIR)/io.c  $(UTILS_DIR)/utils.c $(UTILS_DIR)/progressbar.c 
 TARGETOBJS  := $(TARGETSRC:.c=.o)
-C_LIBRARIES := $(DD_DIR)/lib$(DD_LIB).a $(DDrppi_DIR)/lib$(DDrppi_LIB).a $(WP_DIR)/lib$(WP_LIB).a \
+C_LIBRARIES := $(DD_DIR)/lib$(DD_LIB).a $(DDrppi_DIR)/lib$(DDrppi_LIB).a $(DDsmu_DIR)/lib$(DDsmu_LIB).a $(WP_DIR)/lib$(WP_LIB).a \
              $(XI_DIR)/lib$(XI_LIB).a $(VPF_DIR)/lib$(VPF_LIB).a
-
-INCL :=	$(DD_DIR)/$(DD_LIB).h $(DDrppi_DIR)/$(DDrppi_LIB).h $(WP_DIR)/$(WP_LIB).h \
+INCL :=	$(DD_DIR)/$(DD_LIB).h $(DDrppi_DIR)/$(DDrppi_LIB).h $(DDsmu_DIR)/$(DDsmu_LIB).h $(WP_DIR)/$(WP_LIB).h \
         $(XI_DIR)/$(XI_LIB).h $(VPF_DIR)/$(VPF_LIB).h \
         $(UTILS_DIR)/defs.h 
 
-EXTRA_INCL:= -I$(DD_DIR) -I$(DDrppi_DIR) -I$(WP_DIR) -I$(XI_DIR) -I$(VPF_DIR) $(GSL_CFLAGS)
+EXTRA_INCL:= -I$(DD_DIR) -I$(DDrppi_DIR) -I$(DDsmu_DIR) -I$(WP_DIR) -I$(XI_DIR) -I$(VPF_DIR) $(GSL_CFLAGS)
 EXTRA_LINK:= $(GSL_LINK)
 
 all: $(TARGET) $(TARGETSRC) $(INCL) $(C_LIBRARY) $(ROOT_DIR)/theory.options $(ROOT_DIR)/common.mk Makefile uncompress
@@ -41,6 +42,9 @@ $(DD_DIR)/lib$(DD_LIB).a: $(DD_DIR)/*.c $(DD_DIR)/*.c.src $(DD_DIR)/*.h.src $(RO
 $(DDrppi_DIR)/lib$(DDrppi_LIB).a: $(DDrppi_DIR)/*.c $(DDrppi_DIR)/*.c.src $(DDrppi_DIR)/*.h.src $(ROOT_DIR)/theory.options $(ROOT_DIR)/common.mk
 	$(MAKE) -C $(DDrppi_DIR) libs
 
+$(DDsmu_DIR)/lib$(DDsmu_LIB).a: $(DDsmu_DIR)/*.c $(DDsmu_DIR)/*.c.src $(DDsmu_DIR)/*.h.src $(ROOT_DIR)/theory.options $(ROOT_DIR)/common.mk
+	$(MAKE) -C $(DDsmu_DIR) libs
+
 $(WP_DIR)/lib$(WP_LIB).a: $(WP_DIR)/*.c $(WP_DIR)/*.c.src $(WP_DIR)/*.h.src $(ROOT_DIR)/theory.options $(ROOT_DIR)/common.mk
 	$(MAKE) -C $(WP_DIR) libs
 
diff --git a/theory/examples/run_correlations.c b/theory/examples/run_correlations.c
index 91f92eff..13a6286e 100644
--- a/theory/examples/run_correlations.c
+++ b/theory/examples/run_correlations.c
@@ -32,6 +32,7 @@
 */
 #include "countpairs.h"
 #include "countpairs_rp_pi.h"
+#include "countpairs_s_mu.h"
 #include "countpairs_wp.h"
 #include "countpairs_xi.h"
 
@@ -54,6 +55,8 @@ void Printhelp(void)
     fprintf(stderr,"     * binfile      = name of ascii file containing the r-bins (rmin rmax for each bin)\n") ;
     fprintf(stderr,"     * boxsize      = BoxSize (in same units as X/Y/Z of the data)\n");
     fprintf(stderr,"     * pimax        = pimax   (in same units as X/Y/Z of the data)\n");
+    fprintf(stderr,"     * mu_max       = Max. value of the cosine of the angle to the LOS (must be within [0.0, 1.0])\n");
+    fprintf(stderr,"     * nmu_bins     = Number of linear bins to create (the bins themselves range from [0.0, mu_max]\n");
 #if defined(_OPENMP)
     fprintf(stderr,"     * numthreads   = number of threads to use\n");
 #endif
@@ -69,6 +72,8 @@ int main(int argc, char **argv)
     double boxsize;
     struct timeval t0,t1;
     DOUBLE pimax;
+    DOUBLE mu_max=1.0;
+    int nmu_bins=10;
     int nthreads=1;//default to single thread
 
     struct config_options options = get_config_options();
@@ -77,9 +82,9 @@ int main(int argc, char **argv)
     options.float_type = sizeof(DOUBLE);
     
 #if defined(_OPENMP)
-    const char argnames[][30]={"file","format","binfile","boxsize","pimax","Nthreads"};
+    const char argnames[][30]={"file","format","binfile","boxsize","pimax","mu_max","nmu_bins","Nthreads"};
 #else
-    const char argnames[][30]={"file","format","binfile","boxsize","pimax"};
+    const char argnames[][30]={"file","format","binfile","boxsize","pimax","mu_max","nmu_bins"};
     nthreads = 4;
 #endif
     int nargs=sizeof(argnames)/(sizeof(char)*30);
@@ -97,8 +102,10 @@ int main(int argc, char **argv)
             my_snprintf(binfile,MAXLEN,"%s",argv[3]);
             boxsize=atof(argv[4]);
             pimax=atof(argv[5]);
+            mu_max=atof(argv[6]);
+            nmu_bins=atoi(argv[7]);
 #if defined(_OPENMP)
-            nthreads = atoi(argv[6]);
+            nthreads = atoi(argv[8]);
 #endif
         }
     } else {
@@ -107,6 +114,8 @@ int main(int argc, char **argv)
         my_snprintf(binfile, MAXLEN,"%s","../tests/bins");
         boxsize=420.0;
         pimax=40.0;
+        mu_max=1.0;
+        nmu_bins=10;
     }
 
     fprintf(stderr,ANSI_COLOR_BLUE  "Running `%s' with the parameters \n",argv[0]);
@@ -116,8 +125,10 @@ int main(int argc, char **argv)
     fprintf(stderr,"\t\t %-10s = %s \n",argnames[2],binfile);
     fprintf(stderr,"\t\t %-10s = %10.4lf\n",argnames[3],boxsize);
     fprintf(stderr,"\t\t %-10s = %10.4lf\n",argnames[4],pimax);
+    fprintf(stderr,"\t\t %-10s = %10.4lf\n",argnames[5],mu_max);
+    fprintf(stderr,"\t\t %-10s = %d\n",argnames[6],nmu_bins);
 #if defined(_OPENMP)
-    fprintf(stderr,"\t\t %-10s = %d\n",argnames[5],nthreads);
+    fprintf(stderr,"\t\t %-10s = %d\n",argnames[7],nthreads);
 #endif
     fprintf(stderr,"\t\t -------------------------------------" ANSI_COLOR_RESET "\n");
 
@@ -130,6 +141,46 @@ int main(int argc, char **argv)
     DOUBLE *z2 = z1;
     int64_t ND2 = ND1;
 
+    //Do the straight-up DD counts
+    {
+        gettimeofday(&t0,NULL);
+#if defined(_OPENMP)
+        fprintf(stderr,ANSI_COLOR_MAGENTA "Command-line for running equivalent DD(r) calculation would be:\n `%s %s %s %s %s %s %d'" ANSI_COLOR_RESET "\n",
+                "../DD/DD",file,fileformat,file,fileformat,binfile,nthreads);
+#else
+        fprintf(stderr,ANSI_COLOR_MAGENTA "Command-line for running equivalent DD(r) calculation would be:\n `%s %s %s %s %s %s'" ANSI_COLOR_RESET "\n",
+                "../DD/DD",file,fileformat,file,fileformat,binfile);
+#endif
+        
+        results_countpairs results;
+        int status = countpairs(ND1,x1,y1,z1,
+                                ND2,x2,y2,z2,
+                                nthreads,
+                                autocorr,
+                                binfile,
+                                &results,
+                                &options, NULL);
+        if(status != EXIT_SUCCESS) {
+            return status;
+        }
+
+        gettimeofday(&t1,NULL);
+        double pair_time = ADD_DIFF_TIME(t0,t1);
+#if 0
+        DOUBLE rlow=results.rupp[0];
+        for(int i=1;i<results.nbin;i++) {
+            fprintf(stdout,"%10"PRIu64" %20.8lf %20.8lf %20.8lf \n",results.npairs[i],results.rpavg[i],rlow,results.rupp[i]);
+            rlow=results.rupp[i];
+        }
+#endif
+        fprintf(stderr,ANSI_COLOR_GREEN "Done 3-d auto-correlation. Ngalaxies = %12"PRId64" Time taken = %8.2lf seconds " ANSI_COLOR_RESET "\n", ND1, pair_time);
+        //The results structure contains the pair-counts
+
+
+        //free the result structure
+        free_results(&results);
+    }
+
     //Do the DD(rp, pi) counts
     {
         gettimeofday(&t0,NULL);
@@ -175,46 +226,53 @@ int main(int argc, char **argv)
     }
 
 
-    //Do the straight-up DD counts
+    //Do the DD(s, mu) counts
     {
         gettimeofday(&t0,NULL);
 #if defined(_OPENMP)
-        fprintf(stderr,ANSI_COLOR_MAGENTA "Command-line for running equivalent DD(r) calculation would be:\n `%s %s %s %s %s %s %d'" ANSI_COLOR_RESET "\n",
-                "../DD/DD",file,fileformat,file,fileformat,binfile,nthreads);
+        fprintf(stderr,ANSI_COLOR_MAGENTA "Command-line for running equivalent DD(s,mu) calculation would be:\n `%s %s %s %s %s %s %lf %d %d'"ANSI_COLOR_RESET"\n",
+                "../DDsmu/DDsmu",file,fileformat,file,fileformat,binfile,mu_max,nmu_bins,nthreads);
 #else
-        fprintf(stderr,ANSI_COLOR_MAGENTA "Command-line for running equivalent DD(r) calculation would be:\n `%s %s %s %s %s %s'" ANSI_COLOR_RESET "\n",
-                "../DD/DD",file,fileformat,file,fileformat,binfile);
+        fprintf(stderr,ANSI_COLOR_MAGENTA "Command-line for running equivalent DD(s,mu) calculation would be:\n `%s %s %s %s %s %s %lf %d'"ANSI_COLOR_RESET"\n",
+                "../DDsmu/DDsmu",file,fileformat,file,fileformat,binfile,mu_max,nmu_bins);
 #endif
-        
-        results_countpairs results;
-        int status = countpairs(ND1,x1,y1,z1,
-                                ND2,x2,y2,z2,
-                                nthreads,
-                                autocorr,
-                                binfile,
-                                &results,
-                                &options, NULL);
+
+        results_countpairs_s_mu results;
+        int status = countpairs_s_mu(ND1,x1,y1,z1,
+                                     ND2,x2,y2,z2,
+                                     nthreads,
+                                     autocorr,
+                                     binfile,
+                                     mu_max,
+                                     nmu_bins,
+                                     &results,
+                                     &options, NULL);
         if(status != EXIT_SUCCESS) {
             return status;
         }
-
+        
         gettimeofday(&t1,NULL);
         double pair_time = ADD_DIFF_TIME(t0,t1);
 #if 0
-        DOUBLE rlow=results.rupp[0];
-        for(int i=1;i<results.nbin;i++) {
-            fprintf(stdout,"%10"PRIu64" %20.8lf %20.8lf %20.8lf \n",results.npairs[i],results.rpavg[i],rlow,results.rupp[i]);
-            rlow=results.rupp[i];
+    double smin = results.supp[0];
+    const double dmu = mu_max/(double) nmu_bins;
+    for(int i=1;i<results.nsbin;i++) {
+        const double smax = results.supp[i];
+        for(int j=0;j<nmu_bins;j++) {
+            int index = i*(nmu_bins+1) + j;
+            fprintf(stdout,"%e\t%e\t%e\t%12"PRIu64"\t%e\n", smin, smax, (j+1)*dmu, results.npairs[index], results.weightavg[index]);
         }
+        smin = smax;
+    }
 #endif
-        fprintf(stderr,ANSI_COLOR_GREEN "Done 3-d auto-correlation. Ngalaxies = %12"PRId64" Time taken = %8.2lf seconds " ANSI_COLOR_RESET "\n", ND1, pair_time);
-        //The results structure contains the pair-counts
-
+        fprintf(stderr,ANSI_COLOR_GREEN "Done DD(s,mu) auto-correlation. Ngalaxies = %12"PRId64" Time taken = %8.2lf seconds "ANSI_COLOR_RESET"\n", ND1, pair_time);
 
         //free the result structure
-        free_results(&results);
+        free_results_s_mu(&results);
     }
 
+    
+    
     //Do the wp counts
     {
         gettimeofday(&t0,NULL);
diff --git a/theory/python_bindings/Makefile b/theory/python_bindings/Makefile
index 55502e48..657e1043 100644
--- a/theory/python_bindings/Makefile
+++ b/theory/python_bindings/Makefile
@@ -10,14 +10,17 @@ DD_DIR := $(THEORY_DIR)/DD
 DDrppi_DIR := $(THEORY_DIR)/DDrppi
 WP_DIR := $(THEORY_DIR)/wp
 XI_DIR := $(THEORY_DIR)/xi
+DDSMU_DIR := $(THEORY_DIR)/DDsmu
 VPF_DIR := $(THEORY_DIR)/vpf
 
 DD_LIB := countpairs
 DDrppi_LIB := countpairs_rp_pi
 WP_LIB := countpairs_wp
 XI_LIB := countpairs_xi
+DDSMU_LIB := countpairs_s_mu
 VPF_LIB := countspheres
 
+
 include $(ROOT_DIR)/theory.options $(ROOT_DIR)/common.mk
 
 PROJECT := _countpairs
@@ -25,14 +28,14 @@ SOURCES := $(PROJECT).c
 OBJECTS := $(SOURCES:.c=.o)
 PYTHON_EXTN := $(PROJECT).so.$(MAJOR).$(MINOR).$(PATCHLEVEL)
 C_LIBRARIES := $(DD_DIR)/lib$(DD_LIB).a $(DDrppi_DIR)/lib$(DDrppi_LIB).a $(WP_DIR)/lib$(WP_LIB).a \
-             $(XI_DIR)/lib$(XI_LIB).a $(VPF_DIR)/lib$(VPF_LIB).a
+             $(XI_DIR)/lib$(XI_LIB).a $(DDSMU_DIR)/lib$(DDSMU_LIB).a $(VPF_DIR)/lib$(VPF_LIB).a
 INCL := $(DD_DIR)/$(DD_LIB).h $(DDrppi_DIR)/$(DDrppi_LIB).h $(WP_DIR)/$(WP_LIB).h \
-        $(XI_DIR)/$(XI_LIB).h $(VPF_DIR)/$(VPF_LIB).h \
+        $(XI_DIR)/$(XI_LIB).h $(DDSMU_DIR)/$(DDSMU_LIB).h $(VPF_DIR)/$(VPF_LIB).h \
         $(UTILS_DIR)/defs.h $(IO_DIR)/io.h $(IO_DIR)/ftread.h \
         $(UTILS_DIR)/utils.h \
 	$(UTILS_DIR)/function_precision.h $(UTILS_DIR)/progressbar.h \
         $(UTILS_DIR)/cpu_features.h $(UTILS_DIR)/macros.h
-LIB_INCLUDE:=-I$(DD_DIR) -I$(DDrppi_DIR) -I$(WP_DIR) -I$(XI_DIR) -I$(VPF_DIR)
+LIB_INCLUDE:=-I$(DD_DIR) -I$(DDrppi_DIR) -I$(WP_DIR) -I$(XI_DIR) -I$(DDSMU_DIR) -I$(VPF_DIR)
 
 
 all: sharedlib $(SOURCES) $(C_LIBRARIES) $(ROOT_DIR)/theory.options $(ROOT_DIR)/common.mk Makefile 
@@ -60,6 +63,9 @@ $(WP_DIR)/lib$(WP_LIB).a: $(WP_DIR)/*.c $(WP_DIR)/*.c.src $(WP_DIR)/*.h.src $(RO
 $(XI_DIR)/lib$(XI_LIB).a: $(XI_DIR)/*.c $(XI_DIR)/*.c.src $(XI_DIR)/*.h.src $(ROOT_DIR)/theory.options $(ROOT_DIR)/common.mk
 	$(MAKE) -C $(XI_DIR) libs
 
+$(DDSMU_DIR)/lib$(DDSMU_LIB).a: $(DDSMU_DIR)/*.c $(DDSMU_DIR)/*.c.src $(DDSMU_DIR)/*.h.src $(ROOT_DIR)/theory.options $(ROOT_DIR)/common.mk
+	$(MAKE) -C $(DDSMU_DIR) libs
+
 $(VPF_DIR)/lib$(VPF_LIB).a: $(VPF_DIR)/*.c $(VPF_DIR)/*.c.src $(VPF_DIR)/*.h.src $(ROOT_DIR)/theory.options $(ROOT_DIR)/common.mk
 	$(MAKE) -C $(VPF_DIR) libs
 
diff --git a/theory/python_bindings/_countpairs.c b/theory/python_bindings/_countpairs.c
index 38a2101f..38debd15 100644
--- a/theory/python_bindings/_countpairs.c
+++ b/theory/python_bindings/_countpairs.c
@@ -22,6 +22,7 @@
 #include "countpairs_rp_pi.h"
 #include "countpairs_wp.h"
 #include "countpairs_xi.h"
+#include "countpairs_s_mu.h"
 
 //for the vpf
 #include "countspheres.h"
@@ -63,6 +64,7 @@ static char module_docstring[]             =    "Python extensions for calculati
     "countpairs_rp_pi : Calculate the 2-D DD("RP_CHAR","PI_CHAR") auto/cross-correlation function given two sets of arrays with Cartesian XYZ positions.\n"
     "countpairs_wp    : Calculate the projected auto-correlation function wp (assumes PERIODIC) given one set of arrays with Cartesian XYZ positions\n"
     "countpairs_xi    : Calculate the 3-d auto-correlation function xi (assumes PERIODIC) given one set of arrays with Cartesian XYZ positions\n"
+    "countpairs_s_mu  : Calculate the 2-D DD(s,"MU_CHAR") auto/cross-correlation function given two sets of arrays with Cartesian XYZ positions.\n"
     "countpairs_vpf   : Calculate the counts-in-spheres given one set of arrays with Cartesian XYZ positions\n"
     "\n"
     "See `Corrfunc/call_correlation_functions.py` for example calls to each function in the extension.\n";
@@ -74,6 +76,7 @@ static PyObject *countpairs_countpairs(PyObject *self, PyObject *args, PyObject
 static PyObject *countpairs_countpairs_rp_pi(PyObject *self, PyObject *args, PyObject *kwargs);
 static PyObject *countpairs_countpairs_wp(PyObject *self, PyObject *args, PyObject *kwargs);
 static PyObject *countpairs_countpairs_xi(PyObject *self, PyObject *args, PyObject *kwargs);
+static PyObject *countpairs_countpairs_s_mu(PyObject *self, PyObject *args, PyObject *kwargs);
 static PyObject *countpairs_countspheres_vpf(PyObject *self, PyObject *args, PyObject *kwargs);
 static PyObject *countpairs_error_out(PyObject *module, const char *msg);
 
@@ -90,7 +93,7 @@ static PyMethodDef module_methods[] = {
      "Calculate the 3-D pair-counts, "XI_CHAR"(r), auto/cross-correlation \n"
      "function given two sets of points represented by X1/Y1/Z1 and X2/Y2/Z2 \n"
      "arrays.\n\n"
-     
+
      "Note, that this module only returns pair counts and not the actual \n"
      "correlation function "XI_CHAR"(r). See the mocks/wtheta/wtheta.c for \n"
      "computing "XI_CHAR"(r) from the output of DD(r). Also note that the \n"
@@ -100,10 +103,10 @@ static PyMethodDef module_methods[] = {
      "Parameters \n"
      "-----------\n"
      "Every parameter can be passed as a keyword of the corresponding name.\n\n"
-     
+
      "autocorr : boolean\n"
      "   Boolean flag for auto/cross-correlation. If autocorr is set to 1,\n"
-     "    are not used (but must still be passed, perhaps again as X1/Y1/Z1).\n"     
+     "    are not used (but must still be passed, perhaps again as X1/Y1/Z1).\n"
      "\n"
      "nthreads : integer\n"
      "   The number of OpenMP threads to use. Has no effect if OpenMP was not\n"
@@ -114,17 +117,17 @@ static PyMethodDef module_methods[] = {
      "   contain white-space separated values  of (rmin, rmax)  for each\n"
      "   ``r`` wanted. The bins do not need to be contiguous but must be in\n"
      "   increasing order (smallest bins come first). \n\n"
-     
+
      "X1/Y1/Z1 : array-like, real (float/double)\n"
      "   The array of X/Y/Z positions for the first set of points.\n"
      "   Calculations are done in the precision of the supplied arrays.\n\n"
-     
+
      "weights1 : array-like, real (float/double), shape (n_particles,) or (n_weights_per_particle,n_particles), optional\n"
      "   Weights for computing a weighted pair count.\n\n"
-     
+
      "weight_type : str, optional\n"
      "   The type of pair weighting to apply.\n"
-     "   Options: \"pair_product\", None\n" 
+     "   Options: \"pair_product\", None\n"
      "   Default: None.\n\n"
 
      "periodic : boolean\n"
@@ -133,7 +136,7 @@ static PyMethodDef module_methods[] = {
      "X2/Y2/Z2 : array-like, real (float/double)\n"
      "   Array of XYZ positions for the second set of points. *Must* be the same\n"
      "   precision as the X1/Y1/Z1 arrays. Only required when ``autocorr==0``.\n\n"
-     
+
      "weights2\n : array-like, real (float/double), shape (n_particles,) or (n_weights_per_particle,n_particles), optional\n"
      "   Weights for computing a weighted pair count."
 
@@ -145,7 +148,7 @@ static PyMethodDef module_methods[] = {
      "   Present to facilitate exact calculations for periodic wrapping.\n"
      "   If boxsize is not supplied, then the wrapping is done based on\n"
      "   the maximum difference within each dimension of the X/Y/Z arrays.\n\n"
-    
+
      "output_ravg : boolean (default false)\n"
      "   Boolean flag to output the average ``r`` for each bin. Code will\n"
      "   run slower if you set this flag. Also, note, if you are calculating\n"
@@ -161,7 +164,7 @@ static PyMethodDef module_methods[] = {
      "   Controls the maximum number of cells per dimension. Total number of cells \n"
      "   can be up to (max_cells_per_dim)^3. Only increase if ``rmax`` is too small \n"
      "   relative to the boxsize (and increasing helps the runtime). \n\n"
-     
+
      "c_api_timer : boolean (default false)\n"
      "   Boolean flag to measure actual time spent in the C libraries. Here\n"
      "   to allow for benchmarking and scaling studies.\n\n"
@@ -173,16 +176,16 @@ static PyMethodDef module_methods[] = {
      "  set on the current computer. However, if you set ``isa`` to, say,\n"
      "  ``AVX`` and ``AVX`` is not available on the computer, then the code will\n"
      "  revert to using ``FALLBACK`` (even though ``SSE42`` might be available).\n\n"
-       
+
      "  Unless you are benchmarking the different instruction sets, you should\n"
      "  always leave ``isa`` to the default value. And if you *are* benchmarking,\n"
      "  then the integer values correspond to the ``enum`` for the instruction set\n"
      "  defined in ``utils/defs.h``.\n\n"
-       
+
     "Returns\n"
     "--------\n\n"
     "A tuple (results, time) \n\n"
-     
+
     "results : A python list\n"
     "   A python list containing [rmin, rmax, ravg, npairs, weight_avg] for each radial bin\n"
     "   specified in the ``binfile``. If ``output_ravg`` is not set, then ``ravg``\n"
@@ -192,12 +195,12 @@ static PyMethodDef module_methods[] = {
 
     "time : if ``c_api_timer`` is set, then the return value contains the time spent\n"
     "   in the API; otherwise time is set to 0.0\n\n"
-     
+
     "Example\n"
     "-------\n\n"
-     
+
     ">>> from Corrfunc._countpairs import countpairs\n"
-    ">>> from Corrfunc.io import read_catalog\n" 
+    ">>> from Corrfunc.io import read_catalog\n"
     ">>> x,y,z = read_catalog()\n"
     ">>> autocorr=1\n"
     ">>> nthreads=2\n"
@@ -229,7 +232,7 @@ static PyMethodDef module_methods[] = {
      "\n"
      "autocorr: boolean, required\n"
      "   Boolean flag for auto/cross-correlation. If autocorr is set to 1,\n"
-     "    are not used (but must still be passed, perhaps again as X1/Y1/Z1).\n"     
+     "    are not used (but must still be passed, perhaps again as X1/Y1/Z1).\n"
      "\n"
      "nthreads: integer\n"
      "    The number of OpenMP threads to use. Has no effect if OpenMP was not\n"
@@ -248,17 +251,17 @@ static PyMethodDef module_methods[] = {
      "   contain white-space separated values  of (rpmin, rpmax)  for each\n"
      "   ``rp`` wanted. The bins do not need to be contiguous but must be in\n"
      "   increasing order (smallest bins come first). \n\n"
-     
+
      "X1/Y1/Z1 : array-like, real (float/double)\n"
      "   The array of X/Y/Z positions for the first set of points.\n"
      "   Calculations are done in the precision of the supplied arrays.\n"
      "\n"
      "weights1 : array-like, real (float/double), shape (n_particles,) or (n_weights_per_particle,n_particles), optional\n"
      "   Weights for computing a weighted pair count.\n\n"
-     
+
      "weight_type : str, optional\n"
      "   The type of pair weighting to apply.\n"
-     "   Options: \"pair_product\", None\n" 
+     "   Options: \"pair_product\", None\n"
      "   Default: None.\n\n"
 
      "periodic : boolean\n"
@@ -295,7 +298,7 @@ static PyMethodDef module_methods[] = {
      "   Controls the maximum number of cells per dimension. Total number of cells \n"
      "   can be up to (max_cells_per_dim)^3. Only increase if ``rmax`` is too small \n"
      "   relative to the boxsize (and increasing helps the runtime). \n\n"
-     
+
      "c_api_timer : boolean (default false)\n"
      "   Boolean flag to measure actual time spent in the C libraries. Here\n"
      "   to allow for benchmarking and scaling studies.\n"
@@ -325,7 +328,7 @@ static PyMethodDef module_methods[] = {
      "   will be set to 0.0 for all bins; similarly for ``weight_avg``. ``npairs`` contains the number of pairs\n"
      "   in that bin and can be used to compute the actual wp("RP_CHAR") by\n"
      "   combining with (DR, RR) counts.\n"
-     "\n" 
+     "\n"
      "time : if ``c_api_timer`` is set, then the return value contains the time spent\n"
      "   in the API; otherwise time is set to 0.0\n"
      "\n"
@@ -333,7 +336,7 @@ static PyMethodDef module_methods[] = {
      "--------\n"
      "\n"
      ">>> from Corrfunc._countpairs import countpairs_rp_pi\n"
-     ">>> from Corrfunc.io import read_catalog\n" 
+     ">>> from Corrfunc.io import read_catalog\n"
      ">>> x,y,z = read_catalog()\n"
      ">>> autocorr=1\n"
      ">>> nthreads=2\n"
@@ -399,10 +402,10 @@ static PyMethodDef module_methods[] = {
      "\n"
      "weights : array-like, real (float/double), shape (n_particles,) or (n_weights_per_particle,n_particles), optional\n"
      "   Weights for computing a weighted correlation function.\n\n"
-     
+
      "weight_type : str, optional\n"
      "   The type of pair weighting to apply.\n"
-     "   Options: \"pair_product\", None\n" 
+     "   Options: \"pair_product\", None\n"
      "   Default: None.\n\n"
 
      "verbose : boolean (default false)\n"
@@ -461,7 +464,7 @@ static PyMethodDef module_methods[] = {
      "   ``rpavg`` will be set to 0.0 for all bins; similarly for ``weight_avg``. ``wp`` contains the projected\n"
      "   correlation function while ``npairs`` contains the number of unique pairs\n"
      "   in that bin.  If weight are used, then ``wp`` is weighted, while ``npairs`` is not.\n"
-     "\n" 
+     "\n"
      "time : if ``c_api_timer`` is set, then the return value contains the time spent\n"
      "   in the API; otherwise time is set to 0.0\n"
      "\n"
@@ -476,7 +479,7 @@ static PyMethodDef module_methods[] = {
      "--------\n"
      "\n"
      ">>> from _countpairs import countpairs_wp\n"
-     ">>> from Corrfunc.io import read_catalog\n" 
+     ">>> from Corrfunc.io import read_catalog\n"
      ">>> x,y,z = read_catalog()\n"
      ">>> nthreads=2\n"
      ">>> pimax=40.0\n"
@@ -531,10 +534,10 @@ static PyMethodDef module_methods[] = {
      "\n"
      "weights : array-like, real (float/double), shape (n_particles,) or (n_weights_per_particle,n_particles), optional\n"
      "   Weights for computing a weighted correlation function.\n\n"
-     
+
      "weight_type : str, optional\n"
      "   The type of pair weighting to apply.\n"
-     "   Options: \"pair_product\", None\n" 
+     "   Options: \"pair_product\", None\n"
      "   Default: None.\n\n"
 
      "verbose : boolean (default false)\n"
@@ -585,7 +588,7 @@ static PyMethodDef module_methods[] = {
      "   ``ravg`` will be set to 0.0 for all bins; similarly for ``weightavg``. ``xi`` contains the projected\n"
      "   correlation function while ``npairs`` contains the number of unique pairs\n"
      "   in that bin.  If weights are used, then ``xi`` is weighted, while ``npairs`` is not.\n"
-     "\n" 
+     "\n"
      "time : if ``c_api_timer`` is set, then the return value contains the time spent\n"
      "   in the API; otherwise time is set to 0.0\n"
      "\n"
@@ -593,7 +596,7 @@ static PyMethodDef module_methods[] = {
      "--------\n"
      "\n"
      ">>> from _countpairs import countpairs_xi\n"
-     ">>> from Corrfunc.io import read_catalog\n" 
+     ">>> from Corrfunc.io import read_catalog\n"
      ">>> x,y,z = read_catalog()\n"
      ">>> nthreads=2\n"
      ">>> boxsize = 420.0\n"
@@ -601,6 +604,153 @@ static PyMethodDef module_methods[] = {
      "                               x, y, z, verbose=True, output_ravg=True)\n"
      "\n"
     },
+    {"countpairs_s_mu"      ,(PyCFunction) countpairs_countpairs_s_mu ,METH_VARARGS | METH_KEYWORDS,
+     "countpairs_s_mu(autocorr, nthreads, binfile, mu_max, nmu_bins, X1, Y1, Z1, weights1=None, weight_type=None,\n"
+     "                periodic=True, X2=None, Y2=None, Z2=None, weights2=None, verbose=False,\n"
+     "                boxsize=0.0, output_savg=False, xbin_refine_factor=2, ybin_refine_factor=2,\n"
+     "                zbin_refine_factor=1, max_cells_per_dim=100, c_api_timer=False, isa=-1)\n"
+     "\n"
+     "Calculate the 2-D pair-counts corresponding to the real-space correlation\n"
+     "function, "XI_CHAR"(s, "MU_CHAR"). Pairs which are separated\n"
+     "by less than the ``s`` bins (specified in ``binfile``) in the X-Y plane, and\n"
+     "less than ``s*mu_max`` in the Z-dimension are counted.\n\n"
+
+     "Note, that this module only returns pair counts and not the actual\n"
+     "correlation function "XI_CHAR"(s, "MU_CHAR"). \n"
+     "Also note that the python wrapper for this extension: `Corrfunc.theory.DDsmu`\n"
+     "is more user-friendly.\n"
+     UNICODE_WARNING
+     "\n"
+     "Parameters\n"
+     "-----------\n"
+     "Every parameter can be passed as a keyword of the corresponding name.\n"
+     "\n"
+     "autocorr: boolean, required\n"
+     "   Boolean flag for auto/cross-correlation. If autocorr is set to 1,\n"
+     "    are not used (but must still be passed, perhaps again as X1/Y1/Z1).\n"
+     "\n"
+     "nthreads: integer\n"
+     "    The number of OpenMP threads to use. Has no effect if OpenMP was not\n"
+     "    enabled during library compilation.\n"
+     "\n"
+     "binfile : string\n"
+     "   Filename specifying the ``s`` bins for ``DDsmu``. The file should\n"
+     "   contain white-space separated values  of (smin, smax)  for each\n"
+     "   ``s`` wanted. The bins must be contiguous and in\n"
+     "   increasing order (smallest bins come first). \n"
+     "\n"
+     "mu_max: double. Must be in range (0.0, 1.0]\n"
+     "   A double-precision value for the maximum cosine of the angular separation from\n"
+     "   the line of sight (LOS). Here, LOS is taken to be along the Z direction.\n"
+     "   Note that only pairs with ``0 <= cos("THETA_CHAR"_LOS) < mu_max``\n"
+     "   are counted (no equality).\n\n"
+     "\n"
+     "nmu_bins: Integer. Must be at least 1\n"
+     "   Number of bins for ``mu``\n\n"
+     "\n"
+     "X1/Y1/Z1 : array-like, real (float/double)\n"
+     "   The array of X/Y/Z positions for the first set of points.\n"
+     "   Calculations are done in the precision of the supplied arrays.\n"
+     "\n"
+     "weights1 : array-like, real (float/double), shape (n_particles,) or (n_weights_per_particle,n_particles), optional\n"
+     "   Weights for computing a weighted pair count.\n\n"
+
+     "weight_type : str, optional\n"
+     "   The type of pair weighting to apply.\n"
+     "   Options: \"pair_product\", None\n"
+     "   Default: None.\n\n"
+
+     "periodic : boolean\n"
+     "   Boolean flag to indicate periodic boundary conditions.\n"
+     "\n"
+
+     "X2/Y2/Z2 : array-like, real (float/double)\n"
+     "   Array of XYZ positions for the second set of points. *Must* be the same\n"
+     "   precision as the X1/Y1/Z1 arrays. Only required when ``autocorr==0``.\n"
+     "\n"
+
+     "weights2\n : array-like, real (float/double), shape (n_particles,) or (n_weights_per_particle,n_particles), optional\n"
+     "   Weights for computing a weighted pair count."
+
+     "verbose : boolean (default false)\n"
+     "   Boolean flag to control output of informational messages\n"
+     "\n"
+
+     "boxsize : double\n"
+     "   The side-length of the cube in the cosmological simulation.\n"
+     "   Present to facilitate exact calculations for periodic wrapping.\n"
+     "   If boxsize is not supplied, then the wrapping is done based on\n"
+     "   the maximum difference within each dimension of the X/Y/Z arrays.\n"
+     "\n"
+
+     "output_savg : boolean (default false)\n"
+     "   Boolean flag to output the average ``s`` for each bin. Code will\n"
+     "   run slower if you set this flag. Also, note, if you are calculating\n"
+     "   in single-precision, ``s`` will suffer from numerical loss of\n"
+     "   precision and can not be trusted. If you need accurate ``s``\n"
+     "   values, then pass in double precision arrays for the particle positions.\n"
+     "\n"
+
+     "(xyz)bin_refine_factor: integer (default (2,2,1) typical values in [1-3]) \n"
+     "   Controls the refinement on the cell sizes. Can have up to a 20% impact \n"
+     "   on runtime. \n\n"
+
+     "max_cells_per_dim: integer (default 100, typical values in [50-300]) \n"
+     "   Controls the maximum number of cells per dimension. Total number of cells \n"
+     "   can be up to (max_cells_per_dim)^3. Only increase if ``rmax`` is too small \n"
+     "   relative to the boxsize (and increasing helps the runtime). \n\n"
+
+     "c_api_timer : boolean (default false)\n"
+     "   Boolean flag to measure actual time spent in the C libraries. Here\n"
+     "   to allow for benchmarking and scaling studies.\n"
+     "\n"
+
+     "isa : integer (default -1)\n"
+     "  Controls the runtime dispatch for the instruction set to use. Possible\n"
+     "  options are: [-1, AVX, SSE42, FALLBACK]\n"
+     "\n"
+     "  Setting isa to -1 will pick the fastest available instruction\n"
+     "  set on the current computer. However, if you set ``isa`` to, say,\n"
+     "  ``AVX`` and ``AVX`` is not available on the computer, then the code will\n"
+     "  revert to using ``FALLBACK`` (even though ``SSE42`` might be available).\n"
+     "\n"
+     "  Unless you are benchmarking the different instruction sets, you should\n"
+     "  always leave ``isa`` to the default value. And if you *are* benchmarking,\n"
+     "  then the integer values correspond to the ``enum`` for the instruction set\n"
+     "  defined in ``utils/defs.h``.\n"
+     "\n"
+
+     "Returns\n"
+     "--------\n"
+     "\n"
+     "A tuple (results, time) \n"
+     "\n"
+     "results : A python list\n"
+     "   A python list containing ``nmu_bins`` of [smin, smax, savg, mu_max, npairs, weightavg]\n"
+     "   for each spatial bin specified in the ``binfile``. There will be a total of ``nmu_bins``\n"
+     "   ranging from [0, ``mu_max``) *per* spatial bin. If ``output_savg`` is not set, then ``savg``\n"
+     "   will be set to 0.0 for all bins; similarly for ``weight_avg``. ``npairs`` \n"
+     "   contains the number of pairs in that bin.\n"
+     "\n"
+     "time : if ``c_api_timer`` is set, then the return value contains the time spent\n"
+     "   in the API; otherwise time is set to 0.0\n"
+     "\n"
+
+     "Example\n"
+     "--------\n"
+     "\n"
+     ">>> from Corrfunc._countpairs import countpairs_s_mu\n"
+     ">>> from Corrfunc.io import read_catalog\n"
+     ">>> x,y,z = read_catalog()\n"
+     ">>> autocorr=1\n"
+     ">>> nthreads=2\n"
+     ">>> mu_max=1.0\n"
+     ">>> nmu_bins=40\n"
+     ">>> (DDsmu, time) = countpairs_s_mu(autocorr, nthreads, '../tests/bins', mu_max, nmu_bins, \n"
+     "                                    x, y, z, X2=x, Y2=y, Z2=z,\n"
+     "                                    verbose=True, output_savg=True)\n"
+     "\n"
+    },
     {"countspheres_vpf"      ,(PyCFunction) countpairs_countspheres_vpf ,METH_VARARGS | METH_KEYWORDS,
      "countspheres_vpf(rmax, nbins, nspheres, numpN, seed,\n"
      "                 X, Y, Z, verbose=False, periodic=True,\n"
@@ -730,7 +880,7 @@ static PyObject *countpairs_error_out(PyObject *module, const char *msg)
 {
 #if PY_MAJOR_VERSION < 3
     (void) module;//to avoid unused warning with python2
-#endif    
+#endif
 
     struct module_state *st = GETSTATE(module);
     PyErr_SetString(st->error, msg);
@@ -786,7 +936,7 @@ PyMODINIT_FUNC init_countpairs(void)
         Py_DECREF(module);
         INITERROR;
     }
-    
+
     /* Load `numpy` functionality. */
     import_array();
 
@@ -803,24 +953,24 @@ PyMODINIT_FUNC init_countpairs(void)
 static int64_t check_dims_and_datatype(PyObject *module, PyArrayObject *x1_obj, PyArrayObject *y1_obj, PyArrayObject *z1_obj, PyArrayObject *weights1_obj, size_t *element_size)
 {
     char msg[1024];
-    
+
     const int check_weights = weights1_obj != NULL;
 
     /* All the position arrays should be 1-D*/
     const int nxdims = PyArray_NDIM(x1_obj);
     const int nydims = PyArray_NDIM(y1_obj);
     const int nzdims = PyArray_NDIM(z1_obj);
-    
+
     if(nxdims != 1 || nydims != 1 || nzdims != 1) {
         snprintf(msg, 1024, "ERROR: Expected 1-D numpy arrays.\nFound (nxdims, nydims, nzdims) = (%d, %d, %d) instead",
                  nxdims, nydims, nzdims);
         countpairs_error_out(module, msg);
         return -1;
     }
-    
+
     /* The weights array can be 1-D or 2-D of shape (n_weights, n_particles) */
     const int n_weight_dims = check_weights ? PyArray_NDIM(weights1_obj) : 1;
-    
+
     if(n_weight_dims != 1 && n_weight_dims != 2) {
         snprintf(msg, 1024, "ERROR: Expected 1-D or 2-D weight array.\nFound n_weight_dims = %d instead", n_weight_dims);
         countpairs_error_out(module, msg);
@@ -854,7 +1004,7 @@ static int64_t check_dims_and_datatype(PyObject *module, PyArrayObject *x1_obj,
         countpairs_error_out(module, msg);
         return -1;
     }
-    
+
     // Current version of the code only supports weights of the same dtype as positions
     if( x_type != y_type || y_type != z_type || (check_weights && z_type != weights_type)) {
         PyArray_Descr *x_descr = PyArray_DescrFromType(x_type);
@@ -874,12 +1024,12 @@ static int64_t check_dims_and_datatype(PyObject *module, PyArrayObject *x1_obj,
         countpairs_error_out(module, msg);
         return -1;
     }
-    
+
     /* Check if the number of elements in the 3 Python arrays are identical */
     const int64_t nx1 = (int64_t)PyArray_SIZE(x1_obj);
     const int64_t ny1 = (int64_t)PyArray_SIZE(y1_obj);
     const int64_t nz1 = (int64_t)PyArray_SIZE(z1_obj);
-    
+
     if(nx1 != ny1 || ny1 != nz1) {
       snprintf(msg, 1024, "ERROR: Expected arrays to have the same number of elements in all 3-dimensions.\nFound (nx, ny, nz) = (%"PRId64", %"PRId64", %"PRId64") instead",
                nx1, ny1, nz1);
@@ -904,25 +1054,25 @@ static int64_t check_dims_and_datatype(PyObject *module, PyArrayObject *x1_obj,
     } else {
       *element_size = sizeof(double);
     }
-    
+
     return nx1;
 }
 
 static int print_kwlist_into_msg(char *msg, const size_t totsize, size_t len, char *kwlist[], const size_t nitems)
 {
     for(size_t i=0;i<nitems;i++) {
-        
+
         if(len+strlen(kwlist[i]) >= totsize-2) {
             return EXIT_FAILURE;
         }
-        
+
         memcpy(msg+len, kwlist[i], strlen(kwlist[i]));
         len += strlen(kwlist[i]);
         msg[len] = ',';
         msg[len+1] = ' ';
         len += 2;
     }
-    
+
     msg[len]='\0';
     return EXIT_SUCCESS;
 }
@@ -933,11 +1083,11 @@ static PyObject *countpairs_countpairs(PyObject *self, PyObject *args, PyObject
     //Error-handling is global in python2 -> stored in struct module_state _struct declared at the top of this file
 #if PY_MAJOR_VERSION < 3
     (void) self;
-    PyObject *module = NULL;//should not be used -> setting to NULL so any attempts to dereference will result in a crash. 
+    PyObject *module = NULL;//should not be used -> setting to NULL so any attempts to dereference will result in a crash.
 #else
     //In python3, self is simply the module object that was returned earlier by init
     PyObject *module = self;
-#endif    
+#endif
     PyArrayObject *x1_obj=NULL, *y1_obj=NULL, *z1_obj=NULL, *weights1_obj=NULL;
     PyArrayObject *x2_obj=NULL, *y2_obj=NULL, *z2_obj=NULL, *weights2_obj=NULL;
 
@@ -951,7 +1101,7 @@ static PyObject *countpairs_countpairs(PyObject *self, PyObject *args, PyObject
     options.periodic = 1;
     options.need_avg_sep = 0;
     options.c_api_timer = 0;
-    
+
     int8_t xbin_ref=options.bin_refine_factors[0],
         ybin_ref=options.bin_refine_factors[1],
         zbin_ref=options.bin_refine_factors[2];
@@ -1004,7 +1154,7 @@ static PyObject *countpairs_countpairs(PyObject *self, PyObject *args, PyObject
                                        &weighting_method_str)
 
          ) {
-        
+
         PyObject_Print(kwargs, stdout, 0);
         fprintf(stdout, "\n");
 
@@ -1017,7 +1167,7 @@ static PyObject *countpairs_countpairs(PyObject *self, PyObject *args, PyObject
         if(status != EXIT_SUCCESS) {
             fprintf(stderr,"Error message does not contain all of the keywords\n");
         }
-        
+
         countpairs_error_out(module,msg);
         Py_RETURN_NONE;
     }
@@ -1036,16 +1186,16 @@ static PyObject *countpairs_countpairs(PyObject *self, PyObject *args, PyObject
         set_bin_refine_scheme(&options, BINNING_CUST);//custom binning -> code will honor requested binning scheme
     }
 
-    
+
     /* We have numpy arrays and all the required inputs*/
     /* How many data points are there? And are they all of floating point type */
     size_t element_size;
     const int64_t ND1 = check_dims_and_datatype(module, x1_obj, y1_obj, z1_obj, weights1_obj, &element_size);
     if(ND1 == -1) {
-        //Error has already been set -> simply return 
+        //Error has already been set -> simply return
         Py_RETURN_NONE;
     }
-    
+
     /* Ensure the weights are of the right shape (n_weights, n_particles) */
     if(weights1_obj != NULL){
         // A numpy dimension of length -1 will be expanded to n_weights
@@ -1053,7 +1203,7 @@ static PyObject *countpairs_countpairs(PyObject *self, PyObject *args, PyObject
         PyArray_Dims pdims = {.ptr = &(dims[0]), .len = 2};
         weights1_obj = (PyArrayObject *) PyArray_Newshape(weights1_obj, &pdims, NPY_CORDER);
     }
-    
+
     /* Validate the user's choice of weighting method */
     weight_method_t weighting_method;
     int wstatus = get_weight_method_by_name(weighting_method_str, &weighting_method);
@@ -1072,7 +1222,7 @@ static PyObject *countpairs_countpairs(PyObject *self, PyObject *args, PyObject
         countpairs_error_out(module, msg);
         Py_RETURN_NONE;
     }
-    
+
     if(extra.weights0.num_weights > 0 && found_weights > MAX_NUM_WEIGHTS){
         char msg[1024];
         snprintf(msg, 1024, "ValueError: In %s: Provided %d weights-per-particle, but the code was compiled with MAX_NUM_WEIGHTS=%d.\n",
@@ -1099,17 +1249,17 @@ static PyObject *countpairs_countpairs(PyObject *self, PyObject *args, PyObject
         size_t element_size2;
         ND2 = check_dims_and_datatype(module, x2_obj, y2_obj, z2_obj, weights2_obj, &element_size2);
         if(ND2 == -1) {
-            //Error has already been set -> simply return 
+            //Error has already been set -> simply return
             Py_RETURN_NONE;
         }
-        
+
         /* Ensure the weights are of the right shape (n_weights, n_particles) */
         if(weights2_obj != NULL){
             npy_intp dims[2] = {-1, ND2};
             PyArray_Dims pdims = {.ptr = &(dims[0]), .len = 2};
             weights2_obj = (PyArrayObject *) PyArray_Newshape(weights2_obj, &pdims, NPY_CORDER);
         }
-    
+
         if(element_size != element_size2) {
             snprintf(msg, 1024, "TypeError: In %s: The two arrays must have the same data-type. First array is of type %s while second array is of type %s\n",
                      __FUNCTION__, element_size == 4 ? "floats":"doubles", element_size2 == 4 ? "floats":"doubles");
@@ -1118,9 +1268,9 @@ static PyObject *countpairs_countpairs(PyObject *self, PyObject *args, PyObject
         }
     }
 
-    
-    /* 
-       Interpret the input objects as numpy arrays (of whatever the input type the python object has). 
+
+    /*
+       Interpret the input objects as numpy arrays (of whatever the input type the python object has).
        NULL initialization is necessary since we might be calling XDECREF.
        The input objects can be converted into the required DOUBLE array.
     */
@@ -1132,7 +1282,7 @@ static PyObject *countpairs_countpairs(PyObject *self, PyObject *args, PyObject
     if(weights1_obj != NULL){
         weights1_array = PyArray_FromArray(weights1_obj, NOTYPE_DESCR, requirements);
     }
-    
+
     /* NULL initialization is necessary since we might be calling XDECREF*/
     PyObject *x2_array = NULL, *y2_array = NULL, *z2_array = NULL, *weights2_array = NULL;
     if(autocorr == 0) {
@@ -1143,14 +1293,14 @@ static PyObject *countpairs_countpairs(PyObject *self, PyObject *args, PyObject
             weights2_array = PyArray_FromArray(weights2_obj, NOTYPE_DESCR, requirements);
         }
     }
-    
+
     if (x1_array == NULL || y1_array == NULL || z1_array == NULL ||
         (autocorr==0 && (x2_array == NULL || y2_array == NULL || z2_array == NULL))) {
         Py_XDECREF(x1_array);
         Py_XDECREF(y1_array);
         Py_XDECREF(z1_array);
         Py_XDECREF(weights1_array);
-        
+
         Py_XDECREF(x2_array);
         Py_XDECREF(y2_array);
         Py_XDECREF(z2_array);
@@ -1165,7 +1315,7 @@ static PyObject *countpairs_countpairs(PyObject *self, PyObject *args, PyObject
 
     /* Get pointers to the data */
     void *X1 = NULL, *Y1=NULL, *Z1=NULL, *weights1=NULL;
-    X1 = PyArray_DATA((PyArrayObject *) x1_array); 
+    X1 = PyArray_DATA((PyArrayObject *) x1_array);
     Y1 = PyArray_DATA((PyArrayObject *) y1_array);
     Z1 = PyArray_DATA((PyArrayObject *) z1_array);
     if(weights1_array != NULL){
@@ -1181,7 +1331,7 @@ static PyObject *countpairs_countpairs(PyObject *self, PyObject *args, PyObject
             weights2 = PyArray_DATA((PyArrayObject *) weights2_array);
         }
     }
-    
+
     /* Pack the weights into extra_options */
     for(int64_t w = 0; w < extra.weights0.num_weights; w++){
         extra.weights0.weights[w] = (char *) weights1 + w*ND1*element_size;
@@ -1239,16 +1389,16 @@ static PyObject *countpairs_countpairs_rp_pi(PyObject *self, PyObject *args, PyO
 {
 #if PY_MAJOR_VERSION < 3
     (void) self;
-    PyObject *module = NULL;//should not be used -> setting to NULL so any attempts to dereference will result in a crash. 
+    PyObject *module = NULL;//should not be used -> setting to NULL so any attempts to dereference will result in a crash.
 #else
     //In python3, self is simply the module object that was returned earlier by init
     PyObject *module = self;
-#endif    
+#endif
     PyArrayObject *x1_obj=NULL, *y1_obj=NULL, *z1_obj=NULL, *weights1_obj=NULL;
     PyArrayObject *x2_obj=NULL, *y2_obj=NULL, *z2_obj=NULL, *weights2_obj=NULL;
     int autocorr=0;
     int nthreads=4;
-    
+
     double pimax;
     char *binfile, *weighting_method_str = NULL;
     struct config_options options = get_config_options();
@@ -1259,7 +1409,7 @@ static PyObject *countpairs_countpairs_rp_pi(PyObject *self, PyObject *args, PyO
     int8_t xbin_ref=options.bin_refine_factors[0],
         ybin_ref=options.bin_refine_factors[1],
         zbin_ref=options.bin_refine_factors[2];
-    
+
     static char *kwlist[] = {
         "autocorr",
         "nthreads",
@@ -1320,7 +1470,7 @@ static PyObject *countpairs_countpairs_rp_pi(PyObject *self, PyObject *args, PyO
         if(status != EXIT_SUCCESS) {
             fprintf(stderr,"Error message does not contain all of the keywords\n");
         }
-        
+
         countpairs_error_out(module,msg);
         Py_RETURN_NONE;
     }
@@ -1343,10 +1493,10 @@ static PyObject *countpairs_countpairs_rp_pi(PyObject *self, PyObject *args, PyO
     /* How many data points are there? And are they all of floating point type */
     const int64_t ND1 = check_dims_and_datatype(module, x1_obj, y1_obj, z1_obj, weights1_obj, &element_size);
     if(ND1 == -1) {
-        //Error has already been set -> simply return 
+        //Error has already been set -> simply return
         Py_RETURN_NONE;
     }
-    
+
     /* Ensure the weights are of the right shape (n_weights, n_particles) */
     if(weights1_obj != NULL){
         // A numpy dimension of length -1 will be expanded to n_weights
@@ -1354,7 +1504,7 @@ static PyObject *countpairs_countpairs_rp_pi(PyObject *self, PyObject *args, PyO
         PyArray_Dims pdims = {.ptr = &(dims[0]), .len = 2};
         weights1_obj = (PyArrayObject *) PyArray_Newshape(weights1_obj, &pdims, NPY_CORDER);
     }
-    
+
     /* Validate the user's choice of weighting method */
     weight_method_t weighting_method;
     int wstatus = get_weight_method_by_name(weighting_method_str, &weighting_method);
@@ -1373,7 +1523,7 @@ static PyObject *countpairs_countpairs_rp_pi(PyObject *self, PyObject *args, PyO
         countpairs_error_out(module, msg);
         Py_RETURN_NONE;
     }
-    
+
     if(extra.weights0.num_weights > 0 && found_weights > MAX_NUM_WEIGHTS){
         char msg[1024];
         snprintf(msg, 1024, "ValueError: In %s: Provided %d weights-per-particle, but the code was compiled with MAX_NUM_WEIGHTS=%d.\n",
@@ -1401,7 +1551,7 @@ static PyObject *countpairs_countpairs_rp_pi(PyObject *self, PyObject *args, PyO
         size_t element_size2;
         ND2 = check_dims_and_datatype(module, x2_obj, y2_obj, z2_obj, weights2_obj, &element_size2);
         if(ND2 == -1) {
-            //Error has already been set -> simply return 
+            //Error has already been set -> simply return
             Py_RETURN_NONE;
         }
         /* Ensure the weights are of the right shape (n_weights, n_particles) */
@@ -1417,8 +1567,8 @@ static PyObject *countpairs_countpairs_rp_pi(PyObject *self, PyObject *args, PyO
             countpairs_error_out(module, msg);
             Py_RETURN_NONE;
         }
-    } 
-    
+    }
+
     /* Interpret the input objects as numpy arrays. */
     const int requirements = NPY_ARRAY_IN_ARRAY;
     PyObject *x1_array = NULL, *y1_array = NULL, *z1_array = NULL, *weights1_array = NULL;
@@ -1429,7 +1579,7 @@ static PyObject *countpairs_countpairs_rp_pi(PyObject *self, PyObject *args, PyO
     if(weights1_obj != NULL){
         weights1_array = PyArray_FromArray(weights1_obj, NOTYPE_DESCR, requirements);
     }
-    
+
     if(autocorr == 0) {
         x2_array = PyArray_FromArray(x2_obj, NOTYPE_DESCR, requirements);
         y2_array = PyArray_FromArray(y2_obj, NOTYPE_DESCR, requirements);
@@ -1461,7 +1611,7 @@ static PyObject *countpairs_countpairs_rp_pi(PyObject *self, PyObject *args, PyO
     /* Get pointers to the data as C-types. */
     void *X1 = NULL, *Y1 = NULL, *Z1 = NULL, *weights1=NULL;
     void *X2 = NULL, *Y2 = NULL, *Z2 = NULL, *weights2=NULL;
-    X1 = PyArray_DATA((PyArrayObject *) x1_array); 
+    X1 = PyArray_DATA((PyArrayObject *) x1_array);
     Y1 = PyArray_DATA((PyArrayObject *) y1_array);
     Z1 = PyArray_DATA((PyArrayObject *) z1_array);
     if(weights1_array != NULL){
@@ -1487,7 +1637,7 @@ static PyObject *countpairs_countpairs_rp_pi(PyObject *self, PyObject *args, PyO
 
     NPY_BEGIN_THREADS_DEF;
     NPY_BEGIN_THREADS;
-    
+
     options.float_type = element_size;
     results_countpairs_rp_pi results;
     double c_api_time = 0.0;
@@ -1504,7 +1654,7 @@ static PyObject *countpairs_countpairs_rp_pi(PyObject *self, PyObject *args, PyO
         c_api_time = options.c_api_time;
     }
     NPY_END_THREADS;
-    
+
     /* Clean up. */
     Py_DECREF(x1_array);Py_DECREF(y1_array);Py_DECREF(z1_array);Py_XDECREF(weights1_array);//x1 should absolutely not be NULL
     Py_XDECREF(x2_array);Py_XDECREF(y2_array);Py_XDECREF(z2_array);Py_XDECREF(weights2_array);//x2 might be NULL depending on value of autocorr
@@ -1531,7 +1681,7 @@ static PyObject *countpairs_countpairs_rp_pi(PyObject *self, PyObject *args, PyO
         rlow=results.rupp[i];
     }
     free_results_rp_pi(&results);
-    
+
     return Py_BuildValue("(Od)", ret, c_api_time);
 }
 
@@ -1539,11 +1689,11 @@ static PyObject *countpairs_countpairs_wp(PyObject *self, PyObject *args, PyObje
 {
 #if PY_MAJOR_VERSION < 3
     (void) self;//to suppress the unused variable warning. Terrible hack
-    PyObject *module = NULL;//need not be used -> setting to NULL so any attempts to dereference will result in a crash. 
+    PyObject *module = NULL;//need not be used -> setting to NULL so any attempts to dereference will result in a crash.
 #else
     //In python3, self is simply the module object that was returned earlier by init
     PyObject *module = self;
-#endif    
+#endif
     PyArrayObject *x1_obj=NULL, *y1_obj=NULL, *z1_obj=NULL, *weights1_obj=NULL;
     double boxsize,pimax;
     int nthreads=1;
@@ -1560,7 +1710,7 @@ static PyObject *countpairs_countpairs_wp(PyObject *self, PyObject *args, PyObje
     int8_t xbin_ref=options.bin_refine_factors[0],
         ybin_ref=options.bin_refine_factors[1],
         zbin_ref=options.bin_refine_factors[2];
-    
+
     static char *kwlist[] = {
         "boxsize",
         "pimax",
@@ -1582,7 +1732,7 @@ static PyObject *countpairs_countpairs_wp(PyObject *self, PyObject *args, PyObje
         "isa",/* instruction set to use of type enum isa; valid values are AVX, SSE, FALLBACK */
         NULL
     };
-    
+
     if( ! PyArg_ParseTupleAndKeywords(args, kwargs, "ddisO!O!O!|O!sbbbbbhbbi", kwlist,
                                       &boxsize,&pimax,&nthreads,&binfile,
                                       &PyArray_Type,&x1_obj,
@@ -1597,7 +1747,7 @@ static PyObject *countpairs_countpairs_wp(PyObject *self, PyObject *args, PyObje
                                       &(options.c_api_timer),
                                       &(options.c_cell_timer),
                                       &(options.instruction_set))
-        
+
         ){
         PyObject_Print(kwargs, stdout, 0);
         fprintf(stdout, "\n");
@@ -1611,7 +1761,7 @@ static PyObject *countpairs_countpairs_wp(PyObject *self, PyObject *args, PyObje
         if(status != EXIT_SUCCESS) {
             fprintf(stderr,"Error message does not contain all of the keywords\n");
         }
-        
+
         countpairs_error_out(module,msg);
         Py_RETURN_NONE;
     }
@@ -1630,14 +1780,14 @@ static PyObject *countpairs_countpairs_wp(PyObject *self, PyObject *args, PyObje
         options.bin_refine_factors[2] = zbin_ref;
         set_bin_refine_scheme(&options, BINNING_CUST);//custom binning -> code will honor requested binning scheme
     }
-    
+
     /* How many data points are there? And are they all of floating point type */
     const int64_t ND1 = check_dims_and_datatype(module, x1_obj, y1_obj, z1_obj, weights1_obj, &element_size);
     if(ND1 == -1) {
-        //Error has already been set -> simply return 
+        //Error has already been set -> simply return
         Py_RETURN_NONE;
     }
-    
+
     /* Ensure the weights are of the right shape (n_weights, n_particles) */
     if(weights1_obj != NULL){
         // A numpy dimension of length -1 will be expanded to n_weights
@@ -1645,7 +1795,7 @@ static PyObject *countpairs_countpairs_wp(PyObject *self, PyObject *args, PyObje
         PyArray_Dims pdims = {.ptr = &(dims[0]), .len = 2};
         weights1_obj = (PyArrayObject *) PyArray_Newshape(weights1_obj, &pdims, NPY_CORDER);
     }
-    
+
     /* Validate the user's choice of weighting method */
     weight_method_t weighting_method;
     int wstatus = get_weight_method_by_name(weighting_method_str, &weighting_method);
@@ -1664,7 +1814,7 @@ static PyObject *countpairs_countpairs_wp(PyObject *self, PyObject *args, PyObje
         countpairs_error_out(module, msg);
         Py_RETURN_NONE;
     }
-    
+
     if(extra.weights0.num_weights > 0 && found_weights > MAX_NUM_WEIGHTS){
         char msg[1024];
         snprintf(msg, 1024, "ValueError: In %s: Provided %d weights-per-particle, but the code was compiled with MAX_NUM_WEIGHTS=%d.\n",
@@ -1672,7 +1822,7 @@ static PyObject *countpairs_countpairs_wp(PyObject *self, PyObject *args, PyObje
         countpairs_error_out(module, msg);
         Py_RETURN_NONE;
     }
-    
+
     /* Interpret the input objects as numpy arrays. */
     const int requirements = NPY_ARRAY_IN_ARRAY;
     PyObject *x1_array = NULL, *y1_array = NULL, *z1_array = NULL, *weights1_array = NULL;
@@ -1683,7 +1833,7 @@ static PyObject *countpairs_countpairs_wp(PyObject *self, PyObject *args, PyObje
         weights1_array = PyArray_FromArray(weights1_obj, NOTYPE_DESCR, requirements);
     }
 
-    
+
     if (x1_array == NULL || y1_array == NULL || z1_array == NULL) {
         Py_XDECREF(x1_array);
         Py_XDECREF(y1_array);
@@ -1706,7 +1856,7 @@ static PyObject *countpairs_countpairs_wp(PyObject *self, PyObject *args, PyObje
     if(weights1_array != NULL){
         weights1 = PyArray_DATA((PyArrayObject *) weights1_array);
     }
-    
+
     /* Pack the weights into extra_options */
     for(int64_t w = 0; w < extra.weights0.num_weights; w++){
         extra.weights0.weights[w] = (char *) weights1 + w*ND1*element_size;
@@ -1715,7 +1865,7 @@ static PyObject *countpairs_countpairs_wp(PyObject *self, PyObject *args, PyObje
     NPY_BEGIN_THREADS_DEF;
     NPY_BEGIN_THREADS;
 
-    
+
     results_countpairs_wp results;
     options.float_type = element_size;
     double c_api_time = 0.0;
@@ -1738,7 +1888,7 @@ static PyObject *countpairs_countpairs_wp(PyObject *self, PyObject *args, PyObje
     if(status != EXIT_SUCCESS) {
         Py_RETURN_NONE;
     }
-    
+
 #if 0
     for(int i=1;i<results.nbin;i++) {
         const double rpavg = results.rpavg[i];
@@ -1779,11 +1929,11 @@ static PyObject *countpairs_countpairs_xi(PyObject *self, PyObject *args, PyObje
 {
 #if PY_MAJOR_VERSION < 3
     (void) self;//to suppress the unused variable warning. Terrible hack
-    PyObject *module = NULL;//should not be used -> setting to NULL so any attempts to dereference will result in a crash. 
+    PyObject *module = NULL;//should not be used -> setting to NULL so any attempts to dereference will result in a crash.
 #else
     //In python3, self is simply the module object that was returned earlier by init
     PyObject *module = self;
-#endif    
+#endif
 
     PyArrayObject *x1_obj, *y1_obj, *z1_obj, *weights1_obj = NULL;
     double boxsize;
@@ -1818,7 +1968,7 @@ static PyObject *countpairs_countpairs_xi(PyObject *self, PyObject *args, PyObje
         NULL
     };
 
-    
+
     if( ! PyArg_ParseTupleAndKeywords(args, kwargs, "disO!O!O!|O!sbbbbbhbi", kwlist,
                                       &boxsize,&nthreads,&binfile,
                                       &PyArray_Type,&x1_obj,
@@ -1836,7 +1986,7 @@ static PyObject *countpairs_countpairs_xi(PyObject *self, PyObject *args, PyObje
 
         PyObject_Print(kwargs, stdout, 0);
         fprintf(stdout, "\n");
-        
+
         char msg[1024];
         int len=snprintf(msg, 1024,"ArgumentError: In xi> Could not parse the arguments. Input parameters are: \n");
 
@@ -1846,7 +1996,7 @@ static PyObject *countpairs_countpairs_xi(PyObject *self, PyObject *args, PyObje
         if(status != EXIT_SUCCESS) {
             fprintf(stderr,"Error message does not contain all of the keywords\n");
         }
-        
+
         countpairs_error_out(module,msg);
         Py_RETURN_NONE;
     }
@@ -1869,10 +2019,10 @@ static PyObject *countpairs_countpairs_xi(PyObject *self, PyObject *args, PyObje
     size_t element_size;
     const int64_t ND1 = check_dims_and_datatype(module, x1_obj, y1_obj, z1_obj, weights1_obj, &element_size);
     if(ND1 == -1) {
-        //Error has already been set -> simply return 
+        //Error has already been set -> simply return
         Py_RETURN_NONE;
     }
-    
+
     /* Ensure the weights are of the right shape (n_weights, n_particles) */
     if(weights1_obj != NULL){
         // A numpy dimension of length -1 will be expanded to n_weights
@@ -1880,7 +2030,7 @@ static PyObject *countpairs_countpairs_xi(PyObject *self, PyObject *args, PyObje
         PyArray_Dims pdims = {.ptr = &(dims[0]), .len = 2};
         weights1_obj = (PyArrayObject *) PyArray_Newshape(weights1_obj, &pdims, NPY_CORDER);
     }
-    
+
     /* Validate the user's choice of weighting method */
     weight_method_t weighting_method;
     int wstatus = get_weight_method_by_name(weighting_method_str, &weighting_method);
@@ -1899,7 +2049,7 @@ static PyObject *countpairs_countpairs_xi(PyObject *self, PyObject *args, PyObje
         countpairs_error_out(module, msg);
         Py_RETURN_NONE;
     }
-    
+
     if(extra.weights0.num_weights > 0 && found_weights > MAX_NUM_WEIGHTS){
         char msg[1024];
         snprintf(msg, 1024, "ValueError: In %s: Provided %d weights-per-particle, but the code was compiled with MAX_NUM_WEIGHTS=%d.\n",
@@ -1907,7 +2057,7 @@ static PyObject *countpairs_countpairs_xi(PyObject *self, PyObject *args, PyObje
         countpairs_error_out(module, msg);
         Py_RETURN_NONE;
     }
-    
+
     /* Interpret the input objects as numpy arrays. */
     const int requirements = NPY_ARRAY_IN_ARRAY;
     PyObject *x1_array = NULL, *y1_array = NULL, *z1_array = NULL, *weights1_array = NULL;
@@ -1938,12 +2088,12 @@ static PyObject *countpairs_countpairs_xi(PyObject *self, PyObject *args, PyObje
     if(weights1_array != NULL){
         weights1 = PyArray_DATA((PyArrayObject *) weights1_array);
     }
-    
+
     /* Pack the weights into extra_options */
     for(int64_t w = 0; w < extra.weights0.num_weights; w++){
         extra.weights0.weights[w] = (char *) weights1 + w*ND1*element_size;
     }
-    
+
     NPY_BEGIN_THREADS_DEF;
     NPY_BEGIN_THREADS;
 
@@ -1994,15 +2144,320 @@ static PyObject *countpairs_countpairs_xi(PyObject *self, PyObject *args, PyObje
     return Py_BuildValue("(Od)", ret, c_api_time);
 }
 
+
+static PyObject *countpairs_countpairs_s_mu(PyObject *self, PyObject *args, PyObject *kwargs)
+{
+#if PY_MAJOR_VERSION < 3
+    (void) self;
+    PyObject *module = NULL;//should not be used -> setting to NULL so any attempts to dereference will result in a crash.
+#else
+    //In python3, self is simply the module object that was returned earlier by init
+    PyObject *module = self;
+#endif
+    PyArrayObject *x1_obj=NULL, *y1_obj=NULL, *z1_obj=NULL, *weights1_obj=NULL;
+    PyArrayObject *x2_obj=NULL, *y2_obj=NULL, *z2_obj=NULL, *weights2_obj=NULL;
+    int autocorr=0;
+    int nthreads=4;
+
+    double mu_max;
+    int nmu_bins;
+    char *binfile, *weighting_method_str = NULL;
+    struct config_options options = get_config_options();
+    options.verbose = 0;
+    options.instruction_set = -1;
+    options.periodic = 1;
+    options.c_api_timer = 0;
+    int8_t xbin_ref=options.bin_refine_factors[0],
+        ybin_ref=options.bin_refine_factors[1],
+        zbin_ref=options.bin_refine_factors[2];
+
+    static char *kwlist[] = {
+        "autocorr",
+        "nthreads",
+        "binfile",
+        "mu_max",
+        "nmu_bins",
+        "X1",
+        "Y1",
+        "Z1",
+        "weights1",
+        "X2",
+        "Y2",
+        "Z2",
+        "weights2",
+        "periodic",
+        "verbose", /* keyword verbose -> print extra info at runtime + progressbar */
+        "boxsize",
+        "output_savg",
+        "xbin_refine_factor",
+        "ybin_refine_factor",
+        "zbin_refine_factor",
+        "max_cells_per_dim",
+        "c_api_timer",
+        "isa",/* instruction set to use of type enum isa; valid values are AVX, SSE, FALLBACK */
+        "weight_type",
+        NULL
+    };
+
+    if ( ! PyArg_ParseTupleAndKeywords(args, kwargs, "iisdiO!O!O!|O!O!O!O!O!bbdbbbbhbis", kwlist,
+                                       &autocorr,&nthreads,&binfile, &mu_max, &nmu_bins,
+                                       &PyArray_Type,&x1_obj,
+                                       &PyArray_Type,&y1_obj,
+                                       &PyArray_Type,&z1_obj,
+                                       &PyArray_Type,&weights1_obj,
+                                       &PyArray_Type,&x2_obj,
+                                       &PyArray_Type,&y2_obj,
+                                       &PyArray_Type,&z2_obj,
+                                       &PyArray_Type,&weights2_obj,
+                                       &(options.periodic),
+                                       &(options.verbose),
+                                       &(options.boxsize),
+                                       &(options.need_avg_sep),
+                                       &xbin_ref, &ybin_ref, &zbin_ref,
+                                       &(options.max_cells_per_dim),
+                                       &(options.c_api_timer),
+                                       &(options.instruction_set),
+                                       &weighting_method_str)
+
+         ) {
+        PyObject_Print(kwargs, stdout, 0);
+        fprintf(stdout, "\n");
+
+        char msg[1024];
+        int len=snprintf(msg, 1024,"ArgumentError: In DDsmu> Could not parse the arguments. Input parameters are: \n");
+
+        /* How many keywords do we have? Subtract 1 because of the last NULL */
+        const size_t nitems = sizeof(kwlist)/sizeof(*kwlist) - 1;
+        int status = print_kwlist_into_msg(msg, 1024, len, kwlist, nitems);
+        if(status != EXIT_SUCCESS) {
+            fprintf(stderr,"Error message does not contain all of the keywords\n");
+        }
+
+        countpairs_error_out(module,msg);
+        Py_RETURN_NONE;
+    }
+    options.autocorr=autocorr;
+    /*This is for the fastest isa */
+    if(options.instruction_set == -1) {
+        options.instruction_set = highest_isa;
+    }
+
+    if(xbin_ref != options.bin_refine_factors[0] ||
+       ybin_ref != options.bin_refine_factors[1] ||
+       zbin_ref != options.bin_refine_factors[2]) {
+        options.bin_refine_factors[0] = xbin_ref;
+        options.bin_refine_factors[1] = ybin_ref;
+        options.bin_refine_factors[2] = zbin_ref;
+        set_bin_refine_scheme(&options, BINNING_CUST);//custom binning -> code will honor requested binning scheme
+    }
+
+    size_t element_size;
+    /* How many data points are there? And are they all of floating point type */
+    const int64_t ND1 = check_dims_and_datatype(module, x1_obj, y1_obj, z1_obj, weights1_obj, &element_size);
+    if(ND1 == -1) {
+        //Error has already been set -> simply return
+        Py_RETURN_NONE;
+    }
+
+    /* Ensure the weights are of the right shape (n_weights, n_particles) */
+    if(weights1_obj != NULL){
+        // A numpy dimension of length -1 will be expanded to n_weights
+        npy_intp dims[2] = {-1, ND1};
+        PyArray_Dims pdims = {.ptr = &(dims[0]), .len = 2};
+        weights1_obj = (PyArrayObject *) PyArray_Newshape(weights1_obj, &pdims, NPY_CORDER);
+    }
+
+    /* Validate the user's choice of weighting method */
+    weight_method_t weighting_method;
+    int wstatus = get_weight_method_by_name(weighting_method_str, &weighting_method);
+    if(wstatus != EXIT_SUCCESS){
+        char msg[1024];
+        snprintf(msg, 1024, "ValueError: In %s: unknown weight_type %s!", __FUNCTION__, weighting_method_str);
+        countpairs_error_out(module, msg);
+        Py_RETURN_NONE;
+    }
+    int found_weights = weights1_obj == NULL ? 0 : PyArray_SHAPE(weights1_obj)[0];
+    struct extra_options extra = get_extra_options(weighting_method);
+    if(extra.weights0.num_weights > 0 && extra.weights0.num_weights != found_weights){
+        char msg[1024];
+        snprintf(msg, 1024, "ValueError: In %s: specified weighting method %s which requires %"PRId64" weight(s)-per-particle, but found %d weight(s) instead!\n",
+                 __FUNCTION__, weighting_method_str, extra.weights0.num_weights, found_weights);
+        countpairs_error_out(module, msg);
+        Py_RETURN_NONE;
+    }
+
+    if(extra.weights0.num_weights > 0 && found_weights > MAX_NUM_WEIGHTS){
+        char msg[1024];
+        snprintf(msg, 1024, "ValueError: In %s: Provided %d weights-per-particle, but the code was compiled with MAX_NUM_WEIGHTS=%d.\n",
+                 __FUNCTION__, found_weights, MAX_NUM_WEIGHTS);
+        countpairs_error_out(module, msg);
+        Py_RETURN_NONE;
+    }
+
+    int64_t ND2=ND1;
+    if(autocorr == 0) {
+        char msg[1024];
+        if(x2_obj == NULL || y2_obj == NULL || z2_obj == NULL) {
+            snprintf(msg, 1024, "ValueError: In %s: If autocorr is 0, need to pass the second set of positions (X2=numpy array, Y2=numpy array, Z2=numpy array).\n",
+                     __FUNCTION__);
+            countpairs_error_out(module, msg);
+            Py_RETURN_NONE;
+        }
+        if((weights1_obj == NULL) != (weights2_obj == NULL)){
+            snprintf(msg, 1024, "ValueError: In %s: If autocorr is 0, must pass either zero or two sets of weights.\n",
+                     __FUNCTION__);
+            countpairs_error_out(module, msg);
+            Py_RETURN_NONE;
+        }
+
+        size_t element_size2;
+        ND2 = check_dims_and_datatype(module, x2_obj, y2_obj, z2_obj, weights2_obj, &element_size2);
+        if(ND2 == -1) {
+            //Error has already been set -> simply return
+            Py_RETURN_NONE;
+        }
+        /* Ensure the weights are of the right shape (n_weights, n_particles) */
+        if(weights2_obj != NULL){
+            npy_intp dims[2] = {-1, ND2};
+            PyArray_Dims pdims = {.ptr = &(dims[0]), .len = 2};
+            weights2_obj = (PyArrayObject *) PyArray_Newshape(weights2_obj, &pdims, NPY_CORDER);
+        }
+
+        if(element_size != element_size2) {
+            snprintf(msg, 1024, "TypeError: In %s: The two arrays must have the same data-type. First array is of type %s while second array is of type %s\n",
+                     __FUNCTION__, element_size == 4 ? "floats":"doubles", element_size2 == 4 ? "floats":"doubles");
+            countpairs_error_out(module, msg);
+            Py_RETURN_NONE;
+        }
+    }
+
+    /* Interpret the input objects as numpy arrays. */
+    const int requirements = NPY_ARRAY_IN_ARRAY;
+    PyObject *x1_array = NULL, *y1_array = NULL, *z1_array = NULL, *weights1_array = NULL;
+    PyObject *x2_array = NULL, *y2_array = NULL, *z2_array = NULL, *weights2_array = NULL;
+    x1_array = PyArray_FromArray(x1_obj, NOTYPE_DESCR, requirements);
+    y1_array = PyArray_FromArray(y1_obj, NOTYPE_DESCR, requirements);
+    z1_array = PyArray_FromArray(z1_obj, NOTYPE_DESCR, requirements);
+    if(weights1_obj != NULL){
+        weights1_array = PyArray_FromArray(weights1_obj, NOTYPE_DESCR, requirements);
+    }
+
+    if(autocorr == 0) {
+        x2_array = PyArray_FromArray(x2_obj, NOTYPE_DESCR, requirements);
+        y2_array = PyArray_FromArray(y2_obj, NOTYPE_DESCR, requirements);
+        z2_array = PyArray_FromArray(z2_obj, NOTYPE_DESCR, requirements);
+        if(weights2_obj != NULL){
+            weights2_array = PyArray_FromArray(weights2_obj, NOTYPE_DESCR, requirements);
+        }
+    }
+
+    if (x1_array == NULL || y1_array == NULL || z1_array == NULL ||
+        (autocorr == 0 && (x2_array == NULL || y2_array == NULL || z2_array == NULL))) {
+        Py_XDECREF(x1_array);
+        Py_XDECREF(y1_array);
+        Py_XDECREF(z1_array);
+        Py_XDECREF(weights1_array);
+
+        Py_XDECREF(x2_array);
+        Py_XDECREF(y2_array);
+        Py_XDECREF(z2_array);
+        Py_XDECREF(weights2_array);
+        char msg[1024];
+        snprintf(msg, 1024, "TypeError: In %s: Could not convert input to arrays of allowed floating point types (doubles or floats). Are you passing numpy arrays?",
+                 __FUNCTION__);
+        countpairs_error_out(module, msg);
+        Py_RETURN_NONE;
+    }
+
+
+    /* Get pointers to the data as C-types. */
+    void *X1 = NULL, *Y1 = NULL, *Z1 = NULL, *weights1=NULL;
+    void *X2 = NULL, *Y2 = NULL, *Z2 = NULL, *weights2=NULL;
+    X1 = PyArray_DATA((PyArrayObject *) x1_array);
+    Y1 = PyArray_DATA((PyArrayObject *) y1_array);
+    Z1 = PyArray_DATA((PyArrayObject *) z1_array);
+    if(weights1_array != NULL){
+        weights1 = PyArray_DATA((PyArrayObject *) weights1_array);
+    }
+
+    if(autocorr == 0) {
+        X2 = PyArray_DATA((PyArrayObject *) x2_array);
+        Y2 = PyArray_DATA((PyArrayObject *) y2_array);
+        Z2 = PyArray_DATA((PyArrayObject *) z2_array);
+        if(weights2_array != NULL){
+            weights2 = PyArray_DATA((PyArrayObject *) weights2_array);
+        }
+    }
+
+    /* Pack the weights into extra_options */
+    for(int64_t w = 0; w < extra.weights0.num_weights; w++){
+        extra.weights0.weights[w] = (char *) weights1 + w*ND1*element_size;
+        if(autocorr == 0){
+            extra.weights1.weights[w] = (char *) weights2 + w*ND2*element_size;
+        }
+    }
+
+    NPY_BEGIN_THREADS_DEF;
+    NPY_BEGIN_THREADS;
+
+    options.float_type = element_size;
+    results_countpairs_s_mu results;
+    double c_api_time = 0.0;
+    int status = countpairs_s_mu(ND1,X1,Y1,Z1,
+                                 ND2,X2,Y2,Z2,
+                                 nthreads,
+                                 autocorr,
+                                 binfile,
+                                 mu_max,
+                                 nmu_bins,
+                                 &results,
+                                 &options,
+                                 &extra);
+    if(options.c_api_timer) {
+        c_api_time = options.c_api_time;
+    }
+    NPY_END_THREADS;
+
+    /* Clean up. */
+    Py_DECREF(x1_array);Py_DECREF(y1_array);Py_DECREF(z1_array);Py_XDECREF(weights1_array);//x1 should absolutely not be NULL
+    Py_XDECREF(x2_array);Py_XDECREF(y2_array);Py_XDECREF(z2_array);Py_XDECREF(weights2_array);//x2 might be NULL depending on value of autocorr
+    if(status != EXIT_SUCCESS) {
+        Py_RETURN_NONE;
+    }
+
+
+    /* Build the output list */
+    PyObject *ret = PyList_New(0);//create an empty list
+    double smin=results.supp[0];
+    const double dmu = mu_max/(double)nmu_bins;//mu_min is assumed to be 0.0
+    for(int i=1;i<results.nsbin;i++) {
+        const double smax=results.supp[i];
+        for(int j=0;j<results.nmu_bins;j++) {
+            const int bin_index = i*(results.nmu_bins + 1) + j;
+            PyObject *item = NULL;
+            const double savg = results.savg[bin_index];
+            const double weight_avg = results.weightavg[bin_index];
+            item = Py_BuildValue("(ddddkd)", smin, smax,savg,(j+1)*dmu,results.npairs[bin_index], weight_avg);
+            PyList_Append(ret, item);
+            Py_XDECREF(item);
+        }
+        smin=smax;
+    }
+    free_results_s_mu(&results);
+
+    return Py_BuildValue("(Od)", ret, c_api_time);
+}
+
+
 static PyObject *countpairs_countspheres_vpf(PyObject *self, PyObject *args, PyObject *kwargs)
 {
 #if PY_MAJOR_VERSION < 3
     (void) self;//to suppress the unused variable warning. Terrible hack
-    PyObject *module = NULL;//should not be used -> setting to NULL so any attempts to dereference will result in a crash. 
+    PyObject *module = NULL;//should not be used -> setting to NULL so any attempts to dereference will result in a crash.
 #else
     //In python3, self is simply the module object that was returned earlier by init
     PyObject *module = self;
-#endif    
+#endif
 
     PyArrayObject *x1_obj=NULL, *y1_obj=NULL, *z1_obj=NULL;
     double rmax;
@@ -2018,11 +2473,11 @@ static PyObject *countpairs_countspheres_vpf(PyObject *self, PyObject *args, PyO
     /* Reset the bin refine factors default (since the VPF is symmetric in XYZ, conceptually the binning should be identical in all three directions)*/
     int bin_ref[] = {1,1,1};
     set_bin_refine_factors(&options, bin_ref);
-    
+
     int8_t xbin_ref=options.bin_refine_factors[0],
         ybin_ref=options.bin_refine_factors[1],
         zbin_ref=options.bin_refine_factors[2];
-    
+
     static char *kwlist[] = {
         "rmax",
         "nbins",
@@ -2062,17 +2517,17 @@ static PyObject *countpairs_countspheres_vpf(PyObject *self, PyObject *args, PyO
 
         PyObject_Print(kwargs, stdout, 0);
         fprintf(stdout, "\n");
-        
+
         char msg[1024];
         int len=snprintf(msg, 1024,"ArgumentError: In vpf> Could not parse the arguments. Input parameters are: \n");
-        
+
         /* How many keywords do we have? Subtract 1 because of the last NULL */
         const size_t nitems = sizeof(kwlist)/sizeof(*kwlist) - 1;
         int status = print_kwlist_into_msg(msg, 1024, len, kwlist, nitems);
         if(status != EXIT_SUCCESS) {
             fprintf(stderr,"Error message does not contain all of the keywords\n");
         }
-        
+
         countpairs_error_out(module,msg);
         Py_RETURN_NONE;
     }
@@ -2088,12 +2543,12 @@ static PyObject *countpairs_countspheres_vpf(PyObject *self, PyObject *args, PyO
         options.bin_refine_factors[2] = zbin_ref;
         set_bin_refine_scheme(&options, BINNING_CUST);//custom binning -> code will honor requested binning scheme
     }
-    
+
     /* How many data points are there? And are they all of floating point type */
     size_t element_size;
     const int64_t ND1 = check_dims_and_datatype(module, x1_obj, y1_obj, z1_obj, NULL, &element_size);
     if(ND1 == -1) {
-        //Error has already been set -> simply return 
+        //Error has already been set -> simply return
         Py_RETURN_NONE;
     }
 
diff --git a/theory/python_bindings/call_correlation_functions.py b/theory/python_bindings/call_correlation_functions.py
index f9e78fb6..1ceb5f46 100644
--- a/theory/python_bindings/call_correlation_functions.py
+++ b/theory/python_bindings/call_correlation_functions.py
@@ -26,7 +26,8 @@
     countpairs_rp_pi as DDrppi,\
     countpairs_wp as wp,\
     countpairs_xi as xi,\
-    countspheres_vpf as vpf
+    countspheres_vpf as vpf,\
+    countpairs_s_mu as DDsmu
 
 
 def read_text_file(filename, encoding="utf-8"):
@@ -273,6 +274,35 @@ def main():
               .format(items[0], items[1], items[2], items[3], items[4], items[5]))
     print("-------------------------------------------------------------------------")
 
+    mu_max = 0.5
+    nmu_bins = 10
+
+    print("\nRunning 2-D correlation function DD(s,mu)")
+    results_DDsmu, _ = DDsmu(autocorr=autocorr,
+                             nthreads=nthreads,
+                             binfile=binfile,
+                             mu_max=mu_max,
+                             nmu_bins=nmu_bins,
+                             X1=x,
+                             Y1=y,
+                             Z1=z,
+                             weights1=np.ones_like(x),
+                             weight_type='pair_product',
+                             verbose=True,
+                             periodic=periodic,
+                             boxsize=boxsize,
+                             output_savg=True)
+    print("\n#            ****** DD(s,mu): first {0} bins  *******      "
+          .format(numbins_to_print))
+    print("#      smin        smax       savg     mu_max     npairs    weightavg")
+    print("########################################################################")
+    for ibin in range(numbins_to_print):
+        items = results_DDsmu[ibin]
+        print("{0:12.4f} {1:12.4f} {2:10.4f} {3:10.1f} {4:10d} {5:10.4f}"
+              .format(items[0], items[1], items[2], items[3], items[4], items[5]))
+    print("------------------------------------------------------------------------")
+
+
 
     print("\nRunning 2-D projected correlation function wp(rp)")
     results_wp, _, _ = wp(boxsize=boxsize, pimax=pimax, nthreads=nthreads,
diff --git a/theory/tests/Makefile b/theory/tests/Makefile
index a4a24546..41e7d2d1 100644
--- a/theory/tests/Makefile
+++ b/theory/tests/Makefile
@@ -8,12 +8,14 @@ IO_DIR := $(ROOT_DIR)/io
 THEORY_DIR := $(ROOT_DIR)/theory
 DD_DIR := $(THEORY_DIR)/DD
 DDrppi_DIR := $(THEORY_DIR)/DDrppi
+DDsmu_DIR := $(THEORY_DIR)/DDsmu
 WP_DIR := $(THEORY_DIR)/wp
 XI_DIR := $(THEORY_DIR)/xi
 VPF_DIR := $(THEORY_DIR)/vpf
 
 DD_LIB := countpairs
 DDrppi_LIB := countpairs_rp_pi
+DDsmu_LIB := countpairs_s_mu
 WP_LIB := countpairs_wp
 XI_LIB := countpairs_xi
 VPF_LIB := countspheres
@@ -29,10 +31,10 @@ endif
 
 SRC1   := test_periodic.c $(IO_DIR)/io.c $(IO_DIR)/ftread.c $(UTILS_DIR)/utils.c
 OBJS1  := $(SRC1:.c=.o)
-C_LIBRARIES := $(DD_DIR)/lib$(DD_LIB).a $(DDrppi_DIR)/lib$(DDrppi_LIB).a $(WP_DIR)/lib$(WP_LIB).a \
+C_LIBRARIES := $(DD_DIR)/lib$(DD_LIB).a $(DDrppi_DIR)/lib$(DDrppi_LIB).a $(DDsmu_DIR)/lib$(DDsmu_LIB).a $(WP_DIR)/lib$(WP_LIB).a \
              $(XI_DIR)/lib$(XI_LIB).a $(VPF_DIR)/lib$(VPF_LIB).a
-INCL   := $(IO_DIR)/io.h $(IO_DIR)/ftread.h $(UTILS_DIR)/utils.h \
-          $(DD_DIR)/$(DD_LIB).h $(DDrppi_DIR)/$(DDrppi_LIB).h $(WP_DIR)/$(WP_LIB).h \
+INCL   := $(IO_DIR)/io.h $(IO_DIR)/ftread.h $(UTILS_DIR)/utils.h $(UTILS_DIR)/tests_common.h \
+          $(DD_DIR)/$(DD_LIB).h $(DDrppi_DIR)/$(DDrppi_LIB).h $(DDsmu_DIR)/$(DDsmu_LIB).h  $(WP_DIR)/$(WP_LIB).h \
           $(XI_DIR)/$(XI_LIB).h $(VPF_DIR)/$(VPF_LIB).h
 
 SRC2   := test_nonperiodic.c $(UTILS_DIR)/utils.c $(IO_DIR)/io.c $(IO_DIR)/ftread.c
@@ -40,10 +42,10 @@ OBJS2  := $(SRC2:.c=.o)
 
 all: tests $(TARGETS) $(INCL) uncompress $(ROOT_DIR)/theory.options $(ROOT_DIR)/common.mk Makefile
 
-test_periodic: $(OBJS1) $(C_LIBRARIES) $(INCL) $(ROOT_DIR)/theory.options $(ROOT_DIR)/common.mk Makefile 
+test_periodic: $(OBJS1) $(C_LIBRARIES) $(INCL) $(ROOT_DIR)/theory.options $(ROOT_DIR)/common.mk Makefile
 	$(CC) $(OBJS1) $(C_LIBRARIES) $(GSL_LINK) $(CLINK) -o $@
 
-test_nonperiodic: $(OBJS2) $(C_LIBRARIES) $(INCL) $(ROOT_DIR)/theory.options $(ROOT_DIR)/common.mk Makefile 
+test_nonperiodic: $(OBJS2) $(C_LIBRARIES) $(INCL) $(ROOT_DIR)/theory.options $(ROOT_DIR)/common.mk Makefile
 	$(CC) $(OBJS2) $(C_LIBRARIES) $(CLINK) -o $@
 
 %.o: %.c $(INCL) $(ROOT_DIR)/theory.options $(ROOT_DIR)/common.mk Makefile
@@ -55,6 +57,9 @@ $(DD_DIR)/lib$(DD_LIB).a: $(DD_DIR)/*.c $(DD_DIR)/*.c.src $(DD_DIR)/*.h.src $(RO
 $(DDrppi_DIR)/lib$(DDrppi_LIB).a: $(DDrppi_DIR)/*.c $(DDrppi_DIR)/*.c.src $(DDrppi_DIR)/*.h.src $(ROOT_DIR)/theory.options $(ROOT_DIR)/common.mk
 	$(MAKE) -C $(DDrppi_DIR) libs
 
+$(DDsmu_DIR)/lib$(DDsmu_LIB).a: $(DDsmu_DIR)/*.c $(DDsmu_DIR)/*.c.src $(DDsmu_DIR)/*.h.src $(ROOT_DIR)/theory.options $(ROOT_DIR)/common.mk
+	$(MAKE) -C $(DDsmu_DIR) libs
+
 $(WP_DIR)/lib$(WP_LIB).a: $(WP_DIR)/*.c $(WP_DIR)/*.c.src $(WP_DIR)/*.h.src $(ROOT_DIR)/theory.options $(ROOT_DIR)/common.mk
 	$(MAKE) -C $(WP_DIR) libs
 
@@ -65,9 +70,9 @@ $(VPF_DIR)/lib$(VPF_LIB).a: $(VPF_DIR)/*.c $(VPF_DIR)/*.c.src $(VPF_DIR)/*.h.src
 	$(MAKE) -C $(VPF_DIR) libs
 
 python_lib: tests $(OBJS1) $(INCL) $(ROOT_DIR)/theory.options $(ROOT_DIR)/common.mk Makefile | $(ROOT_DIR)/lib
-	@echo 
+	@echo
 	@echo "All THEORY tests are done. Now checking that the C extensions work."
-	@echo 
+	@echo
 	$(MAKE) -C ../python_bindings tests
 
 tests: test_periodic test_nonperiodic
@@ -86,8 +91,12 @@ wp: test_periodic
 	./test_periodic 2
 
 DDrppi: test_periodic test_nonperiodic
-	./test_periodic 0 5 6 7
-	./test_nonperiodic 1 2 
+	./test_periodic 0 6 7 8
+	./test_nonperiodic 1 3
+
+DDsmu: test_periodic test_nonperiodic
+	./test_periodic 5
+	./test_nonperiodic 2
 
 DD: test_periodic test_nonperiodic
 	./test_periodic 1
@@ -99,8 +108,9 @@ vpf: test_periodic
 xi: test_periodic
 	./test_periodic 4
 
+.PHONY: celna clena celan clean
+
+celna clena celan: clean
 clean:
 	$(RM) $(targets) $(OBJS1) $(OBJS2)
 	$(RM) -R *.dSYM
-
-
diff --git a/theory/tests/Mr19_DDsmu_nonperiodic b/theory/tests/Mr19_DDsmu_nonperiodic
new file mode 100644
index 00000000..39f5c453
--- /dev/null
+++ b/theory/tests/Mr19_DDsmu_nonperiodic
@@ -0,0 +1,140 @@
+     16692           0.20339722          -0.62204752            0.05000000           0.24966241
+     16470           0.20331567          -0.62204752            0.10000000           0.24710486
+     16016           0.20372512          -0.62204752            0.15000000           0.24565266
+     16050           0.20363876          -0.62204752            0.20000000           0.24788391
+     15916           0.20350369          -0.62204752            0.25000000           0.25101277
+     16272           0.20313386          -0.62204752            0.30000000           0.24895674
+     16230           0.20348917          -0.62204752            0.35000000           0.25338644
+     16546           0.20385236          -0.62204752            0.40000000           0.24980049
+     16526           0.20347987          -0.62204752            0.45000000           0.24824338
+     16324           0.20365368          -0.62204752            0.50000000           0.24607593
+     25118           0.29047101          -0.46820059            0.05000000           0.24741942
+     24792           0.28963305          -0.46820059            0.10000000           0.25267582
+     25086           0.29024286          -0.46820059            0.15000000           0.24901836
+     24722           0.29013252          -0.46820059            0.20000000           0.25010222
+     25022           0.29068373          -0.46820059            0.25000000           0.25151603
+     25014           0.28978432          -0.46820059            0.30000000           0.24788683
+     25092           0.29024169          -0.46820059            0.35000000           0.24838181
+     24996           0.29044902          -0.46820059            0.40000000           0.25580759
+     25044           0.28959921          -0.46820059            0.45000000           0.24973243
+     24610           0.29003703          -0.46820059            0.50000000           0.24841258
+     39030           0.41310991          -0.31435498            0.05000000           0.24873583
+     38388           0.41324114          -0.31435498            0.10000000           0.24837949
+     37854           0.41296066          -0.31435498            0.15000000           0.24745815
+     38026           0.41399658          -0.31435498            0.20000000           0.25035322
+     38274           0.41374743          -0.31435498            0.25000000           0.24580413
+     38604           0.41366503          -0.31435498            0.30000000           0.25080159
+     37936           0.41347934          -0.31435498            0.35000000           0.25182989
+     38476           0.41330508          -0.31435498            0.40000000           0.25113755
+     38446           0.41365482          -0.31435498            0.45000000           0.24936805
+     38322           0.41335469          -0.31435498            0.50000000           0.25166295
+     58284           0.58847036          -0.16050875            0.05000000           0.25020375
+     57664           0.58868424          -0.16050875            0.10000000           0.24883266
+     58050           0.58847073          -0.16050875            0.15000000           0.24703402
+     58574           0.58908260          -0.16050875            0.20000000           0.24857010
+     58054           0.58811167          -0.16050875            0.25000000           0.24999327
+     57676           0.58852707          -0.16050875            0.30000000           0.25020662
+     57812           0.58820372          -0.16050875            0.35000000           0.25040234
+     58400           0.58818573          -0.16050875            0.40000000           0.25057091
+     57954           0.58872986          -0.16050875            0.45000000           0.24965003
+     57778           0.58854156          -0.16050875            0.50000000           0.24997801
+     83592           0.83726945          -0.00666210            0.05000000           0.24998233
+     84400           0.83844570          -0.00666210            0.10000000           0.24862569
+     85084           0.83793227          -0.00666210            0.15000000           0.24851677
+     84796           0.83807338          -0.00666210            0.20000000           0.25001269
+     84248           0.83848332          -0.00666210            0.25000000           0.24972551
+     84860           0.83836965          -0.00666210            0.30000000           0.24923526
+     85374           0.83781963          -0.00666210            0.35000000           0.24773429
+     84728           0.83813240          -0.00666210            0.40000000           0.25042875
+     84652           0.83791795          -0.00666210            0.45000000           0.25026937
+     85016           0.83819879          -0.00666210            0.50000000           0.24971834
+    118788           1.19295365           0.14718457            0.05000000           0.25037935
+    118906           1.19265243           0.14718457            0.10000000           0.24915018
+    117776           1.19427135           0.14718457            0.15000000           0.24950589
+    120088           1.19373114           0.14718457            0.20000000           0.25066734
+    119402           1.19333891           0.14718457            0.25000000           0.25095498
+    121212           1.19294985           0.14718457            0.30000000           0.24910440
+    119042           1.19384043           0.14718457            0.35000000           0.24951204
+    120494           1.19303713           0.14718457            0.40000000           0.25028328
+    119208           1.19336002           0.14718457            0.45000000           0.24925978
+    120362           1.19308712           0.14718457            0.50000000           0.25168495
+    171592           1.70474276           0.30103000            0.05000000           0.24881097
+    170524           1.70434256           0.30103000            0.10000000           0.25096788
+    172864           1.70482526           0.30103000            0.15000000           0.25124184
+    173080           1.70509778           0.30103000            0.20000000           0.24959425
+    173230           1.70451935           0.30103000            0.25000000           0.25037203
+    174068           1.70575581           0.30103000            0.30000000           0.25001682
+    173370           1.70491087           0.30103000            0.35000000           0.24961422
+    173504           1.70421163           0.30103000            0.40000000           0.25015015
+    173384           1.70495092           0.30103000            0.45000000           0.25004891
+    175106           1.70524692           0.30103000            0.50000000           0.24864867
+    281176           2.44058087           0.45487534            0.05000000           0.24921168
+    282048           2.43990693           0.45487534            0.10000000           0.24960329
+    283034           2.43795647           0.45487534            0.15000000           0.24988415
+    283986           2.43917192           0.45487534            0.20000000           0.25103485
+    282324           2.43828188           0.45487534            0.25000000           0.24987784
+    283626           2.43841569           0.45487534            0.30000000           0.24946471
+    282438           2.43973167           0.45487534            0.35000000           0.24984319
+    282940           2.43923414           0.45487534            0.40000000           0.24882798
+    286006           2.43798930           0.45487534            0.45000000           0.24928019
+    285594           2.43981163           0.45487534            0.50000000           0.24905817
+    532804           3.49212451           0.60872281            0.05000000           0.24996206
+    530772           3.48966468           0.60872281            0.10000000           0.24898679
+    530256           3.49046850           0.60872281            0.15000000           0.25031266
+    531860           3.49468260           0.60872281            0.20000000           0.24968356
+    534886           3.49251076           0.60872281            0.25000000           0.25001477
+    534710           3.49149216           0.60872281            0.30000000           0.24922175
+    536232           3.49198070           0.60872281            0.35000000           0.24901246
+    535324           3.48953193           0.60872281            0.40000000           0.24939518
+    536586           3.49189361           0.60872281            0.45000000           0.24975535
+    538130           3.49257671           0.60872281            0.50000000           0.24955356
+   1138098           4.98779581           0.76256829            0.05000000           0.25000554
+   1134530           4.98682546           0.76256829            0.10000000           0.24994273
+   1131410           4.98576422           0.76256829            0.15000000           0.25053053
+   1129014           4.98384170           0.76256829            0.20000000           0.25053172
+   1128418           4.98402162           0.76256829            0.25000000           0.24980761
+   1134178           4.98555156           0.76256829            0.30000000           0.24973701
+   1137424           4.98529618           0.76256829            0.35000000           0.24908474
+   1136920           4.98521532           0.76256829            0.40000000           0.24988729
+   1130638           4.98483436           0.76256829            0.45000000           0.24977635
+   1130654           4.98518466           0.76256829            0.50000000           0.25027365
+   2565210           7.11954114           0.91641447            0.05000000           0.24984163
+   2550026           7.11785952           0.91641447            0.10000000           0.24953513
+   2550216           7.11824618           0.91641447            0.15000000           0.24930153
+   2551448           7.11813405           0.91641447            0.20000000           0.24937393
+   2551182           7.11654315           0.91641447            0.25000000           0.24950969
+   2551058           7.11708846           0.91641447            0.30000000           0.24947281
+   2555568           7.11670860           0.91641447            0.35000000           0.24988287
+   2541210           7.11573632           0.91641447            0.40000000           0.24989037
+   2534490           7.11495121           0.91641447            0.45000000           0.24978067
+   2531530           7.11853657           0.91641447            0.50000000           0.25041968
+   6140136          10.15832627           1.07025958            0.05000000           0.24935784
+   6134522          10.15851017           1.07025958            0.10000000           0.24941126
+   6114376          10.15981186           1.07025958            0.15000000           0.24898925
+   6095996          10.16081907           1.07025958            0.20000000           0.24984614
+   6070144          10.16135587           1.07025958            0.25000000           0.24990914
+   6056906          10.16012949           1.07025958            0.30000000           0.24959204
+   6056284          10.15961453           1.07025958            0.35000000           0.24946016
+   6041606          10.15759601           1.07025958            0.40000000           0.24930955
+   6056874          10.15689495           1.07025958            0.45000000           0.24932114
+   6047742          10.15663323           1.07025958            0.50000000           0.24934432
+  15182384          14.48878258           1.22410814            0.05000000           0.24909997
+  15147790          14.48819977           1.22410814            0.10000000           0.24957603
+  15131684          14.48850811           1.22410814            0.15000000           0.24946850
+  15123448          14.48342154           1.22410814            0.20000000           0.24950576
+  15093890          14.49069292           1.22410814            0.25000000           0.24969058
+  15064712          14.49152684           1.22410814            0.30000000           0.24984536
+  15035580          14.49233646           1.22410814            0.35000000           0.24979204
+  15020350          14.48772643           1.22410814            0.40000000           0.24961736
+  15006472          14.48914620           1.22410814            0.45000000           0.24972672
+  15009476          14.49032842           1.22410814            0.50000000           0.24964864
+  39140742          20.67496688           1.37795248            0.05000000           0.24958415
+  39105020          20.66745098           1.37795248            0.10000000           0.24958584
+  39004936          20.66815741           1.37795248            0.15000000           0.24949485
+  38945200          20.66820055           1.37795248            0.20000000           0.24967691
+  38888220          20.66681410           1.37795248            0.25000000           0.24965315
+  38721022          20.66850293           1.37795248            0.30000000           0.24958649
+  38618624          20.66668659           1.37795248            0.35000000           0.24966218
+  38651274          20.66767160           1.37795248            0.40000000           0.24973893
+  38683556          20.67058559           1.37795248            0.45000000           0.24975961
+  38778746          20.66625446           1.37795248            0.50000000           0.24983466
diff --git a/theory/tests/Mr19_DDsmu_periodic b/theory/tests/Mr19_DDsmu_periodic
new file mode 100644
index 00000000..e1606f75
--- /dev/null
+++ b/theory/tests/Mr19_DDsmu_periodic
@@ -0,0 +1,140 @@
+     16696           0.20340501          -0.62204752           0.05000000           0.24968146
+     16488           0.20332144          -0.62204752           0.10000000           0.24706347
+     16020           0.20373232          -0.62204752           0.15000000           0.24562233
+     16060           0.20364371          -0.62204752           0.20000000           0.24784059
+     15934           0.20350385          -0.62204752           0.25000000           0.25108589
+     16284           0.20314149          -0.62204752           0.30000000           0.24887897
+     16234           0.20349115          -0.62204752           0.35000000           0.25340692
+     16550           0.20385818          -0.62204752           0.40000000           0.24976416
+     16538           0.20347811          -0.62204752           0.45000000           0.24813122
+     16334           0.20365597          -0.62204752           0.50000000           0.24612709
+     25138           0.29047320          -0.46820059           0.05000000           0.24740801
+     24816           0.28963528          -0.46820059           0.10000000           0.25263034
+     25100           0.29024627          -0.46820059           0.15000000           0.24906452
+     24734           0.29013424          -0.46820059           0.20000000           0.25003660
+     25046           0.29069771          -0.46820059           0.25000000           0.25139261
+     25030           0.28979884          -0.46820059           0.30000000           0.24791067
+     25124           0.29025353          -0.46820059           0.35000000           0.24846020
+     25010           0.29045907          -0.46820059           0.40000000           0.25582840
+     25088           0.28960990          -0.46820059           0.45000000           0.24973555
+     24652           0.29005464          -0.46820059           0.50000000           0.24851525
+     39076           0.41310723          -0.31435498           0.05000000           0.24874608
+     38432           0.41324844          -0.31435498           0.10000000           0.24834401
+     37920           0.41298222          -0.31435498           0.15000000           0.24751310
+     38088           0.41400263          -0.31435498           0.20000000           0.25035322
+     38340           0.41375542          -0.31435498           0.25000000           0.24579433
+     38664           0.41367900          -0.31435498           0.30000000           0.25075408
+     38002           0.41348494          -0.31435498           0.35000000           0.25190770
+     38538           0.41331965          -0.31435498           0.40000000           0.25107039
+     38528           0.41366017          -0.31435498           0.45000000           0.24932288
+     38382           0.41336992          -0.31435498           0.50000000           0.25165435
+     58392           0.58845657          -0.16050875           0.05000000           0.25012367
+     57786           0.58870852          -0.16050875           0.10000000           0.24888890
+     58200           0.58847174          -0.16050875           0.15000000           0.24709230
+     58712           0.58909389          -0.16050875           0.20000000           0.24857269
+     58186           0.58813286          -0.16050875           0.25000000           0.25012674
+     57800           0.58853899          -0.16050875           0.30000000           0.25024506
+     57968           0.58821597          -0.16050875           0.35000000           0.25039608
+     58532           0.58819346          -0.16050875           0.40000000           0.25062984
+     58088           0.58872466          -0.16050875           0.45000000           0.24969933
+     57948           0.58854816          -0.16050875           0.50000000           0.24996402
+     83794           0.83729359          -0.00666210           0.05000000           0.25001428
+     84672           0.83845940          -0.00666210           0.10000000           0.24871135
+     85358           0.83797630          -0.00666210           0.15000000           0.24843082
+     85076           0.83811044          -0.00666210           0.20000000           0.25001990
+     84548           0.83850795          -0.00666210           0.25000000           0.24976886
+     85196           0.83841976          -0.00666210           0.30000000           0.24923974
+     85676           0.83787346          -0.00666210           0.35000000           0.24775034
+     85004           0.83817048          -0.00666210           0.40000000           0.25048931
+     84954           0.83795899          -0.00666210           0.45000000           0.25029859
+     85288           0.83825321          -0.00666210           0.50000000           0.24974315
+    119352           1.19295578           0.14718457           0.05000000           0.25031211
+    119520           1.19272930           0.14718457           0.10000000           0.24918593
+    118354           1.19427653           0.14718457           0.15000000           0.24953022
+    120690           1.19377006           0.14718457           0.20000000           0.25062360
+    119956           1.19338542           0.14718457           0.25000000           0.25086077
+    121794           1.19294302           0.14718457           0.30000000           0.24909890
+    119662           1.19387752           0.14718457           0.35000000           0.24958285
+    121042           1.19307377           0.14718457           0.40000000           0.25038331
+    119736           1.19342878           0.14718457           0.45000000           0.24926798
+    120924           1.19318640           0.14718457           0.50000000           0.25182151
+    172634           1.70484178           0.30103000           0.05000000           0.24901150
+    171622           1.70447783           0.30103000           0.10000000           0.25106794
+    173936           1.70492875           0.30103000           0.15000000           0.25128342
+    174218           1.70516371           0.30103000           0.20000000           0.24955715
+    174358           1.70468983           0.30103000           0.25000000           0.25049471
+    175166           1.70582292           0.30103000           0.30000000           0.25006190
+    174508           1.70498750           0.30103000           0.35000000           0.24963414
+    174688           1.70438109           0.30103000           0.40000000           0.25008698
+    174588           1.70507521           0.30103000           0.45000000           0.25011769
+    176394           1.70539512           0.30103000           0.50000000           0.24870294
+    283488           2.44073106           0.45487534           0.05000000           0.24923506
+    284490           2.44015027           0.45487534           0.10000000           0.24965515
+    285480           2.43804390           0.45487534           0.15000000           0.24986291
+    286390           2.43923239           0.45487534           0.20000000           0.25104629
+    284834           2.43849273           0.45487534           0.25000000           0.24991494
+    286494           2.43853899           0.45487534           0.30000000           0.24953764
+    285406           2.43996221           0.45487534           0.35000000           0.25001257
+    285932           2.43958068           0.45487534           0.40000000           0.24889016
+    289136           2.43852888           0.45487534           0.45000000           0.24934841
+    288778           2.44042085           0.45487534           0.50000000           0.24910449
+    538774           3.49265466           0.60872281           0.05000000           0.25012685
+    536890           3.49012486           0.60872281           0.10000000           0.24912778
+    536766           3.49103694           0.60872281           0.15000000           0.25047942
+    538644           3.49523525           0.60872281           0.20000000           0.24974164
+    541876           3.49306442           0.60872281           0.25000000           0.25001247
+    542040           3.49198551           0.60872281           0.30000000           0.24933670
+    544062           3.49246414           0.60872281           0.35000000           0.24904686
+    543728           3.48998211           0.60872281           0.40000000           0.24936274
+    545484           3.49250135           0.60872281           0.45000000           0.24991215
+    546724           3.49294220           0.60872281           0.50000000           0.24958501
+   1156554           4.98855112           0.76256829           0.05000000           0.25013543
+   1154302           4.98751036           0.76256829           0.10000000           0.25012477
+   1151936           4.98682179           0.76256829           0.15000000           0.25064812
+   1150726           4.98495569           0.76256829           0.20000000           0.25063066
+   1151192           4.98529231           0.76256829           0.25000000           0.24988710
+   1157706           4.98676745           0.76256829           0.30000000           0.24977810
+   1161690           4.98614358           0.76256829           0.35000000           0.24910945
+   1161492           4.98583037           0.76256829           0.40000000           0.24997636
+   1155500           4.98580328           0.76256829           0.45000000           0.24973296
+   1156280           4.98603707           0.76256829           0.50000000           0.25020194
+   2630150           7.12157827           0.91641447           0.05000000           0.24980989
+   2616906           7.12028119           0.91641447           0.10000000           0.24948964
+   2617738           7.12041075           0.91641447           0.15000000           0.24922114
+   2621348           7.11986117           0.91641447           0.20000000           0.24937453
+   2623228           7.11807429           0.91641447           0.25000000           0.24950429
+   2624028           7.11885765           0.91641447           0.30000000           0.24953469
+   2629056           7.11856852           0.91641447           0.35000000           0.24991871
+   2616664           7.11762632           0.91641447           0.40000000           0.25000758
+   2607754           7.11656104           0.91641447           0.45000000           0.24987818
+   2605020           7.12005098           0.91641447           0.50000000           0.25040764
+   6363294          10.16119903           1.07025958           0.05000000           0.24945822
+   6365066          10.16177346           1.07025958           0.10000000           0.24950650
+   6350118          10.16341397           1.07025958           0.15000000           0.24906329
+   6338594          10.16494264           1.07025958           0.20000000           0.24988379
+   6318838          10.16566686           1.07025958           0.25000000           0.24998068
+   6311932          10.16404943           1.07025958           0.30000000           0.24965104
+   6309244          10.16369761           1.07025958           0.35000000           0.24947432
+   6299678          10.16204343           1.07025958           0.40000000           0.24929928
+   6316354          10.16138045           1.07025958           0.45000000           0.24933026
+   6304980          10.16126849           1.07025958           0.50000000           0.24930246
+  15979970          14.49650011           1.22410814           0.05000000           0.24919628
+  15971062          14.49621019           1.22410814           0.10000000           0.24968706
+  15977284          14.49627589           1.22410814           0.15000000           0.24954267
+  15998070          14.49184107           1.22410814           0.20000000           0.24954113
+  15986760          14.49923658           1.22410814           0.25000000           0.24976108
+  15975338          14.49975675           1.22410814           0.30000000           0.24983244
+  15962496          14.49935401           1.22410814           0.35000000           0.24983263
+  15944790          14.49520336           1.22410814           0.40000000           0.24960287
+  15927326          14.49714924           1.22410814           0.45000000           0.24974935
+  15924152          14.49931252           1.22410814           0.50000000           0.24963448
+  42194174          20.69085123           1.37795248           0.05000000           0.24954990
+  42226860          20.68331626           1.37795248           0.10000000           0.24947298
+  42206962          20.68446245           1.37795248           0.15000000           0.24942812
+  42206278          20.68412907           1.37795248           0.20000000           0.24965358
+  42193166          20.68325619           1.37795248           0.25000000           0.24964480
+  42074518          20.68514150           1.37795248           0.30000000           0.24956370
+  42014558          20.68325092           1.37795248           0.35000000           0.24967507
+  42075126          20.68469578           1.37795248           0.40000000           0.24977415
+  42129724          20.68714916           1.37795248           0.45000000           0.24974958
+  42257974          20.68303823           1.37795248           0.50000000           0.24983016
diff --git a/theory/tests/test_nonperiodic.c b/theory/tests/test_nonperiodic.c
index 16a9e0d3..1a6300b4 100644
--- a/theory/tests/test_nonperiodic.c
+++ b/theory/tests/test_nonperiodic.c
@@ -6,29 +6,18 @@
   directory at https://github.com/manodeep/Corrfunc/
 */
 
-#include <stdio.h>
-#include <stdlib.h>
-#include <assert.h>
-#include <string.h>
-#include <sys/time.h>
-#include <inttypes.h>
-
-#ifndef MAXLEN
-#define MAXLEN 500
-#endif
-
-#include "defs.h"
-#include "utils.h"
+#include "tests_common.h"
 #include "io.h"
-#include "ftread.h"
 
 #include "../DD/countpairs.h"
 #include "../DDrppi/countpairs_rp_pi.h"
+#include "../DDsmu/countpairs_s_mu.h"
 
 char tmpoutputfile[]="./test_nonperiodic_output.txt";
 
 int test_nonperiodic_DD(const char *correct_outputfile);
 int test_nonperiodic_DDrppi(const char *correct_outputfile);
+int test_nonperiodic_DDsmu(const char *correct_outputfile);
 void read_data_and_set_globals(const char *firstfilename, const char *firstformat,const char *secondfilename,const char *secondformat);
 
 //Global variables
@@ -38,25 +27,15 @@ double *X1=NULL,*Y1=NULL,*Z1=NULL,*weights1=NULL;
 int ND2;
 double *X2=NULL,*Y2=NULL,*Z2=NULL,*weights2=NULL;
 
-char binfile[]="bins";
-double pimax=40.0;
-double boxsize=420.0;
-#if defined(_OPENMP)
-const int nthreads=4;
-#else
-const int nthreads=1;
-#endif
-
 char current_file1[MAXLEN],current_file2[MAXLEN];
 struct config_options options;
-
-const double maxdiff = 1e-9;
-const double maxreldiff = 1e-6;
 //end of global variables
 
 int test_nonperiodic_DD(const char *correct_outputfile)
 {
     int autocorr = (X1==X2) ? 1:0;
+    results_countpairs results;
+    int ret = EXIT_FAILURE;
     
     // Set up the weights pointers
     weight_method_t weight_method = PAIR_PRODUCT;
@@ -64,53 +43,54 @@ int test_nonperiodic_DD(const char *correct_outputfile)
     extra.weights0.weights[0] = weights1;
     extra.weights1.weights[0] = weights2;
 
-    //Do the straight-up DD counts
-    results_countpairs results;
-    int status = countpairs(ND1,X1,Y1,Z1,
-                            ND2,X2,Y2,Z2,
-                            nthreads,
-                            autocorr,
-                            binfile,
-                            &results,
-                            &options,
-                            &extra);
-    if(status != EXIT_SUCCESS) {
-        return status;
-    }
-
-    int ret = EXIT_FAILURE;
-    FILE *fp=my_fopen(correct_outputfile,"r");
-    if(fp == NULL) {
-        free_results(&results);
-        return EXIT_FAILURE;
-    }
-    for(int i=1;i<results.nbin;i++) {
-        uint64_t npairs;
-        double rpavg, weightavg;
-        ret = EXIT_FAILURE;
-        int nitems = fscanf(fp,"%"SCNu64" %lf %*f %*f %lf%*[^\n]", &npairs, &rpavg, &weightavg);
-        if(nitems != 3) {
-            break;
+    BEGIN_INTEGRATION_TEST_SECTION
+    
+        //Do the straight-up DD counts
+        int status = countpairs(ND1,X1,Y1,Z1,
+                                ND2,X2,Y2,Z2,
+                                nthreads,
+                                autocorr,
+                                binfile,
+                                &results,
+                                &options,
+                                &extra);
+        if(status != EXIT_SUCCESS) {
+            return status;
         }
-        int floats_equal = AlmostEqualRelativeAndAbs_double(rpavg, results.rpavg[i], maxdiff, maxreldiff);
-        int weights_equal = AlmostEqualRelativeAndAbs_double(weightavg, results.weightavg[i], maxdiff, maxreldiff);
-
-        //Check for exact equality of npairs and float "equality" for rpavg
-        if(npairs == results.npairs[i] && floats_equal == EXIT_SUCCESS && weights_equal == EXIT_SUCCESS) {
-            ret = EXIT_SUCCESS;
-        } else {
-            ret = EXIT_FAILURE;//not required but showing intent 
-            fprintf(stderr,"True npairs = %"PRIu64 " Computed results npairs = %"PRIu64"\n", npairs, results.npairs[i]);
-            fprintf(stderr,"True rpavg  = %e Computed rpavg = %e. floats_equal = %d\n", rpavg, results.rpavg[i], floats_equal);
-            fprintf(stderr,"True weightavg = %e Computed weightavg = %e. weights_equal = %d\n", weightavg, results.weightavg[i], weights_equal);
-            break;
+        
+        FILE *fp=my_fopen(correct_outputfile,"r");
+        if(fp == NULL) {
+            free_results(&results);
+            return EXIT_FAILURE;
         }
-    }
-    fclose(fp);
-
+        for(int i=1;i<results.nbin;i++) {
+            uint64_t npairs;
+            double rpavg, weightavg;
+            ret = EXIT_FAILURE;
+            int nitems = fscanf(fp,"%"SCNu64" %lf %*f %*f %lf%*[^\n]", &npairs, &rpavg, &weightavg);
+            if(nitems != 3) {
+                break;
+            }
+            int floats_equal = AlmostEqualRelativeAndAbs_double(rpavg, results.rpavg[i], maxdiff, maxreldiff);
+            int weights_equal = AlmostEqualRelativeAndAbs_double(weightavg, results.weightavg[i], maxdiff, maxreldiff);
+            
+            //Check for exact equality of npairs and float "equality" for rpavg
+            if(npairs == results.npairs[i] && floats_equal == EXIT_SUCCESS && weights_equal == EXIT_SUCCESS) {
+                ret = EXIT_SUCCESS;
+            } else {
+                ret = EXIT_FAILURE;//not required but showing intent
+                fprintf(stderr,"True npairs = %"PRIu64 " Computed results npairs = %"PRIu64"\n", npairs, results.npairs[i]);
+                fprintf(stderr,"True rpavg  = %e Computed rpavg = %e. floats_equal = %d\n", rpavg, results.rpavg[i], floats_equal);
+                fprintf(stderr,"True weightavg = %e Computed weightavg = %e. weights_equal = %d\n", weightavg, results.weightavg[i], weights_equal);
+                break;
+            }
+        }
+        fclose(fp);
+    END_INTEGRATION_TEST_SECTION;
+        
     /* If the test failed, then write to temporary file, so a comparison can be made */
     if(ret != EXIT_SUCCESS) {
-        fp=my_fopen(tmpoutputfile,"w"); 
+        FILE *fp=my_fopen(tmpoutputfile,"w");
         double rlow = results.rupp[0];
         for(int i=1;i<results.nbin;i++) {
             fprintf(fp,"%10"PRIu64" %20.8lf %20.8lf %20.8lf %20.8lf \n",results.npairs[i],results.rpavg[i],rlow,results.rupp[i], results.weightavg[i]);
@@ -125,70 +105,72 @@ int test_nonperiodic_DD(const char *correct_outputfile)
 
 int test_nonperiodic_DDrppi(const char *correct_outputfile)
 {
-    int autocorr = (X1==X2) ? 1:0;
+    results_countpairs_rp_pi results;
+    int ret = EXIT_FAILURE;
     
+    int autocorr = (X1==X2) ? 1:0;
     // Set up the weights pointers
     weight_method_t weight_method = PAIR_PRODUCT;
     struct extra_options extra = get_extra_options(weight_method);
     extra.weights0.weights[0] = weights1;
     extra.weights1.weights[0] = weights2;
-
-    results_countpairs_rp_pi results;
-    int status = countpairs_rp_pi(ND1,X1,Y1,Z1,
-                                  ND2,X2,Y2,Z2,
-                                  nthreads,
-                                  autocorr,
-                                  binfile,
-                                  pimax,
-                                  &results,
-                                  &options,
-                                  &extra);
-    if(status != EXIT_SUCCESS) {
-        return status;
-    }
-
-    int ret = EXIT_FAILURE;
-    const int npibin = results.npibin;
-    const double dpi = pimax/(double)results.npibin ;
-    FILE *fp=my_fopen(correct_outputfile,"r");
-    if(fp == NULL) {
-        free_results_rp_pi(&results);
-        return EXIT_FAILURE;
-    }
-
     
-    for(int i=1;i<results.nbin;i++) {
-        for(int j=0;j<npibin;j++) {
-            int index = i*(npibin+1) + j;
-            uint64_t npairs;
-            double rpavg, weightavg;
-            ret = EXIT_FAILURE;
-            int nitems = fscanf(fp,"%"SCNu64" %lf %*f %*f %lf%*[^\n]", &npairs, &rpavg, &weightavg);
-            if(nitems != 3) {
-                i = results.nbin;
-                ret = EXIT_FAILURE;//not required but showing intent
-                break;
-            }
-            int floats_equal = AlmostEqualRelativeAndAbs_double(rpavg, results.rpavg[index], maxdiff, maxreldiff);
-            int weights_equal = AlmostEqualRelativeAndAbs_double(weightavg, results.weightavg[index], maxdiff, maxreldiff);
-            
-            //Check for exact equality of npairs and float "equality" for rpavg
-            if(npairs == results.npairs[index] && floats_equal == EXIT_SUCCESS && weights_equal == EXIT_SUCCESS) {
-                ret = EXIT_SUCCESS;
-            } else {
-                fprintf(stderr,"Failed. True npairs = %"PRIu64 " Computed results npairs = %"PRIu64"\n", npairs, results.npairs[index]);
-                fprintf(stderr,"Failed. True rpavg = %e Computed rpavg = %e. floats_equal = %d\n", rpavg, results.rpavg[index], floats_equal);
-                fprintf(stderr,"Failed. True weightavg = %e Computed weightavg = %e. weights_equal = %d\n", weightavg, results.weightavg[index], weights_equal);
-                ret = EXIT_FAILURE;//not required but showing intent 
-                i=results.nbin;
-                break;
-            }
+    BEGIN_INTEGRATION_TEST_SECTION
+        int status = countpairs_rp_pi(ND1,X1,Y1,Z1,
+                                      ND2,X2,Y2,Z2,
+                                      nthreads,
+                                      autocorr,
+                                      binfile,
+                                      pimax,
+                                      &results,
+                                      &options,
+                                      &extra);
+        if(status != EXIT_SUCCESS) {
+            return status;
         }
-    }
-    fclose(fp);
 
+        const int npibin = results.npibin;
+        FILE *fp=my_fopen(correct_outputfile,"r");
+        if(fp == NULL) {
+            free_results_rp_pi(&results);
+            return EXIT_FAILURE;
+        }
+        
+        for(int i=1;i<results.nbin;i++) {
+            for(int j=0;j<npibin;j++) {
+                int index = i*(npibin+1) + j;
+                uint64_t npairs;
+                double rpavg, weightavg;
+                ret = EXIT_FAILURE;
+                int nitems = fscanf(fp,"%"SCNu64" %lf %*f %*f %lf%*[^\n]", &npairs, &rpavg, &weightavg);
+                if(nitems != 3) {
+                    i = results.nbin;
+                    ret = EXIT_FAILURE;//not required but showing intent
+                    break;
+                }
+                int floats_equal = AlmostEqualRelativeAndAbs_double(rpavg, results.rpavg[index], maxdiff, maxreldiff);
+                int weights_equal = AlmostEqualRelativeAndAbs_double(weightavg, results.weightavg[index], maxdiff, maxreldiff);
+                
+                //Check for exact equality of npairs and float "equality" for rpavg
+                if(npairs == results.npairs[index] && floats_equal == EXIT_SUCCESS && weights_equal == EXIT_SUCCESS) {
+                    ret = EXIT_SUCCESS;
+                } else {
+                    fprintf(stderr,"Failed. True npairs = %"PRIu64 " Computed results npairs = %"PRIu64"\n", npairs, results.npairs[index]);
+                    fprintf(stderr,"Failed. True rpavg = %e Computed rpavg = %e. floats_equal = %d\n", rpavg, results.rpavg[index], floats_equal);
+                    fprintf(stderr,"Failed. True weightavg = %e Computed weightavg = %e. weights_equal = %d\n", weightavg, results.weightavg[index], weights_equal);
+                    ret = EXIT_FAILURE;//not required but showing intent
+                    i=results.nbin;
+                    break;
+                }
+            }
+        }
+        fclose(fp);
+    END_INTEGRATION_TEST_SECTION;
+    
     if(ret != EXIT_SUCCESS) {
-        fp = my_fopen(tmpoutputfile,"w");
+        FILE *fp = my_fopen(tmpoutputfile,"w");
+        const int npibin = results.npibin;
+        const double dpi = pimax/(double)results.npibin ;
         for(int i=1;i<results.nbin;i++) {
             const double logrp = log10(results.rupp[i]);
             for(int j=0;j<npibin;j++) {
@@ -204,6 +186,90 @@ int test_nonperiodic_DDrppi(const char *correct_outputfile)
     return ret;
 }
 
+int test_nonperiodic_DDsmu(const char *correct_outputfile)
+{
+    results_countpairs_s_mu results;
+    int ret = EXIT_FAILURE;
+    
+    int autocorr = (X1==X2) ? 1:0;
+    // Set up the weights pointers
+    weight_method_t weight_method = PAIR_PRODUCT;
+    struct extra_options extra = get_extra_options(weight_method);
+    extra.weights0.weights[0] = weights1;
+    extra.weights1.weights[0] = weights2;
+
+    BEGIN_INTEGRATION_TEST_SECTION
+        int status = countpairs_s_mu(ND1,X1,Y1,Z1,
+                                     ND2,X2,Y2,Z2,
+                                     nthreads,
+                                     autocorr,
+                                     binfile,
+                                     theory_mu_max,
+                                     nmu_bins,
+                                     &results,
+                                     &options,
+                                     &extra);
+        if(status != EXIT_SUCCESS) {
+            return status;
+        }
+
+        const int nmubin = results.nmu_bins;
+        FILE *fp=my_fopen(correct_outputfile,"r");
+        if(fp == NULL) {
+            free_results_s_mu(&results);
+            return EXIT_FAILURE;
+        }
+        
+        for(int i=1;i<results.nsbin;i++) {
+            for(int j=0;j<nmubin;j++) {
+                int index = i*(nmubin+1) + j;
+                uint64_t npairs;
+                double savg, weightavg;
+                ret = EXIT_FAILURE;
+                int nitems = fscanf(fp,"%"SCNu64" %lf %*f %*f %lf%*[^\n]", &npairs, &savg, &weightavg);
+                if(nitems != 3) {
+                    i = results.nsbin;
+                    ret = EXIT_FAILURE;//not required but showing intent
+                    break;
+                }
+                int floats_equal = AlmostEqualRelativeAndAbs_double(savg, results.savg[index], maxdiff, maxreldiff);
+                int weights_equal = AlmostEqualRelativeAndAbs_double(weightavg, results.weightavg[index], maxdiff, maxreldiff);
+                
+                //Check for exact equality of npairs and float "equality" for rpavg
+                if(npairs == results.npairs[index] && floats_equal == EXIT_SUCCESS && weights_equal == EXIT_SUCCESS) {
+                    ret = EXIT_SUCCESS;
+                } else {
+                    fprintf(stderr,"Failed. True npairs = %"PRIu64 " Computed results npairs = %"PRIu64"\n", npairs, results.npairs[index]);
+                    fprintf(stderr,"Failed. True savg = %e Computed rpavg = %e. floats_equal = %d\n", savg, results.savg[index], floats_equal);
+                    fprintf(stderr,"Failed. True weightavg = %e Computed weightavg = %e. weights_equal = %d\n", weightavg, results.weightavg[index], weights_equal);
+                    ret = EXIT_FAILURE;//not required but showing intent
+                    i=results.nsbin;
+                    break;
+                }
+            }
+        }
+        fclose(fp);
+    END_INTEGRATION_TEST_SECTION;
+        
+    if(ret != EXIT_SUCCESS) {
+        FILE *fp = my_fopen(tmpoutputfile,"w");
+        const int nmubin = results.nmu_bins;
+        const double dmu = theory_mu_max/(double)results.nmu_bins ;
+        for(int i=1;i<results.nsbin;i++) {
+            const double logs = log10(results.supp[i]);
+            for(int j=0;j<nmubin;j++) {
+                int index = i*(nmubin+1) + j;
+                fprintf(fp,"%10"PRIu64" %20.8lf %20.8lf  %20.8lf %20.8lf\n",results.npairs[index],results.savg[index],logs,(j+1)*dmu, results.weightavg[index]);
+            }
+        }
+        fclose(fp);
+    }
+
+    //free the result structure
+    free_results_s_mu(&results);
+    return ret;
+}
+
 void read_data_and_set_globals(const char *firstfilename, const char *firstformat,const char *secondfilename,const char *secondformat)
 {
     int free_X2=0;
@@ -264,19 +330,18 @@ int main(int argc, char **argv)
     struct timeval tstart,t0,t1;
     char file[]="../tests/data/gals_Mr19.ff";
     char fileformat[]="f";
-    
+
     options = get_config_options();
     options.need_avg_sep=1;
     options.verbose=0;
     options.periodic=0;
     options.float_type=sizeof(double);
-    //options.instruction_set = FALLBACK;
 
     gettimeofday(&tstart,NULL);
 
     //set the globals
     ND1 = read_positions(file,fileformat, sizeof(double), 4, &X1, &Y1, &Z1, &weights1);
-    
+
     ND2 = ND1;
     X2 = X1;
     Y2 = Y1;
@@ -286,24 +351,36 @@ int main(int argc, char **argv)
     strncpy(current_file1,file,MAXLEN);
     strncpy(current_file2,file,MAXLEN);
     reset_bin_refine_factors(&options);
-    
+
     int failed=0;
     int status;
 
-    const char alltests_names[][MAXLEN] = {"Mr19 DD (nonperiodic)","Mr19 DDrppi (nonperiodic)","CMASS DDrppi DR (nonperiodic)"};
+    const char alltests_names[][MAXLEN] = {"Mr19 DD (nonperiodic)",
+                                           "Mr19 DDrppi (nonperiodic)",
+                                           "Mr19 DDsmu (nonperiodic)",
+                                           "CMASS DDrppi DR (nonperiodic)"};
     const int ntests = sizeof(alltests_names)/(sizeof(char)*MAXLEN);
-    const int function_pointer_index[] = {0,1,1};//0->DD, 1->DDrppi
-
-    const char correct_outputfiles[][MAXLEN] = {"Mr19_DD_nonperiodic","Mr19_DDrppi_nonperiodic","cmass_DR_nonperiodic"};
-    const char firstfilename[][MAXLEN] = {"../tests/data/gals_Mr19.ff","../tests/data/gals_Mr19.ff","../tests/data/cmassmock_Zspace.ff"};
-    const char firstfiletype[][MAXLEN] = {"f","f","f"};
-    const char secondfilename[][MAXLEN] = {"../tests/data/gals_Mr19.ff","../tests/data/gals_Mr19.ff","../tests/data/random_Zspace.ff"};
-    const char secondfiletype[][MAXLEN] = {"f","f","f"};
-    
-    const double allpimax[]             = {40.0,40.0,80.0};
-
-    int (*allfunctions[]) (const char *) = {test_nonperiodic_DD,test_nonperiodic_DDrppi};
-    const int numfunctions=2;//2 functions total
+    const int function_pointer_index[] = {0,1,2,1};//0->DD, 1->DDrppi, 2->DDsmu
+
+    const char correct_outputfiles[][MAXLEN] = {"Mr19_DD_nonperiodic",
+                                                "Mr19_DDrppi_nonperiodic",
+                                                "Mr19_DDsmu_nonperiodic",
+                                                "cmass_DR_nonperiodic"};
+    const char firstfilename[][MAXLEN] = {"../tests/data/gals_Mr19.ff",
+                                          "../tests/data/gals_Mr19.ff",
+                                          "../tests/data/gals_Mr19.ff",
+                                          "../tests/data/cmassmock_Zspace.ff"};
+    const char firstfiletype[][MAXLEN] = {"f","f","f","f"};
+    const char secondfilename[][MAXLEN] = {"../tests/data/gals_Mr19.ff",
+                                           "../tests/data/gals_Mr19.ff",
+                                           "../tests/data/gals_Mr19.ff",
+                                           "../tests/data/random_Zspace.ff"};
+    const char secondfiletype[][MAXLEN] = {"f","f","f","f"};
+
+    const double allpimax[]             = {40.0,40.0,40.0,80.0};
+
+    int (*allfunctions[]) (const char *) = {test_nonperiodic_DD,test_nonperiodic_DDrppi,test_nonperiodic_DDsmu};
+    const int numfunctions=3;//3 functions total
 
     int total_tests=0,skipped=0;
 
diff --git a/theory/tests/test_periodic.c b/theory/tests/test_periodic.c
index e59da2b3..9a0e6615 100644
--- a/theory/tests/test_periodic.c
+++ b/theory/tests/test_periodic.c
@@ -6,23 +6,12 @@
   directory at https://github.com/manodeep/Corrfunc/
 */
 
-#include <stdio.h>
-#include <stdlib.h>
-#include <assert.h>
-#include <string.h>
-#include <sys/time.h>
-#include <inttypes.h>
-
-#ifndef MAXLEN
-#define MAXLEN 500
-#endif
-
-#include "defs.h"
+#include "tests_common.h"
 #include "io.h"
-#include "utils.h"
 
 #include "../DD/countpairs.h"
 #include "../DDrppi/countpairs_rp_pi.h"
+#include "../DDsmu/countpairs_s_mu.h"
 #include "../wp/countpairs_wp.h"
 #include "../xi/countpairs_xi.h"
 #include "../vpf/countspheres.h"
@@ -30,8 +19,8 @@
 char tmpoutputfile[]="./test_periodic_output.txt";
 
 int test_periodic_DD(const char *correct_outputfile);
-/* int test_periodic_DD_weighted(const char *correct_outputfile); */
 int test_periodic_DDrppi(const char *correct_outputfile);
+int test_periodic_DDsmu(const char *correct_outputfile);
 int test_wp(const char *correct_outputfile);
 int test_vpf(const char *correct_outputfile);
 int test_xi(const char *correct_outputfile);
@@ -39,6 +28,7 @@ int test_xi(const char *correct_outputfile);
 void read_data_and_set_globals(const char *firstfilename, const char *firstformat,
                                const char *secondfilename, const char *secondformat);
 
+
 //Global variables
 int ND1;
 double *X1=NULL,*Y1=NULL,*Z1=NULL,*weights1=NULL;
@@ -46,26 +36,16 @@ double *X1=NULL,*Y1=NULL,*Z1=NULL,*weights1=NULL;
 int ND2;
 double *X2=NULL,*Y2=NULL,*Z2=NULL,*weights2=NULL;
 
-char binfile[]="bins";
-double pimax=40.0;
-double boxsize=420.0;
-#ifdef _OPENMP
-const int nthreads=4;
-#else
-const int nthreads=1;
-#endif
-
 char current_file1[MAXLEN],current_file2[MAXLEN];
 
 struct config_options options;
-const double maxdiff = 1e-9;
-const double maxreldiff = 1e-6;
-
 //end global variables
 
 int test_periodic_DD(const char *correct_outputfile)
 {
     int autocorr = (X1==X2) ? 1:0;
+    results_countpairs results;
+    int ret = EXIT_FAILURE;
     
     // Set up the weights pointers
     weight_method_t weight_method = PAIR_PRODUCT;
@@ -73,55 +53,57 @@ int test_periodic_DD(const char *correct_outputfile)
     extra.weights0.weights[0] = weights1;
     extra.weights1.weights[0] = weights2;
 
-    //Do the straight-up DD counts
-    results_countpairs results;
-    int status = countpairs(ND1,X1,Y1,Z1,
-                            ND2,X2,Y2,Z2,
-                            nthreads,
-                            autocorr,
-                            binfile,
-                            &results,
-                            &options,
-                            &extra);
-    if(status != EXIT_SUCCESS) {
-        return status;
-    }
-
-    int ret = EXIT_FAILURE;
-    double rlow=results.rupp[0];
-    FILE *fp = my_fopen(correct_outputfile,"r");
-    for(int i=1;i<results.nbin;i++) {
-        uint64_t npairs;
-        double rpavg, weightavg;
-        ret = EXIT_FAILURE;
-        int nitems = fscanf(fp,"%"SCNu64" %lf %*f %*f %lf", &npairs, &rpavg, &weightavg);
-        if(nitems != 3) {
-            ret = EXIT_FAILURE;//not required but showing intent
-            break;
-        }
-        int floats_equal = AlmostEqualRelativeAndAbs_double(rpavg, results.rpavg[i], maxdiff, maxreldiff);
-        int weights_equal = AlmostEqualRelativeAndAbs_double(weightavg, results.weightavg[i], maxdiff, maxreldiff);
+    BEGIN_INTEGRATION_TEST_SECTION
         
-        //Check for exact equality of npairs and float "equality" for rpavg
-        if(npairs == results.npairs[i] && floats_equal == EXIT_SUCCESS && weights_equal == EXIT_SUCCESS) {
-            ret = EXIT_SUCCESS;
-        } else {
-            ret = EXIT_FAILURE;//not required but showing intent 
-            fprintf(stderr,"Failed. True npairs = %"PRIu64 " Computed results npairs = %"PRIu64"\n", npairs, results.npairs[i]);
-            fprintf(stderr,"Failed. True rpavg = %e Computed rpavg = %e. floats_equal = %d\n", rpavg, results.rpavg[i], floats_equal);
-            fprintf(stderr,"Failed. True weightavg = %e Computed weightavg = %e. weights_equal = %d\n", weightavg, results.weightavg[i], weights_equal);
-            break;
+        //Do the straight-up DD counts
+        int status = countpairs(ND1,X1,Y1,Z1,
+                                ND2,X2,Y2,Z2,
+                                nthreads,
+                                autocorr,
+                                binfile,
+                                &results,
+                                &options,
+                                &extra);
+        if(status != EXIT_SUCCESS) {
+            return status;
         }
-
-    }
-    fclose(fp);
     
+        FILE *fp = my_fopen(correct_outputfile,"r");
+        for(int i=1;i<results.nbin;i++) {
+            uint64_t npairs;
+            double rpavg, weightavg;
+            ret = EXIT_FAILURE;
+            int nitems = fscanf(fp,"%"SCNu64" %lf %*f %*f %lf", &npairs, &rpavg, &weightavg);
+            if(nitems != 3) {
+                ret = EXIT_FAILURE;//not required but showing intent
+                break;
+            }
+            int floats_equal = AlmostEqualRelativeAndAbs_double(rpavg, results.rpavg[i], maxdiff, maxreldiff);
+            int weights_equal = AlmostEqualRelativeAndAbs_double(weightavg, results.weightavg[i], maxdiff, maxreldiff);
+            
+            //Check for exact equality of npairs and float "equality" for rpavg
+            if(npairs == results.npairs[i] && floats_equal == EXIT_SUCCESS && weights_equal == EXIT_SUCCESS) {
+                ret = EXIT_SUCCESS;
+            } else {
+                ret = EXIT_FAILURE;//not required but showing intent
+                fprintf(stderr,"Failed. True npairs = %"PRIu64 " Computed results npairs = %"PRIu64"\n", npairs, results.npairs[i]);
+                fprintf(stderr,"Failed. True rpavg = %e Computed rpavg = %e. floats_equal = %d\n", rpavg, results.rpavg[i], floats_equal);
+                fprintf(stderr,"Failed. True weightavg = %e Computed weightavg = %e. weights_equal = %d\n", weightavg, results.weightavg[i], weights_equal);
+                break;
+            }
+            
+        }
+        fclose(fp);
+
+    END_INTEGRATION_TEST_SECTION;
+        
     if(ret != EXIT_SUCCESS) {
-        fp=my_fopen(tmpoutputfile,"w");
+        FILE *fp=my_fopen(tmpoutputfile,"w");
         if(fp == NULL) {
             free_results(&results);
             return EXIT_FAILURE;
         }
+        double rlow=results.rupp[0];
         for(int i=1;i<results.nbin;i++) {
             fprintf(fp,"%10"PRIu64" %20.8lf %20.8lf %20.8lf %20.8lf \n",results.npairs[i],results.rpavg[i],rlow,results.rupp[i],results.weightavg[i]);
             rlow = results.rupp[i];
@@ -136,69 +118,73 @@ int test_periodic_DD(const char *correct_outputfile)
 
 int test_periodic_DDrppi(const char *correct_outputfile)
 {
+    results_countpairs_rp_pi results;
+    int ret = EXIT_FAILURE;
     int autocorr = (X1==X2) ? 1:0;
-    
+
     // Set up the weights pointers
     weight_method_t weight_method = PAIR_PRODUCT;
     struct extra_options extra = get_extra_options(weight_method);
     extra.weights0.weights[0] = weights1;
     extra.weights1.weights[0] = weights2;
 
-    results_countpairs_rp_pi results;
-    int status = countpairs_rp_pi(ND1,X1,Y1,Z1,
-                                  ND2,X2,Y2,Z2,
-                                  nthreads,
-                                  autocorr,
-                                  binfile,
-                                  pimax,
-                                  &results,
-                                  &options,
-                                  &extra);
-    if(status != EXIT_SUCCESS) {
-        return status;
-    }
-
-    int ret = EXIT_FAILURE;
-    const int npibin = results.npibin;
-    const double dpi = pimax/(double)results.npibin ;
-    FILE *fp = my_fopen(correct_outputfile, "r");
-    for(int i=1;i<results.nbin;i++) {
-        for(int j=0;j<npibin;j++) {
-            int index = i*(npibin+1) + j;
-            uint64_t npairs;
-            double rpavg, weightavg;
-            ret = EXIT_FAILURE;
-            int nitems = fscanf(fp,"%"SCNu64" %lf %*f %*f %lf%*[^\n]", &npairs, &rpavg, &weightavg);
-            if(nitems != 3) {
-                ret = EXIT_FAILURE;//not required but showing intent
-                i = results.nbin;
-                break;
-            }
-            int floats_equal = AlmostEqualRelativeAndAbs_double(rpavg, results.rpavg[index], maxdiff, maxreldiff);
-            int weights_equal = AlmostEqualRelativeAndAbs_double(weightavg, results.weightavg[index], maxdiff, maxreldiff);
-            
-            //Check for exact equality of npairs and float "equality" for rpavg
-            if(npairs == results.npairs[index] && floats_equal == EXIT_SUCCESS && weights_equal == EXIT_SUCCESS) {
-                ret = EXIT_SUCCESS;
-            } else {
-                fprintf(stderr,"True npairs = %"PRIu64 " Computed results npairs = %"PRIu64"\n", npairs, results.npairs[index]);
-                fprintf(stderr,"True rpavg  = %20.12e Computed rpavg = %20.12e. floats_equal = %d\n", rpavg, results.rpavg[index], floats_equal);
-                fprintf(stderr,"True weightavg = %e Computed weightavg = %e. weights_equal = %d\n", weightavg, results.weightavg[index], weights_equal);
+    BEGIN_INTEGRATION_TEST_SECTION
+        int status = countpairs_rp_pi(ND1,X1,Y1,Z1,
+                                      ND2,X2,Y2,Z2,
+                                      nthreads,
+                                      autocorr,
+                                      binfile,
+                                      pimax,
+                                      &results,
+                                      &options,
+                                      &extra);
+        if(status != EXIT_SUCCESS) {
+            return status;
+        }
 
-                ret = EXIT_FAILURE;//not required but showing intent 
-                i = results.nbin;                
-                break;
+        const int npibin = results.npibin;
+        FILE *fp = my_fopen(correct_outputfile, "r");
+        for(int i=1;i<results.nbin;i++) {
+            for(int j=0;j<npibin;j++) {
+                int index = i*(npibin+1) + j;
+                uint64_t npairs;
+                double rpavg, weightavg;
+                ret = EXIT_FAILURE;
+                int nitems = fscanf(fp,"%"SCNu64" %lf %*f %*f %lf%*[^\n]", &npairs, &rpavg, &weightavg);
+                if(nitems != 3) {
+                    ret = EXIT_FAILURE;//not required but showing intent
+                    i = results.nbin;
+                    break;
+                }
+                int floats_equal = AlmostEqualRelativeAndAbs_double(rpavg, results.rpavg[index], maxdiff, maxreldiff);
+                int weights_equal = AlmostEqualRelativeAndAbs_double(weightavg, results.weightavg[index], maxdiff, maxreldiff);
+                
+                //Check for exact equality of npairs and float "equality" for rpavg
+                if(npairs == results.npairs[index] && floats_equal == EXIT_SUCCESS && weights_equal == EXIT_SUCCESS) {
+                    ret = EXIT_SUCCESS;
+                } else {
+                    fprintf(stderr,"True npairs = %"PRIu64 " Computed results npairs = %"PRIu64"\n", npairs, results.npairs[index]);
+                    fprintf(stderr,"True rpavg  = %20.12e Computed rpavg = %20.12e. floats_equal = %d\n", rpavg, results.rpavg[index], floats_equal);
+                    fprintf(stderr,"True weightavg = %e Computed weightavg = %e. weights_equal = %d\n", weightavg, results.weightavg[index], weights_equal);
+                    
+                    ret = EXIT_FAILURE;//not required but showing intent
+                    i = results.nbin;
+                    break;
+                }
             }
         }
-    }
-    fclose(fp);
-
+        fclose(fp);
+    END_INTEGRATION_TEST_SECTION;
+        
     if(ret != EXIT_SUCCESS) {
-        fp=my_fopen(tmpoutputfile,"w");
+        FILE *fp=my_fopen(tmpoutputfile,"w");
         if(fp == NULL) {
             free_results_rp_pi(&results);
             return EXIT_FAILURE;
         }
+
+        const int npibin = results.npibin;
+        const double dpi = pimax/(double)results.npibin ;
         for(int i=1;i<results.nbin;i++) {
             const double logrp = log10(results.rupp[i]);
             for(int j=0;j<npibin;j++) {
@@ -208,79 +194,168 @@ int test_periodic_DDrppi(const char *correct_outputfile)
         }
         fclose(fp);
     }
-    
+
     //free the result structure
     free_results_rp_pi(&results);
     return ret;
 }
 
-int test_wp(const char *correct_outputfile)
+int test_periodic_DDsmu(const char *correct_outputfile)
 {
-    
+    int autocorr = (X1==X2) ? 1:0;
+    int ret = EXIT_FAILURE;
+    results_countpairs_s_mu results;
+
     // Set up the weights pointers
     weight_method_t weight_method = PAIR_PRODUCT;
     struct extra_options extra = get_extra_options(weight_method);
     extra.weights0.weights[0] = weights1;
-
-    results_countpairs_wp results;
-    int status = countpairs_wp(ND1,X1,Y1,Z1,
-                               boxsize,
-                               nthreads,
-                               binfile,
-                               pimax,
-                               &results,
-                               &options,
-                               &extra);
-    if(status != EXIT_SUCCESS) {
-        return status;
-    }
-    int ret = EXIT_FAILURE;
-    double rlow=results.rupp[0];
-    FILE *fp=my_fopen(correct_outputfile,"r");
-    for(int i=1;i<results.nbin;i++) {
-        uint64_t npairs;
-        double rpavg,wp,weightavg;
+    extra.weights1.weights[0] = weights2;
+    
+    BEGIN_INTEGRATION_TEST_SECTION
+        int status = countpairs_s_mu(ND1,X1,Y1,Z1,
+                                     ND2,X2,Y2,Z2,
+                                     nthreads,
+                                     autocorr,
+                                     binfile,
+                                     theory_mu_max,
+                                     nmu_bins,
+                                     &results,
+                                     &options,
+                                     &extra);
+        if(status != EXIT_SUCCESS) {
+            return status;
+        }
+        
         ret = EXIT_FAILURE;
-        int nitems = fscanf(fp,"%lf %lf %*f %*f %"SCNu64" %lf%*[^\n]", &wp, &rpavg, &npairs, &weightavg);//discard rlow and rupp
-        if(nitems != 4) {
-            ret = EXIT_FAILURE;//not required but showing intent
-            break;
+        const int nmubin = results.nmu_bins;
+        FILE *fp = my_fopen(correct_outputfile, "r");
+        for(int i=1;i<results.nsbin;i++) {
+            for(int j=0;j<nmubin;j++) {
+                int index = i*(nmubin+1) + j;
+                uint64_t npairs;
+                double savg, weightavg;
+                ret = EXIT_FAILURE;
+                int nitems = fscanf(fp,"%"SCNu64" %lf %*f %*f %lf%*[^\n]", &npairs, &savg, &weightavg);
+                if(nitems != 3) {
+                    ret = EXIT_FAILURE;//not required but showing intent
+                    i = results.nsbin;
+                    break;
+                }
+                int floats_equal = AlmostEqualRelativeAndAbs_double(savg, results.savg[index], maxdiff, maxreldiff);
+                int weights_equal = AlmostEqualRelativeAndAbs_double(weightavg, results.weightavg[index], maxdiff, maxreldiff);
+                
+                //Check for exact equality of npairs and float "equality" for savg
+                if(npairs == results.npairs[index] && floats_equal == EXIT_SUCCESS && weights_equal == EXIT_SUCCESS) {
+                    ret = EXIT_SUCCESS;
+                } else {
+                    fprintf(stderr,"True npairs = %"PRIu64 " Computed results npairs = %"PRIu64"\n", npairs, results.npairs[index]);
+                    fprintf(stderr,"True savg  = %20.12e Computed savg = %20.12e. floats_equal = %d\n", savg, results.savg[index], floats_equal);
+                    fprintf(stderr,"True weightavg = %e Computed weightavg = %e. weights_equal = %d\n", weightavg, results.weightavg[index], weights_equal);
+                    
+                    ret = EXIT_FAILURE;//not required but showing intent
+                    i = results.nsbin;
+                    break;
+                }
+            }
+        }
+        fclose(fp);
+    END_INTEGRATION_TEST_SECTION;
+                        
+    if(ret != EXIT_SUCCESS) {
+        FILE *fp=my_fopen(tmpoutputfile,"w");
+        if(fp == NULL) {
+            free_results_s_mu(&results);
+            return EXIT_FAILURE;
         }
-        int rpavg_equal = AlmostEqualRelativeAndAbs_double(rpavg, results.rpavg[i], maxdiff, maxreldiff);
-        int weightavg_equal = AlmostEqualRelativeAndAbs_double(weightavg, results.weightavg[i], maxdiff, maxreldiff);
-        int wp_equal = AlmostEqualRelativeAndAbs_double(wp, results.wp[i], maxdiff, maxreldiff);
-
-        //Check for exact equality of npairs and float "equality" for rpavg + wp 
-        if(npairs == results.npairs[i] && rpavg_equal == EXIT_SUCCESS && wp_equal == EXIT_SUCCESS && weightavg_equal == EXIT_SUCCESS) {
-            ret = EXIT_SUCCESS;
-        } else {
-            ret = EXIT_FAILURE;//not required but showing intent 
-            fprintf(stderr,"Failed. True npairs = %"PRIu64 " Computed results npairs = %"PRIu64"\n", npairs, results.npairs[i]);
-            fprintf(stderr,"Failed. True wp = %e Computed results wp = %e\n", wp, results.wp[i]);
-            fprintf(stderr,"Failed. True rpavg = %e Computed rpavg = %e.\n",
-                    rpavg, results.rpavg[i]);
-            fprintf(stderr,"Failed. True weightavg = %e Computed weightavg = %e.\n",
-                    weightavg, results.weightavg[i]);
-            fprintf(stderr," wp_equal = %d rpavg_equal = %d weightavg_equal = %d\n", wp_equal, rpavg_equal, weightavg_equal);
-            break;
+        const int nmubin = results.nmu_bins;
+        const double dmu = theory_mu_max/(double)results.nmu_bins;
+        for(int i=1;i<results.nsbin;i++) {
+            const double logs = log10(results.supp[i]);
+            for(int j=0;j<nmubin;j++) {
+                int index = i*(nmubin+1) + j;
+                fprintf(fp,"%10"PRIu64" %20.8lf %20.8lf %20.8lf %20.8lf\n",results.npairs[index],results.savg[index],logs,(j+1)*dmu, results.weightavg[index]);
+            }
         }
+        fclose(fp);
     }
-    fclose(fp);
 
+    //free the result structure
+    free_results_s_mu(&results);
+    return ret;
+}
+
+
+int test_wp(const char *correct_outputfile)
+{
+    results_countpairs_wp results;
+    int ret = EXIT_FAILURE;
+
+    // Set up the weights pointers
+    weight_method_t weight_method = PAIR_PRODUCT;
+    struct extra_options extra = get_extra_options(weight_method);
+    extra.weights0.weights[0] = weights1;
+    
+    BEGIN_INTEGRATION_TEST_SECTION    
+        int status = countpairs_wp(ND1,X1,Y1,Z1,
+                                   boxsize,
+                                   nthreads,
+                                   binfile,
+                                   pimax,
+                                   &results,
+                                   &options,
+                                   &extra);
+        if(status != EXIT_SUCCESS) {
+            return status;
+        }
+
+        FILE *fp=my_fopen(correct_outputfile,"r");
+        for(int i=1;i<results.nbin;i++) {
+            uint64_t npairs;
+            double rpavg,wp,weightavg;
+            ret = EXIT_FAILURE;
+            int nitems = fscanf(fp,"%lf %lf %*f %*f %"SCNu64" %lf%*[^\n]", &wp, &rpavg, &npairs, &weightavg);//discard rlow and rupp
+            if(nitems != 4) {
+                ret = EXIT_FAILURE;//not required but showing intent
+                break;
+            }
+            int rpavg_equal = AlmostEqualRelativeAndAbs_double(rpavg, results.rpavg[i], maxdiff, maxreldiff);
+            int weightavg_equal = AlmostEqualRelativeAndAbs_double(weightavg, results.weightavg[i], maxdiff, maxreldiff);
+            int wp_equal = AlmostEqualRelativeAndAbs_double(wp, results.wp[i], maxdiff, maxreldiff);
+            
+            //Check for exact equality of npairs and float "equality" for rpavg + wp
+            if(npairs == results.npairs[i] && rpavg_equal == EXIT_SUCCESS && wp_equal == EXIT_SUCCESS && weightavg_equal == EXIT_SUCCESS) {
+                ret = EXIT_SUCCESS;
+            } else {
+                ret = EXIT_FAILURE;//not required but showing intent
+                fprintf(stderr,"Failed. True npairs = %"PRIu64 " Computed results npairs = %"PRIu64"\n", npairs, results.npairs[i]);
+                fprintf(stderr,"Failed. True wp = %e Computed results wp = %e\n", wp, results.wp[i]);
+                fprintf(stderr,"Failed. True rpavg = %e Computed rpavg = %e.\n",
+                        rpavg, results.rpavg[i]);
+                fprintf(stderr,"Failed. True weightavg = %e Computed weightavg = %e.\n",
+                        weightavg, results.weightavg[i]);
+                fprintf(stderr," wp_equal = %d rpavg_equal = %d weightavg_equal = %d\n", wp_equal, rpavg_equal, weightavg_equal);
+                break;
+            }
+        }
+        fclose(fp);
+    END_INTEGRATION_TEST_SECTION;
+        
     /* Test failed. Output the results into a temporary file */
     if(ret != EXIT_SUCCESS) {
-        fp=my_fopen(tmpoutputfile,"w");
+        FILE *fp=my_fopen(tmpoutputfile,"w");
         if(fp == NULL) {
             free_results_wp(&results);
             return EXIT_FAILURE;
         }
+        double rlow=results.rupp[0];
         for(int i=1;i<results.nbin;++i) {
             fprintf(fp,"%e\t%e\t%e\t%e\t%12"PRIu64"\t%e \n",results.wp[i],results.rpavg[i],rlow,results.rupp[i],results.npairs[i], results.weightavg[i]);
             rlow=results.rupp[i];
         }
         fclose(fp);
     }
-    
+
     //free the result structure
     free_results_wp(&results);
     return ret;
@@ -294,59 +369,62 @@ int test_vpf(const char *correct_outputfile)
     const int num_pN=6;
     const unsigned long seed=-1234;
     results_countspheres results;
-    int status = countspheres(ND1, X1, Y1, Z1,
-                              rmax, nbin, nc,
-                              num_pN,
-                              seed,
-                              &results,
-                              &options, NULL);
-
-    if(status != EXIT_SUCCESS) {
-        return status;
-    }
-
     int ret = EXIT_FAILURE;
-    //Output the results
-    FILE *fp=my_fopen(correct_outputfile,"r");
-    if(fp == NULL) {
-        free_results_countspheres(&results);
-        return EXIT_FAILURE;
-    }
-    const double rstep = rmax/(double)nbin ;
-    for(int ibin=0;ibin<results.nbin;ibin++) {
-        double r;
-        int nitems = fscanf(fp, "%lf", &r);
-        if(nitems != 1) {
+
+    BEGIN_INTEGRATION_TEST_SECTION
+        int status = countspheres(ND1, X1, Y1, Z1,
+                                  rmax, nbin, nc,
+                                  num_pN,
+                                  seed,
+                                  &results,
+                                  &options, NULL);
+    
+        if(status != EXIT_SUCCESS) {
+            return status;
+        }
+
+        //Output the results
+        FILE *fp=my_fopen(correct_outputfile,"r");
+        if(fp == NULL) {
+            free_results_countspheres(&results);
             return EXIT_FAILURE;
         }
-        ret = EXIT_FAILURE;
-        for(int i=0;i<num_pN;i++) {
-            double pN;
-            nitems = fscanf(fp, " %lf ", &pN);
+        for(int ibin=0;ibin<results.nbin;ibin++) {
+            double r;
+            int nitems = fscanf(fp, "%lf", &r);
             if(nitems != 1) {
                 return EXIT_FAILURE;
             }
-
-            /* Not quite sure how this is working. The correct output columns only have 4 digits printed,
-               but I am comparing here with ~1e-9 in abs. diff. The only way the comparison should work is
-               if the conversion to 4 digits during printf, round-trips during scanf. But surely there must 
-               be a lot more doubles that can be fit within those missing digits of precision.
-
-               I would have thought the comparison would require maxdiff ~ 1e-4. -- MS
-             */
-            int floats_equal = AlmostEqualRelativeAndAbs_double(pN, (results.pN)[ibin][i], maxdiff, maxreldiff);
-            if(floats_equal != EXIT_SUCCESS) {
-                ibin=results.nbin;
-                ret=EXIT_FAILURE;
-                break;
+            ret = EXIT_FAILURE;
+            for(int i=0;i<num_pN;i++) {
+                double pN;
+                nitems = fscanf(fp, " %lf ", &pN);
+                if(nitems != 1) {
+                    return EXIT_FAILURE;
+                }
+                
+                /* Not quite sure how this is working. The correct output columns only have 4 digits printed,
+                   but I am comparing here with ~1e-9 in abs. diff. The only way the comparison should work is
+                   if the conversion to 4 digits during printf, round-trips during scanf. But surely there must
+                   be a lot more doubles that can be fit within those missing digits of precision.
+
+                   I would have thought the comparison would require maxdiff ~ 1e-4. -- MS
+                */
+                int floats_equal = AlmostEqualRelativeAndAbs_double(pN, (results.pN)[ibin][i], maxdiff, maxreldiff);
+                if(floats_equal != EXIT_SUCCESS) {
+                    ibin=results.nbin;
+                    ret=EXIT_FAILURE;
+                    break;
+                }
+                ret = EXIT_SUCCESS;
             }
-            ret = EXIT_SUCCESS;
         }
-    }
-    fclose(fp);
+        fclose(fp);
+    END_INTEGRATION_TEST_SECTION;
     
     if(ret != EXIT_SUCCESS) {
-        fp=my_fopen(tmpoutputfile,"w");
+        FILE *fp=my_fopen(tmpoutputfile,"w");
+        const double rstep = rmax/(double)nbin ;
         for(int ibin=0;ibin<results.nbin;ibin++) {
             const double r=(ibin+1)*rstep;
             fprintf(fp,"%lf ", r);
@@ -365,63 +443,65 @@ int test_vpf(const char *correct_outputfile)
 
 int test_xi(const char *correct_outputfile)
 {
-
+    results_countpairs_xi results;
+    int ret = EXIT_FAILURE;
+    
     // Set up the weights pointers
     weight_method_t weight_method = PAIR_PRODUCT;
     struct extra_options extra = get_extra_options(weight_method);
     extra.weights0.weights[0] = weights1;
 
-    results_countpairs_xi results;
-    int status = countpairs_xi(ND1,X1,Y1,Z1,
-                               boxsize,
-                               nthreads,
-                               binfile,
-                               &results,
-                               &options,
-                               &extra);
-    if(status != EXIT_SUCCESS) {
-        return status;
-    }
-
-    int ret=EXIT_FAILURE;
-    double rlow=results.rupp[0];
-    FILE *fp=my_fopen(correct_outputfile,"r");
-    for(int i=1;i<results.nbin;i++) {
-        uint64_t npairs;
-        double ravg,xi,weightavg;
-        ret = EXIT_FAILURE;
-        int nitems = fscanf(fp,"%lf %lf %*f %*f %"SCNu64" %lf[^\n]", &xi, &ravg, &npairs, &weightavg); //discard rlow and rupp
-        if(nitems != 4) {
-            ret = EXIT_FAILURE;//not required but showing intent
-            break;
+    BEGIN_INTEGRATION_TEST_SECTION
+        int status = countpairs_xi(ND1,X1,Y1,Z1,
+                                   boxsize,
+                                   nthreads,
+                                   binfile,
+                                   &results,
+                                   &options,
+                                   &extra);
+        if(status != EXIT_SUCCESS) {
+            return status;
         }
-        int ravg_equal = AlmostEqualRelativeAndAbs_double(ravg, results.ravg[i], maxdiff, maxreldiff);
-        int weightavg_equal = AlmostEqualRelativeAndAbs_double(weightavg, results.weightavg[i], maxdiff, maxreldiff);
-        int xi_equal = AlmostEqualRelativeAndAbs_double(xi, results.xi[i], maxdiff, maxreldiff);
         
-        //Check for exact equality of npairs and float "equality" for ravg + xi 
-        if(npairs == results.npairs[i] && ravg_equal == EXIT_SUCCESS && xi_equal == EXIT_SUCCESS && weightavg_equal == EXIT_SUCCESS) {
-            ret = EXIT_SUCCESS;
-        } else {
-            ret = EXIT_FAILURE;//not required but showing intent 
-            fprintf(stderr,"Failed. True npairs = %"PRIu64 " Computed results npairs = %"PRIu64"\n", npairs, results.npairs[i]);
-            fprintf(stderr,"Failed. True xi = %e Computed results xi = %e\n", xi, results.xi[i]);
-            fprintf(stderr,"Failed. True ravg = %e Computed ravg = %e.\n",
-                    ravg, results.ravg[i]);
-            fprintf(stderr,"Failed. True weightavg = %e Computed weightavg = %e.\n",
-                    weightavg, results.weightavg[i]);
-            fprintf(stderr," xi_equal = %d ravg_equal = %d weightavg_equal = %d\n", xi_equal, ravg_equal, weightavg_equal);
-            break;
+        FILE *fp=my_fopen(correct_outputfile,"r");
+        for(int i=1;i<results.nbin;i++) {
+            uint64_t npairs;
+            double ravg,xi,weightavg;
+            ret = EXIT_FAILURE;
+            int nitems = fscanf(fp,"%lf %lf %*f %*f %"SCNu64" %lf[^\n]", &xi, &ravg, &npairs, &weightavg); //discard rlow and rupp
+            if(nitems != 4) {
+                ret = EXIT_FAILURE;//not required but showing intent
+                break;
+            }
+            int ravg_equal = AlmostEqualRelativeAndAbs_double(ravg, results.ravg[i], maxdiff, maxreldiff);
+            int weightavg_equal = AlmostEqualRelativeAndAbs_double(weightavg, results.weightavg[i], maxdiff, maxreldiff);
+            int xi_equal = AlmostEqualRelativeAndAbs_double(xi, results.xi[i], maxdiff, maxreldiff);
+            
+            //Check for exact equality of npairs and float "equality" for ravg + xi
+            if(npairs == results.npairs[i] && ravg_equal == EXIT_SUCCESS && xi_equal == EXIT_SUCCESS && weightavg_equal == EXIT_SUCCESS) {
+                ret = EXIT_SUCCESS;
+            } else {
+                ret = EXIT_FAILURE;//not required but showing intent
+                fprintf(stderr,"Failed. True npairs = %"PRIu64 " Computed results npairs = %"PRIu64"\n", npairs, results.npairs[i]);
+                fprintf(stderr,"Failed. True xi = %e Computed results xi = %e\n", xi, results.xi[i]);
+                fprintf(stderr,"Failed. True ravg = %e Computed ravg = %e.\n",
+                        ravg, results.ravg[i]);
+                fprintf(stderr,"Failed. True weightavg = %e Computed weightavg = %e.\n",
+                        weightavg, results.weightavg[i]);
+                fprintf(stderr," xi_equal = %d ravg_equal = %d weightavg_equal = %d\n", xi_equal, ravg_equal, weightavg_equal);
+                break;
+            }
         }
-    }
-    fclose(fp);
-    
+        fclose(fp);
+    END_INTEGRATION_TEST_SECTION;
+
     if(ret != EXIT_SUCCESS){
-        fp=my_fopen(tmpoutputfile,"w");
+        FILE *fp=my_fopen(tmpoutputfile,"w");
         if(fp == NULL) {
             free_results_xi(&results);
             return EXIT_FAILURE;
         }
+        double rlow=results.rupp[0];
         for(int i=1;i<results.nbin;++i) {
             fprintf(fp,"%e\t%e\t%e\t%e\t%12"PRIu64"\t%e \n",results.xi[i],results.ravg[i],rlow,results.rupp[i],results.npairs[i], results.weightavg[i]);
             rlow=results.rupp[i];
@@ -501,12 +581,12 @@ int main(int argc, char **argv)
 
     char file[]="../tests/data/gals_Mr19.ff";
     char fileformat[]="f";
-    
+
     gettimeofday(&tstart,NULL);
 
     //set the globals
     ND1 = read_positions(file,fileformat, sizeof(double), 4, &X1, &Y1, &Z1, &weights1);
-    
+
     ND2 = ND1;
     X2 = X1;
     Y2 = Y1;
@@ -516,7 +596,7 @@ int main(int argc, char **argv)
     strncpy(current_file1,file,MAXLEN);
     strncpy(current_file2,file,MAXLEN);
     reset_bin_refine_factors(&options);
-    
+
     int failed=0;
     int status;
 
@@ -525,17 +605,19 @@ int main(int argc, char **argv)
                                            "Mr19 wp (periodic)",
                                            "Mr19 vpf (periodic)",
                                            "Mr19 xi periodic)",
+                                           "Mr19 DDsmu (periodic)",
                                            "CMASS DDrppi DD (periodic)",
                                            "CMASS DDrppi DR (periodic)",
                                            "CMASS DDrppi RR (periodic)"};
     const int ntests = sizeof(alltests_names)/(sizeof(char)*MAXLEN);
-    const int function_pointer_index[] = {1,0,2,3,4,1,1,1};//0->DD, 1->DDrppi,2->wp, 3->vpf, 4->xi
+    const int function_pointer_index[] = {1,0,2,3,4,5,1,1,1};//0->DD, 1->DDrppi,2->wp, 3->vpf, 4->xi, 5->DDsmu
 
     const char correct_outputfiles[][MAXLEN] = {"Mr19_DDrppi_periodic",
                                                 "Mr19_DD_periodic",
                                                 "Mr19_wp",
                                                 "Mr19_vpf_periodic",
                                                 "Mr19_xi",
+                                                "Mr19_DDsmu_periodic",
                                                 "cmass_DD_periodic",
                                                 "cmass_DR_periodic",
                                                 "cmass_RR_periodic"};
@@ -544,27 +626,30 @@ int main(int argc, char **argv)
                                           "../tests/data/gals_Mr19.ff",
                                           "../tests/data/gals_Mr19.ff",
                                           "../tests/data/gals_Mr19.ff",
+                                          "../tests/data/gals_Mr19.ff",
                                           "../tests/data/cmassmock_Zspace.ff",
                                           "../tests/data/cmassmock_Zspace.ff",
                                           "../tests/data/random_Zspace.ff"};
-    const char firstfiletype[][MAXLEN] = {"f","f","f","f","f","f","f","f"};
+    const char firstfiletype[][MAXLEN] = {"f","f","f","f","f","f","f","f","f"};
     const char secondfilename[][MAXLEN] = {"../tests/data/gals_Mr19.ff",
                                            "../tests/data/gals_Mr19.ff",
                                            "../tests/data/gals_Mr19.ff",
                                            "../tests/data/gals_Mr19.ff",
                                            "../tests/data/gals_Mr19.ff",
+                                           "../tests/data/gals_Mr19.ff",
                                            "../tests/data/cmassmock_Zspace.ff",
                                            "../tests/data/random_Zspace.ff",
                                            "../tests/data/random_Zspace.ff"};
-    const char secondfiletype[][MAXLEN] = {"f","f","f","f","f","f","f","f"};
-    const double allpimax[]             = {40.0,40.0,40.0,40.0,40.0,80.0,80.0,80.0};
+    const char secondfiletype[][MAXLEN] = {"f","f","f","f","f","f","f","f","f"};
+    const double allpimax[]             = {40.0,40.0,40.0,40.0,40.0,40.0,80.0,80.0,80.0};
 
     int (*allfunctions[]) (const char *) = {test_periodic_DD,
                                             test_periodic_DDrppi,
                                             test_wp,
                                             test_vpf,
-                                            test_xi};
-    const int numfunctions=5;//5 functions total
+                                            test_xi,
+                                            test_periodic_DDsmu};
+    const int numfunctions=6;//6 functions total
 
     int total_tests=0,skipped=0;
 
diff --git a/utils/defs.h b/utils/defs.h
index 020f2542..ffe26016 100644
--- a/utils/defs.h
+++ b/utils/defs.h
@@ -19,7 +19,7 @@
 extern "C" {
 #endif
 
-#define API_VERSION          STR("2.0.1")
+#define API_VERSION          STR("2.1.0")
 
 typedef enum {
   DEFAULT=-42,/* present simply to make the enum a signed int*/
@@ -187,6 +187,12 @@ static inline void set_bin_refine_factors(struct config_options *options, const
     reset_bin_refine_scheme(options);
 }
 
+static inline void set_custom_bin_refine_factors(struct config_options *options, const int bin_refine_factors[3])
+{
+    set_bin_refine_factors(options, bin_refine_factors);
+    set_bin_refine_scheme(options, BINNING_CUST);
+}
+    
 static inline void reset_bin_refine_factors(struct config_options *options)
 {
     /* refine factors of 2,2,1 in the xyz dims
diff --git a/utils/macros.h b/utils/macros.h
index 7e6daa93..77793ae7 100644
--- a/utils/macros.h
+++ b/utils/macros.h
@@ -8,7 +8,7 @@
 
 #define ADD_DIFF_TIME(t0,t1)            ((t1.tv_sec - t0.tv_sec) + 1e-6*(t1.tv_usec - t0.tv_usec))
 #define REALTIME_ELAPSED_NS(t0, t1)     ((t1.tv_sec - t0.tv_sec)*1000000000.0 + (t1.tv_nsec - t0.tv_nsec))
-    
+
 #define ALIGNMENT                32
 
 #define STRINGIFY(x)   #x
@@ -25,16 +25,16 @@
         thread_timings->second_cellindex = second_cellid;               \
     }
 
-/* Taken from http://stackoverflow.com/questions/19403233/compile-time-struct-size-check-error-out-if-odd 
+/* Taken from http://stackoverflow.com/questions/19403233/compile-time-struct-size-check-error-out-if-odd
    which is in turn taken from the linux kernel */
 /* #define BUILD_BUG_OR_ZERO(e) (sizeof(struct{ int:-!!(e);})) */
 /* #define ENSURE_STRUCT_SIZE(e, size)  BUILD_BUG_OR_ZERO(sizeof(e) != size) */
-/* However, the previous one gives me an unused-value warning and I do not want 
-   to turn that compiler warning off. Therefore, this version, which results in 
-   an unused local typedef warning is used. I turn off the corresponding warning 
+/* However, the previous one gives me an unused-value warning and I do not want
+   to turn that compiler warning off. Therefore, this version, which results in
+   an unused local typedef warning is used. I turn off the corresponding warning
    in common.mk (-Wno-unused-local-typedefs) via CFLAGS
 */
-#define BUILD_BUG_OR_ZERO(cond, msg) typedef volatile char assertion_on_##msg[( !!(cond) )*2-1 ] 
+#define BUILD_BUG_OR_ZERO(cond, msg) typedef volatile char assertion_on_##msg[( !!(cond) )*2-1 ]
 #define ENSURE_STRUCT_SIZE(e, size)                 BUILD_BUG_OR_ZERO(sizeof(e) == size, sizeof_struct_config_options)
 
 /* Macro Constants */
@@ -56,6 +56,7 @@
 #define RP_UNICODE    "rp"
 #define THETA_UNICODE "\u03B8"
 #define OMEGA_UNICODE "\u03C9"
+#define MU_UNICODE    "\u03BC"
 
 #define PI_SAFE    "pi"
 #define XI_SAFE    "xi"
@@ -63,6 +64,7 @@
 #define RP_SAFE "rp"
 #define THETA_SAFE "theta"
 #define OMEGA      "omega"
+#define MU_SAFE    "mu"
 
 
 #ifdef USE_UNICODE
@@ -70,6 +72,7 @@
 #define XI_CHAR XI_UNICODE
 #define PIMAX_CHAR PIMAX_UNICODE
 #define RP_CHAR  RP_UNICODE
+#define MU_CHAR  MU_UNICODE
 #define THETA_CHAR THETA_UNICODE
 #define OMEGA_CHAR OMEGA_UNICODE
 #define UNICODE_WARNING  "\n\
@@ -83,6 +86,7 @@ the ROOT DIRECTORY of ``Corrfunc`` and re-install the entire packge.\n"
 #else
 #define PI_CHAR PI_SAFE
 #define XI_CHAR XI_SAFE
+#define MU_CHAR MU_SAFE
 #define PIMAX_CHAR PIMAX_SAFE
 #define RP_CHAR    RP_SAFE
 #define THETA_CHAR THETA_SAFE
@@ -142,7 +146,7 @@ the ROOT DIRECTORY of ``Corrfunc`` and re-install the entire packge.\n"
              fprintf(stderr,"Can not handle signal = %d\n", signo);     \
          }                                                              \
          previous_handlers[i] = prev;                                   \
-     }                                                              
+     }
 
 #define RESET_INTERRUPT_HANDLERS()              \
      for(size_t i=0;i<nsig;i++) {                                       \
@@ -153,4 +157,3 @@ the ROOT DIRECTORY of ``Corrfunc`` and re-install the entire packge.\n"
              fprintf(stderr,"Could not reset signal handler to default for signal = %d\n", signo); \
          }                                                              \
      }
-     
diff --git a/utils/tests_common.h b/utils/tests_common.h
new file mode 100644
index 00000000..5fcadf36
--- /dev/null
+++ b/utils/tests_common.h
@@ -0,0 +1,137 @@
+/* File: tests_common.h */
+/*
+  This file is a part of the Corrfunc package
+  Copyright (C) 2015-- Manodeep Sinha (manodeep@gmail.com)
+  License: MIT LICENSE. See LICENSE file under the top-level
+  directory at https://github.com/manodeep/Corrfunc/
+*/
+
+#pragma once
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <string.h>
+#include <sys/time.h>
+#include <inttypes.h>
+
+#ifndef MAXLEN
+#define MAXLEN 500
+#endif
+
+#include "defs.h"
+#include "utils.h"
+
+#ifdef INTEGRATION_TESTS
+
+#warning "Running (SLOW) integration tests"
+
+/* Define the instruction sets that are supported by the compiler */
+const isa valid_instruction_sets[] = {FALLBACK
+#ifdef __SSE4_2__
+                                      ,SSE42
+#endif                                      
+#ifdef __AVX__                                      
+                                      ,AVX
+#endif                                      
+};
+
+/* Strings corresponding to the instruction sets in the array `valid_instruction_sets` */
+const char isa_name[][20] = {"FALLBACK"
+#ifdef __SSE4_2__                             
+                             ,"SSE42"
+#endif
+#ifdef __AVX__                             
+                             , "AVX"
+#endif                             
+};
+
+/* This is a fun C tid-bit. The sizeof(valid_instruction_sets) refers to the total bytes
+   required to store the array. As in the typeof valid_instruction_sets is int[3] when
+   all 3 instructions sets are supported */
+const int num_instructions = sizeof(valid_instruction_sets)/sizeof(valid_instruction_sets[0]);
+
+/* The max. value of bin refine factor to probe. Each of bin refinements factors is set from [1, max_binref]
+ (inclusive) */
+const int min_bin_ref = 1, max_bin_ref = 3;
+
+/* Macro to setup the loop over instruction sets, various bin factors and then run
+ the tests */
+#define BEGIN_INTEGRATION_TEST_SECTION                                  \
+    do {                                                                \
+           int dotest = 1;                                              \
+           const isa old_isa = options.instruction_set;                 \
+           int fastest_bin_ref[] = {1, 1, 1};                           \
+           double fastest_time = 1e30;                                  \
+           struct timespec t0, t1;                                      \
+           for(int iset=0;iset<num_instructions;iset++) {                     \
+               options.instruction_set = valid_instruction_sets[iset];    \
+               for(int bfx=min_bin_ref;bfx<=max_bin_ref;bfx++) {         \
+                   for(int bfy=min_bin_ref;bfy<=max_bin_ref;bfy++) {     \
+                       for(int bfz=min_bin_ref;bfz<=max_bin_ref;bfz++) { \
+                           if(dotest == 1) {                            \
+                               const int bf[] = {bfx, bfy, bfz};        \
+                               set_custom_bin_refine_factors(&options, bf); \
+                               fprintf(stderr,"Running with bin refs = (%d, %d, %d) and instruction set = %s...", \
+                                       options.bin_refine_factors[0],   \
+                                       options.bin_refine_factors[1],   \
+                                       options.bin_refine_factors[2],   \
+                                       isa_name[iset]);                   \
+                               current_utc_time(&t0);
+
+
+/* Clean up the integration tests (close the loops and check for error) */
+#define END_INTEGRATION_TEST_SECTION                                    \
+                               current_utc_time(&t1);                   \
+                               double time_to_run = REALTIME_ELAPSED_NS(t0, t1);\
+                               if(time_to_run < fastest_time) {         \
+                                   fastest_time = time_to_run;          \
+                                   memcpy(&fastest_bin_ref, &bf, sizeof(bf));\
+                               }                                        \
+                               if(ret != EXIT_SUCCESS) {                \
+                                   fprintf(stderr, ANSI_COLOR_RED "FAILED"); \
+                                   dotest = 0;                          \
+                               } else {                                 \
+                                   fprintf(stderr,ANSI_COLOR_GREEN "PASSED"); \
+                               }                                        \
+                               fprintf(stderr, ANSI_COLOR_RESET ". Time taken = %8.2lf seconds \n", time_to_run * 1e-9); \
+                           }/* close the dotest if condition*/          \
+                       }/*bin ref z*/                                   \
+                   }/*bin ref y*/                                       \
+               }/*bin ref x*/                                           \
+               if(ret == EXIT_SUCCESS) {                                \
+                   fprintf(stderr, ANSI_COLOR_MAGENTA "Fastest time = %8.2lf seconds with bin-ref = {%d, %d, %d}" ANSI_COLOR_RESET "\n", \
+                           fastest_time*1e-9,                           \
+                           fastest_bin_ref[0],                          \
+                           fastest_bin_ref[1],                          \
+                           fastest_bin_ref[2]);                         \
+               }                                                        \
+           } /*instruction set */                                       \
+           reset_bin_refine_factors(&options);                          \
+           options.instruction_set = old_isa;                           \
+    } while(0)
+#else
+/* Running regular tests -> no need for exhaustive testing */
+#define BEGIN_INTEGRATION_TEST_SECTION  do {                               
+#define END_INTEGRATION_TEST_SECTION    } while(0)
+
+#endif
+
+
+#ifdef _OPENMP
+const int nthreads=4;
+#else
+const int nthreads=1;
+#endif
+
+const double maxdiff = 1e-9;
+const double maxreldiff = 1e-6;
+
+char binfile[]="../tests/bins";
+char angular_binfile[]="../tests/angular_bins";
+double pimax=40.0;
+double theory_mu_max=0.5;
+double mocks_mu_max=1.0;
+int nmu_bins=10;
+double boxsize=420.0;
+