diff --git a/examples/isdf/00-RHF_diamond.py b/examples/isdf/00-RHF_diamond.py
new file mode 100644
index 000000000..411e1bdf8
--- /dev/null
+++ b/examples/isdf/00-RHF_diamond.py
@@ -0,0 +1,121 @@
+import numpy as np
+from pyscf import lib
+from pyscf.gto.mole import *
+
+from pyscf.isdf import isdf_tools_cell
+from pyscf.isdf import isdf_local_k
+from pyscf.isdf import isdf_jk
+from pyscf.isdf import isdf_local
+
+from pyscf.lib.parameters import BOHR
+
+MOL_STRUCTURE = '''
+                   C     0.      0.      0.
+                   C     0.8917  0.8917  0.8917
+                   C     1.7834  1.7834  0.
+                   C     2.6751  2.6751  0.8917
+                   C     1.7834  0.      1.7834
+                   C     2.6751  0.8917  2.6751
+                   C     0.      1.7834  1.7834
+                   C     0.8917  2.6751  2.6751
+                '''
+
+#### NOTE: a full tests on combinations of parameters ####
+                
+C_ARRAY = [15, 15, 20, 25, 30, 30]
+RELA_CUTOFF = [3e-2, 1e-2, 3e-3, 1e-3, 3e-4, 1e-4]
+SuperCell_ARRAY = [
+    # [1, 1, 1],
+    [1, 1, 2],
+    # [1, 2, 2],
+    # [2, 2, 2],
+    # [3, 3, 3],
+    # [2, 4, 4],
+    # [3, 4, 4],
+    # [5, 5, 5],
+    # [6, 6, 6],
+    # [1, 1, 4],
+    # [1, 1, 8],
+    # [1, 1, 16],
+    # [1, 1, 32],
+]
+
+
+Ke_CUTOFF = [70]
+boxlen = 3.5668
+Basis = ['gth-dzvp']
+
+PARTITION = [
+    [[0,1],[2,3],[4,5],[6,7]],
+    [[0,1,2,3],[4,5,6,7]],
+    [[0,1,2,3,4,5,6,7]],
+    [[0],[1],[2],[3],[4],[5],[6],[7]],
+]
+
+if __name__ == '__main__':
+
+    boxlen = 3.57371000
+    prim_a = np.array([[boxlen,0.0,0.0],[0.0,boxlen,0.0],[0.0,0.0,boxlen]])
+    atm = [
+        ['C', (0.        , 0.        , 0.    )],
+        ['C', (0.8934275 , 0.8934275 , 0.8934275)],
+        ['C', (1.786855  , 1.786855  , 0.    )],
+        ['C', (2.6802825 , 2.6802825 , 0.8934275)],
+        ['C', (1.786855  , 0.        , 1.786855)],
+        ['C', (2.6802825 , 0.8934275 , 2.6802825)],
+        ['C', (0.        , 1.786855  , 1.786855)],
+        ['C', (0.8934275 , 2.6802825 , 2.6802825)],
+    ]
+    
+    for supercell in SuperCell_ARRAY:
+        ke_cutoff = Ke_CUTOFF[0]
+        for partition in PARTITION:   ## test different partition of atoms
+            for basis in Basis:
+                for c, rela_cutoff in zip(C_ARRAY, RELA_CUTOFF):
+                # for c in C_ARRAY:
+                    print('--------------------------------------------')
+                    print('C = %.2e, supercell = %s, kc_cutoff = %d, basis = %s, partition = %s' % (
+                        c, str(supercell), ke_cutoff, basis, partition))
+
+                    prim_cell = isdf_tools_cell.build_supercell(atm, prim_a, Ls = [1,1,1], ke_cutoff=ke_cutoff, basis=basis, pseudo="gth-pade", verbose=4)
+                    prim_mesh = prim_cell.mesh
+                    print("prim_mesh = ", prim_mesh)
+            
+                    mesh = [supercell[0] * prim_mesh[0], supercell[1] * prim_mesh[1], supercell[2] * prim_mesh[2]]
+                    mesh = np.array(mesh, dtype=np.int32)
+            
+                    cell, supercell_group = isdf_tools_cell.build_supercell_with_partition(atm, prim_a, partition=partition, Ls = supercell, ke_cutoff=ke_cutoff, mesh=mesh, basis=basis, pseudo="gth-pade", verbose=4)
+
+                    cell.incore_anyway = False
+                    cell.max_memory    = 200   # force to call with_df.get_jk
+
+                    t1 = (lib.logger.process_clock(),lib.logger.perf_counter())
+                        
+                    pbc_isdf_info = isdf_local.PBC_ISDF_Info_Quad(cell, with_robust_fitting=True, direct=False, rela_cutoff_QRCP=rela_cutoff)
+                    pbc_isdf_info.build_IP_local(c=c, group=supercell_group, Ls=[supercell[0]*4, supercell[1]*4, supercell[2]*4])
+                    print("pbc_isdf_info.naux = ", pbc_isdf_info.naux) 
+                    print("effective c = ", float(pbc_isdf_info.naux) / pbc_isdf_info.nao) 
+                    pbc_isdf_info.build_auxiliary_Coulomb()
+                                                
+                    t2 = (lib.logger.process_clock(), lib.logger.perf_counter())
+
+                    print(isdf_jk._benchmark_time(t1, t2, 'build_isdf', pbc_isdf_info))
+
+                    # for bunch_size in BUNCHSIZE_IO:
+                    ### perform scf ###
+
+                    from pyscf.pbc import scf
+
+                    t1 = (lib.logger.process_clock(), lib.logger.perf_counter())
+                    mf = scf.RHF(cell)
+                    mf.with_df = pbc_isdf_info
+                    mf.max_cycle = 32
+                    mf.conv_tol = 1e-7
+                    pbc_isdf_info.direct_scf = mf.direct_scf
+                    mf.kernel()
+                    t2 = (lib.logger.process_clock(), lib.logger.perf_counter())
+                    print(isdf_jk._benchmark_time(t1, t2, 'scf_isdf', pbc_isdf_info))
+                        
+                    del mf
+                    del pbc_isdf_info
+                exit(1)
\ No newline at end of file
diff --git a/examples/isdf/01-KRHF_TiO2.py b/examples/isdf/01-KRHF_TiO2.py
new file mode 100644
index 000000000..b9f612495
--- /dev/null
+++ b/examples/isdf/01-KRHF_TiO2.py
@@ -0,0 +1,115 @@
+from functools import reduce
+import numpy as np
+from pyscf import lib
+import pyscf.pbc.gto as pbcgto
+from pyscf.pbc.gto import Cell
+from pyscf.pbc import tools
+from pyscf.pbc.lib.kpts import KPoints
+from pyscf.pbc.lib.kpts_helper import is_zero, gamma_point, member
+from pyscf.gto.mole import *
+
+from pyscf.isdf import isdf_tools_cell
+from pyscf.isdf import isdf_local_k
+from pyscf.isdf import isdf_jk
+
+MOL_STRUCTURE = '''
+Ti            2.3246330643        2.3246330643        1.4853414945
+Ti            0.0000000000        0.0000000000       -0.0000000000
+O             0.9065353261        3.7427308025        1.4853414945
+O             3.7427308025        0.9065353261        1.4853414945
+O             1.4180977382        1.4180977382        0.0000000000
+O             3.2311683903        3.2311683903        0.0000000000
+'''
+
+atm = [
+['Ti',(2.3246330643,2.3246330643, 1.4853414945)],
+['Ti',(0.0000000000,0.0000000000, 0.0000000000)],
+['O ',(0.9065353261,3.7427308025, 1.4853414945)],
+['O ',(3.7427308025,0.9065353261, 1.4853414945)],
+['O ',(1.4180977382,1.4180977382, 0.0000000000)],
+['O ',(3.2311683903,3.2311683903, 0.0000000000)],
+]
+boxlen = [4.6492659759,4.6492659759,2.9706828877]
+
+C_ARRAY = [15,20,25,30]  ## if rela_cutoff_QRCP is set, then c is used to when performing random projection, which can be relative large.
+RELA_QR = [1e-2,1e-3,2e-4,1e-4]
+SuperCell_ARRAY = [
+    # [1,1,1],
+    [2,2,2],
+    [3,3,3],
+    [4,4,4],
+    [5,5,5],
+    [6,6,6],
+]
+Ke_CUTOFF = [128, 192]
+
+Basis = ['gth-cc-tzvp-Ye']
+
+prim_partition = [[0],[1],[2],[3],[4],[5]]
+
+if __name__ == '__main__':
+    
+    prim_a = np.array([[boxlen[0],0.0,0.0],[0.0,boxlen[1],0.0],[0.0,0.0,boxlen[2]]])
+    pseudo = 'gth-hf-rev'
+    
+    for supercell in SuperCell_ARRAY:
+        for basis in Basis:
+            for ke_cutoff in Ke_CUTOFF:
+                
+                    DM_CACHED = None
+                    
+                    from pyscf.gto.basis import parse_nwchem
+                    fbas="basis.dat"
+                    atms = ['O', 'Ti']
+                    basis = {atm:parse_nwchem.load(fbas, atm) for atm in atms}
+                    print("basis = ", basis)
+                    
+                    
+                    prim_cell = isdf_tools_cell.build_supercell(atm, prim_a, Ls = [1,1,1], ke_cutoff=ke_cutoff, basis=basis, pseudo=pseudo, spin=0, verbose=10)    
+                    cell      = prim_cell
+
+                    ### perform scf ###
+
+                    from pyscf.pbc import scf, dft
+                    from pyscf.pbc.dft import multigrid
+                    
+                    nk   = supercell
+                    kpts = cell.make_kpts(nk)
+                    
+                    for c,rela_qr in list(zip(C_ARRAY,RELA_QR)):
+                        
+                        print('--------------------------------------------')
+                        print('C = %d, QR=%f, supercell = %s, kc_cutoff = %d, basis = %s' % (c, rela_qr, str(supercell), ke_cutoff, basis))
+
+                        ### create the isdf object ###
+                        
+                        t1 = (lib.logger.process_clock(), lib.logger.perf_counter())
+                        pbc_isdf_info = isdf_local_k.PBC_ISDF_Info_Quad_K(cell,
+                                                                          kmesh=nk,  
+                                                                          with_robust_fitting=True, 
+                                                                          rela_cutoff_QRCP=rela_qr, 
+                                                                          direct=True, 
+                                                                          limited_memory=True,
+                                                                          build_K_bunchsize=128,  ## NOTE:control the memory cost in building K
+                                                                          # use_occ_RI_K=False
+                                                                          )
+                        pbc_isdf_info.verbose = 10
+                        pbc_isdf_info.build_IP_local(c=c, m=5, group=prim_partition)
+                        print("effective c = ", float(pbc_isdf_info.naux) / pbc_isdf_info.nao) 
+                        t2 = (lib.logger.process_clock(), lib.logger.perf_counter())
+                        print(isdf_jk._benchmark_time(t1, t2, 'build ISDF', pbc_isdf_info))
+                        
+                        t1 = (lib.logger.process_clock(), lib.logger.perf_counter())
+                        mf = scf.KRHF(cell, kpts)
+                        mf.with_df   = pbc_isdf_info
+                        mf.max_cycle = 100
+                        mf.conv_tol  = 1e-8
+                        mf.conv_tol_grad = 1e-3
+                        if DM_CACHED is not None:
+                            mf.kernel(DM_CACHED)
+                        else:
+                            mf.kernel()
+                        t2 = (lib.logger.process_clock(), lib.logger.perf_counter())
+                        
+                        print(isdf_jk._benchmark_time(t1, t2, 'RHF_bench', mf))
+                        DM_CACHED = mf.make_rdm1()
\ No newline at end of file
diff --git a/examples/isdf/02-UHF_CCO.py b/examples/isdf/02-UHF_CCO.py
new file mode 100644
index 000000000..aa9dc62b5
--- /dev/null
+++ b/examples/isdf/02-UHF_CCO.py
@@ -0,0 +1,139 @@
+import numpy as np
+from pyscf import lib
+from pyscf.gto.mole import *
+
+from pyscf.isdf import isdf_tools_cell
+from pyscf.isdf import isdf_local_k
+from pyscf.isdf import isdf_jk
+from pyscf.isdf import isdf_local
+
+from pyscf.lib.parameters import BOHR
+
+#### NOTE: a full tests on combinations of parameters ####
+
+prim_a = np.array(
+                [[14.572056092, 0.000000000, 0.000000000],
+                 [0.000000000, 14.572056092, 0.000000000],
+                 [0.000000000, 0.000000000,  6.010273939],]) * BOHR
+atm = [
+['Cu',	(1.927800,	1.927800,	1.590250)],
+['Cu',	(5.783400,	5.783400,	1.590250)],
+['Cu',	(1.927800,	5.783400,	1.590250)],
+['Cu',	(5.783400,	1.927800,	1.590250)],
+['O',	(1.927800,	3.855600,	1.590250)],
+['O',	(3.855600,	5.783400,	1.590250)],
+['O',	(5.783400,	3.855600,	1.590250)],
+['O',	(3.855600,	1.927800,	1.590250)],
+['O',	(0.000000,	1.927800,	1.590250)],
+['O',	(1.927800,	7.711200,	1.590250)],
+['O',	(7.711200,	5.783400,	1.590250)],
+['O',	(5.783400,	0.000000,	1.590250)],
+['Ca',	(0.000000,	0.000000,	0.000000)],
+['Ca',	(3.855600,	3.855600,	0.000000)],
+['Ca',	(7.711200,	3.855600,	0.000000)],
+['Ca',	(3.855600,	7.711200,	0.000000)],
+]
+   
+C_ARRAY = [25, 30, 35]
+RELA_CUTOFF = [1e-3, 3e-4, 1e-4]
+SuperCell_ARRAY = [
+    [1, 1, 1],
+]
+Ke_CUTOFF = [256]
+Basis = ['gth-dzvp']
+
+PARTITION = [
+    [[0],  [1],  [2],  [3], 
+     [4],  [5],  [6],  [7], 
+     [8],  [9],  [10], [11], 
+     [12], [13], [14], [15]]
+]
+
+if __name__ == '__main__':
+    
+    for supercell in SuperCell_ARRAY:
+        ke_cutoff = Ke_CUTOFF[0]
+        for partition in PARTITION:   ## test different partition of atoms
+            for _basis_ in Basis:
+                
+                DM_CACHED = None
+                
+                from pyscf.gto.basis import parse_nwchem
+                fbas="basis2.dat"
+                atms = ['O', 'Cu', "Ca"]
+                basis = {atm:parse_nwchem.load(fbas, atm) for atm in atms}
+                # print("basis = ", basis)
+                
+                pseudo = {'Cu': 'gth-pbe-q19', 'O': 'gth-pbe', 'Ca': 'gth-pbe'}
+                
+                prim_cell = isdf_tools_cell.build_supercell(atm, prim_a, Ls = [1,1,1], ke_cutoff=ke_cutoff, basis=basis, pseudo=pseudo, verbose=4)
+                prim_mesh = prim_cell.mesh
+                # print("prim_mesh = ", prim_mesh)
+            
+                mesh = [supercell[0] * prim_mesh[0], supercell[1] * prim_mesh[1], supercell[2] * prim_mesh[2]]
+                mesh = np.array(mesh, dtype=np.int32)
+            
+                cell, supercell_group = isdf_tools_cell.build_supercell_with_partition(atm, prim_a, 
+                                                                                       partition = partition, 
+                                                                                       Ls        = supercell, 
+                                                                                       ke_cutoff = ke_cutoff, 
+                                                                                       mesh      = mesh, 
+                                                                                       basis     = basis, 
+                                                                                       pseudo    = pseudo, 
+                                                                                       verbose   = 4)
+
+                cell.incore_anyway = False
+                cell.max_memory    = 200   # force to call with_df.get_jk
+                
+                for c, rela_cutoff in zip(C_ARRAY, RELA_CUTOFF):
+                    
+                    print('--------------------------------------------')
+                    print('C = %.2e, supercell = %s, kc_cutoff = %d, basis = %s, partition = %s' % (
+                        c, str(supercell), ke_cutoff, basis, partition))
+
+                    t1 = (lib.logger.process_clock(),lib.logger.perf_counter())
+                        
+                    pbc_isdf_info = isdf_local.PBC_ISDF_Info_Quad(cell, with_robust_fitting=True, 
+                                                                  direct=True, 
+                                                                  rela_cutoff_QRCP=rela_cutoff,
+                                                                  limited_memory=True, build_K_bunchsize=56
+                                                                  )
+                    pbc_isdf_info.build_IP_local(c=c, group=supercell_group)
+                    print("pbc_isdf_info.naux = ", pbc_isdf_info.naux) 
+                    print("effective c = ", float(pbc_isdf_info.naux) / pbc_isdf_info.nao) 
+                    pbc_isdf_info.build_auxiliary_Coulomb()
+                                                
+                    t2 = (lib.logger.process_clock(), lib.logger.perf_counter())
+
+                    print(isdf_jk._benchmark_time(t1, t2, 'build_isdf', pbc_isdf_info))
+
+                    # for bunch_size in BUNCHSIZE_IO:
+                    ### perform scf ###
+
+                    from pyscf.pbc import scf
+
+                    t1 = (lib.logger.process_clock(), lib.logger.perf_counter())
+                    mf = scf.UHF(cell)
+                    mf.with_df = pbc_isdf_info
+                    mf.max_cycle = 64
+                    mf.conv_tol = 1e-7
+                    pbc_isdf_info.direct_scf = mf.direct_scf
+                    if DM_CACHED is not None:
+                        mf.kernel(DM_CACHED)
+                    else:
+                        mf.kernel()
+                    t2 = (lib.logger.process_clock(), lib.logger.perf_counter())
+                    print(isdf_jk._benchmark_time(t1, t2, 'scf_isdf', pbc_isdf_info))
+                        
+                    del mf
+                    del pbc_isdf_info
+                    
+                ### GDF benchmark ###
+                
+                mf = scf.UHF(cell).density_fit()
+                mf.max_cycle = 64
+                mf.conv_tol = 1e-7
+                # pbc_isdf_info.direct_scf = mf.direct_scf
+                mf.kernel(DM_CACHED)
+                    
+                exit(1)
\ No newline at end of file
diff --git a/examples/isdf/03-GDF_CCO.py b/examples/isdf/03-GDF_CCO.py
new file mode 100644
index 000000000..2a58105a8
--- /dev/null
+++ b/examples/isdf/03-GDF_CCO.py
@@ -0,0 +1,136 @@
+import numpy as np
+from pyscf import lib
+from pyscf.gto.mole import *
+
+from pyscf.isdf import isdf_tools_cell
+from pyscf.isdf import isdf_local_k
+from pyscf.isdf import isdf_jk
+from pyscf.isdf import isdf_local
+
+from pyscf.lib.parameters import BOHR
+
+#### NOTE: a full tests on combinations of parameters ####
+
+prim_a = np.array(
+                [[14.572056092, 0.000000000, 0.000000000],
+                 [0.000000000, 14.572056092, 0.000000000],
+                 [0.000000000, 0.000000000,  6.010273939],]) * BOHR
+atm = [
+['Cu',	(1.927800,	1.927800,	1.590250)],
+['Cu',	(5.783400,	5.783400,	1.590250)],
+['Cu',	(1.927800,	5.783400,	1.590250)],
+['Cu',	(5.783400,	1.927800,	1.590250)],
+['O',	(1.927800,	3.855600,	1.590250)],
+['O',	(3.855600,	5.783400,	1.590250)],
+['O',	(5.783400,	3.855600,	1.590250)],
+['O',	(3.855600,	1.927800,	1.590250)],
+['O',	(0.000000,	1.927800,	1.590250)],
+['O',	(1.927800,	7.711200,	1.590250)],
+['O',	(7.711200,	5.783400,	1.590250)],
+['O',	(5.783400,	0.000000,	1.590250)],
+['Ca',	(0.000000,	0.000000,	0.000000)],
+['Ca',	(3.855600,	3.855600,	0.000000)],
+['Ca',	(7.711200,	3.855600,	0.000000)],
+['Ca',	(3.855600,	7.711200,	0.000000)],
+]
+   
+C_ARRAY = [25, 30, 35]
+RELA_CUTOFF = [1e-3, 3e-4, 1e-4]
+SuperCell_ARRAY = [
+    [1, 1, 1],
+]
+Ke_CUTOFF = [256]
+Basis = ['gth-dzvp']
+
+PARTITION = [
+    [[0],  [1],  [2],  [3], 
+     [4],  [5],  [6],  [7], 
+     [8],  [9],  [10], [11], 
+     [12], [13], [14], [15]]
+]
+
+if __name__ == '__main__':
+    
+    for supercell in SuperCell_ARRAY:
+        ke_cutoff = Ke_CUTOFF[0]
+        for partition in PARTITION:   ## test different partition of atoms
+            for _basis_ in Basis:
+                
+                DM_CACHED = None
+                
+                from pyscf.gto.basis import parse_nwchem
+                fbas="basis2.dat"
+                atms = ['O', 'Cu', "Ca"]
+                basis = {atm:parse_nwchem.load(fbas, atm) for atm in atms}
+                # print("basis = ", basis)
+                
+                pseudo = {'Cu': 'gth-pbe-q19', 'O': 'gth-pbe', 'Ca': 'gth-pbe'}
+                
+                prim_cell = isdf_tools_cell.build_supercell(atm, prim_a, Ls = [1,1,1], ke_cutoff=ke_cutoff, basis=basis, pseudo=pseudo, verbose=4)
+                prim_mesh = prim_cell.mesh
+                # print("prim_mesh = ", prim_mesh)
+            
+                mesh = [supercell[0] * prim_mesh[0], supercell[1] * prim_mesh[1], supercell[2] * prim_mesh[2]]
+                mesh = np.array(mesh, dtype=np.int32)
+            
+                cell, supercell_group = isdf_tools_cell.build_supercell_with_partition(atm, prim_a, 
+                                                                                       partition = partition, 
+                                                                                       Ls        = supercell, 
+                                                                                       ke_cutoff = ke_cutoff, 
+                                                                                       mesh      = mesh, 
+                                                                                       basis     = basis, 
+                                                                                       pseudo    = pseudo, 
+                                                                                       verbose   = 4)
+
+                cell.incore_anyway = False
+                cell.max_memory    = 200   # force to call with_df.get_jk
+                
+                for c, rela_cutoff in zip(C_ARRAY, RELA_CUTOFF):
+                    
+                    print('--------------------------------------------')
+                    print('C = %.2e, supercell = %s, kc_cutoff = %d, basis = %s, partition = %s' % (
+                        c, str(supercell), ke_cutoff, basis, partition))
+
+                    t1 = (lib.logger.process_clock(),lib.logger.perf_counter())
+                        
+                    pbc_isdf_info = isdf_local.PBC_ISDF_Info_Quad(cell, with_robust_fitting=True, direct=True, rela_cutoff_QRCP=rela_cutoff,
+                                                                  limited_memory=True, build_K_bunchsize=128)
+                    pbc_isdf_info.build_IP_local(c=c, group=supercell_group)
+                    print("pbc_isdf_info.naux = ", pbc_isdf_info.naux) 
+                    print("effective c = ", float(pbc_isdf_info.naux) / pbc_isdf_info.nao) 
+                    pbc_isdf_info.build_auxiliary_Coulomb()
+                                                
+                    t2 = (lib.logger.process_clock(), lib.logger.perf_counter())
+
+                    print(isdf_jk._benchmark_time(t1, t2, 'build_isdf', pbc_isdf_info))
+
+                    # for bunch_size in BUNCHSIZE_IO:
+                    ### perform scf ###
+
+                    from pyscf.pbc import scf
+
+                    t1 = (lib.logger.process_clock(), lib.logger.perf_counter())
+                    mf = scf.GHF(cell)
+                    mf.with_df = pbc_isdf_info
+                    mf.max_cycle = 64
+                    mf.conv_tol = 1e-7
+                    pbc_isdf_info.direct_scf = mf.direct_scf
+                    if DM_CACHED is not None:
+                        mf.kernel(DM_CACHED)
+                    else:
+                        mf.kernel()
+                    t2 = (lib.logger.process_clock(), lib.logger.perf_counter())
+                    print(isdf_jk._benchmark_time(t1, t2, 'scf_isdf', pbc_isdf_info))
+                        
+                    del mf
+                    del pbc_isdf_info
+                    
+                ### GDF benchmark ###
+                
+                mf = scf.GHF(cell).density_fit()
+                mf.max_cycle = 64
+                mf.conv_tol = 1e-7
+                # pbc_isdf_info.direct_scf = mf.direct_scf
+                mf.kernel(DM_CACHED)
+                    
+                exit(1)
\ No newline at end of file
diff --git a/examples/isdf/basis.dat b/examples/isdf/basis.dat
new file mode 100644
index 000000000..55c2727a0
--- /dev/null
+++ b/examples/isdf/basis.dat
@@ -0,0 +1,49 @@
+#BASIS SET:
+O  S
+14.482841   -3.658934e-02
+6.284704   -1.303224e-01
+1.164884    3.769821e-01
+0.468441    5.431582e-01
+0.184961    2.084140e-01
+O  S
+0.221262    1.000000e+00
+O  P
+10.213949    6.086918e-02
+3.622324    1.870524e-01
+1.299051    3.714779e-01
+0.463791    4.256889e-01
+0.157848    2.088730e-01
+O  P
+0.274670    1.000000e+00
+O  D
+1.200187    1.000000e+00
+#BASIS SET:
+Ti  S
+4.314400    1.000000e+00
+Ti  S
+1.211440    1.000000e+00
+Ti  S
+0.507273    1.000000e+00
+Ti  S
+0.083635    1.000000e+00
+Ti  S
+0.032238    1.000000e+00
+Ti  P
+6.628548   -1.876134e-01 -1.202160e-02
+2.469901    3.648950e-01  2.614988e-02
+1.068373    8.015420e-01 -1.647797e-02
+0.438086    4.330265e-01  7.362989e-02
+Ti  P
+0.157971    1.000000e+00
+Ti  P
+0.068125    1.000000e+00
+Ti  D
+5.692516   -2.291856e-01
+1.923332   -5.221114e-01
+Ti  D
+0.647040    1.000000e+00
+Ti  D
+0.199065    1.000000e+00
+Ti  F
+1.121189   -9.777480e-01
+0.284205   -2.097828e-01
\ No newline at end of file
diff --git a/examples/isdf/basis2.dat b/examples/isdf/basis2.dat
new file mode 100644
index 000000000..455dbef6e
--- /dev/null
+++ b/examples/isdf/basis2.dat
@@ -0,0 +1,118 @@
+#BASIS SET: (5s,5p,5d) -> [3s,3p,2d] Ca
+Ca  S
+    7.213557   -4.811677e-02  6.509227e-02
+    3.953199    2.795449e-01 -3.375291e-01
+    0.887945   -5.055939e-01  5.085903e-01
+    0.381928   -4.403097e-01  3.189670e-01
+Ca  S
+    0.044801    1.000000e+00
+Ca  P
+    5.522531    6.327636e-02  4.842917e-03
+    1.446307   -5.437431e-01 -3.188125e-03
+    0.605733   -7.882950e-01 -6.033881e-02
+    0.239083   -2.800260e-01  9.655471e-02
+Ca  P
+    0.062765    1.000000e+00
+Ca  D
+    3.010924    4.928622e-02
+    1.064269    1.087188e-01
+    0.316680    2.101310e-01
+    0.175804   -3.737671e-02
+Ca  D
+    0.073715    1.000000e+00
+#BASIS SET: (6s,6p,1d) -> [2s,2p,1d] O
+O  S
+   14.482841   -3.658934e-02
+    6.284704   -1.303224e-01
+    1.164884    3.769821e-01
+    0.468441    5.431582e-01
+    0.184961    2.084140e-01
+O  S
+    0.221262    1.000000e+00
+O  P
+   10.213949    6.086918e-02
+    3.622324    1.870524e-01
+    1.299051    3.714779e-01
+    0.463791    4.256889e-01
+    0.157848    2.088730e-01
+O  P
+    0.274670    1.000000e+00
+O  D
+    1.200187    1.000000e+00
+#BASIS SET: (5s,6p,4d,2f) -> [3s,3p,2d,1f] Cu
+Cu  S
+    9.083669   -3.622183e-01  1.093857e-01
+    2.375895    7.613242e-01 -1.645715e-01
+    0.936687    5.169596e-01  3.951353e-02
+    0.116029   -8.984813e-02 -6.615456e-01
+Cu  S
+    0.041075    1.000000e+00
+Cu  P
+   11.566615   -1.354626e-01  2.324231e-02
+    4.918638    4.749338e-01 -7.531200e-02
+    2.290556    7.098130e-01 -2.250031e-02
+    1.043427    4.923407e-01 -1.498897e-01
+    0.429044    9.926123e-02  1.645767e-01
+Cu  P
+    0.139040    1.000000e+00
+Cu  D
+    8.082843   -3.663286e-01
+    3.149999   -5.868172e-01
+    1.067441   -6.124219e-01
+Cu  D
+    0.308911    1.000000e+00
+Cu  F
+    4.078302   -4.962922e-01
+    1.072255   -8.681555e-01
+#BASIS SET: (5s,5p,1d) -> [4s,3p,2d] Ba
+Ba  S
+    4.860079   -2.877241e-02  2.157408e-02
+    1.399001    3.828487e-01 -2.577906e-01
+    0.402710   -7.842840e-01  3.891714e-01
+Ba  S
+    0.115922    1.000000e+00
+Ba  S
+    0.033369    1.000000e+00
+Ba  P
+    3.592628    4.674179e-02
+    1.573935   -3.519620e-01
+    0.689543    5.167405e-01
+    0.302090    7.478454e-01
+Ba  P
+    0.132346    1.000000e+00
+Ba  P
+    0.042656    1.000000e+00
+Ba  D
+    1.680976    5.054502e-02
+    0.883691   -7.506956e-02
+    0.464558   -5.025319e-02
+    0.244219   -4.749195e-01
+    0.128386    1.435246e-01
+Ba  D
+    0.067493    1.000000e+00
+#BASIS SET: (4s,4p,5d,2f) -> [3s,3p,3d,1f] Hg
+Hg  S
+    1.915700    1.357890e-01
+    0.979308   -4.520937e-01
+Hg  S
+    0.150255    1.000000e+00
+Hg  S
+    0.054019    1.000000e+00
+Hg  P
+    2.173806   -8.329602e-03
+    0.867935    4.694467e-02
+Hg  P
+    0.346540    1.000000e+00
+Hg  P
+    0.138363    1.000000e+00
+Hg  D
+    3.674806    2.455544e-02
+    1.622342   -6.047205e-01
+    0.768788   -6.547334e-01
+Hg  D
+    0.339247    1.000000e+00
+Hg  D
+    0.136379    1.000000e+00
+Hg  F
+    1.330141    5.428473e-01
+    0.507359    8.398314e-01
diff --git a/pyscf/isdf/__init__.py b/pyscf/isdf/__init__.py
new file mode 100644
index 000000000..392dd9021
--- /dev/null
+++ b/pyscf/isdf/__init__.py
@@ -0,0 +1,19 @@
+# Copyright 2014-2018 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .isdf import ISDF
+from .isdf_fast import PBC_ISDF_Info
+from .isdf_local import PBC_ISDF_Info_Quad
+from .isdf_local_k import PBC_ISDF_Info_Quad_K
+from .isdf_tools_cell import build_supercell, build_supercell_with_partition
\ No newline at end of file
diff --git a/pyscf/isdf/_isdf_local_K_direct.py b/pyscf/isdf/_isdf_local_K_direct.py
new file mode 100644
index 000000000..6aede0d47
--- /dev/null
+++ b/pyscf/isdf/_isdf_local_K_direct.py
@@ -0,0 +1,723 @@
+#!/usr/bin/env python
+# Copyright 2014-2020 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Author: Ning Zhang <ningzhang1024@gmail.com>
+#
+
+######## a unified driver for getting K directly for both ISDF with/without k-points
+
+############ sys module ############
+
+import copy, sys
+import ctypes
+import numpy as np
+
+############ pyscf module ############
+
+from pyscf import lib
+from pyscf.lib import logger
+from pyscf.pbc import tools
+from pyscf.pbc.lib.kpts_helper import is_zero, gamma_point
+from pyscf.pbc.df.df_jk import _ewald_exxdiv_for_G0
+from pyscf.pbc.df.df_jk import _format_dms, _format_kpts_band, _format_jks
+libisdf = lib.load_library('libisdf')
+
+############ isdf utils ############
+
+from   pyscf.isdf.isdf_jk             import _benchmark_time
+from   pyscf.isdf.isdf_tools_local    import _pack_aoR_holder, _get_aoR_holders_memory
+import pyscf.isdf.isdf_tools_linearop as     lib_isdf
+
+############ profile ############ 
+
+cputime_RgAO   = 0.0
+cputime_V      = 0.0
+cputime_W      = 0.0
+cputime_RgR    = 0.0 
+cputime_Ktmp1  = 0.0
+cputime_Ktmp2  = 0.0
+
+walltime_RgAO  = 0.0
+walltime_V     = 0.0
+walltime_W     = 0.0
+walltime_RgR   = 0.0 
+walltime_Ktmp1 = 0.0
+walltime_Ktmp2 = 0.0
+
+def add_cputime_RgAO(t1):
+    global cputime_RgAO
+    cputime_RgAO += t1
+
+def add_walltime_RgAO(t1):
+    global walltime_RgAO
+    walltime_RgAO += t1
+
+def reset_profile_buildK_time():
+    
+    global cputime_RgAO, cputime_V, cputime_W, cputime_RgR, cputime_Ktmp1, cputime_Ktmp2
+    global walltime_RgAO, walltime_V, walltime_W, walltime_RgR, walltime_Ktmp1, walltime_Ktmp2
+    
+    cputime_RgAO   = 0.0
+    cputime_V      = 0.0
+    cputime_W      = 0.0
+    cputime_RgR    = 0.0 
+    cputime_Ktmp1  = 0.0
+    cputime_Ktmp2  = 0.0
+    
+    walltime_RgAO  = 0.0
+    walltime_V     = 0.0
+    walltime_W     = 0.0
+    walltime_RgR   = 0.0 
+    walltime_Ktmp1 = 0.0
+    walltime_Ktmp2 = 0.0
+
+def log_profile_buildK_time(mydf, use_mpi=False):
+    
+    global cputime_RgAO, cputime_V, cputime_W, cputime_RgR, cputime_Ktmp1, cputime_Ktmp2
+    global walltime_RgAO, walltime_V, walltime_W, walltime_RgR, walltime_Ktmp1, walltime_Ktmp2
+    
+    log = logger.Logger(mydf.stdout, mydf.verbose)
+    
+    if not use_mpi:
+        log.info('In _isdf_get_K_direct_kernel RgAO  cputime = %16.3f walltime = %16.3f paralell = %4.2f' % (cputime_RgAO, walltime_RgAO, cputime_RgAO/walltime_RgAO))
+        log.info('In _isdf_get_K_direct_kernel RgR   cputime = %16.3f walltime = %16.3f paralell = %4.2f' % (cputime_RgR, walltime_RgR, cputime_RgR/walltime_RgR))
+        log.info('In _isdf_get_K_direct_kernel V     cputime = %16.3f walltime = %16.3f paralell = %4.2f' % (cputime_V, walltime_V, cputime_V/walltime_V))
+        log.info('In _isdf_get_K_direct_kernel W     cputime = %16.3f walltime = %16.3f paralell = %4.2f' % (cputime_W, walltime_W, cputime_W/walltime_W))
+        log.info('In _isdf_get_K_direct_kernel Ktmp1 cputime = %16.3f walltime = %16.3f paralell = %4.2f' % (cputime_Ktmp1, walltime_Ktmp1, cputime_Ktmp1/walltime_Ktmp1))
+        log.info('In _isdf_get_K_direct_kernel Ktmp2 cputime = %16.3f walltime = %16.3f paralell = %4.2f' % (cputime_Ktmp2, walltime_Ktmp2, cputime_Ktmp2/walltime_Ktmp2))
+    else:
+        if rank == 0:
+            log.info('In _isdf_get_K_direct_kernel RgAO  cputime = %16.3f walltime = %16.3f paralell = %4.2f' % (cputime_RgAO, walltime_RgAO, cputime_RgAO/walltime_RgAO))
+            log.info('In _isdf_get_K_direct_kernel RgR   cputime = %16.3f walltime = %16.3f paralell = %4.2f' % (cputime_RgR, walltime_RgR, cputime_RgR/walltime_RgR))
+            log.info('In _isdf_get_K_direct_kernel V     cputime = %16.3f walltime = %16.3f paralell = %4.2f' % (cputime_V, walltime_V, cputime_V/walltime_V))
+            log.info('In _isdf_get_K_direct_kernel W     cputime = %16.3f walltime = %16.3f paralell = %4.2f' % (cputime_W, walltime_W, cputime_W/walltime_W))
+            log.info('In _isdf_get_K_direct_kernel Ktmp1 cputime = %16.3f walltime = %16.3f paralell = %4.2f' % (cputime_Ktmp1, walltime_Ktmp1, cputime_Ktmp1/walltime_Ktmp1))
+            log.info('In _isdf_get_K_direct_kernel Ktmp2 cputime = %16.3f walltime = %16.3f paralell = %4.2f' % (cputime_Ktmp2, walltime_Ktmp2, cputime_Ktmp2/walltime_Ktmp2))
+        comm.Barrier()
+
+############ GLOBAL PARAMETER ############
+
+K_DIRECT_NAUX_BUNCHSIZE = 256
+
+############ subroutines to keep ISDF w./w.o k-points consistent ############
+
+def _add_kpnt_info(mydf):
+    if hasattr(mydf, "kmesh"):
+        assert mydf.kmesh is None or (mydf.kmesh[0] == 1 and mydf.kmesh[1] == 1 and mydf.kmesh[2] == 1)
+
+    mydf.meshPrim = np.array(mydf.mesh)
+    mydf.natmPrim = mydf.cell.natm
+    mydf.primCell = mydf.cell
+    mydf.nao_prim = mydf.nao
+    mydf.nIP_Prim = mydf.naux
+
+def _permutation_box(mydf, kmesh):
+    permutation = []
+    for kx in range(kmesh[0]):
+        for ky in range(kmesh[1]):
+            for kz in range(kmesh[2]):
+                
+                tmp = []
+                
+                for ix in range(kmesh[0]):
+                    for iy in range(kmesh[1]):
+                        for iz in range(kmesh[2]):
+                            ix_ = (ix + kx) % kmesh[0]
+                            iy_ = (iy + ky) % kmesh[1]
+                            iz_ = (iz + kz) % kmesh[2]
+                            tmp.append(ix_*kmesh[1]*kmesh[2] + iy_*kmesh[2] + iz_)
+                            
+                tmp = np.array(tmp, dtype=np.int32)
+                permutation.append(tmp)
+    mydf._permutation_box = permutation
+    return permutation
+                
+
+def construct_V(aux_basis:np.ndarray, 
+                buf, 
+                V, 
+                ### some helper info ###
+                grid_ID, grid_ordering,
+                mesh, coulG_real):
+    fn = getattr(libisdf, "_construct_V_local_bas", None)
+    assert(fn is not None)
+        
+    nThread = buf.shape[0]
+    bufsize_per_thread = buf.shape[1]
+    nrow = aux_basis.shape[0]
+    ncol = aux_basis.shape[1]
+    shift_row = 0
+        
+    fn(mesh.ctypes.data_as(ctypes.c_void_p),
+        ctypes.c_int(nrow),
+        ctypes.c_int(ncol),
+        grid_ID.ctypes.data_as(ctypes.c_void_p),
+        aux_basis.ctypes.data_as(ctypes.c_void_p),
+        coulG_real.ctypes.data_as(ctypes.c_void_p),
+        ctypes.c_int(shift_row),
+        V.ctypes.data_as(ctypes.c_void_p),
+        grid_ordering.ctypes.data_as(ctypes.c_void_p),
+        buf.ctypes.data_as(ctypes.c_void_p),
+        ctypes.c_int(bufsize_per_thread))
+        
+def _isdf_get_K_direct_kernel_1(
+    mydf, 
+    coulG_real,
+    ##### input ####
+    group_id,  ## the contribution of K from which group 
+    dm_RgAO,
+    V_or_W_tmp,
+    construct_K1,
+    calculate_W,
+    ##### buffer #####, 
+    buf_build_V_thread,
+    build_VW_buf,
+    offset_V_tmp,
+    Density_RgR_buf,
+    dm_RgAO_buf,
+    dm_RgAO_packed_offset,
+    ddot_res_RgR_buf,
+    K1_tmp1_buf,
+    K1_tmp1_ddot_res_buf,
+    K1_final_ddot_buf,
+    ##### bunchsize #####
+    naux_bunchsize = K_DIRECT_NAUX_BUNCHSIZE,
+    ##### other info #####
+    use_mpi =False,
+    begin_id=None,
+    end_id  =None,
+    ##### out #####
+    K1_or_2 = None
+):
+    
+    log = logger.Logger(mydf.stdout, mydf.verbose)
+  
+  
+    ######### profile #########
+     
+    global cputime_RgAO,  cputime_V,  cputime_W,  cputime_RgR,  cputime_Ktmp1,  cputime_Ktmp2
+    global walltime_RgAO, walltime_V, walltime_W, walltime_RgR, walltime_Ktmp1, walltime_Ktmp2
+    
+    ######### cutoff #########
+    
+    use_cutoff  = False
+    rela_cutoff = None
+    abs_cutoff  = None
+    distance_cutoff = None
+    
+    if hasattr(mydf, "_build_K_rela_cutoff"):
+        rela_cutoff = mydf._build_K_rela_cutoff
+        if rela_cutoff is not None:
+            use_cutoff = True
+    if hasattr(mydf, "_build_K_abs_cutoff"):
+        abs_cutoff = mydf._build_K_abs_cutoff
+        if abs_cutoff is not None:
+            use_cutoff = True
+    if hasattr(mydf, "_build_K_distance_cutoff"):
+        distance_cutoff = mydf._build_K_distance_cutoff
+        if distance_cutoff is not None:
+            assert not use_cutoff
+            use_cutoff = True
+        
+    if use_cutoff and abs_cutoff is None:
+        if distance_cutoff is None:
+            abs_cutoff = 1.0e-9
+    
+    distance_matrix = mydf.distance_matrix
+    
+    ######### info #########
+    
+    assert K1_or_2 is not None
+    
+    if construct_K1 == False:
+        assert V_or_W_tmp is not None
+    
+    # if use_mpi:
+    #     from pyscf.isdf.isdf_tools_mpi import rank, comm_size, comm, allgather, bcast
+    #     size = comm.Get_size()
+    #     if group_id % comm_size != rank:
+    #         raise ValueError
+    
+    nao   = mydf.nao
+    mesh  = np.array(mydf.cell.mesh, dtype=np.int32)
+    ngrid = np.prod(mesh)
+    naux  = mydf.naux
+    
+    ######### to be compatible with kmesh #########
+    
+    if mydf.kmesh is None:
+        kmesh = [1,1,1]
+    else:
+        kmesh = mydf.kmesh
+        
+    nkpts = np.prod(kmesh)
+    
+    if not hasattr(mydf, "nao_prim"):
+        _add_kpnt_info(mydf)
+    natm_prim = mydf.natmPrim
+    nao_prim  = mydf.nao_prim
+    
+    ngrid_prim = np.prod(mesh) // np.prod(kmesh)
+    nIP_prim   = mydf.nIP_Prim
+    
+    assert np.prod(mesh) % np.prod(kmesh) == 0
+    assert mesh[0] % kmesh[0] == 0
+    assert mesh[1] % kmesh[1] == 0
+    assert mesh[2] % kmesh[2] == 0
+    
+    if hasattr(mydf, "_permutation_box"):
+        permutation = mydf._permutation_box
+    else:
+        permutation = _permutation_box(mydf, kmesh)
+    
+    ######### fetch ao values on grids or IPs #########
+    
+    aoRg = mydf.aoRg
+    assert isinstance(aoRg, list)
+    aoR = mydf.aoR
+    assert isinstance(aoR, list)
+    
+    if hasattr(mydf, "aoR1"):
+        aoR1 = mydf.aoR1
+    else:
+        aoR  = aoR
+    
+    if hasattr(mydf, "aoRg1"):
+        aoRg1 = mydf.aoRg1
+    else:
+        aoRg1 = aoRg
+        
+    ######### fetch the atm_ordering #########
+    
+    group     = mydf.group
+    
+    ngroup_prim = len(group)
+    
+    if hasattr(mydf, "atm_ordering"):
+        atm_ordering = mydf.atm_ordering
+    else:
+        atm_ordering = []
+        for group_idx, atm_idx in enumerate(group):
+            atm_idx.sort()
+            atm_ordering.extend(atm_idx)
+        atm_ordering = np.array(atm_ordering, dtype=np.int32)
+        mydf.atm_ordering = atm_ordering
+        
+    aux_basis = mydf.aux_basis
+    assert len(group) == len(aux_basis)
+    
+    ### the number of aux basis involved ###
+    
+    naux_tmp = 0
+    aoRg_packed = []
+    IP_2_atm_id = []
+    ILOC = 0
+    for kx in range(kmesh[0]):
+        for ky in range(kmesh[1]):
+            for kz in range(kmesh[2]):
+                aoRg_holders = []
+                naux_tmp = 0
+                for atm_id in group[group_id]:
+                    # print("atm_id = ", atm_id, "ILOC = ", ILOC, "shape = ", aoRg1[atm_id+ILOC*natm_prim].aoR.shape)
+                    naux_tmp += aoRg1[atm_id+ILOC*natm_prim].aoR.shape[1]
+                    IP_2_atm_id.extend([atm_id+ILOC*natm_prim] * aoRg1[atm_id+ILOC*natm_prim].aoR.shape[1])
+                    aoRg_holders.append(aoRg1[atm_id+ILOC*natm_prim])
+                aoRg_packed.append(_pack_aoR_holder(aoRg_holders, nao))
+                # print("naux_tmp = ", naux_tmp)
+                # print("aux_basis[group_id].shape[0] = ", aux_basis[group_id].shape[0])
+                assert naux_tmp == aux_basis[group_id].shape[0]
+                ILOC += 1
+    IP_2_atm_id = np.array(IP_2_atm_id, dtype=np.int32)
+    # print("IP_2_atm_id = ", IP_2_atm_id)
+    
+    # grid ID involved for the given group
+    
+    aux_basis_grip_ID  = mydf.partition_group_to_gridID[group_id]
+    
+    # pack aoRg for loop over Rg #
+    
+    # aoRg_packed = _pack_aoR_holder(aoRg_holders, nao)
+    # memory      = _get_aoR_holders_memory(aoRg_holders)
+    
+    memory      = _get_aoR_holders_memory(aoRg_packed)
+    
+    # log.info('In _isdf_get_K_direct_kernel1 aoRg_packed Memory = %d Bytes' % (memory))
+    # log.info('In _isdf_get_K_direct_kernel1 group_id = %d, naux = %d' % (group_id, naux_tmp))
+    # log.info('In _isdf_get_K_direct_kernel1 aoRg_holders Memory = %d Bytes' % (memory))
+    # log.info('In _isdf_get_K_direct_kernel1 naux_bunchsize      = %d' % (naux_bunchsize))
+    
+    # assert aoRg_packed.ngrid_tot == naux_tmp
+    
+    ######### get involved C function #########
+    
+    fn_packcol1      = getattr(libisdf, "_buildK_packcol",              None)
+    fn_packcol2      = getattr(libisdf, "_buildK_packcol2",             None)
+    fn_packadd_col   = getattr(libisdf, "_buildK_packaddcol",           None)
+    fn_packadd_row_k = getattr(libisdf, "_buildK_packaddrow_shift_col", None)
+    
+    assert fn_packcol1      is not None
+    assert fn_packcol2      is not None
+    assert fn_packadd_col   is not None
+    assert fn_packadd_row_k is not None
+    
+    # determine bunchsize #
+    
+    bunchsize = min(naux_bunchsize, naux_tmp)
+    
+    if construct_K1:
+    
+        ### allocate buf ###
+    
+        V_tmp = np.ndarray((bunchsize, ngrid), 
+                           buffer=build_VW_buf, 
+                           offset=offset_V_tmp, 
+                           dtype =np.float64)
+        offset_after_V_tmp = offset_V_tmp + V_tmp.size * V_tmp.dtype.itemsize
+    
+        # buffer for W_tmp # 
+    
+        W_tmp = np.ndarray((naux_tmp, naux), 
+                           buffer=build_VW_buf, 
+                           offset=offset_after_V_tmp, 
+                           dtype =np.float64)
+        W_tmp.ravel()[:] = 0.0 # clean
+    
+    else:
+        offset_after_V_tmp = offset_V_tmp
+        W_tmp = None
+
+    ###### CUTOFF ######
+    
+    # if use_cutoff:    
+        # dm_RgAO_max = np.max(np.abs(dm_RgAO[:, :nao_prim]))
+        # log.info('In _isdf_get_K_direct_kernel1 dm_RgAO_max = %16.8e' % (dm_RgAO_max))
+        
+    ####################
+
+    if begin_id is None:
+        begin_id = 0
+    if end_id   is None:
+        end_id   = naux_tmp
+
+    #### loop over Rg ####
+    
+    for p0, p1 in lib.prange(begin_id, end_id, bunchsize):
+        
+        unique_elements = np.unique(IP_2_atm_id[p0:p1])
+        
+        #### 2. build the V matrix if constructK1 ####
+        
+        if construct_K1:
+            
+            V_tmp = np.ndarray((p1 - p0, ngrid), 
+                                buffer=build_VW_buf, 
+                                offset=offset_V_tmp, 
+                                dtype =np.float64)
+            V_tmp.ravel()[:] = 0.0 # clean
+
+            t1 = (lib.logger.process_clock(), lib.logger.perf_counter())
+        
+            construct_V(aux_basis[group_id][p0:p1, :], 
+                        buf_build_V_thread,
+                        V_tmp,
+                        aux_basis_grip_ID,
+                        mydf.grid_ID_ordered,
+                        mesh,
+                        coulG_real)
+            
+            t2 = (lib.logger.process_clock(), lib.logger.perf_counter()) 
+            
+            cputime_V  += t2[0] - t1[0]
+            walltime_V += t2[1] - t1[1]
+        
+        else:
+            
+            V_tmp = V_or_W_tmp[p0:p1, :]   # W_tmp in fact
+        
+        #### 3. build the K1_or_2 matrix ####
+        
+        ###### 3.1 build density RgR
+        
+        if construct_K1:
+            Density_RgR_tmp = np.ndarray((p1 - p0, ngrid), 
+                                         buffer=Density_RgR_buf, 
+                                         offset=0, 
+                                         dtype =np.float64)
+        else:
+            Density_RgR_tmp = np.ndarray((p1 - p0, naux), 
+                                         buffer=Density_RgR_buf, 
+                                         offset=0, 
+                                         dtype =np.float64)
+        Density_RgR_tmp.ravel()[:] = 0.0 # clean 
+
+        t1 = (lib.logger.process_clock(), lib.logger.perf_counter())
+
+        ILOC = 0
+        for kx in range(kmesh[0]):
+            for ky in range(kmesh[1]):
+                for kz in range(kmesh[2]):
+                    
+                    if kx!=0 or ky!=0 or kz!=0:
+                        if construct_K1:
+                            col_permutation = mydf._get_permutation_column_aoR(kx, ky, kz)
+                        else:
+                            col_permutation = mydf._get_permutation_column_aoRg(kx, ky, kz)
+
+                    for atm_id in atm_ordering[:natm_prim]:
+            
+                        if construct_K1:
+                            aoR_holder = aoR[atm_id]
+                        else:
+                            aoR_holder = aoRg[atm_id]
+            
+                        if aoR_holder is None:
+                            raise ValueError("aoR_holder is None")
+                        
+                        ngrid_now    = aoR_holder.aoR.shape[1]
+                        nao_involved = aoR_holder.aoR.shape[0]
+            
+                        ###### CUTOFF ######
+                        
+                        if use_cutoff:
+                            if distance_cutoff is not None:
+                                distance = np.min(distance_matrix[unique_elements, ILOC*natm_prim+atm_id])
+                                if distance > distance_cutoff:
+                                    continue
+                                
+                        ####################
+                                    
+                        ##### packed involved DgAO #####
+            
+                        if kx == 0 and ky == 0 and kz == 0:
+                            ao_permutation = aoR_holder.ao_involved
+                        else:
+                            ao_permutation = col_permutation[atm_id]
+            
+                        if (nao_involved == nao) and (kx == 0 and ky == 0 and kz == 0):
+                            Density_RgAO_packed = dm_RgAO[p0:p1, :]
+                        else:
+                            Density_RgAO_packed = np.ndarray((p1-p0, nao_involved), 
+                                                             buffer=dm_RgAO_buf, 
+                                                             offset=dm_RgAO_packed_offset, 
+                                                             dtype =np.float64)
+                
+                            fn_packcol1(
+                                Density_RgAO_packed.ctypes.data_as(ctypes.c_void_p),
+                                ctypes.c_int(p1-p0),
+                                ctypes.c_int(nao_involved),
+                                dm_RgAO[p0:p1, :].ctypes.data_as(ctypes.c_void_p),
+                                ctypes.c_int(p1-p0),
+                                ctypes.c_int(nao),
+                                ao_permutation.ctypes.data_as(ctypes.c_void_p)
+                            )
+
+                        ###### CUTOFF ######
+
+                        if use_cutoff:
+                            if distance_cutoff is None:
+                                dm_RgAO_packed_max = np.max(np.abs(Density_RgAO_packed))
+                                if dm_RgAO_packed_max < abs_cutoff:
+                                    continue
+                        
+                        ####################
+
+                        if construct_K1:
+                            grid_begin   = aoR_holder.global_gridID_begin + ILOC*ngrid_prim
+                        else:
+                            grid_begin   = aoR_holder.global_gridID_begin + ILOC*nIP_prim
+                        
+                        ddot_res_RgR = np.ndarray((p1-p0, ngrid_now), buffer=ddot_res_RgR_buf)
+                        lib.ddot(Density_RgAO_packed, aoR_holder.aoR, c=ddot_res_RgR)
+                        Density_RgR_tmp[:, grid_begin:grid_begin+ngrid_now] = ddot_res_RgR
+        
+                    ILOC += 1
+        
+        t2 = (lib.logger.process_clock(), lib.logger.perf_counter())
+        
+        cputime_RgR  += t2[0] - t1[0]
+        walltime_RgR += t2[1] - t1[1]
+        
+        Density_RgR = Density_RgR_tmp
+        
+        #### 3.2 V_tmp = Density_RgR * V
+        
+        lib_isdf.cwise_mul(V_tmp, Density_RgR, out=Density_RgR)
+        V2_tmp = Density_RgR
+        
+        
+        ###### CUTOFF ######
+        
+        # if use_cutoff:
+        #     if construct_K1:
+        #         V2_tmp_max = np.max(np.abs(V2_tmp[:, :ngrid_prim]))
+        #     else:
+        #         V2_tmp_max = np.max(np.abs(V2_tmp[:, :nIP_prim]))
+            # log.info('In _isdf_get_K_direct_kernel1 V2_tmp_max = %16.8e' % (V2_tmp_max))
+        
+        ####################
+        
+        #### 3.3 K1_tmp1 = V2_tmp * aoR.T
+        
+        K1_tmp1 = np.ndarray((p1-p0, nao), buffer=K1_tmp1_buf)
+        K1_tmp1.ravel()[:] = 0.0
+        
+        t1 = (lib.logger.process_clock(), lib.logger.perf_counter())
+        
+        ILOC = 0
+        for kx in range(kmesh[0]):
+            for ky in range(kmesh[1]):
+                for kz in range(kmesh[2]):
+                    
+                    if kx!=0 or ky!=0 or kz!=0:
+                        if construct_K1:
+                            col_permutation = mydf._get_permutation_column_aoR(kx, ky, kz)
+                        else:
+                            col_permutation = mydf._get_permutation_column_aoRg(kx, ky, kz)
+                        
+                    for atm_id in atm_ordering[:natm_prim]:
+            
+                        if construct_K1:
+                            aoR_holder = aoR[atm_id]
+                        else:
+                            aoR_holder = aoRg[atm_id]
+            
+                        ngrid_now    = aoR_holder.aoR.shape[1]
+                        nao_involved = aoR_holder.aoR.shape[0]
+                        ddot_res     = np.ndarray((p1-p0, nao_involved), buffer=K1_tmp1_ddot_res_buf)
+                        
+                        if construct_K1:
+                            grid_loc_begin = aoR_holder.global_gridID_begin + ILOC*ngrid_prim
+                        else:
+                            grid_loc_begin = aoR_holder.global_gridID_begin + ILOC*nIP_prim
+            
+                        ###### CUTOFF ######
+                        
+                        if use_cutoff:
+                            if distance_cutoff is None:
+                                V2_tmp_max2 = np.max(np.abs(V2_tmp[:, grid_loc_begin:grid_loc_begin+ngrid_now]))
+                                if V2_tmp_max2 < abs_cutoff:
+                                    continue
+                            else:
+                                distance = np.min(distance_matrix[unique_elements, ILOC*natm_prim+atm_id])
+                                if distance > distance_cutoff:
+                                    continue
+                        
+                        ####################
+            
+                        lib.ddot(V2_tmp[:, grid_loc_begin:grid_loc_begin+ngrid_now],
+                                 aoR_holder.aoR.T, 
+                                 c=ddot_res)
+            
+                        if kx == 0 and ky == 0 and kz == 0:
+                            ao_permutation = aoR_holder.ao_involved
+                        else:
+                            ao_permutation = col_permutation[atm_id]
+                            assert col_permutation[atm_id].shape[0] == nao_involved
+            
+                        if (nao_involved == nao) and (kx == 0 and ky == 0 and kz == 0):
+                            K1_tmp1 += ddot_res
+                        else:
+                            fn_packadd_col(
+                                K1_tmp1.ctypes.data_as(ctypes.c_void_p),
+                                ctypes.c_int(K1_tmp1.shape[0]),
+                                ctypes.c_int(K1_tmp1.shape[1]),
+                                ddot_res.ctypes.data_as(ctypes.c_void_p),
+                                ctypes.c_int(ddot_res.shape[0]),
+                                ctypes.c_int(ddot_res.shape[1]),
+                                ao_permutation.ctypes.data_as(ctypes.c_void_p)
+                            )
+
+                    ILOC += 1
+
+        t2 = (lib.logger.process_clock(), lib.logger.perf_counter())
+        
+        cputime_Ktmp1  += t2[0] - t1[0]
+        walltime_Ktmp1 += t2[1] - t1[1]
+
+        #### 3.4 K1_or_2 += aoRg * K1_tmp1
+        
+        t1 = (lib.logger.process_clock(), lib.logger.perf_counter())
+        
+        ILOC = 0
+        for kx in range(kmesh[0]):
+            for ky in range(kmesh[1]):
+                for kz in range(kmesh[2]):
+                    
+                    box_permutation = permutation[ILOC]
+                    
+                    nao_involved = aoRg_packed[ILOC].nao_involved
+                    ddot_res     = np.ndarray((nao_involved, nao), buffer=K1_final_ddot_buf)
+                    lib.ddot(aoRg_packed[ILOC].aoR[:,p0:p1], K1_tmp1, c=ddot_res)
+                    fn_packadd_row_k(
+                        K1_or_2.ctypes.data_as(ctypes.c_void_p),
+                        ctypes.c_int(K1_or_2.shape[0]),
+                        ctypes.c_int(K1_or_2.shape[1]),
+                        ddot_res.ctypes.data_as(ctypes.c_void_p),
+                        ctypes.c_int(ddot_res.shape[0]),
+                        ctypes.c_int(ddot_res.shape[1]),
+                        aoRg_packed[ILOC].ao_involved.ctypes.data_as(ctypes.c_void_p),
+                        ctypes.c_int(nkpts),
+                        ctypes.c_int(nao_prim),
+                        box_permutation.ctypes.data_as(ctypes.c_void_p)
+                    )
+                    
+                    ILOC += 1
+        
+        t2 = (lib.logger.process_clock(), lib.logger.perf_counter())
+        
+        cputime_Ktmp2  += t2[0] - t1[0]
+        walltime_Ktmp2 += t2[1] - t1[1]
+        
+        #### 4. build the W matrix ####
+        
+        if calculate_W:
+                        
+            aux_ket_shift = 0
+            grid_shift    = 0
+
+            t1 = (lib.logger.process_clock(), lib.logger.perf_counter())
+
+            ILOC = 0
+            for ix in range(kmesh[0]):
+                for iy in range(kmesh[1]):
+                    for iz in range(kmesh[2]):
+                        
+                        skip = False
+                        if use_cutoff:
+                            if distance_cutoff is not None:
+                                distance = np.min(distance_matrix[unique_elements, ILOC*natm_prim:(ILOC+1)*natm_prim])
+                                if distance > distance_cutoff:
+                                    skip = True
+                        
+                        for j in range(len(group)):
+                            aux_basis_ket  = mydf.aux_basis[j]
+                            ngrid_now      = aux_basis_ket.shape[1]
+                            naux_ket       = aux_basis_ket.shape[0]
+                            if not skip:
+                                W_tmp[p0:p1, aux_ket_shift:aux_ket_shift+naux_ket] = lib.ddot(
+                                V_tmp[:, grid_shift:grid_shift+ngrid_now], aux_basis_ket.T)
+                            aux_ket_shift += naux_ket
+                            grid_shift    += ngrid_now 
+                        ILOC += 1
+            
+            t2 = (lib.logger.process_clock(), lib.logger.perf_counter())
+            
+            cputime_W  += t2[0] - t1[0]
+            walltime_W += t2[1] - t1[1]
+            
+            assert grid_shift == ngrid
+        
+    return W_tmp
+    
\ No newline at end of file
diff --git a/pyscf/isdf/fft.c b/pyscf/isdf/fft.c
new file mode 100644
index 000000000..d7f0f6cfb
--- /dev/null
+++ b/pyscf/isdf/fft.c
@@ -0,0 +1,261 @@
+/* Copyright 2021- The PySCF Developers. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+
+ *
+ * Author: Xing Zhang <zhangxing.nju@gmail.com>
+ */
+
+#include <stdio.h>
+#include <complex.h>
+#include "fft.h"
+#include "config.h"
+
+#define BLKSIZE 128
+#define MAX(x, y) (((x) > (y)) ? (x) : (y))
+
+fftw_plan fft_create_r2c_plan(double *in, complex double *out, int rank, int *mesh)
+{
+    fftw_plan p;
+    p = fftw_plan_dft_r2c(rank, mesh, in, out, FFTW_ESTIMATE);
+    return p;
+}
+
+fftw_plan fft_create_c2r_plan(complex double *in, double *out, int rank, int *mesh)
+{
+    fftw_plan p;
+    p = fftw_plan_dft_c2r(rank, mesh, in, out, FFTW_ESTIMATE);
+    return p;
+}
+
+void fft_execute(fftw_plan p)
+{
+    fftw_execute(p);
+}
+
+void fft_destroy_plan(fftw_plan p)
+{
+    fftw_destroy_plan(p);
+}
+
+void _complex_fft(complex double *in, complex double *out, int *mesh, int rank, int sign)
+{
+    int i;
+    int nx = mesh[0];
+    int nyz = 1;
+    for (i = 1; i < rank; i++)
+    {
+        nyz *= mesh[i];
+    }
+    int nmax = nyz / BLKSIZE * BLKSIZE;
+    fftw_plan p_2d = fftw_plan_dft(rank - 1, mesh + 1, in, out, sign, FFTW_ESTIMATE);
+    int nn[BLKSIZE] = {nx};
+    fftw_plan p_3d_x = fftw_plan_many_dft(1, nn, BLKSIZE,
+                                          out, NULL, nyz, 1,
+                                          out, NULL, nyz, 1,
+                                          sign, FFTW_ESTIMATE);
+
+#pragma omp parallel private(i)
+    {
+        int off;
+#pragma omp for schedule(dynamic)
+        for (i = 0; i < nx; i++)
+        {
+            off = i * nyz;
+            fftw_execute_dft(p_2d, in + off, out + off);
+        }
+
+#pragma omp for schedule(dynamic)
+        for (i = 0; i < nmax; i += BLKSIZE)
+        {
+            fftw_execute_dft(p_3d_x, out + i, out + i);
+        }
+    }
+    fftw_destroy_plan(p_2d);
+    fftw_destroy_plan(p_3d_x);
+
+    int nres = nyz - nmax;
+    if (nres > 0)
+    {
+        fftw_plan p_3d_x = fftw_plan_many_dft(1, nn, nres,
+                                              out + nmax, NULL, nyz, 1,
+                                              out + nmax, NULL, nyz, 1,
+                                              sign, FFTW_ESTIMATE);
+        fftw_execute(p_3d_x);
+        fftw_destroy_plan(p_3d_x);
+    }
+}
+
+void fft(complex double *in, complex double *out, int *mesh, int rank)
+{
+    _complex_fft(in, out, mesh, rank, FFTW_FORWARD);
+}
+
+void ifft(complex double *in, complex double *out, int *mesh, int rank)
+{
+    _complex_fft(in, out, mesh, rank, FFTW_BACKWARD);
+    size_t i, n = 1;
+    for (i = 0; i < rank; i++)
+    {
+        n *= mesh[i];
+    }
+    double fac = 1. / (double)n;
+#pragma omp parallel for schedule(static)
+    for (i = 0; i < n; i++)
+    {
+        out[i] *= fac;
+    }
+}
+
+void rfft(double *in, complex double *out, int *mesh, int rank)
+{
+    fftw_plan p = fftw_plan_dft_r2c(rank, mesh, in, out, FFTW_ESTIMATE);
+    fftw_execute(p);
+    fftw_destroy_plan(p);
+}
+
+void rfft_3d(double *in, complex double *out, int *mesh, int rank)
+{
+    fftw_plan p = fftw_plan_dft_r2c_3d(mesh[0], mesh[1], mesh[2], in, out, FFTW_ESTIMATE);
+    fftw_execute(p);
+    fftw_destroy_plan(p);
+}
+
+void irfft(complex double *in, double *out, int *mesh, int rank)
+{
+    fftw_plan p = fftw_plan_dft_c2r(rank, mesh, in, out, FFTW_ESTIMATE);
+    fftw_execute(p);
+    fftw_destroy_plan(p);
+    size_t i, n = 1;
+    for (i = 0; i < rank; i++)
+    {
+        n *= mesh[i];
+    }
+    double fac = 1. / (double)n;
+#pragma omp parallel for schedule(static)
+    for (i = 0; i < n; i++)
+    {
+        out[i] *= fac;
+    }
+}
+
+void irfft_3d(complex double *in, double *out, int *mesh, int rank)
+{
+    fftw_plan p = fftw_plan_dft_c2r_3d(mesh[0], mesh[1], mesh[2], in, out, FFTW_ESTIMATE);
+    fftw_execute(p);
+    fftw_destroy_plan(p);
+    size_t i, n = 1;
+    for (i = 0; i < rank; i++)
+    {
+        n *= mesh[i];
+    }
+    double fac = 1. / (double)n;
+#pragma omp parallel for schedule(static)
+    for (i = 0; i < n; i++)
+    {
+        out[i] *= fac;
+    }
+}
+
+//// the following subroutines are designed for the 3D FFT for ISDF ////
+
+void _rfft_3d_ISDF(double *in, complex double *out, int *mesh, int nTransform) /// single thread mode
+{
+    fftw_plan p = fftw_plan_dft_r2c_3d(mesh[0], mesh[1], mesh[2], in, out, FFTW_ESTIMATE);
+    int n_in = mesh[0] * mesh[1] * mesh[2];
+    int n_out = mesh[0] * mesh[1] * (mesh[2] / 2 + 1);
+    for (int i = 0; i < nTransform; i++)
+    {
+        fftw_execute_dft_r2c(p, in + i * n_in, out + i * n_out);
+    }
+    fftw_destroy_plan(p);
+}
+
+void _rfft_3d_ISDF_manydft(double *in, complex double *out, int *mesh, int nTransform) /// not to be very efficient
+{
+    int n_in = mesh[0] * mesh[1] * mesh[2];
+    int n_out = mesh[0] * mesh[1] * (mesh[2] / 2 + 1);
+    int mesh_out[3] = {mesh[0], mesh[1], mesh[2] / 2 + 1};
+    fftw_plan p = fftw_plan_many_dft_r2c(
+        3, mesh, nTransform, in, mesh, 1, n_in, out, mesh_out, 1, n_out, FFTW_ESTIMATE);
+    fftw_execute(p);
+    fftw_destroy_plan(p);
+}
+
+void _rfft_3d_ISDF_parallel(double *in, complex double *out, int *mesh, int nTransform) /// parallel thread mode
+{
+    fftw_plan p = fftw_plan_dft_r2c_3d(mesh[0], mesh[1], mesh[2], in, out, FFTW_ESTIMATE);
+    int n_in = mesh[0] * mesh[1] * mesh[2];
+    int n_out = mesh[0] * mesh[1] * (mesh[2] / 2 + 1);
+#pragma omp parallel for schedule(static)
+    for (int i = 0; i < nTransform; i++)
+    {
+        fftw_execute_dft_r2c(p, in + i * n_in, out + i * n_out);
+    }
+    fftw_destroy_plan(p);
+}
+
+void _irfft_3d_ISDF(complex double *in, double *out, int *mesh, int nTransform) /// single thread mode
+{
+    fftw_plan p = fftw_plan_dft_c2r_3d(mesh[0], mesh[1], mesh[2], in, out, FFTW_ESTIMATE);
+    int n_in = mesh[0] * mesh[1] * (mesh[2] / 2 + 1);
+    int n_out = mesh[0] * mesh[1] * mesh[2];
+    double fac = 1. / (double)n_out;
+
+    for (int i = 0; i < nTransform; i++)
+    {
+        fftw_execute_dft_c2r(p, in + i * n_in, out + i * n_out);
+        for (int j = 0; j < n_out; j++)
+        {
+            out[i * n_out + j] *= fac;
+        }
+    }
+
+    fftw_destroy_plan(p);
+}
+
+void _irfft_3d_ISDF_manydft(complex double *in, double *out, int *mesh, int nTransform) /// not to be very efficient
+{
+    int n_in = mesh[0] * mesh[1] * (mesh[2] / 2 + 1);
+    int n_out = mesh[0] * mesh[1] * mesh[2];
+    double fac = 1. / (double)n_out;
+    int mesh_in[3] = {mesh[0], mesh[1], mesh[2] / 2 + 1};
+    fftw_plan p = fftw_plan_many_dft_c2r(
+        3, mesh, nTransform, in, mesh_in, 1, n_in, out, mesh, 1, n_out, FFTW_ESTIMATE);
+    fftw_execute(p);
+    fftw_destroy_plan(p);
+    for (int i = 0; i < nTransform * n_out; i++)
+    {
+        out[i] *= fac;
+    }
+}
+
+void _irfft_3d_ISDF_parallel(complex double *in, double *out, int *mesh, int nTransform) /// parallel thread mode
+{
+    fftw_plan p = fftw_plan_dft_c2r_3d(mesh[0], mesh[1], mesh[2], in, out, FFTW_ESTIMATE);
+    int n_in = mesh[0] * mesh[1] * (mesh[2] / 2 + 1);
+    int n_out = mesh[0] * mesh[1] * mesh[2];
+    double fac = 1. / (double)n_out;
+
+#pragma omp parallel for schedule(static)
+    for (int i = 0; i < nTransform; i++)
+    {
+        fftw_execute_dft_c2r(p, in + i * n_in, out + i * n_out);
+        for (int j = 0; j < n_out; j++)
+        {
+            out[i * n_out + j] *= fac;
+        }
+    }
+
+    fftw_destroy_plan(p);
+}
\ No newline at end of file
diff --git a/pyscf/isdf/fft.h b/pyscf/isdf/fft.h
new file mode 100644
index 000000000..995a914e6
--- /dev/null
+++ b/pyscf/isdf/fft.h
@@ -0,0 +1,27 @@
+/* Copyright 2021- The PySCF Developers. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+
+ *
+ * Author: Xing Zhang <zhangxing.nju@gmail.com>
+ */
+
+#include <fftw3.h>
+#include <complex.h>
+
+#define FFT_PLAN fftw_plan
+
+FFT_PLAN fft_create_r2c_plan(double* in, double __complex__ * out, int rank, int* mesh);
+FFT_PLAN fft_create_c2r_plan(double __complex__ * in, double* out, int rank, int* mesh);
+void fft_execute(FFT_PLAN p);
+void fft_destroy_plan(FFT_PLAN p);
diff --git a/pyscf/isdf/isdf.py b/pyscf/isdf/isdf.py
new file mode 100644
index 000000000..e3b1493be
--- /dev/null
+++ b/pyscf/isdf/isdf.py
@@ -0,0 +1,415 @@
+#!/usr/bin/env python
+# Copyright 2014-2020 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Author: Ning Zhang <ningzhang1024@gmail.com>
+#         Xing Zhang
+#
+
+############ sys module ############
+
+import os
+import sys
+import numpy as np
+import numpy
+import scipy
+
+############ pyscf module ############
+
+import pyscf
+from pyscf import lib
+from pyscf.lib import logger
+from pyscf import pbc
+from pyscf.pbc import gto as pbcgto
+from pyscf.pbc import scf as pbcscf
+from pyscf.pbc import dft as pbcdft
+from pyscf.pbc import tools
+from pyscf.pbc import df
+from pyscf.pbc.dft import gen_grid
+from pyscf.pbc.dft import multigrid
+from pyscf.pbc.df.df_jk import _format_dms, _format_kpts_band
+from pyscf.pbc.dft.multigrid.multigrid_pair import MultiGridFFTDF2, _eval_rhoG
+
+import pyscf.isdf.isdf_ao2mo as isdf_ao2mo
+import pyscf.isdf.isdf_jk    as isdf_jk
+from   pyscf.isdf.isdf_jk    import _benchmark_time
+
+############ subroutines ############
+
+def _get_rhoR(mydf, dm_kpts, hermi=1):
+    ''' 
+    get the electron density in real space (on grids)
+
+    '''
+
+    kpts      = np.zeros((1,3))
+    kpts_band = None
+
+    ### step 1 , evaluate ao_values on the grid
+
+    grids = mydf.grids
+    coords = np.asarray(grids.coords).reshape(-1,3)
+    mesh = grids.mesh
+    ngrids = np.prod(mesh)
+    assert ngrids == coords.shape[0]
+
+    ### step 2, evaluate the density on the grid as the weight for kmean
+    ### TODO: make it linear scaling
+
+    dm_kpts = np.asarray(dm_kpts, order='C')
+    dms = _format_dms(dm_kpts, kpts)
+    nset, nkpts, _ = dms.shape[:3]
+    assert nset == 1
+    assert nkpts == 1  # only gamma point for now
+    kpts_band = _format_kpts_band(kpts_band, kpts)
+
+    # density in grid space   $\rho(G)=\int_\Omega \rho(R) e^{-iGr} dr$
+    rhoG = _eval_rhoG(mydf, dm_kpts, hermi, kpts, deriv=0)
+
+    weight = cell.vol / ngrids
+    # *(1./weight) because rhoR is scaled by weight in _eval_rhoG.  When
+    # computing rhoR with IFFT, the weight factor is not needed.
+    # the above comment is from pyscf/pbc/dft/multigrid_pair.py
+    # $\rho(R) = 1/\Omega \int_BZ \rho(G) e^{iGr} dG$ ???
+    rhoR = tools.ifft(rhoG.reshape(-1,ngrids), mesh).real * (1./weight)
+    rhoR = rhoR.flatten()
+    assert rhoR.size == ngrids
+
+    return rhoR
+
+def isdf(mydf, dm_kpts, hermi=1, naux=None, c=5, max_iter=100, kpts=np.zeros((1,3)), kpts_band=None, verbose=None):
+
+    ''' 
+
+    Args:
+        mydf                : the DF object
+        dm_kpts (np.ndarray): (nset, nkpts, nao, nao) density matrix in k-space
+        hermi (int)         : int, optional
+                              If :math:`hermi=1`, the task list is built only for
+                              the upper triangle of the matrix. Default is 0.
+        naux (int)          : number of auxiliary basis functions
+        c (int)             : the ratio between the number of auxiliary basis functions and the number of atomic basis functions
+                              if naux is none, then naux is set to c * cell.nao
+        max_iter (int)      : max number of iterations for kmean
+        verbose (int)       : verbosity level
+        kpts (np.ndarray)   : 
+
+    Returns:
+        W (np.ndarray)      : (naux,naux) matrix of the ISDF potential
+        aoRg (np.ndarray)   : (naux,ngrids) matrix of the auxiliary basis
+        aoR (np.ndarray)    : (nao, ngrids) matrix of the (scaled) atomic basis in real space
+        V_R (np.ndarray)    : (naux,ngrids) matrix of the ISDF potential in real space
+        idx (np.ndarray)    : (naux,) index of the auxiliary basis in the real space grid
+
+    Ref: 
+    (1) Lu2015 
+    (2) Hu2023      10.1021/acs.jctc.2c00927
+    (3) Sandeep2022 https://pubs.acs.org/doi/10.1021/acs.jctc.2c00720
+
+    '''
+
+    t1 = (logger.process_clock(), logger.perf_counter())
+
+    ### step 1 , evaluate ao_values on the grid
+
+    cell   = mydf.cell
+    grids  = mydf.grids
+    coords = np.asarray(grids.coords).reshape(-1,3)
+    mesh   = grids.mesh
+    ngrids = np.prod(mesh)
+    assert ngrids == coords.shape[0]
+
+    log   = logger.Logger(sys.stdout, 4)
+    cput0 = (logger.process_clock(), logger.perf_counter())
+    aoR   = mydf._numint.eval_ao(cell, coords)[0]
+
+    aoR  *= np.sqrt(cell.vol / ngrids)   ## NOTE: scaled !
+
+    print("aoR.shape = ", aoR.shape)
+
+    cput1 = log.timer('eval_ao', *cput0)
+    if naux is None:
+        naux = cell.nao * c  # number of auxiliary basis functions
+
+    ### step 2, evaluate the density on the grid as the weight for kmean
+    ### TODO: make it linear scaling
+
+    dm_kpts = np.asarray(dm_kpts, order='C')
+    dms = _format_dms(dm_kpts, kpts)
+    nset, nkpts, nao = dms.shape[:3]
+    assert nset == 1
+    assert nkpts == 1  # only gamma point for now
+    # kpts_band, input_band = _format_kpts_band(kpts_band, kpts), kpts_band
+    kpts_band = _format_kpts_band(kpts_band, kpts)
+
+    # density in grid space   $\rho(G)=\int_\Omega \rho(R) e^{-iGr} dr$
+    rhoG = _eval_rhoG(mydf, dm_kpts, hermi, kpts, deriv=0)
+
+    weight = cell.vol / ngrids
+    # *(1./weight) because rhoR is scaled by weight in _eval_rhoG.  When
+    # computing rhoR with IFFT, the weight factor is not needed.
+    # the above comment is from pyscf/pbc/dft/multigrid_pair.py
+    # $\rho(R) = 1/\Omega \int_BZ \rho(G) e^{iGr} dG$ ???
+    rhoR = tools.ifft(rhoG.reshape(-1,ngrids), mesh).real * (1./weight)
+    rhoR = rhoR.flatten()
+    assert rhoR.size == ngrids
+
+    ### step 3, kmean clustering get the IP
+    ### TODO: implement QRCP as an option
+
+    cput1 = log.timer('eval_rhoR', *cput1)
+    from sklearn.cluster import KMeans
+    # from cuml.cluster import KMeans
+    # from scikit-learn.cluster import KMeans
+    from sklearn.cluster import KMeans
+    kmeans_float = KMeans(n_clusters=naux,
+                          max_iter=max_iter,
+                          # max_samples_per_batch=32768*8//naux,
+                          # output_type='numpy'
+                          )
+    kmeans_float.fit(coords, sample_weight=rhoR)
+    centers = kmeans_float.cluster_centers_
+
+    cput1 = log.timer('kmeans', *cput1)
+
+    t2 = (logger.process_clock(), logger.perf_counter())
+    _benchmark_time(t1, t2, "kmeans", mydf)
+    t1 = t2
+
+    ### step 4, get the auxiliary basis
+
+    a = cell.lattice_vectors()
+    scaled_centers = np.dot(centers, np.linalg.inv(a))
+
+    idx = (np.rint(scaled_centers*mesh[None,:]) + mesh[None,:]) % (mesh[None,:])
+    idx = idx[:,2] + idx[:,1]*mesh[2] + idx[:,0]*(mesh[1]*mesh[2])
+    idx = idx.astype(int)
+    idx = list(set(idx))
+    idx.sort()
+    idx = np.asarray(idx)
+    print("idx = ", idx)
+
+    cput1 = log.timer('get idx', *cput1)
+
+    aoRg = aoR[idx]  # (nIP, nao), nIP = naux
+    # A = numpy.dot(aoRg, aoRg.T) ** 2  # (Naux, Naux)
+    A = np.asarray(lib.dot(aoRg, aoRg.T), order='C')
+    A = A ** 2
+    cput1 = log.timer('get A', *cput1)
+
+    X = np.empty((naux,ngrids))
+    blksize = int(10*1e9/8/naux)
+    for p0, p1 in lib.prange(0, ngrids, blksize):
+        # B = numpy.dot(aoRg, aoR[p0:p1].T) ** 2
+        B = np.asarray(lib.dot(aoRg, aoR[p0:p1].T), order='C')
+        B = B ** 2
+        X[:,p0:p1] = scipy.linalg.lstsq(A, B)[0]
+        B = None
+    A = None
+
+    cput1 = log.timer('least squre fit', *cput1)
+
+    t2 = (logger.process_clock(), logger.perf_counter())
+    _benchmark_time(t1, t2, "Construct Xg", mydf)
+    t1 = t2
+
+    ### step 5, get the ISDF potential, V(R_g, R')
+
+    V_R   = np.empty((naux,ngrids))
+    coulG = tools.get_coulG(cell, mesh=mesh)
+
+    blksize1 = int(5*1e9/8/ngrids)
+    for p0, p1 in lib.prange(0, naux, blksize1):
+        X_freq     = numpy.fft.fftn(X[p0:p1].reshape(-1,*mesh), axes=(1,2,3)).reshape(-1,ngrids)
+        V_G        = X_freq * coulG[None,:]
+        X_freq     = None
+        V_R[p0:p1] = numpy.fft.ifftn(V_G.reshape(-1,*mesh), axes=(1,2,3)).real.reshape(-1,ngrids)
+        V_G        = None
+    coulG = None
+    # V_R  *= 2 * np.pi
+
+    cput1 = log.timer('fft', *cput1)
+
+    t2 = (logger.process_clock(), logger.perf_counter())
+    _benchmark_time(t1, t2, "Construct VR", mydf)
+    t1 = t2
+
+    W = np.zeros((naux,naux))
+    for p0, p1 in lib.prange(0, ngrids, blksize):
+        W += numpy.dot(X[:,p0:p1], V_R[:,p0:p1].T)
+
+    # for i in range(naux):
+    #     for j in range(i):
+    #         print("W[%5d, %5d] = %15.8e" % (i, j, W[i,j]))
+
+    cput1 = log.timer('get W', *cput1)
+
+    t2 = (logger.process_clock(), logger.perf_counter())
+    _benchmark_time(t1, t2, "Construct WR", mydf)
+
+    return W, aoRg.T, aoR.T, V_R, idx, X
+
+class ISDF(df.fft.FFTDF):
+    def __init__(self, cell):
+        super().__init__(cell=cell)
+
+    def build(self, dm=None, naux=None, c=8, max_iter=128):
+        '''
+        Args:
+            dm (np.ndarray): (nset, nkpts, nao, nao) density matrix in k-space
+            naux (int)     : number of auxiliary basis functions
+            c (int)        : the ratio between the number of auxiliary basis functions and the number of atomic basis functions
+                                if naux is none, then naux is set to c * cell.nao
+            max_iter (int) : max number of iterations for kmean
+
+        Returns:
+
+        '''
+
+        if naux is None and c is None:
+            c = 8
+
+        self.c = c
+        self.naux = naux
+
+        ## dm is the weight for kmean
+
+        if dm is None:
+
+            mf            = pbcdft.RKS(self.cell)
+            mf.xc         = "PBE,PBE"
+            mf.init_guess = 'atom'  # atom guess is fast
+            mf.with_df    = multigrid.MultiGridFFTDF2(self.cell)
+            dm            = mf.get_init_guess(self.cell, 'atom')
+
+        df_tmp = MultiGridFFTDF2(self.cell)
+        self.W, self.aoRg, self.aoR, self.V_R, _, aux_basis = isdf(
+            df_tmp, dm, naux=naux, c=c, max_iter=max_iter, verbose=self.cell.verbose)
+
+        ## WARNING: self.aoRG, self.aoR is scaled by a factor of sqrt(cell.vol / ngrids)
+
+        self.naux = self.W.shape[0]
+
+        if self.cell.verbose >= logger.INFO:
+            logger.info(self, 'naux = %d', self.naux)
+            print("naux = ", self.naux)
+
+        self.check_sanity()
+
+    ##### functions defined in isdf_ao2mo.py #####
+
+    get_eri = get_ao_eri = isdf_ao2mo.get_eri
+    ao2mo = get_mo_eri = isdf_ao2mo.general
+    ao2mo_7d = isdf_ao2mo.ao2mo_7d  # seems to be only called in kadc and kccsd, NOT implemented!
+
+    ##### functions defined in isdf_jk.py #####
+
+    get_jk = isdf_jk.get_jk_dm
+
+if __name__ == "__main__":
+
+    cell   = pbcgto.Cell()
+    boxlen = 3.5668
+    cell.a = np.array([[boxlen,0.0,0.0],[0.0,boxlen,0.0],[0.0,0.0,boxlen]])
+
+    cell.atom = '''
+                   C     0.      0.      0.
+                   C     0.8917  0.8917  0.8917
+                   C     1.7834  1.7834  0.
+                   C     2.6751  2.6751  0.8917
+                   C     1.7834  0.      1.7834
+                   C     2.6751  0.8917  2.6751
+                   C     0.      1.7834  1.7834
+                   C     0.8917  2.6751  2.6751
+                '''
+    cell.basis   = 'gth-szv'
+    cell.pseudo  = 'gth-pade'
+    cell.verbose = 4
+
+    cell.ke_cutoff  = 128
+    cell.max_memory = 800  # 800 Mb
+    cell.precision  = 1e-8  # integral precision
+    cell.use_particle_mesh_ewald = True
+
+    print(cell.energy_nuc())
+    print(cell.enuc)
+
+    cell.build()
+
+    print("Number of electrons: ", cell.nelectron)
+    print("Number of atoms    : ", cell.natm)
+    print("Number of basis    : ", cell.nao)
+    print("Number of images   : ", cell.nimgs)
+
+    # make a super cell
+
+    cell = tools.super_cell(cell, [1,1,1])
+
+    print("Number of electrons: ", cell.nelectron)
+    print("Number of atoms    : ", cell.natm)
+    print("Number of basis    : ", cell.nao)
+    print("Number of images   : ", cell.nimgs)
+
+    # construct DF object
+
+    mf            = pbcdft.RKS(cell)
+    mf.xc         = "PBE,PBE"
+    mf.init_guess = 'atom'  # atom guess is fast
+    mf.with_df    = multigrid.MultiGridFFTDF2(cell)
+    
+    dm1 = mf.get_init_guess(cell, 'atom')
+    mydf = MultiGridFFTDF2(cell)
+
+    s1e = mf.get_ovlp(cell)
+
+    print(s1e.shape)
+    print(dm1.shape)
+    print(mydf.grids.mesh)
+    print(mydf.grids.coords.shape)
+
+    # perform ISDF
+
+    rhoR = _get_rhoR(mydf, dm1)
+    print("rhoR.shape = ", rhoR.shape)
+    print("nelec from rhoR is ", np.sum(rhoR) * cell.vol / np.prod(cell.mesh))
+
+    W, aoRg, aoR, V_R, idx, _ = isdf(mydf, dm1, naux=cell.nao*10, max_iter=100, verbose=4)
+
+    print("W.shape    = ", W.shape)
+    print("aoRg.shape = ", aoRg.shape)
+    print("aoR.shape  = ", aoR.shape)
+    print("V_R.shape  = ", V_R.shape)
+    print("idx.shape  = ", idx.shape)
+
+    # check norm
+
+    print(np.sum(aoR[0, :] ** 2))
+    ovlp = cell.pbc_intor('cint1e_ovlp_sph')
+    print(ovlp[0, 0])
+
+    mydf_eri = df.FFTDF(cell)
+    eri = mydf_eri.get_eri(compact=False).reshape(cell.nao, cell.nao, cell.nao, cell.nao)
+    print("eri.shape  = ", eri.shape)
+
+    eri_isdf = isdf_ao2mo.isdf_eri_robust_fit(mydf, W, aoRg, aoR, V_R, verbose=4)
+
+    print("eri_isdf.shape  = ", eri_isdf.shape)
+
+    for i in range(cell.nao):
+        for j in range(cell.nao):
+            for k in range(cell.nao):
+                for l in range(cell.nao):
+                    if abs(eri[i,j,k,l] - eri_isdf[i,j,k,l]) > 1e-6:
+                        print("eri[{}, {}, {}, {}] = {} != {}".format(i,j,k,l,eri[i,j,k,l], eri_isdf[i,j,k,l]),
+                              "ration = ", eri[i,j,k,l]/eri_isdf[i,j,k,l])
diff --git a/pyscf/isdf/isdf_ao2mo.py b/pyscf/isdf/isdf_ao2mo.py
new file mode 100644
index 000000000..d28b16029
--- /dev/null
+++ b/pyscf/isdf/isdf_ao2mo.py
@@ -0,0 +1,1213 @@
+#!/usr/bin/env python
+# Copyright 2014-2020 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Author: Ning Zhang <ningzhang1024@gmail.com>
+#
+
+############ sys module ############
+
+import numpy, scipy
+import numpy as np
+import ctypes
+
+############ pyscf module ############
+
+from pyscf import lib
+from pyscf import ao2mo
+from pyscf.ao2mo.incore import iden_coeffs
+from pyscf.pbc import tools
+from pyscf.pbc.lib import kpts_helper
+from pyscf.pbc.lib.kpts_helper import is_zero, gamma_point, unique
+from pyscf import __config__
+from pyscf.pbc.df.fft_ao2mo import _format_kpts, _iskconserv, _contract_compact
+libisdf = lib.load_library('libisdf')
+
+############ isdf utils ############
+
+from pyscf.isdf.isdf_tools_local         import aoR_Holder
+from pyscf.isdf.isdf_jk                  import _benchmark_time
+import pyscf.isdf.isdf_tools_linearop    as     lib_isdf
+
+############ subroutines ---- AO2MO ############
+
+def isdf_eri_robust_fit(mydf, W, aoRg, aoR, V_r, verbose=None):
+    r'''
+    
+    Get (AO) electron repulsion integrals (ERI) from ISDF with robust fitting. 
+    Illurstrate the idea of iSDF with robust fitting in a human-readable way.
+    
+    Args:
+        mydf : ISDF objects 
+        W    : W matrix in Sandeep2022 eq 13
+        aoR  : AO values on grids (typically uniform mesh)
+        aoRg : Atomic orbitals' values on interpolation ponts. 
+        V_r  : V matrix in Sandeep2022 eq 13
+
+    Return: ERI with s1 symmetry
+    
+    NOTE: it is an abandoned func
+    
+    Ref:
+    
+    (1) Sandeep2022 https://pubs.acs.org/doi/10.1021/acs.jctc.2c00720
+
+    '''
+
+    cell = mydf.cell
+    nao  = cell.nao
+    ngrid = np.prod(cell.mesh)
+    vol = cell.vol
+
+    eri = numpy.zeros((nao,nao,nao,nao))
+
+    pair_Rg = np.einsum('ix,jx->ijx', aoRg, aoRg)
+    pair_R  = np.einsum('ix,jx->ijx', aoR, aoR)
+
+    ### step 1, term1
+
+    path    = np.einsum_path('ijx,xy,kly->ijkl', pair_Rg, V_r, pair_R, optimize='optimal')[0]
+    eri_tmp = np.einsum('ijx,xy,kly->ijkl', pair_Rg, V_r, pair_R, optimize=path)
+
+    ### step 2, term2
+
+    eri = eri_tmp + eri_tmp.transpose(2,3,0,1)
+
+    ### step 3, term3
+
+    path = np.einsum_path('ijx,xy,kly->ijkl', pair_Rg, W, pair_Rg, optimize='optimal')[0]
+    eri -= np.einsum('ijx,xy,kly->ijkl', pair_Rg, W, pair_Rg, optimize=path)
+
+    return eri * ngrid / vol
+
+
+def isdf_eri(mydf, mo_coeff = None, verbose=None):
+    
+    """
+    Perform AO2MO transformation from ISDF with robust fitting with s4 symmetry
+    Locality is supported if explored!
+    
+    Args:
+        mydf      :
+        mo_coeff  : Molecular orbital coefficients.
+    
+    Returns:
+        eri       : MO-ERI with s4 symmetry.
+    
+    TODO:
+    when eri is very small, use DGEMM!
+    
+    """
+    
+    #### basic info #### 
+    
+    direct = mydf.direct
+    if direct is True:
+        raise NotImplementedError("direct is not supported in isdf_eri_robust")
+    with_robust_fitting = mydf.with_robust_fitting
+    
+    nao   = mydf.cell.nao
+    naux  = mydf.naux
+    vol   = mydf.cell.vol
+    ngrid = np.prod(mydf.cell.mesh)
+    natm  = mydf.cell.natm
+    
+    if mo_coeff is not None:
+        assert mo_coeff.shape[0] == nao
+        nmo = mo_coeff.shape[1]
+    else:
+        nmo = nao
+    
+    size  = nmo * (nmo + 1) // 2
+    eri   = numpy.zeros((size, size))
+    
+    aoR  = mydf.aoR
+    aoRg = mydf.aoRg
+    assert isinstance(aoR, list)
+    assert isinstance(aoRg, list)
+    
+    if mo_coeff is not None:
+        
+        moR  = []
+        moRg = []
+        
+        for i in range(natm):
+            
+            if with_robust_fitting:
+                ao_involved     = aoR[i].ao_involved
+                mo_coeff_packed = mo_coeff[ao_involved,:].copy()
+                _moR            = lib.ddot(mo_coeff_packed.T, aoR[i].aoR)
+                mo_involved     = np.arange(nmo)
+                moR.append(
+                    aoR_Holder(
+                        aoR = _moR,
+                        ao_involved = mo_involved,
+                        local_gridID_begin  = aoR[i].local_gridID_begin,
+                        local_gridID_end    = aoR[i].local_gridID_end,
+                        global_gridID_begin = aoR[i].global_gridID_begin,
+                        global_gridID_end   = aoR[i].global_gridID_end)
+                )
+            else:
+                moR.append(None)
+            
+            ao_involved     = aoRg[i].ao_involved
+            mo_coeff_packed = mo_coeff[ao_involved,:].copy()
+            _moRg           = lib.ddot(mo_coeff_packed.T, aoRg[i].aoR)
+            mo_involved     = np.arange(nmo)
+            moRg.append(
+                aoR_Holder(
+                    aoR = _moRg,
+                    ao_involved = mo_involved,
+                    local_gridID_begin  = aoRg[i].local_gridID_begin,
+                    local_gridID_end    = aoRg[i].local_gridID_end,
+                    global_gridID_begin = aoRg[i].global_gridID_begin,
+                    global_gridID_end   = aoRg[i].global_gridID_end)
+            )
+    else:
+        moR = aoR
+        moRg = aoRg
+    
+    if with_robust_fitting:
+        max_nao_involved   = np.max([aoR_holder.aoR.shape[0] for aoR_holder in moR  if aoR_holder is not None])
+        max_ngrid_involved = np.max([aoR_holder.aoR.shape[1] for aoR_holder in moR  if aoR_holder is not None])
+    else:
+        max_nao_involved   = np.max([aoR_holder.aoR.shape[0] for aoR_holder in moRg if aoR_holder is not None])
+        max_ngrid_involved = None
+    max_nIP_involved   = np.max([aoR_holder.aoR.shape[1] for aoR_holder in moRg if aoR_holder is not None])
+    
+    ###### loop over basic info to allocate the buf ######
+    
+    aoPairRg_buf  = np.zeros((max_nao_involved, max_nao_involved, max_nIP_involved))
+    aoPairRg_buf2 = np.zeros((max_nao_involved, max_nao_involved, max_nIP_involved))
+    if with_robust_fitting:
+        aoPairR_buf = np.zeros((max_nao_involved, max_nao_involved, max_ngrid_involved))
+    else:
+        aoPairR_buf = None
+    
+    if with_robust_fitting:
+        V_W_pack_buf = np.zeros((max_nIP_involved, max_ngrid_involved)) 
+    else:
+        V_W_pack_buf = np.zeros((max_nIP_involved, max_nIP_involved)) 
+    
+    max_npair    = (max_nao_involved * (max_nao_involved + 1)) // 2
+    suberi_buf   = np.zeros((max_npair, max_npair))
+    ddot_res_buf = np.zeros((max_nIP_involved, max_npair)) 
+    
+    #### involved function #### 
+    
+    fn_packcol = getattr(libisdf, "_buildK_packcol2", None)
+    assert fn_packcol is not None
+    
+    fn_unpack_suberi_to_eri = getattr(libisdf, "_unpack_suberi_to_eri", None)
+    assert fn_unpack_suberi_to_eri is not None
+    
+    fn_pack_aoR_to_aoPairR = getattr(libisdf, "_pack_aoR_to_aoPairR_same", None)
+    assert fn_pack_aoR_to_aoPairR is not None
+    
+    ### V_R term ###
+
+    V_R = mydf.V_R
+    
+    if with_robust_fitting:
+        
+        for partition_i in range(natm):
+            
+            aoRg_i            = moRg[partition_i]
+            ao_involved_i     = aoRg_i.ao_involved
+            nao_i             = aoRg_i.aoR.shape[0]
+            global_IP_begin_i = aoRg_i.global_gridID_begin
+            nIP_i             = aoRg_i.aoR.shape[1]
+            nPair_i           = (nao_i * (nao_i + 1)) // 2
+            aoPair_i          = np.ndarray((nPair_i, nIP_i), dtype=np.float64, buffer=aoPairRg_buf)
+
+            fn_pack_aoR_to_aoPairR(
+                aoRg_i.aoR.ctypes.data_as(ctypes.c_void_p),
+                aoPair_i.ctypes.data_as(ctypes.c_void_p),
+                ctypes.c_int(nao_i),
+                ctypes.c_int(nIP_i)
+            )
+            
+            for partition_j in range(natm):
+                
+                aoR_j             = moR[partition_j]
+                ao_involved_j     = aoR_j.ao_involved
+                nao_j             = aoR_j.aoR.shape[0]
+                global_IP_begin_j = aoR_j.global_gridID_begin
+                ngrid_j           = aoR_j.aoR.shape[1]
+                nPair_j           = (nao_j * (nao_j + 1)) // 2
+                aoPair_j          = np.ndarray((nPair_j, ngrid_j), dtype=np.float64, buffer=aoPairR_buf)
+                
+                fn_pack_aoR_to_aoPairR(
+                    aoR_j.aoR.ctypes.data_as(ctypes.c_void_p),
+                    aoPair_j.ctypes.data_as(ctypes.c_void_p),
+                    ctypes.c_int(nao_j),
+                    ctypes.c_int(ngrid_j)
+                )
+                
+                V_packed = np.ndarray((nIP_i, ngrid_j), dtype=np.float64, buffer=V_W_pack_buf)
+                
+                fn_packcol(
+                    V_packed.ctypes.data_as(ctypes.c_void_p),
+                    ctypes.c_int(nIP_i),
+                    ctypes.c_int(ngrid_j),
+                    V_R[global_IP_begin_i:global_IP_begin_i+nIP_i, :].ctypes.data_as(ctypes.c_void_p),
+                    ctypes.c_int(nIP_i),
+                    ctypes.c_int(V_R.shape[1]),
+                    ctypes.c_int(global_IP_begin_j),
+                    ctypes.c_int(global_IP_begin_j+ngrid_j)
+                )
+                
+                ddot_res = np.ndarray((nIP_i, nPair_j), dtype=np.float64, buffer=ddot_res_buf)
+                lib.ddot(V_packed, aoPair_j.T, c=ddot_res)
+                sub_eri  = np.ndarray((nPair_i, nPair_j), dtype=np.float64, buffer=suberi_buf)
+                lib.ddot(aoPair_i, ddot_res, c=sub_eri)
+                
+                transpose = 1
+                fn_unpack_suberi_to_eri(
+                    eri.ctypes.data_as(ctypes.c_void_p),
+                    ctypes.c_int(nmo),
+                    sub_eri.ctypes.data_as(ctypes.c_void_p),
+                    ctypes.c_int(nao_i),
+                    ao_involved_i.ctypes.data_as(ctypes.c_void_p),
+                    ctypes.c_int(nao_j),
+                    ao_involved_j.ctypes.data_as(ctypes.c_void_p),
+                    ctypes.c_int(transpose)
+                )
+        
+    ### W   term ### 
+    
+    W = mydf.W
+    
+    for partition_i in range(natm):
+        
+        aoRg_i            = moRg[partition_i]
+        ao_involved_i     = aoRg_i.ao_involved
+        nao_i             = aoRg_i.aoR.shape[0]
+        global_IP_begin_i = aoRg_i.global_gridID_begin
+        nIP_i             = aoRg_i.aoR.shape[1]
+        nPair_i           = (nao_i * (nao_i + 1)) // 2
+        aoPair_i          = np.ndarray((nPair_i, nIP_i), dtype=np.float64, buffer=aoPairRg_buf)
+        
+        fn_pack_aoR_to_aoPairR(
+            aoRg_i.aoR.ctypes.data_as(ctypes.c_void_p),
+            aoPair_i.ctypes.data_as(ctypes.c_void_p),
+            ctypes.c_int(nao_i),
+            ctypes.c_int(nIP_i)
+        )
+        
+        for partition_j in range(partition_i+1):
+            
+            aoRg_j            = moRg[partition_j]
+            ao_involved_j     = aoRg_j.ao_involved
+            nao_j             = aoRg_j.aoR.shape[0]
+            global_IP_begin_j = aoRg_j.global_gridID_begin
+            nIP_j             = aoRg_j.aoR.shape[1]
+            nPair_j           = (nao_j * (nao_j + 1)) // 2
+            aoPair_j          = np.ndarray((nPair_j, nIP_j), dtype=np.float64, buffer=aoPairRg_buf2)
+            
+            fn_pack_aoR_to_aoPairR(
+                aoRg_j.aoR.ctypes.data_as(ctypes.c_void_p),
+                aoPair_j.ctypes.data_as(ctypes.c_void_p),
+                ctypes.c_int(nao_j),
+                ctypes.c_int(nIP_j)
+            )
+            
+            ## pack_W ##
+            
+            W_packed = np.ndarray((nIP_i, nIP_j), dtype=np.float64, buffer=V_W_pack_buf)
+            
+            fn_packcol(
+                W_packed.ctypes.data_as(ctypes.c_void_p),
+                ctypes.c_int(nIP_i),
+                ctypes.c_int(nIP_j),
+                W[global_IP_begin_i:global_IP_begin_i+nIP_i, :].ctypes.data_as(ctypes.c_void_p),
+                ctypes.c_int(nIP_i),
+                ctypes.c_int(W.shape[1]),
+                ctypes.c_int(global_IP_begin_j),
+                ctypes.c_int(global_IP_begin_j+nIP_j)
+            )
+            
+            ddot_res = np.ndarray((nIP_i, nPair_j), dtype=np.float64, buffer=ddot_res_buf)
+            lib.ddot(W_packed, aoPair_j.T, c=ddot_res)
+            sub_eri  = np.ndarray((nPair_i, nPair_j), dtype=np.float64, buffer=suberi_buf)
+            
+            alpha = 1
+            if with_robust_fitting:
+                alpha = -1
+            lib.ddot(aoPair_i, ddot_res, c=sub_eri, alpha=alpha)
+
+            transpose = 1
+            if partition_i == partition_j:
+                transpose = 0
+
+            fn_unpack_suberi_to_eri(
+                eri.ctypes.data_as(ctypes.c_void_p),
+                ctypes.c_int(nmo),
+                sub_eri.ctypes.data_as(ctypes.c_void_p),
+                ctypes.c_int(nao_i),
+                ao_involved_i.ctypes.data_as(ctypes.c_void_p),
+                ctypes.c_int(nao_j),
+                ao_involved_j.ctypes.data_as(ctypes.c_void_p),
+                ctypes.c_int(transpose)
+            )
+    
+    ### del buf ###
+    
+    # assert np.allclose(eri, eri.T)
+    
+    del aoPairRg_buf
+    del aoPairRg_buf2
+    del aoPairR_buf
+        
+    return eri * ngrid / vol
+    
+def isdf_eri_2(mydf, mo_coeff = None, verbose=None):
+    
+    """
+    Perform AO2MO transformation from ISDF with robust fitting with s4 symmetry
+    Locality is supported if explored!
+    
+    Args:
+        mydf      :
+        mo_coeff  : Molecular orbital coefficients.
+    
+    Returns:
+        eri       : MO-ERI with s4 symmetry.
+    
+    NOTE: 
+    
+    For small eri case 
+    
+    """
+    
+    #### basic info #### 
+    
+    assert mo_coeff is not None
+    
+    direct = mydf.direct
+    if direct is True:
+        raise NotImplementedError("direct is not supported in isdf_eri_robust")
+    with_robust_fitting = mydf.with_robust_fitting
+    
+    nao   = mydf.cell.nao
+    naux  = mydf.naux
+    vol   = mydf.cell.vol
+    ngrid = np.prod(mydf.cell.mesh)
+    natm  = mydf.cell.natm
+    
+    if mo_coeff is not None:
+        assert mo_coeff.shape[0] == nao
+        nmo = mo_coeff.shape[1]
+    else:
+        nmo = nao
+    
+    size  = nmo * (nmo + 1) // 2
+    eri   = numpy.zeros((size, size))
+    
+    aoR  = mydf.aoR
+    aoRg = mydf.aoRg
+    assert isinstance(aoR, list)
+    assert isinstance(aoRg, list)
+    
+    if mo_coeff is not None:
+        
+        moR  = []
+        moRg = []
+        
+        for i in range(natm):
+            
+            if with_robust_fitting:
+                ao_involved     = aoR[i].ao_involved
+                mo_coeff_packed = mo_coeff[ao_involved,:].copy()
+                _moR            = lib.ddot(mo_coeff_packed.T, aoR[i].aoR)
+                mo_involved     = np.arange(nmo)
+                moR.append(
+                    aoR_Holder(
+                        aoR = _moR,
+                        ao_involved = mo_involved,
+                        local_gridID_begin  = aoR[i].local_gridID_begin,
+                        local_gridID_end    = aoR[i].local_gridID_end,
+                        global_gridID_begin = aoR[i].global_gridID_begin,
+                        global_gridID_end   = aoR[i].global_gridID_end)
+                )
+            else:
+                moR.append(None)
+            
+            ao_involved     = aoRg[i].ao_involved
+            mo_coeff_packed = mo_coeff[ao_involved,:].copy()
+            _moRg           = lib.ddot(mo_coeff_packed.T, aoRg[i].aoR)
+            mo_involved     = np.arange(nmo)
+            moRg.append(
+                aoR_Holder(
+                    aoR = _moRg,
+                    ao_involved = mo_involved,
+                    local_gridID_begin  = aoRg[i].local_gridID_begin,
+                    local_gridID_end    = aoRg[i].local_gridID_end,
+                    global_gridID_begin = aoRg[i].global_gridID_begin,
+                    global_gridID_end   = aoRg[i].global_gridID_end)
+            )
+    else:
+        moR = aoR
+        moRg = aoRg
+    
+    max_nao_involved   = np.max([aoR_holder.aoR.shape[0] for aoR_holder in moR  if aoR_holder is not None])
+    max_ngrid_involved = np.max([aoR_holder.aoR.shape[1] for aoR_holder in moR  if aoR_holder is not None])
+    max_nIP_involved   = np.max([aoR_holder.aoR.shape[1] for aoR_holder in moRg if aoR_holder is not None])
+    
+    ###### loop over basic info to allocate the buf ######
+    
+    #max_npair    = (max_nao_involved * (max_nao_involved + 1)) // 2
+    #ddot_res_buf = np.zeros((max_nIP_involved, max_npair)) 
+    max_npair = nmo * (nmo + 1) // 2
+    npair = max_npair
+    suberi       = np.zeros((npair, npair))
+    ddot_res_buf = np.zeros((naux, npair)) 
+
+    aoPairRg_buf  = np.zeros((nmo, nmo, max_nIP_involved))
+    #aoPairRg_buf2 = np.zeros((max_nao_involved, max_nao_involved, max_nIP_involved))
+    aoPairRg = np.zeros((npair, naux))
+    
+    if with_robust_fitting:
+        aoPairR_buf = np.zeros((nmo, nmo, max_ngrid_involved))
+        aoPairR = np.zeros((npair, ngrid))
+    else:
+        aoPairR_buf = None
+    
+    #### involved function #### 
+    
+    fn_packcol = getattr(libisdf, "_buildK_packcol2", None)
+    assert fn_packcol is not None
+    
+    fn_unpack_suberi_to_eri = getattr(libisdf, "_unpack_suberi_to_eri", None)
+    assert fn_unpack_suberi_to_eri is not None
+    
+    fn_pack_aoR_to_aoPairR = getattr(libisdf, "_pack_aoR_to_aoPairR_same", None)
+    assert fn_pack_aoR_to_aoPairR is not None
+    
+    ### construct aoPairRg, aoPairR ###
+    
+    for partition_i in range(natm):
+            
+        aoRg_i            = moRg[partition_i]
+        ao_involved_i     = aoRg_i.ao_involved
+        nao_i             = aoRg_i.aoR.shape[0]
+        global_IP_begin_i = aoRg_i.global_gridID_begin
+        nIP_i             = aoRg_i.aoR.shape[1]
+        nPair_i           = (nao_i * (nao_i + 1)) // 2
+        assert nPair_i   == npair
+        aoPair_i          = np.ndarray((nPair_i, nIP_i), dtype=np.float64, buffer=aoPairRg_buf)
+
+        fn_pack_aoR_to_aoPairR(
+            aoRg_i.aoR.ctypes.data_as(ctypes.c_void_p),
+            aoPair_i.ctypes.data_as(ctypes.c_void_p),
+            ctypes.c_int(nao_i),
+            ctypes.c_int(nIP_i)
+        )
+        
+        aoPairRg[:, global_IP_begin_i:global_IP_begin_i+nIP_i] = aoPair_i
+        
+        if with_robust_fitting:
+            
+            aoR_i             = moR[partition_i]
+            ao_involved_i     = aoR_i.ao_involved
+            nao_i             = aoR_i.aoR.shape[0]
+            global_IP_begin_i = aoR_i.global_gridID_begin
+            ngrid_i           = aoR_i.aoR.shape[1]
+            nPair_i           = (nao_i * (nao_i + 1)) // 2
+            assert nPair_i   == npair
+            aoPair_i          = np.ndarray((nPair_i, ngrid_i), dtype=np.float64, buffer=aoPairR_buf)
+            
+            fn_pack_aoR_to_aoPairR(
+                aoR_i.aoR.ctypes.data_as(ctypes.c_void_p),
+                aoPair_i.ctypes.data_as(ctypes.c_void_p),
+                ctypes.c_int(nao_i),
+                ctypes.c_int(ngrid_i)
+            )
+            
+            aoPairR[:, global_IP_begin_i:global_IP_begin_i+ngrid_i] = aoPair_i
+    
+    ### V_R term ###
+
+    V_R = mydf.V_R
+    
+    if with_robust_fitting:
+        
+        lib.ddot(V_R, aoPairR.T, c=ddot_res_buf)
+        lib.ddot(aoPairRg, ddot_res_buf, c=suberi)
+        eri += suberi
+        eri += suberi.T
+            
+    ### W   term ### 
+    
+    W = mydf.W
+    
+    lib.ddot(W, aoPairRg.T, c=ddot_res_buf)
+    lib.ddot(aoPairRg, ddot_res_buf, c=suberi)
+    if with_robust_fitting:
+        eri -= suberi
+    else:
+        eri += suberi
+        
+    ### del buf ###
+    
+    # assert np.allclose(eri, eri.T)
+    
+    del aoPairRg_buf
+    #del aoPairRg_buf2
+    del aoPairR_buf
+    del aoPairRg
+    del aoPairR
+        
+    return eri * ngrid / vol
+
+def isdf_eri_ovov(mydf, mo_coeff_o: np.ndarray = None, mo_coeff_v: np.ndarray = None, verbose=None):
+    
+    """
+    Perform AO2MO transformation from ISDF for specific orbital types (ovov), for MP2 calculation
+    Locality is supported if explored!
+    
+    Args:
+        mydf       : ISDF objects.
+        mo_coeff_o : Molecular orbital coefficients for occupied orbitals
+        mo_coeff_v : Molecular orbital coefficients for virtual  orbitals
+        
+    Return:
+        eri : ovov part of MO-ERI
+        
+    """
+    
+    #### basic info #### 
+    
+    direct = mydf.direct
+    if direct is True:
+        raise NotImplementedError("direct is not supported in isdf_eri_robust")
+    with_robust_fitting = mydf.with_robust_fitting
+    
+    nao   = mydf.cell.nao
+    naux  = mydf.naux
+    vol   = mydf.cell.vol
+    ngrid = np.prod(mydf.cell.mesh)
+    natm  = mydf.cell.natm
+    
+    nao_o = mo_coeff_o.shape[1]
+    nao_v = mo_coeff_v.shape[1]
+    
+    size  = nao_o * nao_v
+    eri   = numpy.zeros((size, size))
+    
+    aoR  = mydf.aoR
+    aoRg = mydf.aoRg
+    assert isinstance(aoR, list)
+    assert isinstance(aoRg, list)
+    
+    ############ transformation of moRg/moR ############
+        
+    moR_o  = []
+    moRg_o = []
+        
+    moR_v  = []
+    moRg_v = []
+        
+    for i in range(natm):
+            
+        if with_robust_fitting:
+            ao_involved     = aoR[i].ao_involved
+            mo_coeff_packed = mo_coeff_o[ao_involved,:].copy()
+            _moR            = lib.ddot(mo_coeff_packed.T, aoR[i].aoR)
+            mo_involved     = np.arange(nao_o)
+            moR_o.append(
+                aoR_Holder(
+                    aoR = _moR,
+                    ao_involved = mo_involved,
+                    local_gridID_begin  = aoR[i].local_gridID_begin,
+                    local_gridID_end    = aoR[i].local_gridID_end,
+                    global_gridID_begin = aoR[i].global_gridID_begin,
+                    global_gridID_end   = aoR[i].global_gridID_end)
+            )
+                
+            mo_coeff_packed = mo_coeff_v[ao_involved,:].copy()
+            _moR            = lib.ddot(mo_coeff_packed.T, aoR[i].aoR)
+            mo_involved     = np.arange(nao_v)
+            moR_v.append(
+                aoR_Holder(
+                    aoR = _moR,
+                    ao_involved = mo_involved,
+                    local_gridID_begin  = aoR[i].local_gridID_begin,
+                    local_gridID_end    = aoR[i].local_gridID_end,
+                    global_gridID_begin = aoR[i].global_gridID_begin,
+                    global_gridID_end   = aoR[i].global_gridID_end)
+            )
+                
+        else:
+            moR_o.append(None)
+            moR_v.append(None)
+            
+        ao_involved     = aoRg[i].ao_involved
+        mo_coeff_packed = mo_coeff_o[ao_involved,:].copy()
+        _moRg           = lib.ddot(mo_coeff_packed.T, aoRg[i].aoR)
+        mo_involved     = np.arange(nao_o)
+        moRg_o.append(
+            aoR_Holder(
+                aoR = _moRg,
+                ao_involved = mo_involved,
+                local_gridID_begin  = aoRg[i].local_gridID_begin,
+                local_gridID_end    = aoRg[i].local_gridID_end,
+                global_gridID_begin = aoRg[i].global_gridID_begin,
+                global_gridID_end   = aoRg[i].global_gridID_end)
+        )
+        
+        mo_coeff_packed = mo_coeff_v[ao_involved,:].copy()
+        _moRg           = lib.ddot(mo_coeff_packed.T, aoRg[i].aoR)
+        mo_involved     = np.arange(nao_v)
+        moRg_v.append(
+            aoR_Holder(
+                aoR = _moRg,
+                ao_involved = mo_involved,
+                local_gridID_begin  = aoRg[i].local_gridID_begin,
+                local_gridID_end    = aoRg[i].local_gridID_end,
+                global_gridID_begin = aoRg[i].global_gridID_begin,
+                global_gridID_end   = aoRg[i].global_gridID_end)
+        )
+    
+    ########################################################
+    
+    max_nao_involved   = max(nao_o, nao_v)
+    max_ngrid_involved = np.max([aoR_holder.aoR.shape[1] for aoR_holder in moR_o  if aoR_holder is not None])
+    max_nIP_involved   = np.max([aoR_holder.aoR.shape[1] for aoR_holder in moRg_o if aoR_holder is not None])
+    
+    ###### loop over basic info to allocate the buf ######
+    
+    aoPairRg_buf  = np.zeros((nao_o, nao_v, max_nIP_involved))
+    aoPairRg_buf2 = np.zeros((nao_o, nao_v, max_nIP_involved))
+    if with_robust_fitting:
+        aoPairR_buf = np.zeros((nao_o, nao_v, max_ngrid_involved))
+    else:
+        aoPairR_buf = None
+        
+    if with_robust_fitting:
+        V_W_pack_buf = np.zeros((max_nIP_involved, max_ngrid_involved)) 
+    else:
+        V_W_pack_buf = np.zeros((max_nIP_involved, max_nIP_involved)) 
+    
+    max_npair    = nao_o * nao_v
+    suberi_buf   = np.zeros((max_npair, max_npair))
+    ddot_res_buf = np.zeros((max_nIP_involved, max_npair)) 
+    
+    #### involved function #### 
+    
+    fn_packcol = getattr(libisdf, "_buildK_packcol2", None)
+    assert fn_packcol is not None
+    
+    fn_unpack_suberi_to_eri = getattr(libisdf, "_unpack_suberi_to_eri_ovov", None)
+    assert fn_unpack_suberi_to_eri is not None
+    
+    fn_pack_aoR_to_aoPairR = getattr(libisdf, "_pack_aoR_to_aoPairR_diff", None)
+    assert fn_pack_aoR_to_aoPairR is not None
+    
+    ### V_R term ###
+
+    V_R = mydf.V_R
+    
+    if with_robust_fitting:
+        
+        for partition_i in range(natm):
+            
+            aoRg_i_o          = moRg_o[partition_i]
+            nocc_i            = aoRg_i_o.aoR.shape[0]
+            
+            aoRg_i_v          = moRg_v[partition_i]
+            nvir_i            = aoRg_i_v.aoR.shape[0]
+            
+            global_IP_begin_i = aoRg_i_o.global_gridID_begin
+            nIP_i             = aoRg_i_o.aoR.shape[1]
+            
+            nPair_i           = nocc_i * nvir_i
+            aoPair_i          = np.ndarray((nPair_i, nIP_i), dtype=np.float64, buffer=aoPairRg_buf)
+            
+            fn_pack_aoR_to_aoPairR(
+                aoRg_i_o.aoR.ctypes.data_as(ctypes.c_void_p),
+                aoRg_i_v.aoR.ctypes.data_as(ctypes.c_void_p),
+                aoPair_i.ctypes.data_as(ctypes.c_void_p),
+                ctypes.c_int(nocc_i),
+                ctypes.c_int(nvir_i),
+                ctypes.c_int(nIP_i)
+            )
+            
+            for partition_j in range(natm):
+                
+                aoR_j_o           = moR_o[partition_j]
+                nocc_j            = aoR_j_o.aoR.shape[0]
+                
+                aoR_j_v           = moR_v[partition_j]
+                nvir_j            = aoR_j_v.aoR.shape[0]
+                
+                global_IP_begin_j = aoR_j_o.global_gridID_begin
+                ngrid_j           = aoR_j_o.aoR.shape[1]
+                
+                nPair_j           = nocc_j * nvir_j
+                aoPair_j          = np.ndarray((nPair_j, ngrid_j), dtype=np.float64, buffer=aoPairR_buf)
+                
+                fn_pack_aoR_to_aoPairR(
+                    aoR_j_o.aoR.ctypes.data_as(ctypes.c_void_p),
+                    aoR_j_v.aoR.ctypes.data_as(ctypes.c_void_p),
+                    aoPair_j.ctypes.data_as(ctypes.c_void_p),
+                    ctypes.c_int(nocc_j),
+                    ctypes.c_int(nvir_j),
+                    ctypes.c_int(ngrid_j)
+                )
+                
+                V_packed = np.ndarray((nIP_i, ngrid_j), dtype=np.float64, buffer=V_W_pack_buf)
+                
+                fn_packcol(
+                    V_packed.ctypes.data_as(ctypes.c_void_p),
+                    ctypes.c_int(nIP_i),
+                    ctypes.c_int(ngrid_j),
+                    V_R[global_IP_begin_i:global_IP_begin_i+nIP_i, :].ctypes.data_as(ctypes.c_void_p),
+                    ctypes.c_int(nIP_i),
+                    ctypes.c_int(V_R.shape[1]),
+                    ctypes.c_int(global_IP_begin_j),
+                    ctypes.c_int(global_IP_begin_j+ngrid_j)
+                )
+                
+                ddot_res = np.ndarray((nIP_i, nPair_j), dtype=np.float64, buffer=ddot_res_buf)
+                lib.ddot(V_packed, aoPair_j.T, c=ddot_res)
+                sub_eri  = np.ndarray((nPair_i, nPair_j), dtype=np.float64, buffer=suberi_buf)
+                lib.ddot(aoPair_i, ddot_res, c=sub_eri)
+                
+                assert nPair_i == nPair_j == (nao_o * nao_v)
+                
+                transpose = 1
+                fn_unpack_suberi_to_eri(
+                    eri.ctypes.data_as(ctypes.c_void_p),
+                    sub_eri.ctypes.data_as(ctypes.c_void_p),
+                    ctypes.c_int(nPair_i),
+                    ctypes.c_int(transpose)
+                )
+    
+    ### W   term ### 
+    
+    W = mydf.W
+    
+    for partition_i in range(natm):
+        
+        aoRg_i_o          = moRg_o[partition_i]
+        nocc_i            = aoRg_i_o.aoR.shape[0]
+        
+        aoRg_i_v          = moRg_v[partition_i]
+        nvir_i            = aoRg_i_v.aoR.shape[0]
+        
+        global_IP_begin_i = aoRg_i_o.global_gridID_begin
+        nIP_i             = aoRg_i_o.aoR.shape[1]
+        
+        nPair_i           = nocc_i * nvir_i
+        aoPair_i          = np.ndarray((nPair_i, nIP_i), dtype=np.float64, buffer=aoPairRg_buf)
+        
+        fn_pack_aoR_to_aoPairR(
+            aoRg_i_o.aoR.ctypes.data_as(ctypes.c_void_p),
+            aoRg_i_v.aoR.ctypes.data_as(ctypes.c_void_p),
+            aoPair_i.ctypes.data_as(ctypes.c_void_p),
+            ctypes.c_int(nocc_i),
+            ctypes.c_int(nvir_i),
+            ctypes.c_int(nIP_i)
+        )
+        
+        for partition_j in range(partition_i+1):
+            
+            aoRg_j_o          = moRg_o[partition_j]
+            nocc_j            = aoRg_j_o.aoR.shape[0]
+            
+            aoRg_j_v          = moRg_v[partition_j]
+            nvir_j            = aoRg_j_v.aoR.shape[0]
+            
+            global_IP_begin_j = aoRg_j_o.global_gridID_begin
+            nIP_j             = aoRg_j_o.aoR.shape[1]
+            
+            nPair_j           = nocc_j * nvir_j
+            aoPair_j          = np.ndarray((nPair_j, nIP_j), dtype=np.float64, buffer=aoPairRg_buf2)
+            
+            fn_pack_aoR_to_aoPairR(
+                aoRg_j_o.aoR.ctypes.data_as(ctypes.c_void_p),
+                aoRg_j_v.aoR.ctypes.data_as(ctypes.c_void_p),
+                aoPair_j.ctypes.data_as(ctypes.c_void_p),
+                ctypes.c_int(nocc_j),
+                ctypes.c_int(nvir_j),
+                ctypes.c_int(nIP_j)
+            )
+            
+            ## pack_W ##
+            
+            W_packed = np.ndarray((nIP_i, nIP_j), dtype=np.float64, buffer=V_W_pack_buf)
+            
+            fn_packcol(
+                W_packed.ctypes.data_as(ctypes.c_void_p),
+                ctypes.c_int(nIP_i),
+                ctypes.c_int(nIP_j),
+                W[global_IP_begin_i:global_IP_begin_i+nIP_i, :].ctypes.data_as(ctypes.c_void_p),
+                ctypes.c_int(nIP_i),
+                ctypes.c_int(W.shape[1]),
+                ctypes.c_int(global_IP_begin_j),
+                ctypes.c_int(global_IP_begin_j+nIP_j)
+            )
+            
+            ddot_res = np.ndarray((nIP_i, nPair_j), dtype=np.float64, buffer=ddot_res_buf)
+            lib.ddot(W_packed, aoPair_j.T, c=ddot_res)
+            sub_eri  = np.ndarray((nPair_i, nPair_j), dtype=np.float64, buffer=suberi_buf)
+            
+            assert nPair_i == nPair_j == (nao_o * nao_v)
+            
+            alpha = 1
+            if with_robust_fitting:
+                alpha = -1
+            lib.ddot(aoPair_i, ddot_res, c=sub_eri, alpha=alpha)
+    
+            transpose = 1
+            if partition_i == partition_j:
+                transpose = 0
+            
+            fn_unpack_suberi_to_eri(
+                eri.ctypes.data_as(ctypes.c_void_p),
+                sub_eri.ctypes.data_as(ctypes.c_void_p),
+                ctypes.c_int(nPair_i),
+                ctypes.c_int(transpose)
+            )
+    
+    ### del buf ###
+    
+    assert np.allclose(eri, eri.T)
+    
+    del aoPairRg_buf
+    del aoPairRg_buf2
+    del aoPairR_buf
+        
+    return eri.reshape(nao_o, nao_v, nao_o, nao_v) * ngrid / vol
+  
+def get_eri(mydf, kpts=None,
+            compact=getattr(__config__, 'pbc_df_ao2mo_get_eri_compact', True)):
+
+    cell = mydf.cell
+    nao = cell.nao_nr()
+    kptijkl = _format_kpts(kpts)
+    if not _iskconserv(cell, kptijkl):
+        lib.logger.warn(cell, 'isdf_ao2mo: momentum conservation not found in '
+                        'the given k-points %s', kptijkl)
+        return numpy.zeros((nao,nao,nao,nao))
+
+    # kpti, kptj, kptk, kptl = kptijkl
+    # q = kptj - kpti
+    # coulG = tools.get_coulG(cell, q, mesh=mydf.mesh)
+    # coords = cell.gen_uniform_grids(mydf.mesh)
+    # max_memory = mydf.max_memory - lib.current_memory()[0]
+
+####################
+
+# gamma point, the integral is real and with s4 symmetry
+
+    if gamma_point(kptijkl):
+
+        eri = isdf_eri(mydf, verbose=mydf.cell.verbose)
+        
+        if compact:
+            return eri
+        else:
+            return ao2mo.restore(1, eri, nao)
+
+####################
+# aosym = s1, complex integrals
+
+    else:
+        raise NotImplementedError
+
+
+def general(mydf, mo_coeffs, kpts=None,
+            compact=getattr(__config__, 'pbc_df_ao2mo_general_compact', True)):
+    '''General MO integral transformation'''
+
+    from pyscf.pbc.df.df_ao2mo import warn_pbc2d_eri
+    warn_pbc2d_eri(mydf)
+    cell = mydf.cell
+    nao = cell.nao_nr()
+    kptijkl = _format_kpts(kpts)
+    kpti, kptj, kptk, kptl = kptijkl
+    if isinstance(mo_coeffs, numpy.ndarray) and mo_coeffs.ndim == 2:
+        mo_coeffs = (mo_coeffs,) * 4
+    mo_coeffs = [numpy.asarray(mo, order='F') for mo in mo_coeffs]
+    if not _iskconserv(cell, kptijkl):
+        lib.logger.warn(cell, 'fft_ao2mo: momentum conservation not found in '
+                        'the given k-points %s', kptijkl)
+        return numpy.zeros([mo.shape[1] for mo in mo_coeffs])
+
+    allreal = not any(numpy.iscomplexobj(mo) for mo in mo_coeffs)
+    q = kptj - kpti
+    # coulG = tools.get_coulG(cell, q, mesh=mydf.mesh)
+    # coords = cell.gen_uniform_grids(mydf.mesh)
+    max_memory = mydf.max_memory - lib.current_memory()[0]
+
+    if hasattr(mydf, "W2") or (hasattr(mydf, "force_LS_THC") and mydf.force_LS_THC == True):          # NOTE: this means that LS_THC_recompression is called, we do not perform ao2mo with robust fitting, as it is very expensive!
+        #print("use_LS_THC_anyway")
+        use_LS_THC_anyway = True
+    else:
+        #print("no_use_LS_THC_anyway")
+        use_LS_THC_anyway = False
+
+    IsMOERI = (iden_coeffs(mo_coeffs[0], mo_coeffs[1]) and
+               iden_coeffs(mo_coeffs[0], mo_coeffs[2]) and
+               iden_coeffs(mo_coeffs[0], mo_coeffs[3]))
+    if not IsMOERI:
+        IsOVOV = False
+        IsGeneral = False
+    else:
+        IsOVOV = (iden_coeffs(mo_coeffs[0], mo_coeffs[2]) and
+                 iden_coeffs(mo_coeffs[1], mo_coeffs[3]))
+        if IsOVOV:
+            IsGeneral = False
+        else:
+            IsGeneral = True
+
+    if gamma_point(kptijkl) and allreal:
+
+        ##### check whether LS-THC anyway #####
+        
+        if use_LS_THC_anyway:
+            
+            vol   = mydf.cell.vol
+            ngrid = np.prod(mydf.cell.mesh)
+            
+            if hasattr(mydf, "W2"):
+                eri = LS_THC_moeri(mydf, mydf.W2, mydf.aoRg2, mo_coeffs) * ngrid / vol
+            else:
+                eri = LS_THC_moeri(mydf, mydf.W, mydf.aoRg, mo_coeffs) * ngrid / vol
+            if compact:
+                if IsMOERI:
+                    return ao2mo.restore(4, eri, nao)
+                else:
+                    return eri
+            else:
+                return eri
+
+        if ((iden_coeffs(mo_coeffs[0], mo_coeffs[1]) and
+             iden_coeffs(mo_coeffs[0], mo_coeffs[2]) and
+             iden_coeffs(mo_coeffs[0], mo_coeffs[3]))):
+                        
+            #### Full MO-ERI ####
+            
+            t1  = (lib.logger.process_clock(), lib.logger.perf_counter())
+            eri = isdf_eri(mydf, mo_coeffs[0].copy(), verbose=mydf.cell.verbose)
+            # eri = isdf_eri_2(mydf, mo_coeffs[0].copy(), verbose=mydf.cell.verbose) # requires aoPairR, which is very expensive
+            t2  = (lib.logger.process_clock(), lib.logger.perf_counter())
+            _benchmark_time(t1, t2, 'isdf_eri', mydf)
+                
+            if compact:
+                return eri
+            else:
+                return ao2mo.restore(1, eri, nao)
+        else:
+            
+            #### ovov MO-ERI ####
+            
+            if ((iden_coeffs(mo_coeffs[0], mo_coeffs[2]) and
+                 iden_coeffs(mo_coeffs[1], mo_coeffs[3]))):
+                
+                t1 = (lib.logger.process_clock(), lib.logger.perf_counter())
+                eri = isdf_eri_ovov(mydf, mo_coeffs[0].copy(), mo_coeffs[1].copy(), verbose=mydf.cell.verbose)
+                t2 = (lib.logger.process_clock(), lib.logger.perf_counter())
+                _benchmark_time(t1, t2, 'isdf_eri_ovov', mydf)
+            
+                if compact:
+                    print("compact is not supported in general with ov ov mode")
+                    return eri
+                else:
+                    return eri
+            
+            else:
+                    raise NotImplementedError
+
+    else:
+        raise NotImplementedError
+    
+    return
+
+def ao2mo_7d(mydf, mo_coeff_kpts, kpts=None, factor=1, out=None):
+    raise NotImplementedError
+
+############ subroutines ---- LS-THC ############
+
+def LS_THC(mydf, R:np.ndarray):
+    '''
+    Least-Square Tensorhypercontraction decomposition of ERI.
+    Given an R matrix, compute the Z matrix such that the electron repulsion integral (ERI) can be expressed as eri ~ R R Z R R.
+    Supports both ISDF w./w.o. robust fitting.
+    
+    Args:
+        mydf : ISDF objects. 
+        R    : A matrix used in the computation of the ERI.
+
+    Returns:
+        Z    :  eri = R R Z R R.
+
+    Ref:
+        (1) Martinez2012: Parrish, Hohenstein, Martinez and Sherill. J. Chem. Phys. 137, 224106 (2012), DOI: https://doi.org/10.1063/1.4768233
+
+    '''
+    
+    log = lib.logger.Logger(mydf.stdout, mydf.verbose)
+    
+    t1 = (lib.logger.process_clock(), lib.logger.perf_counter())
+    
+    nGrid_R = R.shape[1]
+    nao     = R.shape[0]
+    
+    assert nao == mydf.cell.nao
+    
+    ngrid = np.prod(mydf.cell.mesh)
+    nIP   = mydf.naux
+    naux  = mydf.naux
+    vol   = mydf.cell.vol
+    natm  = mydf.cell.natm
+    
+    Z = np.zeros((nGrid_R, nGrid_R))
+    
+    #### step 1 construct ####
+    
+    RR = lib.ddot(R.T, R)
+    lib_isdf.square_inPlace(RR)
+    
+    # diag RR #
+    
+    D_RR, U_RR = scipy.linalg.eigh(RR)    
+    D_RR_inv   = (1.0/D_RR).copy()
+    
+    ## for debug ##
+    
+    log.debug4("***** LS_THC ***** ")
+    log.debug4("max D_RR         = %f", np.max(D_RR))
+    log.debug4("min D_RR         = %f", np.min(D_RR))
+    log.debug4("condition number = %f", np.max(D_RR)/np.min(D_RR))
+    
+    #### step 2 construct R R ERI R R with O(N^3) cost #### 
+    
+    # build (RX)^{PA} = \sum_mu R_mu^P X_\mu^A with X = aoRg # 
+    
+    RX = np.zeros((nGrid_R, nIP))
+    
+    aoRg = mydf.aoRg
+    
+    if isinstance(aoRg, np.ndarray):
+        
+        RX = lib.ddot(R.T, aoRg)
+
+    else:
+        
+        assert isinstance(aoRg, list)
+        
+        for partition_i in range(natm):
+        
+            aoRg_i            = aoRg[partition_i]
+            ao_involved_i     = aoRg_i.ao_involved
+            nao_i             = aoRg_i.aoR.shape[0]
+            global_IP_begin_i = aoRg_i.global_gridID_begin
+            nIP_i             = aoRg_i.aoR.shape[1]
+        
+            R_packed = R[ao_involved_i,:].copy() 
+            RX_tmp   = lib.ddot(R_packed.T, aoRg_i.aoR)
+        
+            RX[:,global_IP_begin_i:global_IP_begin_i+nIP_i] = RX_tmp 
+    
+    RX = lib_isdf.square_inPlace(RX)
+        
+    # build (RY)^{PB} = \sum_mu R_mu^P Y_\mu^B with Y = aoR # 
+    
+    if mydf.with_robust_fitting:
+        
+        if isinstance(mydf.aoR, np.ndarray):
+            
+            RY = lib.ddot(R.T, mydf.aoR)
+            
+        else:
+            
+            assert isinstance(mydf.aoR, list)
+            
+            aoR = mydf.aoR
+            RY = np.zeros((nGrid_R, ngrid))
+            for partition_i in range(natm):
+            
+                aoR_i            = aoR[partition_i]
+                ao_involved_i    = aoR_i.ao_involved
+                nao_i            = aoR_i.aoR.shape[0]
+                global_gridID_i  = aoR_i.global_gridID_begin
+                ngrid_i          = aoR_i.aoR.shape[1]
+            
+                R_packed = R[ao_involved_i,:].copy()
+                RY_tmp   = lib.ddot(R_packed.T, aoR_i.aoR)
+            
+                RY[:,global_gridID_i:global_gridID_i+ngrid_i] = RY_tmp
+    
+        RY = lib_isdf.square_inPlace(RY)
+    else:
+        RY = None
+    
+    #### V term ####
+    
+    with_robust_fitting = mydf.with_robust_fitting
+    
+    if with_robust_fitting:
+        V_R = mydf.V_R
+        Z_tmp1 = lib.ddot(V_R, RY.T)
+        lib.ddot(RX, Z_tmp1, c=Z)
+        Z += Z.T
+        del Z_tmp1
+        
+    #### W term #### 
+    
+    W = mydf.W
+    Z_tmp2 = lib.ddot(W, RX.T)
+    if with_robust_fitting:
+        lib.ddot(RX, Z_tmp2, c=Z, alpha=-1, beta=1)
+    else:
+        lib.ddot(RX, Z_tmp2, c=Z)
+    del Z_tmp2
+    
+    Z1 = lib.ddot(U_RR.T, Z)
+    Z2 = lib.ddot(Z1, U_RR, c=Z)
+    Z  = Z2 
+    
+    lib_isdf.d_i_ij_ij(D_RR_inv, Z, out=Z)
+    lib_isdf.d_ij_j_ij(Z, D_RR_inv, out=Z)
+    lib.ddot(U_RR, Z, c=Z1)
+    lib.ddot(Z1, U_RR.T, c=Z)
+    
+    t2 = (lib.logger.process_clock(), lib.logger.perf_counter())
+    
+    log.timer('LS_THC fitting', *t1)
+    
+    return Z * ngrid / vol
+
+def LS_THC_eri(Z:np.ndarray, R:np.ndarray):
+    
+    einsum_str = "iP,jP,PQ,kQ,lQ->ijkl"
+    
+    path_info = np.einsum_path(einsum_str, R,R,Z,R,R, optimize='optimal')
+    
+    return np.einsum(einsum_str,R,R,Z,R,R,optimize=path_info[0])
+
+def LS_THC_moeri(mydf, Z:np.ndarray, R:np.ndarray, mo_coeff:np.ndarray):
+    
+    t1 = (lib.logger.process_clock(), lib.logger.perf_counter())
+    assert len(mo_coeff) == 4
+    moRg       = [lib.ddot(x.T, R) for x in mo_coeff]
+    einsum_str = "iP,jP,PQ,kQ,lQ->ijkl"
+    path_info  = np.einsum_path(einsum_str, moRg[0], moRg[1], Z, moRg[2], moRg[3], optimize='optimal')
+    res = np.einsum(einsum_str, moRg[0], moRg[1], Z, moRg[2], moRg[3], optimize=path_info[0])
+    log = lib.logger.Logger(mydf.stdout, mydf.verbose)
+    log.timer('LS_THC MOERI', *t1)
+    return res
\ No newline at end of file
diff --git a/pyscf/isdf/isdf_eval_gto.py b/pyscf/isdf/isdf_eval_gto.py
new file mode 100644
index 000000000..2b6d2b568
--- /dev/null
+++ b/pyscf/isdf/isdf_eval_gto.py
@@ -0,0 +1,193 @@
+#!/usr/bin/env python
+# Copyright 2014-2020 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Author: Ning Zhang <ningzhang1024@gmail.com>
+#
+
+import ctypes
+import numpy
+from pyscf import lib
+from pyscf.gto import moleintor
+from pyscf.gto.eval_gto import _get_intor_and_comp, BLKSIZE
+from pyscf.pbc.gto import _pbcintor
+from pyscf import __config__
+
+EXTRA_PREC = getattr(__config__, 'pbc_gto_eval_gto_extra_precision', 1e-2)
+
+libpbc  = _pbcintor.libpbc
+libisdf = lib.load_library('libisdf')
+
+def z2d_InPlace(z):
+    '''Convert complex array to double array in-place'''
+    assert(z.dtype == numpy.complex128)
+    
+    fn = getattr(libisdf, "NPz2d_InPlace")
+    assert(fn is not None)
+    fn(z.ctypes.data_as(ctypes.c_void_p),
+         ctypes.c_size_t(z.size))
+    z_real = numpy.ndarray(shape=z.shape, dtype=numpy.double, buffer=z)
+    return z_real
+
+def _estimate_rcut(cell):
+    '''Cutoff raidus, above which each shell decays to a value less than the
+    required precsion'''
+    log_prec = numpy.log(cell.precision * EXTRA_PREC)
+    rcut = []
+    for ib in range(cell.nbas):
+        l = cell.bas_angular(ib)
+        es = cell.bas_exp(ib)
+        cs = abs(cell.bas_ctr_coeff(ib)).max(axis=1)
+        r = 5.
+        r = (((l+2)*numpy.log(r)+numpy.log(cs) - log_prec) / es)**.5
+        r[r < 1.] = 1.
+        r = (((l+2)*numpy.log(r)+numpy.log(cs) - log_prec) / es)**.5
+        rcut.append(r.max())
+    return numpy.array(rcut)
+
+def ISDF_eval_gto(cell, eval_name=None, coords=None, comp=None, kpts=numpy.zeros((1,3)), kpt=None,
+             shls_slice=None, non0tab=None, ao_loc=None, cutoff=None,
+             out=None, Ls=None, rcut=None):
+    r'''Evaluate PBC-AO function value on the given grids,
+
+    Args:
+        eval_name : str
+
+            ==========================  =======================
+            Function                    Expression
+            ==========================  =======================
+            "GTOval_sph"                \sum_T exp(ik*T) |AO>
+            "GTOval_ip_sph"             nabla \sum_T exp(ik*T) |AO>
+            "GTOval_cart"               \sum_T exp(ik*T) |AO>
+            "GTOval_ip_cart"            nabla \sum_T exp(ik*T) |AO>
+            ==========================  =======================
+
+        atm : int32 ndarray
+            libcint integral function argument
+        bas : int32 ndarray
+            libcint integral function argument
+        env : float64 ndarray
+            libcint integral function argument
+
+        coords : 2D array, shape (N,3)
+            The coordinates of the grids.
+
+    Kwargs:
+        shls_slice : 2-element list
+            (shl_start, shl_end).
+            If given, only part of AOs (shl_start <= shell_id < shl_end) are
+            evaluated.  By default, all shells defined in cell will be evaluated.
+        non0tab : 2D bool array
+            mask array to indicate whether the AO values are zero.  The mask
+            array can be obtained by calling :func:`dft.gen_grid.make_mask`
+        cutoff : float
+            AO values smaller than cutoff will be set to zero. The default
+            cutoff threshold is ~1e-22 (defined in gto/grid_ao_drv.h)
+        out : ndarray
+            If provided, results are written into this array.
+
+    Returns:
+        A list of 2D (or 3D) arrays to hold the AO values on grids. 
+
+    WARNING : only support gamma point calculation !!!!
+
+    '''
+
+    if eval_name is None:
+        if cell.cart:
+            eval_name = 'GTOval_cart_deriv%d' % 0
+        else:
+            eval_name = 'GTOval_sph_deriv%d' % 0
+
+    if eval_name[:3] == 'PBC':  # PBCGTOval_xxx
+        eval_name, comp = _get_intor_and_comp(cell, eval_name[3:], comp)
+    else:
+        eval_name, comp = _get_intor_and_comp(cell, eval_name, comp)
+    eval_name = 'PBC' + eval_name
+
+    assert comp == 1
+
+    atm = numpy.asarray(cell._atm, dtype=numpy.int32, order='C')
+    bas = numpy.asarray(cell._bas, dtype=numpy.int32, order='C')
+    env = numpy.asarray(cell._env, dtype=numpy.double, order='C')
+    natm = atm.shape[0]
+    nbas = bas.shape[0]
+    if kpts is None:
+        if kpt is not None:
+            raise RuntimeError('kpt should be a list of k-points')
+            kpts_lst = numpy.reshape(kpt, (1,3))
+        else:
+            kpts_lst = numpy.zeros((1,3))
+    else:
+        kpts_lst = numpy.reshape(kpts, (-1,3))
+    nkpts = len(kpts_lst)
+    ngrids = len(coords)
+
+    assert kpts_lst.shape[0] == 1
+
+    # print("kpts_lst = ", kpts_lst)
+
+    if non0tab is None:
+        non0tab = numpy.empty(((ngrids+BLKSIZE-1)//BLKSIZE, nbas),
+                              dtype=numpy.uint8)
+# non0tab stores the number of images to be summed in real space.
+# Initializing it to 255 means all images should be included
+        non0tab[:] = 0xff
+
+    if ao_loc is None:
+        ao_loc = moleintor.make_loc(bas, eval_name)
+    if shls_slice is None:
+        shls_slice = (0, nbas)
+    sh0, sh1 = shls_slice
+    nao = ao_loc[sh1] - ao_loc[sh0]
+
+    if out is None:
+        out = numpy.empty((nkpts,comp,nao,ngrids), dtype=numpy.complex128)  # NOTE THE definition of the shape!
+    else:
+        # print("out is given")
+        out = numpy.ndarray((nkpts,comp,nao,ngrids), dtype=numpy.complex128,
+                             buffer=out)
+    coords = numpy.asarray(coords, order='F')
+
+    # For atoms near the boundary of the cell, it is necessary (even in low-
+    # dimensional systems) to include lattice translations in all 3 dimensions.
+    if Ls is None:
+        if cell.dimension < 2 or cell.low_dim_ft_type == 'inf_vacuum':
+            Ls = cell.get_lattice_Ls(dimension=cell.dimension)
+        else:
+            Ls = cell.get_lattice_Ls(dimension=3)
+        Ls = Ls[numpy.argsort(lib.norm(Ls, axis=1))]
+    expLk = numpy.exp(1j * numpy.asarray(numpy.dot(Ls, kpts_lst.T), order='C'))
+    if rcut is None:
+        rcut = _estimate_rcut(cell)
+
+    with cell.with_integral_screen(cutoff):
+        drv = getattr(libpbc, eval_name)
+        drv(ctypes.c_int(ngrids),
+            (ctypes.c_int*2)(*shls_slice), ao_loc.ctypes.data_as(ctypes.c_void_p),
+            Ls.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(len(Ls)),
+            expLk.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(nkpts),
+            out.ctypes.data_as(ctypes.c_void_p),
+            coords.ctypes.data_as(ctypes.c_void_p),
+            rcut.ctypes.data_as(ctypes.c_void_p),
+            non0tab.ctypes.data_as(ctypes.c_void_p),
+            atm.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(natm),
+            bas.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(nbas),
+            env.ctypes.data_as(ctypes.c_void_p))
+  
+    out = out[0]
+    out = z2d_InPlace(out)
+    return out[0]
+
+
diff --git a/pyscf/isdf/isdf_fast.py b/pyscf/isdf/isdf_fast.py
new file mode 100644
index 000000000..0ec0950dc
--- /dev/null
+++ b/pyscf/isdf/isdf_fast.py
@@ -0,0 +1,1218 @@
+#!/usr/bin/env python
+# Copyright 2014-2020 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Author: Ning Zhang <ningzhang1024@gmail.com>
+#
+
+############ sys module ############
+
+import copy
+import numpy as np
+import ctypes
+
+############ pyscf module ############
+
+from pyscf import lib
+from pyscf.lib import logger
+import pyscf.pbc.gto as pbcgto
+from pyscf.pbc.gto import Cell
+from pyscf.pbc import tools
+from pyscf.pbc.lib.kpts import KPoints
+from pyscf.pbc.lib.kpts_helper import is_zero, gamma_point, member
+from pyscf.gto.mole import *
+from pyscf.pbc.dft import multigrid
+
+############ isdf utils ############
+
+from   pyscf.isdf.isdf_jk import _benchmark_time
+import pyscf.isdf.isdf_ao2mo as isdf_ao2mo
+import pyscf.isdf.isdf_jk    as isdf_jk
+from   pyscf.isdf.isdf_eval_gto import ISDF_eval_gto
+from   pyscf.isdf.isdf_tools_kSampling import _kmesh_to_Kpoints
+libisdf = lib.load_library('libisdf')
+
+############ global variables ############
+
+BASIS_CUTOFF               = 1e-18  # too small may lead to numerical instability
+CRITERION_CALL_PARALLEL_QR = 256
+
+############ subroutines --- select IP and build aux basis ############
+
+def _select_IP_direct(mydf, c:int, m:int, first_natm=None, global_IP_selection=True, 
+                      aoR_cutoff = None,
+                      rela_cutoff = 0.0, 
+                      no_retriction_on_nIP = False,
+                      use_mpi=False):
+    r''' Select the interpolation points (IP) based on the given criteria.
+    
+    Args:
+        mydf : object
+            The interpolative separable density fitting (ISDF) object.
+            
+        c : int
+            if rela_cutoff is None or 0.0, control the number of IPs selected with c * nao at most.
+            
+            
+        rela_cutoff : float
+            The relative cutoff value for IP selection.
+            IPs with values smaller than rela_cutoff * max_QR_value will not be selected.
+            Default is 0.0. (no control via QR values)
+
+
+    Kwargs:
+        first_natm : int
+            The number of atoms to be considered for IP selection.
+            If not given, all atoms will be considered.
+            If set,       it *should* be used in ISDF with k-sampling class, first_natm is the number of atoms in the first cell.
+            
+        global_IP_selection : bool
+            Whether to perform global IP selection.
+            If True, IPs will be re-selected after the individual selection of each atom.
+            Default is True.
+            
+        aoR_cutoff : float
+            The cutoff value for AO values.
+            Points with max AO values smaller than this cutoff will not be considered for IP selection.
+            Default is None.
+            
+        no_retriction_on_nIP : bool
+            Whether to remove the restriction on the number of IPs.
+            If True, there will be no limit on the number of selected IPs.
+            Default is False.
+            
+        use_mpi : bool
+            Whether to use MPI for parallel computation.
+            Default is False.
+            
+        m : int 
+            Control the number of 
+
+        Returns:
+            selected_IP : list
+                The list of selected interpolation points.
+        
+        Ref:
+        
+        (1) Sandeep2022 https://pubs.acs.org/doi/10.1021/acs.jctc.2c00720
+        
+    '''
+    
+    if use_mpi:
+        from isdf_tools_mpi import rank, comm_size, comm, allgather, bcast
+        if rank == 0:
+            logger.debug4(mydf, "_select_IP_direct: num_threads = %d", lib.num_threads())
+    else:
+        rank = 0
+        logger.debug4(mydf, "_select_IP_direct: num_threads = %d", lib.num_threads())
+
+    ### determine the largest grids point of one atm ###
+
+    natm         = mydf.cell.natm
+    nao          = mydf.nao
+    naux_max     = 0
+
+    nao_per_atm = np.zeros((natm), dtype=np.int32)
+    for i in range(mydf.nao):
+        atm_id = mydf.ao2atomID[i]
+        nao_per_atm[atm_id] += 1
+
+    for nao_atm in nao_per_atm:
+        naux_max = max(naux_max, int(np.sqrt(c*nao_atm)) + m)
+
+    nthread = lib.num_threads()
+
+    buf_size_per_thread = mydf.get_buffer_size_in_IP_selection(c, m)
+    buf_size            = buf_size_per_thread
+    
+    if hasattr(mydf, "IO_buf"):
+        buf = mydf.IO_buf
+    else:
+        buf = np.zeros((buf_size), dtype=np.float64)
+        mydf.IO_buf = buf
+        
+    if buf.size < buf_size:
+        mydf.IO_buf = np.zeros((buf_size), dtype=np.float64)
+        buf = mydf.IO_buf
+    buf_tmp = np.ndarray((buf_size), dtype=np.float64, buffer=buf)
+
+    ### loop over atm ###
+
+    from pyscf.pbc.dft.multigrid.multigrid_pair import MultiGridFFTDF2
+
+    df_tmp  = MultiGridFFTDF2(mydf.cell)
+    grids   = df_tmp.grids
+    coords  = np.asarray(grids.coords).reshape(-1,3)
+    assert coords is not None
+
+    results = []
+
+    fn_colpivot_qr = getattr(libisdf, "ColPivotQRRelaCut", None)
+    assert(fn_colpivot_qr is not None)
+    fn_ik_jk_ijk = getattr(libisdf, "NP_d_ik_jk_ijk", None)
+    assert(fn_ik_jk_ijk is not None)
+
+    weight = np.sqrt(mydf.cell.vol / coords.shape[0])
+
+    for p0, p1 in lib.prange(0, 1, 1):
+
+        taskinfo = []
+
+        # clear buffer
+
+        if first_natm is None:
+            first_natm = natm
+    
+        for atm_id in range(first_natm):
+            
+            if use_mpi:
+                if atm_id % comm_size != rank:
+                    continue
+
+            buf_tmp[:buf_size_per_thread] = 0.0
+
+            grid_ID = np.where(mydf.partition == atm_id)[0]
+
+            offset  = 0
+            aoR_atm = np.ndarray((nao, grid_ID.shape[0]), dtype=np.complex128, buffer=buf_tmp, offset=offset)
+            aoR_atm = ISDF_eval_gto(mydf.cell, coords=coords[grid_ID], out=aoR_atm) * weight
+            
+            nao_tmp = nao
+            
+            if aoR_cutoff is not None:
+                logger.debug4(mydf, "_select_IP_direct: aoR_cutoff = %12.6e", aoR_cutoff)
+                max_row = np.max(np.abs(aoR_atm), axis=1)
+                where = np.where(max_row > mydf.aoR_cutoff)[0]
+                aoR_atm = aoR_atm[where]
+                nao_tmp = aoR_atm.shape[0]
+
+            # create buffer for this atm
+
+            dtypesize = buf.dtype.itemsize
+
+            offset += nao_tmp*grid_ID.shape[0] * dtypesize
+
+            nao_atm  = nao_per_atm[atm_id]
+            naux_now = int(np.sqrt(c*nao_atm)) + m
+            naux2_now = naux_now * naux_now
+
+            R = np.ndarray((naux2_now, grid_ID.shape[0]), dtype=np.float64)
+            offset += naux2_now*grid_ID.shape[0] * dtypesize
+
+            aoR_atm1 = np.ndarray((naux_now, grid_ID.shape[0]), dtype=np.float64, buffer=buf_tmp, offset=offset)
+            offset += naux_now*grid_ID.shape[0] * dtypesize
+
+            aoR_atm2 = np.ndarray((naux_now, grid_ID.shape[0]), dtype=np.float64, buffer=buf_tmp, offset=offset)
+            offset += naux_now*grid_ID.shape[0] * dtypesize
+
+            aoPairBuffer = np.ndarray(
+                (naux_now*naux_now, grid_ID.shape[0]), dtype=np.float64, buffer=buf_tmp, offset=offset)
+            offset += naux_now*naux_now*grid_ID.shape[0] * dtypesize
+
+            G1 = np.random.rand(nao_tmp, naux_now)
+            G1, _ = numpy.linalg.qr(G1)
+            G1    = G1.T
+            G2 = np.random.rand(nao_tmp, naux_now)
+            G2, _ = numpy.linalg.qr(G2)
+            G2    = G2.T
+
+            lib.dot(G1, aoR_atm, c=aoR_atm1)
+            lib.dot(G2, aoR_atm, c=aoR_atm2)
+
+            fn_ik_jk_ijk(aoR_atm1.ctypes.data_as(ctypes.c_void_p),
+                         aoR_atm2.ctypes.data_as(ctypes.c_void_p),
+                         aoPairBuffer.ctypes.data_as(ctypes.c_void_p),
+                         ctypes.c_int(naux_now),
+                         ctypes.c_int(naux_now),
+                         ctypes.c_int(grid_ID.shape[0]))
+            if global_IP_selection:
+                if no_retriction_on_nIP:
+                    max_rank = min(naux2_now, grid_ID.shape[0])
+                else:
+                    max_rank  = min(naux2_now, grid_ID.shape[0], nao_atm * c + m)
+            else:
+                if no_retriction_on_nIP:
+                    max_rank = min(naux2_now, grid_ID.shape[0])
+                else:
+                    max_rank  = min(naux2_now, grid_ID.shape[0], nao_atm * c)
+                    
+            npt_find      = ctypes.c_int(0)
+            pivot         = np.arange(grid_ID.shape[0], dtype=np.int32)
+            thread_buffer = np.ndarray((nthread+1, grid_ID.shape[0]+1), dtype=np.float64, buffer=buf_tmp, offset=offset)
+            offset       += (nthread+1)*(grid_ID.shape[0]+1) * dtypesize
+            global_buffer = np.ndarray((1, grid_ID.shape[0]), dtype=np.float64, buffer=buf_tmp, offset=offset)
+            offset       += grid_ID.shape[0] * dtypesize
+
+            fn_colpivot_qr(aoPairBuffer.ctypes.data_as(ctypes.c_void_p),
+                            ctypes.c_int(naux2_now),
+                            ctypes.c_int(grid_ID.shape[0]),
+                            ctypes.c_int(max_rank),
+                            ctypes.c_double(1e-14),
+                            ctypes.c_double(rela_cutoff),
+                            pivot.ctypes.data_as(ctypes.c_void_p),
+                            R.ctypes.data_as(ctypes.c_void_p),
+                            ctypes.byref(npt_find),
+                            thread_buffer.ctypes.data_as(ctypes.c_void_p),
+                            global_buffer.ctypes.data_as(ctypes.c_void_p))
+            
+            npt_find = npt_find.value
+                        
+            cutoff   = abs(R[npt_find-1, npt_find-1])
+            pivot = pivot[:npt_find]
+            pivot.sort()
+            results.extend(list(grid_ID[pivot]))
+            
+            logger.debug4(mydf, "_select_IP_direct: ngrid = %d, npt_find = %d, cutoff = %12.6e", grid_ID.shape[0], npt_find, cutoff)
+
+    if use_mpi:
+        comm.Barrier()
+        results = allgather(results)
+    results.sort()
+    
+    ### global IP selection, we can use this step to avoid numerical issue ###
+    
+    ### but this step is not necessary if locality is explored ###
+
+    if global_IP_selection and rank == 0:
+
+        #if mydf.verbose:
+        #    print("global IP selection")
+
+        bufsize = mydf.get_buffer_size_in_global_IP_selection(len(results), c, m)
+
+        if buf.size < bufsize:
+            mydf.IO_buf = np.zeros((bufsize), dtype=np.float64)
+            buf = mydf.IO_buf
+            if mydf.verbose:
+                print("reallocate buf of size = ", bufsize)
+
+        dtypesize = buf.dtype.itemsize
+
+        buf_tmp = np.ndarray((bufsize), dtype=np.float64, buffer=buf)
+
+        offset = 0
+        aoRg   = np.ndarray((nao, len(results)), dtype=np.complex128, buffer=buf_tmp)
+        aoRg   = ISDF_eval_gto(mydf.cell, coords=coords[results], out=aoRg) * weight
+
+        offset += nao*len(results) * dtypesize
+
+        naux_now  = int(np.sqrt(c*nao)) + m
+        naux2_now = naux_now * naux_now
+
+        R       = np.ndarray((naux2_now, len(results)), dtype=np.float64)
+        offset += naux2_now*len(results) * dtypesize
+
+        aoRg1   = np.ndarray((naux_now, len(results)), dtype=np.float64, buffer=buf_tmp, offset=offset)
+        offset += naux_now*len(results) * dtypesize
+
+        aoRg2   = np.ndarray((naux_now, len(results)), dtype=np.float64, buffer=buf_tmp, offset=offset)
+        offset += naux_now*len(results) * dtypesize
+
+        aoPairBuffer = np.ndarray(
+            (naux_now*naux_now, len(results)), dtype=np.float64, buffer=buf_tmp, offset=offset)
+        offset += naux_now*naux_now*len(results) * dtypesize
+
+        G1 = np.random.rand(nao, naux_now)
+        G1, _ = numpy.linalg.qr(G1)
+        G1    = G1.T
+        G2 = np.random.rand(nao, naux_now)
+        G2, _ = numpy.linalg.qr(G2)
+        G2    = G2.T
+
+        lib.dot(G1, aoRg, c=aoRg1)
+        lib.dot(G2, aoRg, c=aoRg2)
+
+        fn_ik_jk_ijk(aoRg1.ctypes.data_as(ctypes.c_void_p),
+                     aoRg2.ctypes.data_as(ctypes.c_void_p),
+                     aoPairBuffer.ctypes.data_as(ctypes.c_void_p),
+                     ctypes.c_int(naux_now),
+                     ctypes.c_int(naux_now),
+                     ctypes.c_int(len(results)))
+
+        nao_first = np.sum(nao_per_atm[:first_natm])
+
+        if no_retriction_on_nIP:
+            max_rank = min(naux2_now, len(results))
+        else:
+            max_rank  = min(naux2_now, len(results), nao_first * c)
+
+        npt_find      = ctypes.c_int(0)
+        pivot         = np.arange(len(results), dtype=np.int32)
+        thread_buffer = np.ndarray((nthread+1, len(results)+1), dtype=np.float64, buffer=buf_tmp, offset=offset)
+        offset       += (nthread+1)*(len(results)+1) * dtypesize
+        global_buffer = np.ndarray((1, len(results)), dtype=np.float64, buffer=buf_tmp, offset=offset)
+        offset       += len(results) * dtypesize
+
+        fn_colpivot_qr(aoPairBuffer.ctypes.data_as(ctypes.c_void_p),
+                        ctypes.c_int(naux2_now),
+                        ctypes.c_int(len(results)),
+                        ctypes.c_int(max_rank),
+                        ctypes.c_double(1e-14),
+                        ctypes.c_double(rela_cutoff),
+                        pivot.ctypes.data_as(ctypes.c_void_p),
+                        R.ctypes.data_as(ctypes.c_void_p),
+                        ctypes.byref(npt_find),
+                        thread_buffer.ctypes.data_as(ctypes.c_void_p),
+                        global_buffer.ctypes.data_as(ctypes.c_void_p))
+        npt_find = npt_find.value
+                
+        cutoff   = abs(R[npt_find-1, npt_find-1])
+        pivot = pivot[:npt_find]
+
+        pivot.sort()
+
+        results = np.array(results, dtype=np.int32)
+        results = list(results[pivot])
+    
+        logger.debug4(mydf, "_select_IP_direct: ngrid = %d, npt_find = %d, cutoff = %12.6e", len(results), npt_find, cutoff)
+
+    if global_IP_selection and use_mpi:
+        results = bcast(results)
+
+    return results
+
+def build_aux_basis(mydf, debug=True, use_mpi=False):
+    '''build the auxiliary basis for ISDF given IP_ID and aoR.
+    '''
+    
+    if use_mpi:
+        from isdf_tools_mpi import rank, bcast, comm
+    
+    t1 = (lib.logger.process_clock(), lib.logger.perf_counter())
+    
+    # allocate memory for the auxiliary basis
+
+    naux = mydf.IP_ID.shape[0]
+    mydf.naux = naux
+    mydf._allocate_jk_buffer(datatype=np.double)
+    buffer1 = np.ndarray((mydf.naux , mydf.naux), dtype=np.double, buffer=mydf.jk_buffer, offset=0)
+    
+    nao = mydf.nao
+    IP_ID = mydf.IP_ID
+    aoR = mydf.aoR
+
+    if not hasattr(mydf, "aoRg") or mydf.aoRg is None:
+        aoRg = numpy.empty((mydf.nao, mydf.IP_ID.shape[0]))
+        lib.dslice(aoR, IP_ID, out=aoRg)
+    else:
+        aoRg = mydf.aoRg
+        
+    e = None
+    h = None
+    
+    if not use_mpi or (use_mpi and rank == 0):
+        A = np.asarray(lib.ddot(aoRg.T, aoRg, c=buffer1), order='C')  # buffer 1 size = naux * naux
+        lib.square_inPlace(A)
+        
+        t11 = (lib.logger.process_clock(), lib.logger.perf_counter())
+        e, h = scipy.linalg.eigh(A)
+        t12 = (lib.logger.process_clock(), lib.logger.perf_counter())
+        
+        _benchmark_time(t11, t12, "diag_A", mydf)
+        
+        logger.debug4(mydf, "build_aux_basis: condition number = %12.6e", e[-1]/e[0])
+        
+        where = np.where(e > e[-1]*1e-16)[0]
+        e = e[where]
+        h = h[:, where]
+        
+    if use_mpi:
+        e = bcast(e)
+        h = bcast(h)
+    
+    mydf.aux_basis = np.asarray(lib.ddot(aoRg.T, aoR), order='C')   # buffer 2 size = naux * ngrids
+    lib.square_inPlace(mydf.aux_basis)
+    
+    #fn_build_aux = getattr(libisdf, "Solve_LLTEqualB_Parallel", None)
+    #assert(fn_build_aux is not None)
+
+    nThread = lib.num_threads()
+    nGrids  = aoR.shape[1]
+    Bunchsize = nGrids // nThread
+    
+    buffer2 = np.ndarray((e.shape[0] , mydf.aux_basis.shape[1]), dtype=np.double, buffer=mydf.jk_buffer,
+             offset=mydf.naux * mydf.naux * mydf.jk_buffer.dtype.itemsize)
+    B = np.asarray(lib.ddot(h.T, mydf.aux_basis, c=buffer2), order='C')
+    lib.d_i_ij_ij(1.0/e, B, out=B)
+    np.asarray(lib.ddot(h, B, c=mydf.aux_basis), order='C')
+
+    if use_mpi:
+        comm.Barrier()
+
+    t2 = (lib.logger.process_clock(), lib.logger.perf_counter())
+    
+    _benchmark_time(t1, t2, "build_auxiliary_basis", mydf)
+
+    mydf.naux = naux
+    mydf.aoRg = aoRg
+
+from pyscf.pbc import df
+
+class PBC_ISDF_Info(df.fft.FFTDF):
+    ''' Interpolative separable density fitting (ISDF) for periodic systems.
+    Not recommended as the locality is not explored! 
+    
+    Examples:
+
+    >>> #### code to construct aoR ommited ###
+    >>> aoR  *= np.sqrt(cell.vol / ngrids)
+    >>> pbc_isdf = PBC_ISDF_Info(cell, aoR=aoR)
+    >>> pbc_isdf.build_IP_Sandeep(build_global_basis=True, c=C, global_IP_selection=False)
+    >>> pbc_isdf.build_auxiliary_Coulomb()
+    >>> from pyscf.pbc import scf
+    >>> mf = scf.RHF(cell)
+    >>> pbc_isdf.direct_scf = mf.direct_scf
+    >>> mf.with_df = pbc_isdf
+    >>> mf.verbose = 0
+    >>> mf.kernel()
+    
+    '''
+
+    def __init__(self, mol:Cell, 
+                 aoR: np.ndarray = None,  ## convention: aoR is scaled by np.sqrt(mol.vol / ngrids)
+                 with_robust_fitting=True,
+                 kmesh=None,
+                 get_partition=True,
+                 verbose = None
+                 ):
+
+        if kmesh == None:
+            kmesh = numpy.asarray([1,1,1], dtype=numpy.int32)
+        KPoints = _kmesh_to_Kpoints(mol, kmesh)   ### WARNING: this subroutine is not correct ! 
+        
+        super().__init__(cell=mol, kpts=KPoints)
+
+        if verbose is not None:
+            self.verbose = verbose
+
+        ## the following variables are used in build_sandeep
+
+        self.with_robust_fitting = with_robust_fitting
+
+        self.IP_ID     = None
+        self.aux_basis = None
+        self.c         = None
+        self.naux      = None
+        self.W         = None
+        self.aoRg      = None
+        self.aoR       = aoR
+        self.grid_begin = 0
+        if aoR is not None:
+            self.aoRT  = aoR.T
+        else:
+            self.aoRT  = None
+        self.V_R       = None
+        self.cell      = mol
+        self.mesh      = mol.mesh
+
+        self.partition = None
+
+        self.natm = mol.natm
+        self.nao  = mol.nao_nr()
+
+        from pyscf.pbc.dft.multigrid.multigrid_pair import MultiGridFFTDF2
+
+        logger.info(self, "PBC_ISDF_Info: mol.ke_cutoff = %f", mol.ke_cutoff)
+
+        df_tmp = MultiGridFFTDF2(mol)
+        
+        if aoR is None:
+            # df_tmp = MultiGridFFTDF2(mol)
+            self.coords = np.asarray(df_tmp.grids.coords).reshape(-1,3)
+            self.ngrids = self.coords.shape[0]
+        else:
+            self.ngrids = aoR.shape[1]
+            assert self.nao == aoR.shape[0]
+
+        self.grid_end  = self.ngrids
+
+        ## preallocated buffer for parallel calculation
+
+        self.jk_buffer = None
+        self.ddot_buf  = None
+
+        ao2atomID = np.zeros(self.nao, dtype=np.int32)
+        ao2atomID = np.zeros(self.nao, dtype=np.int32)
+
+        # only valid for spherical GTO
+
+        ao_loc = 0
+        for i in range(mol._bas.shape[0]):
+            atm_id = mol._bas[i, ATOM_OF]
+            nctr   = mol._bas[i, NCTR_OF]
+            angl   = mol._bas[i, ANG_OF]
+            nao_now = nctr * (2 * angl + 1)  # NOTE: sph basis assumed!
+            ao2atomID[ao_loc:ao_loc+nao_now] = atm_id
+            ao_loc += nao_now
+
+        self.ao2atomID = ao2atomID
+
+        # given aoG, determine at given grid point, which ao has the maximal abs value
+
+        if aoR is not None:
+            self.partition = np.argmax(np.abs(aoR), axis=0)
+            # map aoID to atomID
+            self.partition = np.asarray([ao2atomID[x] for x in self.partition])
+            grids   = df_tmp.grids
+            self.coords  = np.asarray(grids.coords).reshape(-1,3)
+            self._numints = df_tmp._numint
+        else:
+            grids   = df_tmp.grids
+            coords  = np.asarray(grids.coords).reshape(-1,3)
+            NumInts = df_tmp._numint
+
+            coords_now = coords
+            
+            if kmesh is not None:
+                
+                mesh       = mol.mesh
+                meshPrim   = np.array(mesh, dtype=np.int32) // kmesh
+                coords_now = coords_now.reshape(kmesh[0], meshPrim[0], kmesh[1], meshPrim[1], kmesh[2], meshPrim[2], 3)
+                coords_now = coords_now.transpose(0, 2, 4, 1, 3, 5, 6).reshape(-1, 3)
+                coords_now = coords_now[:np.prod(meshPrim), :]
+
+            self.partition = np.zeros(coords_now.shape[0], dtype=np.int32)
+
+            from pyscf.isdf.isdf_eval_gto import ISDF_eval_gto
+
+            if hasattr(self, "IO_buf"):
+                logger.debug4(self, "PBC_ISDF_Info: IO_buf is already allocated")
+            else:
+                logger.debug4(self, "PBC_ISDF_Info: IO_buf is not allocated")
+                max_memory = max(2000, self.max_memory-lib.current_memory()[0])
+                self.IO_buf = np.zeros((int(max_memory*1e6//8),), dtype=np.double)
+
+            logger.debug4(self, "PBC_ISDF_Info: IO_buf.size     = %d", self.IO_buf.size)
+            logger.debug4(self, "PBC_ISDF_Info: coords.shape[0] = %d", coords_now.shape[0])
+            logger.debug4(self, "PBC_ISDF_Info: self.nao        = %d", self.nao)
+
+            bufsize = min(self.IO_buf.size, 4*1e9/8) // 2
+            bunchsize = int(bufsize / (self.nao))
+
+            assert bunchsize > 0
+            
+            if get_partition and aoR is None:
+                for p0, p1 in lib.prange(0, coords_now.shape[0], bunchsize):
+                    AoR_Buf = np.ndarray((self.nao, p1-p0), dtype=np.complex128, buffer=self.IO_buf, offset=0)
+                    AoR_Buf = ISDF_eval_gto(self.cell, coords=coords_now[p0:p1], out=AoR_Buf)
+                    res     = np.argmax(np.abs(AoR_Buf), axis=0)
+                    self.partition[p0:p1] = np.asarray([ao2atomID[x] for x in res])
+                    AoR_Buf = None
+            else:
+                self.partition = None
+                
+            res = None
+            
+            self.coords = coords
+            self._numints = NumInts
+        
+        ########### attr used in build K directly with cutoff ###########
+        
+        self._build_K_rela_cutoff     = None
+        self._build_K_abs_cutoff      = None
+        self._build_K_distance_cutoff = None
+
+    def _allocate_jk_buffer(self, datatype):
+
+        if self.jk_buffer is None:
+
+            nao    = self.nao
+            ngrids = self.ngrids
+            naux   = self.naux
+
+            logger.debug4(self, "_allocate_jk_buffer: nao = %d, ngrids = %d, naux = %d", nao, ngrids, naux) 
+            buffersize_k = nao * ngrids + naux * ngrids + naux * naux + nao * nao           
+            buffersize_j = nao * ngrids + ngrids + nao * naux + naux + naux + nao * nao
+
+            nThreadsOMP   = lib.num_threads()
+            size_ddot_buf = max((naux*naux)+2, ngrids) * nThreadsOMP
+
+            if hasattr(self, "IO_buf"):
+
+                if self.IO_buf.size < (max(buffersize_k, buffersize_j) + size_ddot_buf):
+                    self.IO_buf = np.zeros((max(buffersize_k, buffersize_j) + size_ddot_buf,), dtype=datatype)
+
+                self.jk_buffer = np.ndarray((max(buffersize_k, buffersize_j),),
+                                            dtype=datatype, buffer=self.IO_buf, offset=0)
+                offset         = max(buffersize_k, buffersize_j) * self.jk_buffer.dtype.itemsize
+                self.ddot_buf  = np.ndarray((nThreadsOMP, max((nao*nao)+2, ngrids)),
+                                            dtype=datatype, buffer=self.IO_buf, offset=offset)
+
+            else:
+
+                self.jk_buffer = np.ndarray((max(buffersize_k, buffersize_j),), dtype=datatype)
+                self.ddot_buf  = np.zeros((nThreadsOMP, max((nao*nao)+2, ngrids)), dtype=datatype)
+
+        else:
+            assert self.jk_buffer.dtype == datatype
+            assert self.ddot_buf.dtype == datatype
+
+    def set_build_K_cutoff(self, rela_cutoff=None, abs_cutoff=None):
+        ''' set the cutoff for building K matrix directly.
+        '''
+        self._build_K_rela_cutoff = rela_cutoff
+        self._build_K_abs_cutoff  = abs_cutoff
+
+    def set_build_K_distance_cutoff(self, distance_cutoff=None):
+        ''' set the cutoff for building K matrix directly.
+        '''
+        self._build_K_distance_cutoff = distance_cutoff
+
+    def build(self):
+        raise NotImplementedError
+
+    def build_only_partition(self):
+        raise NotImplementedError
+
+    def get_buffer_size_in_IP_selection(self, c, m=5):
+        natm = self.cell.natm
+        nao_per_atm = np.zeros((natm), dtype=np.int32)
+        for i in range(self.nao):
+            atm_id = self.ao2atomID[i]
+            nao_per_atm[atm_id] += 1
+
+        naux_max = 0
+        for nao_atm in nao_per_atm:
+            naux_max = max(naux_max, int(np.sqrt(c*nao_atm)) + m)
+
+        ngrid_on_atm = np.zeros((self.cell.natm), dtype=np.int32)
+        for atm_id in self.partition:
+            ngrid_on_atm[atm_id] += 1
+
+        naux_max2 = naux_max * naux_max
+
+        ngrid_on_atm = np.max(ngrid_on_atm)
+
+        nThread = lib.num_threads()
+
+        buf_size  = self.nao*ngrid_on_atm                      # aoR_atm
+        buf_size += naux_max2*ngrid_on_atm                     # R
+        buf_size += naux_max*ngrid_on_atm*2                    # aoR_atm1, aoR_atm2
+        buf_size += naux_max*naux_max*ngrid_on_atm             # aoPairBuffer
+        buf_size += (nThread+1)*(ngrid_on_atm+1)
+        buf_size += ngrid_on_atm
+
+        return max(buf_size, 2*self.nao*ngrid_on_atm)
+
+    def get_buffer_size_in_global_IP_selection(self, ngrids_possible, c, m=5):
+
+        nao        = self.nao
+        naux_max   = int(np.sqrt(c*nao)) + m
+        ngrids_now = ngrids_possible
+        naux_max2  = naux_max * naux_max
+
+        nThread    = lib.num_threads()
+
+        buf_size   = self.nao*ngrids_now                      # aoR_atm
+        buf_size  += naux_max2*ngrids_now                     # R
+        buf_size  += naux_max*ngrids_now*2                    # aoR_atm1, aoR_atm2
+        buf_size  += naux_max*naux_max*ngrids_now             # aoPairBuffer
+        buf_size  += (nThread+1)*(ngrids_now+1)
+        buf_size  += ngrids_now
+
+        return max(buf_size, 2*self.nao*ngrids_now)
+    
+    def get_A_B(self):
+        '''aux basis is contructed via solving AX=B
+        '''
+
+        aoR   = self.aoR
+        IP_ID = self.IP_ID
+        aoRG  = aoR[:, IP_ID]
+
+        A = np.asarray(lib.dot(aoRG.T, aoRG), order='C')
+        A = A ** 2
+        B = np.asarray(lib.dot(aoRG.T, aoR), order='C')
+        B = B ** 2
+
+        return A, B
+
+
+    def build_IP_Sandeep(self, c=5, m=5,
+                         global_IP_selection=True,
+                         build_global_basis=True,
+                         IP_ID=None,
+                         debug=True):
+        ''' select the interpolation points (IP) based on the given criteria using Sandeep's method.
+        Ref:
+        (1) Sandeep2022 https://pubs.acs.org/doi/10.1021/acs.jctc.2c00720
+        '''
+
+        # build partition
+
+        ao2atomID = self.ao2atomID
+        partition = self.partition
+        aoR  = self.aoR
+        natm = self.natm
+        nao  = self.nao
+        ao2atomID = self.ao2atomID
+        partition = self.partition
+        aoR  = self.aoR
+        natm = self.natm
+        nao  = self.nao
+
+        # for each atm
+
+        if not hasattr(self, "use_mpi"):
+            self.use_mpi = False
+            rank = 0
+
+        t1 = (lib.logger.process_clock(), lib.logger.perf_counter())
+
+        if IP_ID is None:
+            IP_ID  = _select_IP_direct(self, c, m, global_IP_selection=global_IP_selection, use_mpi=self.use_mpi)
+            IP_ID.sort()
+            IP_ID  = np.array(IP_ID, dtype=np.int32)
+        self.IP_ID = np.array(IP_ID, dtype=np.int32)
+
+        t2 = (lib.logger.process_clock(), lib.logger.perf_counter())
+        if rank == 0:
+            _benchmark_time(t1, t2, "build_IP", self)
+        t1 = t2
+
+        # build the auxiliary basis
+
+        self.c = c
+        build_aux_basis(self)
+
+    def build_auxiliary_Coulomb(self, cell:Cell = None, mesh=None, debug=True):
+        ''' build V and W matrix see eq(13) of Sandeep2022.
+        
+        Ref:
+        (1) Sandeep2022 https://pubs.acs.org/doi/10.1021/acs.jctc.2c00720
+        '''
+
+        self._allocate_jk_buffer(datatype=np.double)
+
+        # build the ddot buffer
+
+        naux   = self.naux
+
+        if cell is None:
+            cell = self.cell
+        if mesh is None:
+            mesh = self.cell.mesh
+
+        def constrcuct_V_CCode(aux_basis:np.ndarray, mesh, coul_G):
+            
+            coulG_real         = coul_G.reshape(*mesh)[:, :, :mesh[2]//2+1].reshape(-1)
+            nThread            = lib.num_threads()
+            bunchsize          = naux // (2*nThread)
+            bufsize_per_thread = bunchsize * coulG_real.shape[0] * 2
+            bufsize_per_thread = (bufsize_per_thread + 15) // 16 * 16
+            nAux               = aux_basis.shape[0]
+            ngrids             = aux_basis.shape[1]
+            mesh_int32         = np.array(mesh, dtype=np.int32)
+
+            V                  = np.zeros((nAux, ngrids), dtype=np.double)
+
+            fn = getattr(libisdf, "_construct_V", None)
+            assert(fn is not None)
+
+            fn(mesh_int32.ctypes.data_as(ctypes.c_void_p),
+               ctypes.c_int(nAux),
+               aux_basis.ctypes.data_as(ctypes.c_void_p),
+               coulG_real.ctypes.data_as(ctypes.c_void_p),
+               V.ctypes.data_as(ctypes.c_void_p),
+               ctypes.c_int(bunchsize),
+               self.jk_buffer.ctypes.data_as(ctypes.c_void_p),
+               ctypes.c_int(bufsize_per_thread))
+
+            return V
+
+        t1 = (lib.logger.process_clock(), lib.logger.perf_counter())
+
+        if cell is None:
+            cell = self.cell
+            #print("cell.__class__ = ", cell.__class__)
+
+        coulG = tools.get_coulG(cell, mesh=mesh)
+
+        V_R = constrcuct_V_CCode(self.aux_basis, mesh, coulG)
+
+        t2 = (lib.logger.process_clock(), lib.logger.perf_counter())
+        if debug:
+            _benchmark_time(t1, t2, "build_auxiliary_Coulomb_V_R", self)
+        t1 = t2
+
+        W = lib.ddot(a=self.aux_basis, b=V_R.T)
+
+        self.coulG = coulG.copy()
+
+        t2 = (lib.logger.process_clock(), lib.logger.perf_counter())
+        if debug:
+            _benchmark_time(t1, t2, "build_auxiliary_Coulomb_W", self)
+
+        self.V_R  = V_R
+        self.W    = W
+        self.mesh = mesh
+
+    def check_AOPairError(self):
+        
+        assert(self.aoR is not None)
+        assert(self.IP_ID is not None)
+        assert(self.aux_basis is not None)
+
+        aoR = self.aoR
+        aoRg = aoR[:, self.IP_ID]
+        nao = self.nao
+
+        logger.debug4(self, "check_AOPairError")
+
+        for i in range(nao):
+            
+            coeff = numpy.einsum('k,jk->jk', aoRg[i, :], aoRg).reshape(-1, self.IP_ID.shape[0])
+            aoPair = numpy.einsum('k,jk->jk', aoR[i, :], aoR).reshape(-1, aoR.shape[1])
+            aoPair_approx = coeff @ self.aux_basis
+
+            diff = aoPair - aoPair_approx
+            diff_pair_abs_max = np.max(np.abs(diff), axis=1)
+
+            for j in range(diff_pair_abs_max.shape[0]):
+                logger.debug4(self, "(%5d, %5d, %15.8e)", i, j, diff_pair_abs_max[j])
+
+    def __del__(self):
+        return
+
+    @property
+    def kpt(self):
+        return np.zeros(3)
+
+    def get_pp(self, kpts=None):
+        if hasattr(self, "PP") and self.PP is not None:
+            return self.PP
+        else:
+            
+            use_super_pp = False 
+            
+            if hasattr(self, "_use_super_pp"):
+                if self._use_super_pp:
+                    use_super_pp = True
+                    t0 = (lib.logger.process_clock(), lib.logger.perf_counter())
+                    self.PP = super().get_pp(kpts=np.zeros(3))
+                    t1 = (lib.logger.process_clock(), lib.logger.perf_counter()) 
+            if not use_super_pp:
+                t0 = (lib.logger.process_clock(), lib.logger.perf_counter())
+                cell = self.cell.copy()
+                cell.omega = 0.0
+                if hasattr(self, "ke_cutoff_pp"):
+                    cell.ke_cutoff = self.ke_cutoff_pp
+                cell.build()
+                df_tmp = multigrid.MultiGridFFTDF2(cell)
+                v_pp_loc2_nl = df_tmp.get_pp()
+                v_pp_loc1_G = df_tmp.vpplocG_part1
+                v_pp_loc1 = multigrid.multigrid_pair._get_j_pass2(df_tmp, v_pp_loc1_G)
+                self.PP = (v_pp_loc1 + v_pp_loc2_nl)[0]
+                t1 = (lib.logger.process_clock(), lib.logger.perf_counter()) 
+                
+            #if self.verbose:
+            if self.use_mpi:
+                from pyscf.isdf.isdf_tools_mpi import rank
+                if rank == 0:
+                    _benchmark_time(t0, t1, "get_pp", self)
+            else:
+                _benchmark_time(t0, t1, "get_pp", self)
+                
+            #### kpts #### 
+            
+            if kpts is not None:
+                
+                nkpts = kpts.shape[0]
+                
+                if hasattr(self, "kmesh") and self.kmesh is not None:
+                    pass
+                else:
+                    self.kmesh = np.asarray([1,1,1], dtype=np.int32)
+                kmesh = np.asarray(self.kmesh, dtype=np.int32)
+                assert kpts.shape[0] == np.prod(self.kmesh, dtype=np.int32) or kpts.shape[0] == 1 or kpts.ndim == 1
+                is_single_kpt = kpts.shape[0] == 1 or kpts.ndim == 1
+                
+                if is_single_kpt:
+                    #### use the calculated one by default ####
+                    if self.use_mpi:
+                        from pyscf.isdf.isdf_tools_mpi import bcast
+                        self.PP = bcast(self.PP, root = 0)
+                    return self.PP
+                
+                #### the following is used to test KRHF #### 
+                
+                ### info used in super().get_pp() ###
+                
+                assert hasattr(self, "prim_cell")
+                                
+                nao_prim = self.cell.nao_nr() // nkpts 
+                assert self.cell.nao_nr() % nkpts == 0
+                self.PP = self.PP[:nao_prim, :].copy()
+                
+                n_complex = self.kmesh[0] * self.kmesh[1] * (self.kmesh[2]//2+1)
+                n_cell    = np.prod(self.kmesh)
+                
+                PP_complex = np.zeros((nao_prim, n_complex * nao_prim), dtype=np.complex128)
+                PP_real    = np.ndarray((nao_prim, n_cell * nao_prim), dtype=np.double, buffer=PP_complex)
+                PP_real.ravel()[:] = self.PP.ravel()
+                buf_fft    = np.zeros((nao_prim, n_complex, nao_prim), dtype=np.complex128)
+                
+                fn1 = getattr(libisdf, "_FFT_Matrix_Col_InPlace", None)
+                assert fn1 is not None 
+                
+                fn1(
+                    PP_real.ctypes.data_as(ctypes.c_void_p),
+                    ctypes.c_int(nao_prim),
+                    ctypes.c_int(nao_prim),
+                    kmesh.ctypes.data_as(ctypes.c_void_p),
+                    buf_fft.ctypes.data_as(ctypes.c_void_p)
+                )
+                del buf_fft 
+                
+                from  pyscf.isdf.isdf_tools_densitymatrix import pack_JK_in_FFT_space
+                
+                PP_complex = PP_complex.conj().copy()
+                self.PP = pack_JK_in_FFT_space(PP_complex, kmesh, nao_prim)
+                
+            if self.use_mpi:
+                from pyscf.isdf.isdf_tools_mpi import bcast
+                self.PP = bcast(self.PP, root = 0)
+                
+            return self.PP
+        
+    def get_nuc(self, kpts=None):
+        if hasattr(self, "nuc") and self.nuc is not None:
+            return self.nuc
+        else:
+            
+            t0 = (lib.logger.process_clock(), lib.logger.perf_counter())
+            self.nuc = super().get_nuc(kpts=np.zeros(3))
+            t1 = (lib.logger.process_clock(), lib.logger.perf_counter()) 
+            if self.verbose:
+                _benchmark_time(t0, t1, "get_nuc", self)
+                
+            #### kpts #### 
+            
+            if kpts is not None:
+                
+                nkpts = kpts.shape[0]
+                
+                if hasattr(self, "kmesh") and self.kmesh is not None:
+                    pass
+                else:
+                    self.kmesh = np.asarray([1,1,1], dtype=np.int32)
+                kmesh = np.asarray(self.kmesh, dtype=np.int32)
+
+                assert kpts.shape[0] == np.prod(self.kmesh, dtype=np.int32) or kpts.shape[0] == 1 or kpts.ndim == 1
+
+                is_single_kpt = kpts.shape[0] == 1 or kpts.ndim == 1
+                
+                if is_single_kpt:
+                    if self.use_mpi:
+                        from pyscf.isdf.isdf_tools_mpi import bcast
+                        self.nuc = bcast(self.nuc, root = 0)
+                    return self.nuc
+                
+                #### the following is used in KRHF #### 
+                
+                ### info used in super().get_pp() ###
+                
+                assert hasattr(self, "prim_cell")
+                                
+                nao_prim = self.cell.nao_nr() // nkpts 
+                assert self.cell.nao_nr() % nkpts == 0
+                self.nuc = self.nuc[:nao_prim, :].copy()
+                
+                n_complex = self.kmesh[0] * self.kmesh[1] * (self.kmesh[2]//2+1)
+                n_cell    = np.prod(self.kmesh)
+                
+                nuc_complex = np.zeros((nao_prim, n_complex * nao_prim), dtype=np.complex128)
+                nuc_real    = np.ndarray((nao_prim, n_cell * nao_prim), dtype=np.double, buffer=nuc_complex)
+                nuc_real.ravel()[:] = self.nuc.ravel()
+                buf_fft    = np.zeros((nao_prim, n_complex, nao_prim), dtype=np.complex128)
+                
+                fn1 = getattr(libisdf, "_FFT_Matrix_Col_InPlace", None)
+                assert fn1 is not None 
+                
+                fn1(
+                    nuc_real.ctypes.data_as(ctypes.c_void_p),
+                    ctypes.c_int(nao_prim),
+                    ctypes.c_int(nao_prim),
+                    kmesh.ctypes.data_as(ctypes.c_void_p),
+                    buf_fft.ctypes.data_as(ctypes.c_void_p)
+                )
+                del buf_fft 
+                
+                from  pyscf.isdf.isdf_tools_densitymatrix import pack_JK_in_FFT_space
+                
+                nuc_complex = nuc_complex.conj().copy()
+                self.nuc = pack_JK_in_FFT_space(nuc_complex, kmesh, nao_prim)
+                
+            if self.use_mpi:
+                from pyscf.isdf.isdf_tools_mpi import bcast
+                self.nuc = bcast(self.nuc, root = 0)
+                
+            return self.nuc
+        
+    def LS_THC_recompression(self, X:np.ndarray, force_LS_THC=True):
+        
+        from isdf_ao2mo import LS_THC 
+        
+        if force_LS_THC:
+            self.with_robust_fitting = False
+            self.force_LS_THC        = True
+            self.W    = LS_THC(self, X) / (self.ngrids/self.cell.vol)
+            self.aoRg = X
+            self.V_R  = None
+        else:
+            self.force_LS_THC        = False
+            self.W2    = LS_THC(self, X) / (self.ngrids/self.cell.vol)
+            self.aoRg2 = X
+    
+    def aoRg_full(self):
+        return self.aoRg, None
+        
+    ##### functions defined in isdf_ao2mo.py #####
+
+    get_eri = get_ao_eri = isdf_ao2mo.get_eri
+    ao2mo = get_mo_eri = isdf_ao2mo.general
+    ao2mo_7d = isdf_ao2mo.ao2mo_7d  # seems to be only called in kadc and kccsd, NOT implemented!
+
+    ##### functions defined in isdf_jk.py #####
+
+    get_jk = isdf_jk.get_jk_dm
+
+
+
+if __name__ == '__main__':
+
+    C = 15
+    
+    cell   = pbcgto.Cell()
+    boxlen = 3.5668
+    cell.a = np.array([[boxlen,0.0,0.0],[0.0,boxlen,0.0],[0.0,0.0,boxlen]])
+    cell.atom = '''
+                   C     0.      0.      0.
+                   C     0.8917  0.8917  0.8917
+                   C     1.7834  1.7834  0.
+                   C     2.6751  2.6751  0.8917
+                   C     1.7834  0.      1.7834
+                   C     2.6751  0.8917  2.6751
+                   C     0.      1.7834  1.7834
+                   C     0.8917  2.6751  2.6751
+                '''
+
+#     boxlen = 4.2
+#     cell.a = np.array([[boxlen,0.0,0.0],[0.0,boxlen,0.0],[0.0,0.0,boxlen]])
+#     cell.atom = '''
+# Li 0.0   0.0   0.0
+# Li 2.1   2.1   0.0
+# Li 0.0   2.1   2.1
+# Li 2.1   0.0   2.1
+# H  0.0   0.0   2.1
+# H  0.0   2.1   0.0
+# H  2.1   0.0   0.0
+# H  2.1   2.1   2.1
+# '''
+
+    cell.basis   = 'gth-dzvp'
+    # cell.basis   = 'gth-tzvp'
+    cell.pseudo  = 'gth-pade'
+    cell.verbose = 10
+
+    # cell.ke_cutoff  = 128   # kinetic energy cutoff in a.u.
+    cell.ke_cutoff = 70
+    cell.max_memory = 800  # 800 Mb
+    cell.precision  = 1e-8  # integral precision
+    cell.use_particle_mesh_ewald = True
+
+    cell.build()
+
+    cell = tools.super_cell(cell, [1, 1, 1])
+
+    from pyscf.pbc.dft.multigrid.multigrid_pair import MultiGridFFTDF2, _eval_rhoG
+
+    df_tmp = MultiGridFFTDF2(cell)
+
+    grids  = df_tmp.grids
+    coords = np.asarray(grids.coords).reshape(-1,3)
+    nx     = grids.mesh[0]
+
+    mesh   = grids.mesh
+    ngrids = np.prod(mesh)
+    assert ngrids == coords.shape[0]
+
+    aoR   = df_tmp._numint.eval_ao(cell, coords)[0].T  # the T is important
+    aoR  *= np.sqrt(cell.vol / ngrids)
+
+    pbc_isdf_info = PBC_ISDF_Info(cell, aoR=aoR)
+    pbc_isdf_info.build_IP_Sandeep(build_global_basis=True, c=C, global_IP_selection=False)
+    pbc_isdf_info.build_auxiliary_Coulomb()
+
+    ### perform scf ###
+
+    from pyscf.pbc import scf
+
+    mf = scf.RHF(cell)
+    pbc_isdf_info.direct_scf = mf.direct_scf
+    mf.with_df = pbc_isdf_info
+    mf.max_cycle = 100
+    mf.conv_tol = 1e-7
+
+    print("mf.direct_scf = ", mf.direct_scf)
+
+    mf.kernel()
+
+    print("mf.with_df.IP_ID = ", mf.with_df.IP_ID)
+    print("mf.with_df.partition = ", mf.with_df.partition)
+    
+    for i in range(cell.natm):
+        print("i = ", i, "partition = ", mf.with_df.partition[mf.with_df.partition == i].shape[0])
+
+    #exit(1)
+
+    # without robust fitting 
+    
+    pbc_isdf_info.with_robust_fitting = False
+
+    mf = scf.RHF(cell)
+    pbc_isdf_info.direct_scf = mf.direct_scf
+    mf.with_df = pbc_isdf_info
+    mf.max_cycle = 100
+    mf.conv_tol = 1e-7
+    mf.kernel()
+
+    mf = scf.RHF(cell)
+    mf.max_cycle = 100
+    mf.conv_tol = 1e-8
+    #mf.kernel()
+    pbc_isdf_info.with_robust_fitting = True
+
+    ##### test the LS_THC_recompression ##### 
+    
+    _pbc_isdf_info = PBC_ISDF_Info(cell, aoR)
+    _pbc_isdf_info.build_IP_Sandeep(build_global_basis=True, c=12, global_IP_selection=False)
+
+    pbc_isdf_info.LS_THC_recompression(_pbc_isdf_info.aoRg, force_LS_THC=False)
+    
+    mf = scf.RHF(cell)
+    pbc_isdf_info.direct_scf = mf.direct_scf
+    mf.with_df = pbc_isdf_info
+    mf.max_cycle = 10
+    mf.conv_tol = 1e-7
+    mf.kernel()
+    
+    pbc_isdf_info.LS_THC_recompression(_pbc_isdf_info.aoRg)
+    
+    mf = scf.RHF(cell)
+    pbc_isdf_info.direct_scf = mf.direct_scf
+    mf.with_df = pbc_isdf_info
+    mf.max_cycle = 10
+    mf.conv_tol = 1e-7
+    mf.kernel()
\ No newline at end of file
diff --git a/pyscf/isdf/isdf_jk.py b/pyscf/isdf/isdf_jk.py
new file mode 100644
index 000000000..7ebcd5ff5
--- /dev/null
+++ b/pyscf/isdf/isdf_jk.py
@@ -0,0 +1,598 @@
+#!/usr/bin/env python
+# Copyright 2014-2020 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Author: Ning Zhang <ningzhang1024@gmail.com>
+#
+
+import copy
+import numpy as np
+import numpy
+import ctypes
+
+from pyscf import lib
+from pyscf.lib import logger
+from pyscf.pbc import tools
+from pyscf.pbc.lib.kpts_helper import is_zero, gamma_point
+libpbc = lib.load_library('libpbc')
+
+##################################################
+#
+# only Gamma Point
+#
+##################################################
+
+######### tools #########
+
+def _benchmark_time(t1, t2, label, rec):
+    lib.logger.debug4(rec, "%20s wall time: %12.6f CPU time: %12.6f" % (label, t2[1] - t1[1], t2[0] - t1[0]))
+
+def _contract_j_dm(mydf, dm, with_robust_fitting=True, use_mpi=False):
+    '''
+
+    Args:
+        mydf  : density fitting object
+        dm    : the density matrix
+
+    '''
+    
+    assert use_mpi == False
+    
+    t1 = (logger.process_clock(), logger.perf_counter())
+
+    if len(dm.shape) == 3:
+        assert dm.shape[0] == 1
+        dm = dm[0]
+
+    nao  = dm.shape[0]
+
+    cell = mydf.cell
+    assert cell.nao == nao
+    vol = cell.vol
+
+    W    = mydf.W
+    aoRg = mydf.aoRg
+    aoR  = mydf.aoR
+    ngrid = aoR.shape[1]
+
+    if hasattr(mydf, "V_R"):
+        V_R  = mydf.V_R
+    else:
+        V_R = None
+    naux = aoRg.shape[1]
+    IP_ID = mydf.IP_ID
+    
+    #### step 2. get J term1 and term2
+    
+    buffer = mydf.jk_buffer
+    buffer1 = np.ndarray((nao,ngrid), dtype=dm.dtype, buffer=buffer, offset=0)
+    buffer2 = np.ndarray((ngrid), dtype=dm.dtype, buffer=buffer, offset=nao * ngrid * dm.dtype.itemsize)
+    buffer3 = np.ndarray((nao,naux), dtype=dm.dtype, buffer=buffer,
+                         offset=(nao * ngrid + ngrid) * dm.dtype.itemsize)
+    buffer4 = np.ndarray((naux), dtype=dm.dtype, buffer=buffer, offset=(nao *
+                         ngrid + ngrid + nao * naux) * dm.dtype.itemsize)
+    buffer5 = np.ndarray((naux), dtype=dm.dtype, buffer=buffer, offset=(nao *
+                            ngrid + ngrid + nao * naux + naux) * dm.dtype.itemsize)
+    buffer6 = np.ndarray((nao,nao), dtype=dm.dtype, buffer=buffer, offset=(nao *
+                            ngrid + ngrid + nao * naux + naux + naux) * dm.dtype.itemsize)
+    buffer7 = np.ndarray((nao,naux), dtype=dm.dtype, buffer=buffer, offset=0)
+    buffer8 = np.ndarray((naux), dtype=dm.dtype, buffer=buffer, offset=nao * ngrid * dm.dtype.itemsize)
+
+    ## constract dm and aoR
+
+    # need allocate memory, size = nao  * ngrid, (buffer 1)
+
+    lib.ddot(dm, aoR, c=buffer1)  
+    tmp1 = buffer1
+
+    # need allocate memory, size = ngrid, (buffer 2)
+
+    density_R = np.asarray(lib.multiply_sum_isdf(aoR, tmp1, out=buffer2), order='C')
+
+    # need allocate memory, size = nao  * naux, (buffer 3)
+
+    # lib.dslice(tmp1, IP_ID, buffer3)
+    # tmp1 = buffer3
+    tmp1 = lib.ddot(dm, aoRg)  
+
+    density_Rg = np.asarray(lib.multiply_sum_isdf(aoRg, tmp1, out=buffer4),
+                            order='C')  # need allocate memory, size = naux, (buffer 4)
+
+    # This should be the leading term of the computation cost in a single-thread mode.
+
+    # need allocate memory, size = naux, (buffer 5)
+
+    J = None
+
+    if with_robust_fitting:
+        J = np.asarray(lib.ddot_withbuffer(V_R, density_R.reshape(-1,1), c=buffer5.reshape(-1,1), buf=mydf.ddot_buf), order='C').reshape(-1)   # with buffer, size 
+        
+        # do not need allocate memory, use buffer 3
+
+        J = np.asarray(lib.d_ij_j_ij(aoRg, J, out=buffer3), order='C')
+
+        # need allocate memory, size = nao  * nao, (buffer 6)
+
+        J = np.asarray(lib.ddot_withbuffer(aoRg, J.T, c=buffer6, buf=mydf.ddot_buf), order='C')
+            
+        # do not need allocate memory, use buffer 2
+
+        J2 = np.asarray(lib.dot(V_R.T, density_Rg.reshape(-1,1), c=buffer2.reshape(-1,1)), order='C').reshape(-1)
+
+        # do not need allocate memory, use buffer 1
+
+        # J2 = np.einsum('ij,j->ij', aoR, J2)
+        J2 = np.asarray(lib.d_ij_j_ij(aoR, J2, out=buffer1), order='C')
+
+        # do not need allocate memory, use buffer 6
+
+        # J += np.asarray(lib.dot(aoR, J2.T), order='C')
+        lib.ddot_withbuffer(aoR, J2.T, c=J, beta=1, buf=mydf.ddot_buf)
+
+    #### step 3. get J term3
+
+    # do not need allocate memory, use buffer 2
+
+    tmp = np.asarray(lib.dot(W, density_Rg.reshape(-1,1), c=buffer8.reshape(-1,1)), order='C').reshape(-1)
+    
+    # do not need allocate memory, use buffer 1 but viewed as buffer 7
+    
+    tmp = np.asarray(lib.d_ij_j_ij(aoRg, tmp, out=buffer7), order='C')
+    
+    # do not need allocate memory, use buffer 6
+    
+    if with_robust_fitting:
+        lib.ddot_withbuffer(aoRg, -tmp.T, c=J, beta=1, buf=mydf.ddot_buf)
+    else:
+        J = buffer6
+        lib.ddot_withbuffer(aoRg, tmp.T, c=J, beta=0, buf=mydf.ddot_buf)
+
+    t2 = (logger.process_clock(), logger.perf_counter())
+    
+    if mydf.verbose:
+        _benchmark_time(t1, t2, "_contract_j_dm", mydf)
+
+    return J * ngrid / vol
+
+def _contract_j_dm_fast(mydf, dm, with_robust_fitting=True, use_mpi=False):
+
+    assert use_mpi == False
+    
+    t1 = (logger.process_clock(), logger.perf_counter())
+
+    if len(dm.shape) == 3:
+        assert dm.shape[0] == 1
+        dm = dm[0]
+        
+    nao  = dm.shape[0]
+    cell = mydf.cell
+    assert cell.nao == nao
+    ngrid = np.prod(cell.mesh)
+    assert ngrid == mydf.ngrids
+    vol = cell.vol
+
+    W    = mydf.W
+    aoRg = mydf.aoRg
+    aoR  = mydf.aoR
+    ngrid = aoR.shape[1]
+    if hasattr(mydf, "V_R"):
+        V_R = mydf.V_R
+    else:
+        V_R = None
+    naux = mydf.naux
+    IP_ID = mydf.IP_ID
+    
+    mesh = np.array(cell.mesh, dtype=np.int32)
+    
+    #### step 0. allocate buffer 
+    
+    buffer = mydf.jk_buffer
+    buffer1 = np.ndarray((nao,ngrid), dtype=dm.dtype, buffer=buffer, offset=0)
+    buffer2 = np.ndarray((ngrid), dtype=dm.dtype, buffer=buffer, offset=nao * ngrid * dm.dtype.itemsize)
+    buffer3 = np.ndarray((nao,naux), dtype=dm.dtype, buffer=buffer,
+                         offset=(nao * ngrid + ngrid) * dm.dtype.itemsize)
+    buffer4 = np.ndarray((naux), dtype=dm.dtype, buffer=buffer, offset=(nao *
+                         ngrid + ngrid + nao * naux) * dm.dtype.itemsize)
+    buffer5 = np.ndarray((naux), dtype=dm.dtype, buffer=buffer, offset=(nao *
+                            ngrid + ngrid + nao * naux + naux) * dm.dtype.itemsize)
+    buffer6 = np.ndarray((nao,nao), dtype=dm.dtype, buffer=buffer, offset=(nao *
+                            ngrid + ngrid + nao * naux + naux + naux) * dm.dtype.itemsize)
+    buffer7 = np.ndarray((nao,naux), dtype=dm.dtype, buffer=buffer, offset=0)
+    buffer8 = np.ndarray((naux), dtype=dm.dtype, buffer=buffer, offset=nao * ngrid * dm.dtype.itemsize)
+
+    #### step 1. get density value on real space grid and IPs
+    
+    lib.ddot(dm, aoR, c=buffer1) 
+    tmp1 = buffer1
+    density_R = np.asarray(lib.multiply_sum_isdf(aoR, tmp1, out=buffer2), order='C')
+    
+    if hasattr(mydf, "grid_ID_ordered"):
+        if (use_mpi and rank == 0) or (use_mpi == False):
+            density_R_original = np.zeros_like(density_R)
+            
+            fn_order = getattr(libpbc, "_Reorder_Grid_to_Original_Grid", None)
+            assert fn_order is not None
+            
+            fn_order(
+                ctypes.c_int(density_R.size),
+                mydf.grid_ID_ordered.ctypes.data_as(ctypes.c_void_p),
+                density_R.ctypes.data_as(ctypes.c_void_p),
+                density_R_original.ctypes.data_as(ctypes.c_void_p),
+            )
+
+            density_R = density_R_original.copy()
+        
+    J = None
+    
+    if (use_mpi and rank == 0) or (use_mpi == False):
+    
+        fn_J = getattr(libpbc, "_construct_J", None)
+        assert(fn_J is not None)
+
+        J = np.zeros_like(density_R)
+
+        fn_J(
+            mesh.ctypes.data_as(ctypes.c_void_p),
+            density_R.ctypes.data_as(ctypes.c_void_p),
+            mydf.coulG.ctypes.data_as(ctypes.c_void_p),
+            J.ctypes.data_as(ctypes.c_void_p),
+        )
+        
+        if hasattr(mydf, "grid_ID_ordered"):
+            
+            J_ordered = np.zeros_like(J)
+
+            fn_order = getattr(libpbc, "_Original_Grid_to_Reorder_Grid", None)
+            assert fn_order is not None 
+            
+            fn_order(
+                ctypes.c_int(J.size),
+                mydf.grid_ID_ordered.ctypes.data_as(ctypes.c_void_p),
+                J.ctypes.data_as(ctypes.c_void_p),
+                J_ordered.ctypes.data_as(ctypes.c_void_p),
+            )
+            
+            J = J_ordered.copy()
+             
+    #### step 3. get J 
+    
+    J = np.asarray(lib.d_ij_j_ij(aoR, J, out=buffer1), order='C') 
+    J = lib.ddot_withbuffer(aoR, J.T, buf=mydf.ddot_buf)
+
+    t2 = (logger.process_clock(), logger.perf_counter())
+    
+    if mydf.verbose:
+        _benchmark_time(t1, t2, "_contract_j_dm_fast", mydf)
+    
+    return J * ngrid / vol
+
+def _contract_j_dm_wo_robust_fitting(mydf, dm, with_robust_fitting=False, use_mpi=False):
+    
+    assert with_robust_fitting == False
+    assert use_mpi == False
+    
+    if use_mpi:
+        raise NotImplementedError("MPI is not supported in this function")
+
+    t1 = (logger.process_clock(), logger.perf_counter())
+    
+    if len(dm.shape) == 3:
+        assert dm.shape[0] == 1
+        dm = dm[0]
+
+    nao  = dm.shape[0]
+    
+    cell = mydf.cell
+    assert cell.nao == nao
+    vol = cell.vol
+    mesh = np.array(cell.mesh, dtype=np.int32)
+    ngrid = np.prod(cell.mesh)
+
+    W    = mydf.W
+    aoRg = mydf.aoRg
+    
+    naux = aoRg.shape[1]
+    
+    tmp1 = lib.ddot(dm, aoRg)  
+    density_Rg = np.asarray(lib.multiply_sum_isdf(aoRg, tmp1),
+                            order='C') 
+    tmp = np.asarray(lib.dot(W, density_Rg.reshape(-1,1)), order='C').reshape(-1)
+    tmp = np.asarray(lib.d_ij_j_ij(aoRg, tmp), order='C')
+
+    J = lib.ddot(aoRg, tmp.T)
+
+    del tmp1 
+    tmp1 = None
+    del tmp 
+    tmp = None
+    del density_Rg
+    density_Rg = None
+
+    t2 = (logger.process_clock(), logger.perf_counter())
+
+    _benchmark_time(t1, t2, "_contract_j_dm_wo_robust_fitting", mydf)
+    
+    return J * ngrid / vol
+
+def _contract_k_dm(mydf, dm, with_robust_fitting=True, use_mpi=False):
+    '''
+
+    Args:
+        mydf       :
+        mo_coeffs  : the occupied MO coefficients
+
+    '''
+
+    assert use_mpi == False
+    
+    t1 = (logger.process_clock(), logger.perf_counter())
+
+    if len(dm.shape) == 3:
+        assert dm.shape[0] == 1
+        dm = dm[0]
+        
+    nao  = dm.shape[0]
+
+    cell = mydf.cell
+    assert cell.nao == nao
+    ngrid = np.prod(cell.mesh)
+    assert ngrid == mydf.ngrids
+    vol = cell.vol
+
+    W    = mydf.W
+    aoRg = mydf.aoRg
+    aoR  = mydf.aoR
+    ngrid = aoR.shape[1]
+    if hasattr(mydf, "V_R"):
+        V_R = mydf.V_R
+    else:
+        V_R = None
+    # naux = aoRg.shape[1]
+    naux = mydf.naux
+    IP_ID = mydf.IP_ID
+
+    buffer = mydf.jk_buffer
+    buffer1 = np.ndarray((nao,ngrid), dtype=dm.dtype, buffer=buffer, offset=0)
+    buffer2 = np.ndarray((naux,ngrid), dtype=dm.dtype, buffer=buffer, offset=nao * ngrid * dm.dtype.itemsize)
+    buffer3 = np.ndarray((naux,naux), dtype=dm.dtype, buffer=buffer,
+                         offset=(nao * ngrid + naux * ngrid) * dm.dtype.itemsize)
+    buffer4 = np.ndarray((nao,nao), dtype=dm.dtype, buffer=buffer, offset=(nao *
+                         ngrid + naux * ngrid + naux * naux) * dm.dtype.itemsize)
+    buffer5 = np.ndarray((naux,nao), dtype=dm.dtype, buffer=buffer, offset=0)
+    buffer6 = np.ndarray((naux,nao), dtype=dm.dtype, buffer=buffer, offset=nao * ngrid * dm.dtype.itemsize)
+
+    #### step 1. get density value on real space grid and IPs
+
+    # need allocate memory, size = nao  * ngrid, this buffer does not need anymore  (buffer 1)
+
+    density_RgR  = np.asarray(lib.dot(dm, aoR, c=buffer1), order='C')
+    
+    # need allocate memory, size = naux * ngrid                                     (buffer 2)
+
+    # density_RgR  = np.asarray(lib.dot(aoRg.T, density_RgR, c=buffer2), order='C')
+    lib.ddot(aoRg.T, density_RgR, c=buffer2)
+    density_RgR = buffer2
+
+    # need allocate memory, size = naux * naux                                      (buffer 3)
+
+    density_RgRg = lib.ddot(dm, aoRg)
+    density_RgRg = lib.ddot(aoRg.T, density_RgRg)
+
+    #### step 2. get K term1 and term2
+
+    ### todo: optimize the following 4 lines, it seems that they may not parallize!
+
+    # tmp = V_R * density_RgR  # pointwise multiplication, TODO: this term should be parallized
+    # do not need allocate memory, size = naux * ngrid, (buffer 2)
+
+    # tmp = np.asarray(lib.cwise_mul(V_R, density_RgR, out=buffer2), order='C')
+
+    # lib.cwise_mul(V_R, density_RgR, out=buffer2)
+
+    K = None
+
+    if with_robust_fitting:
+        lib.cwise_mul(V_R, density_RgR, out=buffer2)
+        tmp = buffer2
+
+        # do not need allocate memory, size = naux * nao,   (buffer 1, but viewed as buffer5)
+    
+        K = np.asarray(lib.ddot_withbuffer(tmp, aoR.T, c=buffer5, buf=mydf.ddot_buf), order='C')
+
+        ### the order due to the fact that naux << ngrid  # need allocate memory, size = nao * nao,           (buffer 4)
+
+        K  = np.asarray(lib.ddot_withbuffer(aoRg, K, c=buffer4, buf=mydf.ddot_buf), order='C')
+
+        K += K.T
+
+    #### step 3. get K term3
+
+    ### todo: optimize the following 4 lines, it seems that they may not parallize!
+    # pointwise multiplication, do not need allocate memory, size = naux * naux, use buffer for (buffer 3)
+    # tmp = W * density_RgRg
+
+    lib.cwise_mul(W, density_RgRg, out=density_RgRg)
+    tmp = density_RgRg
+
+    # do not need allocate memory, size = naux * nao, use buffer 2 but viewed as buffer 6
+    
+    tmp = np.asarray(lib.dot(tmp, aoRg.T, c=buffer6), order='C')
+
+    # K  -= np.asarray(lib.dot(aoRg, tmp, c=K, beta=1), order='C')     # do not need allocate memory, size = nao * nao, (buffer 4)
+    
+    if with_robust_fitting:
+        lib.ddot_withbuffer(aoRg, -tmp, c=K, beta=1, buf=mydf.ddot_buf)
+    else:
+        K = buffer4
+        lib.ddot_withbuffer(aoRg, tmp, c=K, beta=0, buf=mydf.ddot_buf)
+
+    t2 = (logger.process_clock(), logger.perf_counter())
+    
+    if mydf.verbose:
+        _benchmark_time(t1, t2, "_contract_k_dm", mydf)
+
+    if K is None:
+        K = np.zeros((nao, nao))
+
+    return K * ngrid / vol
+
+def _contract_k_dm_wo_robust_fitting(mydf, dm, with_robust_fitting=False, use_mpi=False):
+    
+    assert with_robust_fitting == False
+    
+    if use_mpi:
+        raise NotImplementedError("MPI is not supported in this function")
+
+    t1 = (logger.process_clock(), logger.perf_counter())
+    
+    if len(dm.shape) == 3:
+        assert dm.shape[0] == 1
+        dm = dm[0]
+
+    nao  = dm.shape[0]
+    
+    cell = mydf.cell
+    assert cell.nao == nao
+    vol = cell.vol
+    mesh = np.array(cell.mesh, dtype=np.int32)
+    ngrid = np.prod(cell.mesh)
+
+    W    = mydf.W
+    aoRg = mydf.aoRg
+    
+    naux = aoRg.shape[1]
+    
+    density_RgRg = lib.ddot(dm, aoRg)
+    density_RgRg = lib.ddot(aoRg.T, density_RgRg)
+    
+    lib.cwise_mul(W, density_RgRg, out=density_RgRg)
+    tmp = density_RgRg
+    tmp = np.asarray(lib.dot(tmp, aoRg.T), order='C')
+    if hasattr(mydf, "ddot_buf") and mydf.ddot_buf is not None:
+        K = lib.ddot_withbuffer(aoRg, tmp, buf=mydf.ddot_buf)
+    else:
+        K = lib.ddot(aoRg, tmp)
+    
+    t2 = (logger.process_clock(), logger.perf_counter())
+    
+    # if mydf.verbose:
+    _benchmark_time(t1, t2, "_contract_k_dm_wo_robust_fitting", mydf)
+    
+    del tmp
+    tmp = None
+    del density_RgRg
+    density_RgRg = None
+    
+    return K * ngrid / vol # take care this factor 
+
+def get_jk_dm(mydf, dm, hermi=1, kpt=np.zeros(3),
+           kpts_band=None, with_j=True, with_k=True, omega=None, 
+           use_mpi = False, **kwargs):
+    
+    '''JK for given k-point'''
+    
+    if len(dm.shape) == 3:
+        assert dm.shape[0] == 1 or dm.shape[0] == 2
+        #dm = dm[0]
+    else:
+        assert dm.ndim == 2
+        dm = dm.reshape(1, dm.shape[0], dm.shape[1])
+        
+    nset = dm.shape[0]
+
+    if hasattr(mydf, 'Ls'):
+        from pyscf.pbc.df.isdf.isdf_tools_densitymatrix import symmetrize_dm
+        dm = symmetrize_dm(dm, mydf.Ls)
+    else:
+        if hasattr(mydf, 'kmesh'):
+            from pyscf.pbc.df.isdf.isdf_tools_densitymatrix import symmetrize_dm
+            dm = symmetrize_dm(dm, mydf.kmesh)
+
+    #### perform the calculation ####
+
+    if mydf.jk_buffer is None:  # allocate the buffer for get jk
+        mydf._allocate_jk_buffer(dm.dtype)
+
+    if "exxdiv" in kwargs:
+        exxdiv = kwargs["exxdiv"]
+    else:
+        exxdiv = None
+
+    #vj = vk = None
+    vj = np.zeros_like(dm)
+    vk = np.zeros_like(dm)
+
+    if kpts_band is not None and abs(kpt-kpts_band).sum() > 1e-9:
+        raise NotImplementedError("ISDF does not support kpts_band != kpt")
+
+    log = logger.Logger(mydf.stdout, mydf.verbose)
+    t1 = (logger.process_clock(), logger.perf_counter())
+
+    j_real = gamma_point(kpt)
+    k_real = gamma_point(kpt) and not np.iscomplexobj(dm)
+
+    assert j_real
+    assert k_real
+
+    mem_now = lib.current_memory()[0]
+    max_memory = max(2000, (mydf.max_memory - mem_now))
+
+    log.debug1('max_memory = %d MB (%d in use)', max_memory, mem_now)
+
+    for iset in range(nset):
+
+        if with_j:
+            if mydf.with_robust_fitting:
+                vj[iset] = _contract_j_dm_fast(mydf, dm[iset], mydf.with_robust_fitting, use_mpi)
+            else:
+                vj[iset] = _contract_j_dm_wo_robust_fitting(mydf, dm[iset], mydf.with_robust_fitting, use_mpi)   
+        if with_k:
+            if mydf.with_robust_fitting:
+                vk[iset] = _contract_k_dm(mydf, dm[iset], mydf.with_robust_fitting, use_mpi)
+            else:
+                vk[iset] = _contract_k_dm_wo_robust_fitting(mydf, dm[iset], mydf.with_robust_fitting, use_mpi)
+            if exxdiv == 'ewald':
+                print("WARNING: ISDF does not support ewald")
+
+    ##### the following code is added to deal with _ewald_exxdiv_for_G0 #####
+    
+    from pyscf.pbc.df.df_jk import _format_dms, _format_kpts_band, _format_jks, _ewald_exxdiv_for_G0
+    
+    kpts = kpt.reshape(1,3)
+    kpts = np.asarray(kpts)
+    dm_kpts = dm.copy()
+    dm_kpts = lib.asarray(dm_kpts, order='C')
+    dms = _format_dms(dm_kpts, kpts)
+    nset, nkpts, nao = dms.shape[:3]
+    if nset > 2:
+        logger.warn(mydf, 'nset > 2, please confirm what you are doing, for RHF nset == 1, for UHF nset == 2')
+    assert nkpts == 1
+    
+    kpts_band, input_band = _format_kpts_band(kpts_band, kpts), kpts_band
+    nband = len(kpts_band)
+    assert nband == 1
+
+    if is_zero(kpts_band) and is_zero(kpts):
+        vk = vk.reshape(nset,nband,nao,nao)
+    else:
+        raise NotImplementedError("ISDF does not support kpts_band != 0")
+
+    if exxdiv == 'ewald':
+        _ewald_exxdiv_for_G0(mydf.cell, kpts, dms, vk, kpts_band=kpts_band)
+    
+    vk = vk.reshape(nset,nao,nao)
+
+    t1 = log.timer('sr jk', *t1)
+
+    return vj, vk
\ No newline at end of file
diff --git a/pyscf/isdf/isdf_libdmet_tran_2e.py b/pyscf/isdf/isdf_libdmet_tran_2e.py
new file mode 100644
index 000000000..22c169bbe
--- /dev/null
+++ b/pyscf/isdf/isdf_libdmet_tran_2e.py
@@ -0,0 +1,473 @@
+#!/usr/bin/env python
+# Copyright 2014-2020 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Author: Ning Zhang <ningzhang1024@gmail.com>
+#
+
+############ sys module ############
+
+import numpy, scipy
+import numpy as np
+import ctypes
+
+############ pyscf module ############
+
+from pyscf import lib
+from pyscf import ao2mo
+from pyscf.ao2mo.incore import iden_coeffs
+from pyscf.pbc import tools
+from pyscf.pbc.lib import kpts_helper
+from pyscf.pbc.lib.kpts_helper import is_zero, gamma_point, unique
+from pyscf import __config__
+from pyscf.pbc.df.fft_ao2mo import _format_kpts, _iskconserv, _contract_compact
+libisdf = lib.load_library('libisdf')
+
+############ isdf utils ############
+
+from pyscf.isdf.isdf_tools_local import aoR_Holder
+from pyscf.isdf.isdf_jk          import _benchmark_time 
+from pyscf.isdf.isdf_local_k     import PBC_ISDF_Info_Quad_K
+
+
+def _aoR_full_col(mydf):
+    '''
+    return aoR[:, :ngrid_prim] for the supercell system
+    '''
+
+    assert isinstance(mydf, PBC_ISDF_Info_Quad_K)
+
+    fn_pack = getattr(libisdf, "_Pack_Matrix_SparseRow_DenseCol", None)
+    assert fn_pack is not None
+    
+    prim_cell  = mydf.primCell
+    prim_mesh  = prim_cell.mesh
+    prim_ngrid = np.prod(prim_mesh)
+    prim_natm  = mydf.natmPrim
+    
+    assert len(mydf.aoR) == prim_natm
+    
+    res = np.zeros((mydf.nao, prim_ngrid), dtype=np.float64)
+    
+    for i in range(prim_natm):
+        aoR_i               = mydf.aoR[i]
+        ao_involved_i       = aoR_i.ao_involved
+        nao_i               = aoR_i.aoR.shape[0]
+        global_grid_begin_i = aoR_i.global_gridID_begin
+        ngrid_i             = aoR_i.aoR.shape[1]
+                
+        fn_pack(
+            res.ctypes.data_as(ctypes.c_void_p), 
+            ctypes.c_int(res.shape[0]),
+            ctypes.c_int(res.shape[1]),
+            aoR_i.aoR.ctypes.data_as(ctypes.c_void_p),
+            ctypes.c_int(nao_i),
+            ctypes.c_int(ngrid_i),
+            ao_involved_i.ctypes.data_as(ctypes.c_void_p),
+            ctypes.c_int(global_grid_begin_i),
+            ctypes.c_int(global_grid_begin_i+ngrid_i)
+        )
+                
+    return res
+
+def _aoRg_full_col(mydf):
+    '''
+    return aoR[:, :ngrid_prim] for the supercell system
+    '''
+
+    assert isinstance(mydf, PBC_ISDF_Info_Quad_K)
+
+    fn_pack = getattr(libisdf, "_Pack_Matrix_SparseRow_DenseCol", None)
+    assert fn_pack is not None
+    
+    prim_cell  = mydf.primCell
+    prim_mesh  = prim_cell.mesh
+    prim_ngrid = np.prod(prim_mesh)
+    prim_natm  = mydf.natmPrim
+    prim_nIP   = mydf.nIP_Prim
+    
+    assert len(mydf.aoR) == prim_natm
+    
+    res = np.zeros((mydf.nao, prim_nIP), dtype=np.float64)
+    
+    for i in range(mydf.natmPrim):
+        aoRg_i            = mydf.aoRg[i]
+        ao_involved_i     = aoRg_i.ao_involved
+        nao_i             = aoRg_i.aoR.shape[0]
+        global_IP_begin_i = aoRg_i.global_gridID_begin
+        nIP_i             = aoRg_i.aoR.shape[1]
+                
+        fn_pack(
+            res.ctypes.data_as(ctypes.c_void_p), 
+            ctypes.c_int(res.shape[0]),
+            ctypes.c_int(res.shape[1]),
+            aoRg_i.aoR.ctypes.data_as(ctypes.c_void_p),
+            ctypes.c_int(nao_i),
+            ctypes.c_int(nIP_i),
+            ao_involved_i.ctypes.data_as(ctypes.c_void_p),
+            ctypes.c_int(global_IP_begin_i),
+            ctypes.c_int(global_IP_begin_i+nIP_i)
+        )
+                
+    return res
+
+######## copy from libdmet ########
+
+def eri_restore(eri, symmetry, nemb):
+    """
+    Restore eri with given permutation symmetry.
+    """
+    spin_pair = eri.shape[0]
+    if spin_pair == 1:
+        eri_res = ao2mo.restore(symmetry, eri[0].real, nemb)
+    else:
+        if symmetry == 4:
+            nemb_pair = nemb*(nemb+1) // 2
+            if eri.size == spin_pair * nemb_pair * nemb_pair:
+                eri_res = eri.real.reshape(spin_pair, nemb_pair, nemb_pair)
+            else:
+                eri_res = np.empty((spin_pair, nemb_pair, nemb_pair))
+                for s in range(spin_pair):
+                    eri_res[s] = ao2mo.restore(symmetry, eri[s].real, nemb)
+        elif symmetry == 1:
+            if eri.size == spin_pair * nemb**4:
+                eri_res = eri.real.reshape(spin_pair, nemb, nemb, nemb, nemb)
+            else:
+                eri_res = np.empty((spin_pair, nemb, nemb, nemb, nemb))
+                for s in range(spin_pair):
+                    eri_res[s] = ao2mo.restore(symmetry, eri[s].real, nemb)
+        else:
+            raise ValueError("Spin unrestricted ERI does not support 8-fold symmetry.")
+    eri_res = np.asarray(eri_res, order='C')
+    return eri_res
+
+def get_emb_eri_isdf(mydf, C_ao_emb:np.ndarray=None, symmetry=4):
+    
+    ''' 
+    get eri for embedding system
+    '''
+    
+    #### preprocess #### 
+    
+    assert isinstance(mydf, PBC_ISDF_Info_Quad_K)
+    assert not mydf.direct
+    
+    if C_ao_emb.ndim == 2:
+        C_ao_emb = C_ao_emb.reshape(1, *C_ao_emb.shape)
+    assert C_ao_emb.ndim  == 3
+    assert C_ao_emb.dtype == np.float64  ## supercell basis
+    
+    nspin, nao_full, nemb = C_ao_emb.shape
+    
+    print("nspin    = ", nspin)
+    print("nao_full = ", nao_full)
+    print("nemb     = ", nemb)
+    
+    supercell = mydf.cell
+    print("supercell.nao = ", supercell.nao)
+    assert supercell.nao == nao_full
+    
+    ngrid      = mydf.ngrids
+    vol        = supercell.vol
+    mesh_prim  = mydf.primCell.mesh
+    ngrid_prim = np.prod(mesh_prim)
+    nao_prim   = mydf.nao_prim
+    nIP_prim   = mydf.nIP_Prim
+    kmesh      = mydf.kmesh
+    nkpts      = np.prod(kmesh)
+    nIP        = mydf.naux
+    
+    with_robust_fitting = mydf.with_robust_fitting
+    
+    #eri = np.zeros((nspin*(nspin+1)//2, nemb, nemb, nemb, nemb), dtype=np.float64) ## the ordering of spin is aa, bb, ab
+    eri = np.zeros((nspin*(nspin+1)//2, nemb**2, nemb**2), dtype=np.float64) ## the ordering of spin is aa, bb, ab
+    
+    ### emb values on grid and IPs ###
+    
+    emb_R = []
+    emb_Rg= []
+    for i in range(nspin):
+        emb_R.append([])
+        emb_Rg.append([])
+    
+    if with_robust_fitting:
+        aoR_fullcol  = _aoR_full_col(mydf)
+        assert aoR_fullcol.shape  == (nao_full, ngrid_prim)
+    aoRg_fullcol = _aoRg_full_col(mydf)
+    assert aoRg_fullcol.shape == (nao_full, nIP_prim)
+    
+    aoR_tmp  = np.zeros_like(aoR_fullcol)
+    aoRg_tmp = np.zeros_like(aoRg_fullcol)
+    
+    for kx in range(kmesh[0]):
+        for ky in range(kmesh[1]):
+            for kz in range(kmesh[2]):
+                                
+                for ix in range(kmesh[0]):
+                    for iy in range(kmesh[1]):
+                        for iz in range(kmesh[2]):
+                            
+                            ILOC  = ix*kmesh[1]*kmesh[2] + iy*kmesh[2] + iz
+                            ix_   = (ix + kx) % kmesh[0]
+                            iy_   = (iy + ky) % kmesh[1]
+                            iz_   = (iz + kz) % kmesh[2]
+                            ILOC_ = ix_*kmesh[1]*kmesh[2] + iy_*kmesh[2] + iz_
+                            
+                            if with_robust_fitting:
+                                aoR_tmp[ILOC_*nao_prim:(ILOC_+1)*nao_prim,:]  = aoR_fullcol[ILOC*nao_prim:(ILOC+1)*nao_prim,:]
+                            aoRg_tmp[ILOC_*nao_prim:(ILOC_+1)*nao_prim,:] = aoRg_fullcol[ILOC*nao_prim:(ILOC+1)*nao_prim,:]
+                            
+                for i in range(nspin):
+                    if with_robust_fitting:
+                        emb_R[i].append(np.dot(C_ao_emb[i].T, aoR_tmp))
+                    emb_Rg[i].append(np.dot(C_ao_emb[i].T, aoRg_tmp))
+                            
+    
+    ### V_R term ###
+    
+    #V_R = mydf.V_R
+    #assert V_R.shape == (nIP_prim, ngrid)
+        
+    tmp_V = np.zeros((nspin, nIP, nemb*nemb), dtype=np.float64)
+    
+    def _construct_tmp_V_W(Is_V=False):
+
+        tmp_V.ravel()[:] = 0.0
+
+        if Is_V:
+            V = mydf.V_R
+            ngrid_per_box = ngrid_prim
+            _emb_R = emb_R
+        else:
+            V = mydf.W
+            ngrid_per_box = nIP_prim
+            _emb_R = emb_Rg
+        
+        for kx in range(kmesh[0]):
+            for ky in range(kmesh[1]):
+                for kz in range(kmesh[2]):
+                    
+                    ILOC = kx*kmesh[1]*kmesh[2] + ky*kmesh[2] + kz
+                    
+                    for i in range(nspin):
+                        
+                        _emb_pair = np.einsum('iP,jP->ijP', _emb_R[i][ILOC], _emb_R[i][ILOC])
+                        _emb_pair = _emb_pair.reshape(nemb*nemb, ngrid_per_box)
+                        # _tmp_V    = lib.ddot(V[:,ILOC*ngrid_per_box:(ILOC+1)*ngrid_per_box],_emb_pair.T)
+
+                        ## another pass to account for the transposition ##
+                        
+                        for ix in range(kmesh[0]):
+                            for iy in range(kmesh[1]):
+                                for iz in range(kmesh[2]):
+                                    
+                                    ix_ = (kx-ix+kmesh[0]) % kmesh[0]
+                                    iy_ = (ky-iy+kmesh[1]) % kmesh[1]
+                                    iz_ = (kz-iz+kmesh[2]) % kmesh[2]
+                                    
+                                    ILOC_ = ix_*kmesh[1]*kmesh[2] + iy_*kmesh[2] + iz_
+                                    ILOC  = ix *kmesh[1]*kmesh[2] + iy *kmesh[2] + iz
+                                    
+                                    lib.ddot(
+                                        a=V[:,ILOC_*ngrid_per_box:(ILOC_+1)*ngrid_per_box],
+                                        b=_emb_pair.T,
+                                        alpha=1.0,
+                                        c=tmp_V[i][ILOC*nIP_prim:(ILOC+1)*nIP_prim,:],
+                                        beta=1.0)
+    
+    def _the_last_pass(plus):
+        
+        if plus:
+            alpha = 1
+        else:
+            alpha =-1
+            
+        for ix in range(kmesh[0]):
+            for iy in range(kmesh[1]):
+                for iz in range(kmesh[2]):
+                
+                    ILOC = ix*kmesh[1]*kmesh[2] + iy*kmesh[2] + iz
+                
+                    if nspin == 1:
+
+                        emb_pair_Rg = np.einsum('iP,jP->ijP', emb_Rg[0][ILOC], emb_Rg[0][ILOC])
+                        emb_pair_Rg = emb_pair_Rg.reshape(nemb*nemb, nIP_prim)
+
+                        lib.ddot(
+                            a = emb_pair_Rg,
+                            b = tmp_V[0][ILOC*nIP_prim:(ILOC+1)*nIP_prim,:],
+                            alpha = alpha,
+                            c     = eri[0],
+                            beta  = 1
+                        )
+                    else:
+                        if nspin == 2:
+
+                            emb_pair_Rg_alpha = np.einsum('iP,jP->ijP', emb_Rg[0][ILOC], emb_Rg[0][ILOC])
+                            emb_pair_Rg_beta  = np.einsum('iP,jP->ijP', emb_Rg[1][ILOC], emb_Rg[1][ILOC])
+
+                            lib.ddot(
+                                a = emb_pair_Rg_alpha,
+                                b = tmp_V[0][ILOC*nIP_prim:(ILOC+1)*nIP_prim,:],
+                                alpha = alpha,
+                                c     = eri[0],
+                                beta  = 1
+                            )
+
+                            lib.ddot(
+                                a = emb_pair_Rg_beta,
+                                b = tmp_V[1][ILOC*nIP_prim:(ILOC+1)*nIP_prim,:],
+                                alpha = alpha,
+                                c     = eri[1],
+                                beta  = 1
+                            )
+
+                            lib.ddot(
+                                a = emb_pair_Rg_alpha,
+                                b = tmp_V[1][ILOC*nIP_prim:(ILOC+1)*nIP_prim,:],
+                                alpha = alpha,
+                                c     = eri[2],
+                                beta  = 1
+                            )
+
+                        else:
+                            raise ValueError("nspin > 2 is not supported")
+    
+    if with_robust_fitting:
+        
+        _construct_tmp_V_W(True)
+        _the_last_pass(plus=True)
+        nspinpair = nspin*(nspin+1)//2
+        
+        for i in range(nspinpair):
+            eri[i] += eri[i].T
+    
+    ### W term ###
+    
+    _construct_tmp_V_W(False)
+    if with_robust_fitting:
+        _the_last_pass(plus=False)
+    else:
+        _the_last_pass(plus=True)
+    
+    #### post process ####
+    
+    # reshape the eri 
+    
+    eri = eri.reshape(nspin*(nspin+1)//2, nemb, nemb, nemb, nemb)
+    eri = eri_restore(eri, symmetry, nemb)
+    
+    return eri * ngrid / vol
+    
+    
+if __name__ == "__main__":
+
+    from isdf_tools_cell import build_supercell, build_supercell_with_partition
+    C = 25
+    
+    verbose = 10
+    import pyscf.pbc.gto as pbcgto
+    
+    cell   = pbcgto.Cell()
+    boxlen = 3.5668
+    cell.a = np.array([[boxlen,0.0,0.0],[0.0,boxlen,0.0],[0.0,0.0,boxlen]])
+    prim_a = np.array([[boxlen,0.0,0.0],[0.0,boxlen,0.0],[0.0,0.0,boxlen]])
+    atm = [
+        ['C', (0.     , 0.     , 0.    )],
+        ['C', (0.8917 , 0.8917 , 0.8917)],
+        ['C', (1.7834 , 1.7834 , 0.    )],
+        ['C', (2.6751 , 2.6751 , 0.8917)],
+        ['C', (1.7834 , 0.     , 1.7834)],
+        ['C', (2.6751 , 0.8917 , 2.6751)],
+        ['C', (0.     , 1.7834 , 1.7834)],
+        ['C', (0.8917 , 2.6751 , 2.6751)],
+    ] 
+    
+    KE_CUTOFF = 70
+    basis = 'gth-szv'
+    
+    prim_cell = build_supercell(atm, prim_a, Ls = [1,1,1], basis=basis, ke_cutoff=KE_CUTOFF)
+    prim_mesh = prim_cell.mesh
+    # prim_partition = [[0], [1], [2], [3], [4], [5], [6], [7]]
+    # prim_partition = [[0,1,2,3,4,5,6,7]]
+    prim_partition = [[0,1],[2,3],[4,5],[6,7]]
+    
+    Ls = [1, 2, 2]
+    kpts = prim_cell.make_kpts(Ls)
+    Ls = np.array(Ls, dtype=np.int32)
+    mesh = [Ls[0] * prim_mesh[0], Ls[1] * prim_mesh[1], Ls[2] * prim_mesh[2]]
+    mesh = np.array(mesh, dtype=np.int32)
+    
+    cell, group_partition = build_supercell_with_partition(atm, prim_a, mesh=mesh, 
+                                                     Ls=Ls,
+                                                     basis=basis, 
+                                                     #pseudo=pseudo,
+                                                     partition=prim_partition, ke_cutoff=KE_CUTOFF, verbose=verbose)
+    
+    pbc_isdf_info = PBC_ISDF_Info_Quad_K(prim_cell, kmesh=Ls, with_robust_fitting=True, aoR_cutoff=1e-8, 
+                                         # direct=True, 
+                                         direct=False, 
+                                         rela_cutoff_QRCP=1e-4,
+                                         limited_memory=True, build_K_bunchsize=32)
+    pbc_isdf_info.build_IP_local(c=C, m=5, group=prim_partition, Ls=[Ls[0]*10, Ls[1]*10, Ls[2]*10])
+    pbc_isdf_info.verbose = 10    
+    pbc_isdf_info.build_auxiliary_Coulomb(debug=True)
+    
+    # print("grid_segment = ", pbc_isdf_info.grid_segment)
+    
+    from pyscf.pbc import scf
+
+    mf = scf.KRHF(prim_cell, kpts)
+    pbc_isdf_info.direct_scf = mf.direct_scf
+    mf.with_df = pbc_isdf_info
+    mf.max_cycle = 16
+    mf.conv_tol = 1e-7
+    
+    mf.kernel()
+
+    nao_full = pbc_isdf_info.cell.nao
+    nao_emb  = nao_full // 5
+    C_ao_emb = np.random.rand(nao_full, nao_emb)
+    
+    eri_emb = get_emb_eri_isdf(pbc_isdf_info, C_ao_emb, symmetry=4)
+    
+    supercell = pbc_isdf_info.cell
+    
+    from pyscf.isdf.isdf_local import PBC_ISDF_Info_Quad
+    
+    pbc_isdf_info2 = PBC_ISDF_Info_Quad(supercell, with_robust_fitting=True, 
+                                        aoR_cutoff=1e-8, 
+                                        direct=False, 
+                                        # direct=True, 
+                                        limited_memory=True, build_K_bunchsize=32,
+                                        use_occ_RI_K=False, rela_cutoff_QRCP=1e-4)
+        
+    pbc_isdf_info2.build_IP_local(c=C, m=5, group=group_partition)
+    pbc_isdf_info2.build_auxiliary_Coulomb()    
+    
+    eri_emb_benchmark = pbc_isdf_info2.ao2mo(C_ao_emb)
+    
+    assert eri_emb.shape == eri_emb_benchmark.shape
+    
+    diff = np.linalg.norm(eri_emb - eri_emb_benchmark)
+    print("diff     = ", diff)
+    max_diff = np.max(np.abs(eri_emb - eri_emb_benchmark))
+    print("max_diff = ", max_diff)
+    
+    # print("eri_emb.shape = ", eri_emb.shape)
+    # print("eri_emb           = ", eri_emb[0,0],eri_emb[0,1])
+    # print("eri_emb_benchmark = ", eri_emb_benchmark[0,0], eri_emb_benchmark[0,1])
+    # for i in range(eri_emb.shape[0]):
+    #     for j in range(eri_emb.shape[1]):
+    #         print(eri_emb[i,j], eri_emb_benchmark[i,j], eri_emb[i,j]/eri_emb_benchmark[i,j])
diff --git a/pyscf/isdf/isdf_local.py b/pyscf/isdf/isdf_local.py
new file mode 100644
index 000000000..8c6b39ed4
--- /dev/null
+++ b/pyscf/isdf/isdf_local.py
@@ -0,0 +1,1692 @@
+#!/usr/bin/env python
+# Copyright 2014-2020 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Author: Ning Zhang <ningzhang1024@gmail.com>
+#
+
+############ sys module ############
+
+import copy
+import numpy as np
+import scipy
+import ctypes, sys
+
+############ pyscf module ############
+
+from pyscf import lib
+from pyscf.pbc.gto import Cell
+from pyscf.pbc import tools
+from pyscf.gto.mole import *
+libisdf = lib.load_library('libisdf')
+
+############ isdf utils ############
+
+from   pyscf.isdf.isdf_jk                import _benchmark_time
+import pyscf.isdf.isdf_fast              as     ISDF
+from   pyscf.isdf.isdf_eval_gto          import ISDF_eval_gto
+import pyscf.isdf.isdf_tools_local       as     ISDF_Local_Utils
+import pyscf.isdf.isdf_local_jk          as     ISDF_Local_JK
+import pyscf.isdf.isdf_tools_linearop    as     lib_isdf
+
+##### all the involved algorithm in ISDF based on aoR_Holder ##### 
+
+USE_SCIPY_QR       = False  ## true for single-thread mode to compare with Kori's code
+USE_SCIPY_CHOLESKY = True
+assert USE_SCIPY_CHOLESKY == True
+
+############ subroutines --- select IP ############
+
+############ ls refers to linear scaling ############
+
+def select_IP_atm_ls(mydf, 
+                     c:int, m:int, 
+                     first_natm           = None, 
+                     rela_cutoff          = 0.0, 
+                     no_retriction_on_nIP = False,
+                     use_mpi              = False):
+
+    if use_mpi:
+        from pyscf.isdf.isdf_tools_mpi import rank, comm, comm_size
+    else:
+        rank = 0
+        comm = None
+        comm_size = 1
+
+    assert isinstance(mydf.aoR, list)
+    assert isinstance(mydf.partition, list)
+    
+    ### determine the largest grids point of one atm ###
+
+    log = lib.logger.Logger(mydf.cell.stdout, mydf.cell.verbose)
+
+    natm      = mydf.cell.natm
+    nao       = mydf.nao
+    naux_max  = 0
+
+    nao_per_atm = np.zeros((natm), dtype=np.int32)
+    for i in range(mydf.nao):
+        atm_id = mydf.ao2atomID[i]
+        nao_per_atm[atm_id] += 1
+
+    for nao_atm in nao_per_atm:
+        naux_max = max(naux_max, int(np.sqrt(c*nao_atm)) + m)
+
+    nthread = lib.num_threads()
+
+    ### loop over atm ###
+    
+    coords  = mydf.coords
+    assert coords is not None
+
+    results = []
+
+    fn_colpivot_qr = getattr(libisdf, "ColPivotQRRelaCut", None)
+    assert(fn_colpivot_qr is not None)
+    fn_ik_jk_ijk = getattr(libisdf, "NP_d_ik_jk_ijk", None)
+    assert(fn_ik_jk_ijk is not None)
+
+    weight = np.sqrt(mydf.cell.vol / coords.shape[0])
+
+    if first_natm is None:
+        first_natm = natm
+    
+    group_begin, group_end = ISDF_Local_Utils._range_partition(first_natm, rank, comm_size, use_mpi)
+    
+    for i in range(first_natm):
+        results.append(None)
+    
+    aoR_atm1 = None
+    aoR_atm2 = None
+    aoPairBuffer = None
+    R = None
+    thread_buffer = None
+    global_buffer = None
+    
+    log.debug4("-------------------------------------------")
+    
+    for atm_id in range(group_begin, group_end):
+        
+        aoR = mydf.aoR[atm_id]
+        if aoR is None:  # it is used to split the task when using MPI
+            continue
+
+        grid_ID = mydf.partition[atm_id] 
+        aoR_atm = mydf.aoR[atm_id].aoR
+        nao_tmp = aoR_atm.shape[0]
+        
+        # create buffer for this atm
+
+        dtypesize = aoR_atm.dtype.itemsize
+        nao_atm   = nao_per_atm[atm_id]
+        naux_now  = int(np.sqrt(c*nao_atm)) + m
+        naux2_now = naux_now * naux_now
+
+        R = np.ndarray((naux2_now, grid_ID.shape[0]), dtype=np.float64)
+
+        aoR_atm1 = np.ndarray((naux_now, grid_ID.shape[0]), dtype=np.float64)
+        aoR_atm2 = np.ndarray((naux_now, grid_ID.shape[0]), dtype=np.float64)
+
+        aoPairBuffer = np.ndarray(
+            (naux_now*naux_now, grid_ID.shape[0]), dtype=np.float64)
+
+        G1 = np.random.rand(nao_tmp, naux_now)
+        G1, _ = numpy.linalg.qr(G1)
+        G1    = G1.T
+        G2 = np.random.rand(nao_tmp, naux_now)
+        G2, _ = numpy.linalg.qr(G2)
+        G2    = G2.T
+
+        lib.dot(G1, aoR_atm, c=aoR_atm1)
+        lib.dot(G2, aoR_atm, c=aoR_atm2)
+
+        fn_ik_jk_ijk(aoR_atm1.ctypes.data_as(ctypes.c_void_p),
+                     aoR_atm2.ctypes.data_as(ctypes.c_void_p),
+                     aoPairBuffer.ctypes.data_as(ctypes.c_void_p),
+                     ctypes.c_int(naux_now),
+                     ctypes.c_int(naux_now),
+                     ctypes.c_int(grid_ID.shape[0]))
+        if no_retriction_on_nIP:
+            max_rank = min(naux2_now, grid_ID.shape[0])
+            log.debug4("In select_IP_atm_ls, no_retriction_on_nIP")
+        else:
+            max_rank  = min(naux2_now, grid_ID.shape[0], nao_atm * c + m)
+            log.debug4("In select_IP_atm_ls, retriction_on_nIP")
+        npt_find      = ctypes.c_int(0)
+        pivot         = np.arange(grid_ID.shape[0], dtype=np.int32)
+        thread_buffer = np.ndarray((nthread+1, grid_ID.shape[0]+1), dtype=np.float64)
+        global_buffer = np.ndarray((1, grid_ID.shape[0]), dtype=np.float64)
+        
+        log.debug4("In select_IP_atm_ls, max_rank      = %d" % (max_rank))
+        log.debug4("In select_IP_atm_ls, naux2_now     = %d" % (naux2_now))
+        log.debug4("In select_IP_atm_ls, grid_ID.shape = %s" % (grid_ID.shape))
+        log.debug4("In select_IP_atm_ls, rela_cutoff   = %e" % (rela_cutoff))
+        
+        if USE_SCIPY_QR:
+            R, pivot = scipy.linalg.qr(aoPairBuffer, pivoting=True, mode='r', check_finite=False, overwrite_a=True)
+            npt_find = nao_atm * c + m 
+        else:
+            fn_colpivot_qr(aoPairBuffer.ctypes.data_as(ctypes.c_void_p),
+                            ctypes.c_int(naux2_now),
+                            ctypes.c_int(grid_ID.shape[0]),
+                            ctypes.c_int(max_rank),
+                            ctypes.c_double(1e-14),
+                            ctypes.c_double(rela_cutoff),
+                            pivot.ctypes.data_as(ctypes.c_void_p),
+                            R.ctypes.data_as(ctypes.c_void_p),
+                            ctypes.byref(npt_find),
+                            thread_buffer.ctypes.data_as(ctypes.c_void_p),
+                            global_buffer.ctypes.data_as(ctypes.c_void_p))
+            npt_find = npt_find.value
+            cutoff = abs(R[npt_find-1, npt_find-1])
+            log.debug4("ngrid = %d, npt_find = %d, cutoff = %12.6e" % (grid_ID.shape[0], npt_find, cutoff))
+                        
+        pivot = pivot[:npt_find]
+        pivot.sort()
+        
+        atm_IP = grid_ID[pivot]
+        atm_IP = np.array(atm_IP, dtype=np.int32)
+        atm_IP.sort()
+        results[atm_id] = atm_IP
+        
+        log.debug4("In select_IP_atm_ls, npt_find      = %d" %(npt_find))
+        log.debug4("-------------------------------------------")
+
+    del aoR_atm1
+    del aoR_atm2
+    del aoPairBuffer
+    del R
+    del thread_buffer
+    del global_buffer
+
+    if use_mpi:
+        results = ISDF_Local_Utils._sync_list(results, first_natm)
+
+    assert len(results) == first_natm
+
+    return results
+
+def select_IP_group_ls(mydf, aoRg_possible, c:int, m:int, group=None, atm_2_IP_possible = None):
+        
+    assert isinstance(aoRg_possible, list)
+    assert isinstance(group, list) or isinstance(group, np.ndarray)
+    assert isinstance(atm_2_IP_possible, list)
+    
+    assert len(aoRg_possible) == len(atm_2_IP_possible)
+    # assert len(aoRg_possible) == mydf.natm
+    
+    log = lib.logger.Logger(mydf.cell.stdout, mydf.cell.verbose)
+    
+    if group is None:
+        raise ValueError("group must be specified")
+
+    #if mydf.verbose:
+    #     print("In select_IP, num_threads = ", lib.num_threads())
+        
+    nthread = lib.num_threads()
+    
+    coords = mydf.coords
+        
+    fn_colpivot_qr = getattr(libisdf, "ColPivotQRRelaCut", None)
+    assert(fn_colpivot_qr is not None)
+    fn_ik_jk_ijk = getattr(libisdf, "NP_d_ik_jk_ijk", None)
+    assert(fn_ik_jk_ijk is not None)
+
+    weight = np.sqrt(mydf.cell.vol / coords.shape[0])
+    
+    #### perform QRCP ####
+
+    nao_group = 0
+    for atm_id in group:
+        shl_begin = mydf.shl_atm[atm_id][0]
+        shl_end   = mydf.shl_atm[atm_id][1]
+        nao_atm = mydf.aoloc_atm[shl_end] - mydf.aoloc_atm[shl_begin]
+        nao_group += nao_atm
+    
+    ##### random projection #####
+
+    nao = mydf.nao
+    
+    # aoR_atm = ISDF_eval_gto(mydf.cell, coords=coords[IP_possible]) * weight
+    
+    aoRg_unpacked = []
+    for atm_id in group:
+        aoRg_unpacked.append(aoRg_possible[atm_id])
+    if len(aoRg_unpacked) == 1:
+        aoRg_packed = aoRg_unpacked[0].aoR
+    else:
+        aoRg_packed = ISDF_Local_Utils._pack_aoR_holder(aoRg_unpacked, nao).aoR
+    
+    nao = aoRg_packed.shape[0]
+
+    log.debug4("In select_IP_group_ls, nao_group   = %d" % (nao_group))
+    log.debug4("In select_IP_group_ls, nao         = %d" % (nao))    
+    log.debug4("In select_IP_group_ls, c           = %d, m = %d" % (c, m))
+    log.debug4("In select_IP_group_ls, rela_cutoff = %e" % (mydf.rela_cutoff_QRCP))
+    
+    # naux_now = int(np.sqrt(c*nao)) + m # seems to be too large
+    naux_now = int(np.sqrt(c*nao_group)) + m
+    G1 = np.random.rand(nao, naux_now)
+    G1, _ = numpy.linalg.qr(G1)
+    G1 = G1.T
+    
+    G2 = np.random.rand(nao, naux_now)
+    G2, _ = numpy.linalg.qr(G2)
+    G2    = G2.T 
+    # naux_now = nao
+        
+    aoR_atm1 = lib.ddot(G1, aoRg_packed)
+    naux_now1 = aoR_atm1.shape[0]
+    aoR_atm2 = lib.ddot(G2, aoRg_packed)
+    naux_now2 = aoR_atm2.shape[0]
+    
+    naux2_now = naux_now1 * naux_now2
+    
+    R = np.ndarray((naux2_now, aoRg_packed.shape[1]), dtype=np.float64)
+
+    aoPairBuffer = np.ndarray((naux2_now, aoRg_packed.shape[1]), dtype=np.float64)
+
+    fn_ik_jk_ijk(aoR_atm1.ctypes.data_as(ctypes.c_void_p),
+                 aoR_atm2.ctypes.data_as(ctypes.c_void_p),
+                 aoPairBuffer.ctypes.data_as(ctypes.c_void_p),
+                 ctypes.c_int(naux_now1),
+                 ctypes.c_int(naux_now2),
+                 ctypes.c_int(aoRg_packed.shape[1]))
+
+    aoR_atm1 = None
+    aoR_atm2 = None
+    del aoR_atm1
+    del aoR_atm2
+
+    IP_possible = []
+    for atm_id in group:
+        if atm_2_IP_possible[atm_id] is None:
+            continue
+        IP_possible.extend(atm_2_IP_possible[atm_id])
+    IP_possible = np.array(IP_possible, dtype=np.int32)
+
+    if mydf.no_restriction_on_nIP:
+        max_rank = min(naux2_now, IP_possible.shape[0])
+        log.debug4("In select_IP_group_ls, no_restriction_on_nIP")
+    else:
+        max_rank  = min(naux2_now, IP_possible.shape[0], nao_group * c)  
+        log.debug4("In select_IP_group_ls, restriction_on_nIP")
+        
+    log.debug4("In select_IP_group_ls, naux2_now         = %d, max_rank = %d" % (naux2_now, max_rank))
+    log.debug4("In select_IP_group_ls, IP_possible.shape = %s" % (IP_possible.shape))
+    log.debug4("In select_IP_group_ls, nao_group         = %d" % (nao_group))
+    log.debug4("In select_IP_group_ls, c = %d" % (c))
+    log.debug4("In select_IP_group_ls, nao_group * c = %d" % (nao_group * c))
+    
+    npt_find = ctypes.c_int(0)
+    pivot    = np.arange(IP_possible.shape[0], dtype=np.int32)
+
+    thread_buffer = np.ndarray((nthread+1, IP_possible.shape[0]+1), dtype=np.float64)
+    global_buffer = np.ndarray((1, IP_possible.shape[0]), dtype=np.float64)
+
+
+    if not USE_SCIPY_QR:
+        fn_colpivot_qr(aoPairBuffer.ctypes.data_as(ctypes.c_void_p),
+                       ctypes.c_int(naux2_now),
+                       ctypes.c_int(IP_possible.shape[0]),
+                       ctypes.c_int(max_rank),
+                       ctypes.c_double(1e-14),
+                       ctypes.c_double(mydf.rela_cutoff_QRCP),
+                       pivot.ctypes.data_as(ctypes.c_void_p),
+                       R.ctypes.data_as(ctypes.c_void_p),
+                       ctypes.byref(npt_find),
+                       thread_buffer.ctypes.data_as(ctypes.c_void_p),
+                       global_buffer.ctypes.data_as(ctypes.c_void_p))
+        npt_find = npt_find.value
+        cutoff   = abs(R[npt_find-1, npt_find-1])
+        log.debug4("ngrid = %d, npt_find = %d, cutoff = %12.6e" % (IP_possible.shape[0], npt_find, cutoff))
+    else:
+        # pivot, rankc = scipy.linalg.lapack.dpstrf(aoPairBuffer)[1:3]
+        # pivot = pivot[:rankc]-1
+        # npt_find = nao_group * c 
+        R, pivot = scipy.linalg.qr(aoPairBuffer, pivoting=True, mode='r', check_finite=False, overwrite_a=True)
+        npt_find = nao_group * c 
+    
+    log.debug4("In select_IP_group_ls, npt_find = %d" % (npt_find))
+    
+    pivot = pivot[:npt_find]
+    pivot.sort()
+    results = list(IP_possible[pivot])
+    results = np.array(results, dtype=np.int32)
+    
+    ### clean up ###
+    
+    del aoPairBuffer
+    del R
+    del thread_buffer
+    del global_buffer
+    del G1
+    del G2
+    del aoRg_packed
+    del IP_possible
+    aoRg_packed = None
+    IP_possible = None
+    aoPairBuffer = None
+    R = None
+    pivot = None
+    thread_buffer = None
+    global_buffer = None
+    
+    return results
+
+def select_IP_local_ls_drive(mydf, c, m, IP_possible_atm, group, use_mpi=False):
+    
+    if use_mpi:
+        from pyscf.isdf.isdf_tools_mpi import rank, comm, comm_size
+    else:
+        rank = 0
+        comm = None
+        comm_size = 1   
+    
+    IP_group  = []
+    
+    aoRg_possible = mydf.aoRg_possible
+
+    ######### allocate buffer #########
+
+    natm = mydf.natm
+    
+    for i in range(len(group)):
+        IP_group.append(None)
+
+    if len(group) < natm:
+        
+        if use_mpi == False:
+            for i in range(len(group)):
+                IP_group[i] = select_IP_group_ls(mydf, aoRg_possible, c, m, group=group[i], atm_2_IP_possible=IP_possible_atm)
+        else:
+            group_begin, group_end = ISDF_Local_Utils._range_partition(len(group), rank, comm_size, use_mpi)
+            for i in range(group_begin, group_end):
+                IP_group[i] = select_IP_group_ls(mydf, aoRg_possible, c, m, group=group[i], atm_2_IP_possible=IP_possible_atm)
+            # allgather(IP_group)
+            
+            IP_group = ISDF_Local_Utils._sync_list(IP_group, len(group))
+
+    else:
+        IP_group = IP_possible_atm 
+
+    mydf.IP_group = IP_group
+    
+    mydf.IP_flat = []
+    mydf.IP_segment = [0]
+    nIP_now = 0
+    for x in IP_group:
+        mydf.IP_flat.extend(x)
+        nIP_now += len(x)
+        mydf.IP_segment.append(nIP_now)
+    mydf.IP_flat = np.array(mydf.IP_flat, dtype=np.int32)
+    mydf.naux = mydf.IP_flat.shape[0]
+    
+    gridID_2_atmID = mydf.gridID_2_atmID
+    
+    partition_IP = []
+    for i in range(natm):
+        partition_IP.append([])
+    
+    for _ip_id_ in mydf.IP_flat:
+        atm_id = gridID_2_atmID[_ip_id_]
+        partition_IP[atm_id].append(_ip_id_)
+    
+    for i in range(natm):
+        partition_IP[i] = np.array(partition_IP[i], dtype=np.int32)
+        partition_IP[i].sort()  
+    
+    mydf.partition_IP = partition_IP
+
+    ### build ### 
+    
+    if len(group) < natm:
+        
+        coords = mydf.coords
+        weight = np.sqrt(mydf.cell.vol / mydf.coords.shape[0])
+    
+        del mydf.aoRg_possible
+        mydf.aoRg_possible = None
+    
+        t1 = (lib.logger.process_clock(), lib.logger.perf_counter())
+        mydf.aoRg = mydf._construct_build_aoRg(partition_IP, group)
+        t2 = (lib.logger.process_clock(), lib.logger.perf_counter())
+        _benchmark_time(t1, t2, "build_aoRg", mydf)
+    
+    else:
+        if use_mpi:
+            mydf.aoRg = mydf.aoRg_possible
+        else:
+            mydf.aoRg = mydf.aoRg_possible
+    
+    if rank == 0:
+        memory = ISDF_Local_Utils._get_aoR_holders_memory(mydf.aoRg)
+        log    = lib.logger.Logger(mydf.cell.stdout, mydf.cell.verbose)
+        log.info("memory to store aoRg is %d " %(memory))
+        
+    return IP_group
+
+############ subroutines --- build aux bas ############
+
+def find_common_elements_positions(arr1, arr2):
+    position1 = []
+    position2 = []
+    i, j = 0, 0
+    while i < len(arr1) and j < len(arr2):
+        if arr1[i] < arr2[j]:
+            i += 1
+        elif arr1[i] > arr2[j]:
+            j += 1
+        else:
+            # positions.append(((i, arr1[i]), (j, arr2[j])))
+            position1.append(i)
+            position2.append(j)
+            i += 1
+            j += 1
+    return np.array(position1, dtype=np.int32), np.array(position2, dtype=np.int32)
+
+def build_aux_basis_ls(mydf, group, IP_group, debug=True, use_mpi=False):
+
+    log = lib.logger.Logger(mydf.cell.stdout, mydf.cell.verbose)
+
+    if use_mpi:
+        from pyscf.isdf.isdf_tools_mpi import rank, comm, comm_size
+    else:
+        rank = 0
+        comm = None
+        comm_size = 1
+    
+    ###### split task ######
+    
+    ngroup = len(group)
+    nthread = lib.num_threads()
+    assert len(IP_group) == ngroup
+        
+    group_begin, group_end = ISDF_Local_Utils._range_partition(ngroup, rank, comm_size, use_mpi)
+    
+    ngroup_local = group_end - group_begin
+    
+    if ngroup_local == 0:
+        log.warn(" WARNING : rank = %d, ngroup_local = 0" % rank)
+    
+    mydf.group_begin = group_begin
+    mydf.group_end = group_end
+        
+    ###### build grid_ID_local ###### 
+    
+    coords = mydf.coords
+
+    ###### build aux basis ######
+    
+    mydf.aux_basis = []
+    
+    for i in range(ngroup):
+        mydf.aux_basis.append(None)
+    
+    if not USE_SCIPY_CHOLESKY:
+        fn_cholesky = getattr(libisdf, "Cholesky", None)
+        assert (fn_cholesky is not None)
+        fn_build_aux = getattr(libisdf, "Solve_LLTEqualB_Parallel", None)
+        assert(fn_build_aux is not None)
+    
+    for i in range(group_begin, group_end):
+        
+        aoRg_unpacked = []
+        aoR_unpacked = []
+        
+        for atm_id in group[i]:
+            aoRg_unpacked.append(mydf.aoRg[atm_id])
+            aoR_unpacked.append(mydf.aoR[atm_id])
+        
+        aoRg1 = ISDF_Local_Utils._pack_aoR_holder(aoRg_unpacked, mydf.nao)
+        aoR1 = ISDF_Local_Utils._pack_aoR_holder(aoR_unpacked, mydf.nao)
+        
+        if aoRg1.aoR.shape[0] == aoR1.aoR.shape[0]:
+            aoRg1 = aoRg1.aoR
+            aoR1 = aoR1.aoR
+        else:
+            pos1, pos2 = find_common_elements_positions(aoRg1.ao_involved, aoR1.ao_involved)
+            assert len(pos1) == aoRg1.aoR.shape[0]
+            aoRg1 = aoRg1.aoR
+            aoR1 = aoR1.aoR[pos2,:]
+        
+            
+        A = lib.ddot(aoRg1.T, aoRg1)
+        lib_isdf.square_inPlace(A)
+        grid_ID = mydf.partition_group_to_gridID[i]
+        B = lib.ddot(aoRg1.T, aoR1)
+        lib_isdf.square_inPlace(B)
+        
+        if not USE_SCIPY_CHOLESKY:
+            print("SCIPY is not called")
+            fn_cholesky(
+                A.ctypes.data_as(ctypes.c_void_p),
+                ctypes.c_int(A.shape[0]),
+            ) 
+            nThread = lib.num_threads()
+            bunchsize = B.shape[1]//nThread
+            fn_build_aux(
+                ctypes.c_int(B.shape[0]),
+                A.ctypes.data_as(ctypes.c_void_p),
+                B.ctypes.data_as(ctypes.c_void_p),
+                ctypes.c_int(B.shape[1]),
+                ctypes.c_int(bunchsize)
+            )
+        else:
+            # print("SCIPY is called")
+            C = scipy.linalg.cholesky(A, lower=True, overwrite_a=True, check_finite=False)
+            B = scipy.linalg.cho_solve((C, True), B, overwrite_b=True, check_finite=False)
+        
+        mydf.aux_basis[i] = B.copy()
+        # exit(1)
+
+    ### sync aux_basis ###
+    
+    if use_mpi:
+        mydf.aux_basis = ISDF_Local_Utils._sync_list(mydf.aux_basis, ngroup)
+    
+    del A 
+    A = None
+    del B
+    B = None
+    del aoRg1
+    aoRg1 = None
+    del aoR1
+    aoR1 = None
+  
+def build_auxiliary_Coulomb_local_bas_wo_robust_fitting(mydf, debug=True, use_mpi=False):
+    
+    if use_mpi:
+        raise NotImplementedError("use_mpi = True is not supported")
+        #### NOTE: one should bcast aux_basis first! ####
+
+    
+    t0 = (lib.logger.process_clock(), lib.logger.perf_counter())
+    
+    cell = mydf.cell
+    mesh = cell.mesh
+    mesh_int32         = np.array(mesh, dtype=np.int32)
+    
+    naux = mydf.naux
+    
+    ncomplex = mesh[0] * mesh[1] * (mesh[2] // 2 + 1) * 2 
+    
+    group_begin = mydf.group_begin
+    group_end = mydf.group_end
+    ngroup = len(mydf.group)
+    
+    grid_ordering = mydf.grid_ID_ordered 
+    
+    if mydf.omega is not None:
+        assert mydf.omega >= 0.0
+    
+    coulG = mydf.coulG.copy()
+    coulG_real         = coulG.reshape(*mesh)[:, :, :mesh[2]//2+1].reshape(-1).copy()
+    
+    def construct_V(aux_basis:np.ndarray, buf, V, grid_ID, grid_ordering):
+        fn = getattr(libisdf, "_construct_V_local_bas", None)
+        assert(fn is not None)
+        
+        nThread = buf.shape[0]
+        bufsize_per_thread = buf.shape[1]
+        nrow = aux_basis.shape[0]
+        ncol = aux_basis.shape[1]
+        shift_row = 0
+        
+        fn(mesh_int32.ctypes.data_as(ctypes.c_void_p),
+                ctypes.c_int(nrow),
+                ctypes.c_int(ncol),
+                grid_ID.ctypes.data_as(ctypes.c_void_p),
+                aux_basis.ctypes.data_as(ctypes.c_void_p),
+                coulG_real.ctypes.data_as(ctypes.c_void_p),
+                ctypes.c_int(shift_row),
+                V.ctypes.data_as(ctypes.c_void_p),
+                grid_ordering.ctypes.data_as(ctypes.c_void_p),
+                buf.ctypes.data_as(ctypes.c_void_p),
+                ctypes.c_int(bufsize_per_thread))
+        
+    ####### allocate buf for V ########
+    
+    nThread = lib.num_threads()
+    bufsize_per_thread = (coulG_real.shape[0] * 2 + mesh[0] * mesh[1] * mesh[2])
+    buf = np.zeros((nThread, bufsize_per_thread), dtype=np.double)
+    
+    assert len(mydf.aux_basis) == ngroup
+    
+    naux_local = 0
+    max_naux_bunch = 0
+    for i in range(group_begin, group_end):
+        naux_local += mydf.aux_basis[i].shape[0]    
+        max_naux_bunch = max(max_naux_bunch, mydf.aux_basis[i].shape[0])
+    
+    if hasattr(mydf, "grid_pnt_near_atm"):
+        max_naux_bunch = max(max_naux_bunch, len(mydf.grid_pnt_near_atm))
+        if use_mpi == False or (use_mpi and rank == comm_size - 1):
+            naux_local += len(mydf.grid_pnt_near_atm)
+    
+    V = np.zeros((max_naux_bunch, np.prod(mesh_int32)), dtype=np.double)
+    
+    naux = mydf.naux
+    
+    W = np.zeros((naux_local, naux), dtype=np.double)
+    
+    aux_row_loc = 0
+    
+    if hasattr(mydf, "grid_pnt_near_atm"):
+        grid_ID_near_atm = mydf.grid_pnt_near_atm
+    else:
+        grid_ID_near_atm = []
+        grid_ID_near_atm = np.array(grid_ID_near_atm, dtype=np.int32)
+    for i in range(group_begin, group_end):
+        
+        aux_basis_now = mydf.aux_basis[i]
+        naux_bra = aux_basis_now.shape[0]
+        grid_ID = mydf.partition_group_to_gridID[i]
+        
+        construct_V(aux_basis_now, buf, V, grid_ID, grid_ordering)
+        
+        grid_shift = 0
+        aux_col_loc = 0
+        for j in range(0, ngroup):
+            grid_ID_now = mydf.partition_group_to_gridID[j]
+            aux_bas_ket = mydf.aux_basis[j]
+            naux_ket = aux_bas_ket.shape[0]
+            ngrid_now = grid_ID_now.size
+            W[aux_row_loc:aux_row_loc+naux_bra, aux_col_loc:aux_col_loc+naux_ket] = lib.ddot(V[:naux_bra, grid_shift:grid_shift+ngrid_now], aux_bas_ket.T)
+            grid_shift += ngrid_now
+            aux_col_loc += naux_ket
+        print("aux_row_loc = %d, aux_col_loc = %d" % (aux_row_loc, aux_col_loc))
+        print("V.shape = ", V[:naux_bra,:].shape)
+        W[aux_row_loc:aux_row_loc+naux_bra, aux_col_loc:] = V[:naux_bra, grid_shift:]
+        aux_row_loc += aux_basis_now.shape[0]
+    
+    if (use_mpi == False or (use_mpi and rank == comm_size - 1)) and len(grid_ID_near_atm) != 0:
+        ### construct the final row ### 
+        grid_ID = grid_ID_near_atm
+        aux_basis_now = np.identity(len(grid_ID), dtype=np.double)
+        construct_V(aux_basis_now, buf, V, grid_ID, grid_ordering)
+        grid_shift = 0
+        aux_col_loc = 0
+        naux_bra = len(grid_ID)
+        for j in range(0, ngroup):
+            grid_ID_now = mydf.partition_group_to_gridID[j]
+            aux_bas_ket = mydf.aux_basis[j]
+            naux_ket = aux_bas_ket.shape[0]
+            ngrid_now = grid_ID_now.size
+            W[aux_row_loc:aux_row_loc+naux_bra, aux_col_loc:aux_col_loc+naux_ket] = lib.ddot(V[:naux_bra, grid_shift:grid_shift+ngrid_now], aux_bas_ket.T)
+            grid_shift += ngrid_now
+            aux_col_loc += naux_ket
+        assert aux_row_loc == aux_col_loc
+        W[aux_row_loc:, aux_col_loc:] = V[:naux_bra, grid_shift:]
+    
+    del buf
+    buf = None
+    del V
+    V = None
+    
+    mydf.W = W
+    
+    if use_mpi:
+        comm.Barrier()
+    
+    t1 = (lib.logger.process_clock(), lib.logger.perf_counter())
+    
+    if mydf.verbose > 0:
+        _benchmark_time(t0, t1, 'build_auxiliary_Coulomb', mydf)
+
+def build_auxiliary_Coulomb_local_bas(mydf, debug=True, use_mpi=False):
+    
+    if hasattr(mydf, "grid_pnt_near_atm") and len(mydf.grid_pnt_near_atm) != 0 :
+        raise NotImplementedError("grid_pnt_near_atm is not supported")
+    
+    log = lib.logger.Logger(mydf.cell.stdout, mydf.cell.verbose)
+    
+    t0 = (lib.logger.process_clock(), lib.logger.perf_counter())
+    
+    cell = mydf.cell
+    mesh = cell.mesh
+        
+    naux = mydf.naux
+    
+    ncomplex = mesh[0] * mesh[1] * (mesh[2] // 2 + 1) * 2 
+    
+    group_begin = mydf.group_begin
+    group_end = mydf.group_end
+    
+    grid_ordering = mydf.grid_ID_ordered
+    
+    def construct_V_CCode(aux_basis:list[np.ndarray], mesh, coul_G):
+        
+        coulG_real         = coul_G.reshape(*mesh)[:, :, :mesh[2]//2+1].reshape(-1).copy()
+        nThread            = lib.num_threads()
+        bufsize_per_thread = int((coulG_real.shape[0] * 2 + mesh[0] * mesh[1] * mesh[2]) * 1.1)
+        bufsize_per_thread = (bufsize_per_thread + 15) // 16 * 16
+        
+        buf = np.zeros((nThread, bufsize_per_thread), dtype=np.double)
+        
+        # nAux               = aux_basis.shape[0]
+        
+        nAux = 0
+        for x in aux_basis:
+            nAux += x.shape[0]
+        
+        ngrids             = mesh[0] * mesh[1] * mesh[2]
+        mesh_int32         = np.array(mesh, dtype=np.int32)
+        V                  = np.zeros((nAux, ngrids), dtype=np.double)
+        
+        fn = getattr(libisdf, "_construct_V_local_bas", None)
+        assert(fn is not None)
+        
+        shift_row = 0
+        ngrid_now = 0
+        for i in range(len(aux_basis)):
+            
+            aux_basis_now = aux_basis[i]
+            grid_ID = mydf.partition_group_to_gridID[group_begin+i]
+            assert aux_basis_now.shape[1] == grid_ID.size 
+            ngrid_now += grid_ID.size
+        
+            fn(mesh_int32.ctypes.data_as(ctypes.c_void_p),
+                ctypes.c_int(aux_basis_now.shape[0]),
+                ctypes.c_int(aux_basis_now.shape[1]),
+                grid_ID.ctypes.data_as(ctypes.c_void_p),
+                aux_basis_now.ctypes.data_as(ctypes.c_void_p),
+                coulG_real.ctypes.data_as(ctypes.c_void_p),
+                ctypes.c_int(shift_row),
+                V.ctypes.data_as(ctypes.c_void_p),
+                grid_ordering.ctypes.data_as(ctypes.c_void_p),
+                buf.ctypes.data_as(ctypes.c_void_p),
+                ctypes.c_int(bufsize_per_thread))
+        
+            shift_row += aux_basis_now.shape[0]
+
+        del buf
+        buf = None
+
+        return V
+    
+    ########### construct V ###########
+
+    if mydf.omega is not None:
+        assert mydf.omega >= 0.0
+    coulG = mydf.coulG.copy()
+    V = construct_V_CCode(mydf.aux_basis, mesh, coulG)
+
+    if use_mpi:
+
+        ############# the only communication #############
+    
+        grid_segment = mydf.grid_segment 
+        assert len(grid_segment) == comm_size + 1
+    
+        t0_comm = (lib.logger.process_clock(), lib.logger.perf_counter())
+    
+        sendbuf = []
+        for i in range(comm_size):
+            p0 = grid_segment[i]
+            p1 = grid_segment[i+1]
+            sendbuf.append(V[:, p0:p1])
+        del V
+        V = None
+        V_fullrow = np.vstack(alltoall(sendbuf, split_recvbuf=True))
+        del sendbuf
+        sendbuf = None
+    
+        mydf.V_R = V_fullrow
+    
+        t1_comm = (lib.logger.process_clock(), lib.logger.perf_counter()) 
+        t_comm = t1_comm[1] - t0_comm[1]
+    
+        if mydf.verbose > 0:
+            log.info("rank = %d, t_comm = %12.6e" % (rank, t_comm))
+    else:
+        t_comm = 0.0
+        mydf.V_R = V
+
+    ########### construct W ###########
+    
+    aux_group_shift = [0]
+    naux_now = 0
+    for i in range(len(mydf.IP_group)):
+        IP_group_now = mydf.IP_group[i]
+        naux_now += len(IP_group_now)
+        aux_group_shift.append(naux_now)
+    
+    mydf.W = np.zeros((mydf.naux, mydf.naux), dtype=np.float64) 
+    
+    grid_shift = 0
+    for i in range(group_begin, group_end):
+        aux_begin = aux_group_shift[i]
+        aux_end   = aux_group_shift[i+1]
+        ngrid_now = mydf.partition_group_to_gridID[i].size
+        sys.stdout.flush()
+        mydf.W[:, aux_begin:aux_end] = lib.ddot(mydf.V_R[:, grid_shift:grid_shift+ngrid_now], mydf.aux_basis[i-group_begin].T)
+        grid_shift += ngrid_now
+    
+    if use_mpi:
+        comm.Barrier()
+        
+    t1 = (lib.logger.process_clock(), lib.logger.perf_counter())
+    
+    if mydf.verbose > 0:
+        _benchmark_time(t0, t1, 'build_auxiliary_Coulomb', mydf)
+    
+    sys.stdout.flush()
+
+    
+class PBC_ISDF_Info_Quad(ISDF.PBC_ISDF_Info):
+    
+    ''' Interpolative separable density fitting (ISDF) for periodic systems.
+    The locality is explored! 
+    
+    Fitting aux basis is linear scaling!
+    
+    Quad stands for quadratic scaling for constructing V and W matrix as well as build K matrix!
+    
+    Examples:
+
+    >>> pbc_isdf = PBC_ISDF_Info_Quad(cell, with_robust_fitting=True, aoR_cutoff=1e-8, direct=False, use_occ_RI_K=False)
+    >>> pbc_isdf.build_IP_local(c=C, m=5)
+    >>> pbc_isdf.build_auxiliary_Coulomb()
+    >>> from pyscf.pbc import scf
+    >>> mf = scf.RHF(cell)
+    >>> pbc_isdf.direct_scf = mf.direct_scf
+    >>> mf.with_df = pbc_isdf
+    >>> mf.verbose = 0
+    >>> mf.kernel()
+    
+    '''
+    
+    # group_partition refer to the group of atoms to perform local fitting
+    # if not set then each atom is treated as a group 
+    
+    def __init__(self, mol:Cell, 
+                 with_robust_fitting = True,
+                 kmesh               = None,
+                 verbose             = None,
+                 rela_cutoff_QRCP    = None,
+                 aoR_cutoff          = 1e-8,
+                 direct              = False,
+                 use_occ_RI_K        = False,
+                 limited_memory      = False,
+                 build_K_bunchsize   = None):
+        
+        assert use_occ_RI_K == False
+        
+        if verbose is None:
+            verbose = mol.verbose
+        
+        super().__init__(
+            mol=mol,
+            aoR=None,
+            with_robust_fitting=with_robust_fitting,
+            kmesh=kmesh,
+            get_partition=False,
+            verbose=verbose
+        )
+        
+        self.cell = mol.copy()
+        cell = self.cell
+
+        #### get other info #### 
+        
+        shl_atm = []
+        
+        for i in range(self.natm):
+            shl_atm.append([None, None])
+        
+        for i in range(cell.nbas):
+            atm_id = cell.bas_atom(i)
+            if shl_atm[atm_id][0] is None:
+                shl_atm[atm_id][0] = i
+            shl_atm[atm_id][1] = i+1
+        
+        self.shl_atm = shl_atm
+        self.aoloc_atm = cell.ao_loc_nr() 
+        
+        self.use_mpi = False
+
+        self.aoR_cutoff = aoR_cutoff
+        
+        if rela_cutoff_QRCP is None:
+            self.no_restriction_on_nIP = False
+            self.rela_cutoff_QRCP = 0.0
+        else:
+            self.no_restriction_on_nIP = True
+            self.rela_cutoff_QRCP = rela_cutoff_QRCP
+    
+        self.aoR = None
+        self.partition = None
+
+        self.V_W_cutoff = None
+
+        self.direct = direct # whether to use direct method to calculate J and K, if True, the memory usage will be reduced, V W will not be stored
+        if self.direct:
+            self.with_robust_fitting = True
+
+        self.with_translation_symmetry = False
+        self.kmesh = None
+        
+        ######### default setting for range separation #########
+        
+        # WARNING: not a good design pattern to write this code here! 
+        
+        self.omega           = None
+        self.use_aft_ao      = False
+        self.ke_cutoff_pp    = self.cell.ke_cutoff
+        self.ke_cutoff_ft_ao = self.cell.ke_cutoff
+        self.ft_ao_mesh      = self.mesh.copy()
+        #self.rsjk            = None
+        #self.cell_rsjk       = None
+            
+        ########## coul kernel ##########
+        
+        self.get_coulG()
+        self.ovlp = self.cell.pbc_intor('int1e_ovlp')
+        self.occ_tol = 1e-9
+        self.occ_RI_K  = use_occ_RI_K
+
+        ########## limited memory ##########
+        
+        self._limited_memory    = limited_memory
+        self._build_K_bunchsize = build_K_bunchsize
+        if build_K_bunchsize is None:
+            if limited_memory:
+                from _isdf_local_K_direct import K_DIRECT_NAUX_BUNCHSIZE
+                self._build_K_bunchsize = K_DIRECT_NAUX_BUNCHSIZE
+            else:
+                self._build_K_bunchsize = 10000 * 10000 # infinite in practice
+
+    @property
+    def first_natm(self):
+        if self.kmesh is not None:
+            return self.cell.natm // np.prod(self.kmesh)
+        else:
+            return self.cell.natm
+
+    def build_partition_aoR(self, Ls=None):
+        
+        if self.aoR is not None and self.partition is not None:
+            return
+            
+        ##### build cutoff info #####   
+        
+        self.distance_matrix = ISDF_Local_Utils.get_cell_distance_matrix(self.cell)
+        weight = np.sqrt(self.cell.vol / self.coords.shape[0])
+        precision = self.aoR_cutoff
+        rcut = ISDF_Local_Utils._estimate_rcut(self.cell, self.coords.shape[0], precision)
+        rcut_max = np.max(rcut)
+        atm2_bas = ISDF_Local_Utils._atm_to_bas(self.cell)
+        self.AtmConnectionInfo = []
+        
+        for i in range(self.cell.natm):
+            tmp = ISDF_Local_Utils.AtmConnectionInfo(self.cell, i, self.distance_matrix, precision, rcut, rcut_max, atm2_bas)
+            self.AtmConnectionInfo.append(tmp)
+        
+        ##### build partition #####
+        
+        if Ls is None:
+            lattice_x = self.cell.lattice_vectors()[0][0]
+            lattice_y = self.cell.lattice_vectors()[1][1]
+            lattice_z = self.cell.lattice_vectors()[2][2]
+            
+            Ls = [int(lattice_x)/3+6, int(lattice_y)/3+6, int(lattice_z)/3+6]
+
+        t1 = (lib.logger.process_clock(), lib.logger.perf_counter())
+        # if self.rsjk is not None and self.cell_rsjk is not None:
+        #     self.partition = ISDF_Local_Utils.get_partition(self.cell_rsjk, self.coords, self.AtmConnectionInfo, 
+        #                                                           Ls, 
+        #                                                           self.with_translation_symmetry,
+        #                                                           self.kmesh,
+        #                                                           self.use_mpi)
+        # else:
+        self.partition = ISDF_Local_Utils.get_partition(self.cell, self.coords, self.AtmConnectionInfo, 
+                                                              Ls, 
+                                                              self.with_translation_symmetry,
+                                                              self.kmesh,
+                                                              self.use_mpi)
+        t2 = (lib.logger.process_clock(), lib.logger.perf_counter())
+        
+        if not self.use_mpi:
+            rank = 0
+        else:
+            from pyscf.isdf.isdf_tools_mpi import rank
+        
+        if rank == 0:
+            _benchmark_time(t1, t2, "build_partition", self)
+        
+        for i in range(self.natm):
+            self.partition[i] = np.array(self.partition[i], dtype=np.int32)
+            self.partition[i].sort()
+        
+        t1 = (lib.logger.process_clock(), lib.logger.perf_counter())
+        sync_aoR = False
+        if self.direct:
+            sync_aoR = True
+            
+        ## deal with translation symmetry ##
+        
+        first_natm = self.first_natm
+        
+        ####################################
+        
+        for x in range(self.natm):
+            # print("len of partition[%d] = %d" % (x, len(self.partition[x])))
+            logger.debug4(self, "len of partition[%d] = %d" % (x, len(self.partition[x])))
+        
+        if self.use_aft_ao:
+            self.aoR = ISDF_Local_Utils.get_aoR_analytic(self.cell, self.coords, self.partition,
+                                                         None,
+                                                         first_natm,
+                                                         self.group,
+                                                         self.distance_matrix, 
+                                                         self.AtmConnectionInfo, 
+                                                         self.use_mpi, self.use_mpi, sync_aoR)                                                          
+        else:
+            # assert self.rsjk is None and self.cell_rsjk is None
+            self.aoR = ISDF_Local_Utils.get_aoR(self.cell, self.coords, self.partition, 
+                                                      None,
+                                                      first_natm,
+                                                      self.group,
+                                                      self.distance_matrix, 
+                                                      self.AtmConnectionInfo, 
+                                                      self.use_mpi, self.use_mpi, sync_aoR)
+    
+        memory = ISDF_Local_Utils._get_aoR_holders_memory(self.aoR)
+        t2 = (lib.logger.process_clock(), lib.logger.perf_counter())
+        
+        if rank == 0:
+            _benchmark_time(t1, t2, "build_aoR", self)
+    
+    def _allocate_jk_buffer(self, datatype, ngrids_local):
+        pass
+    
+    @property
+    def max_nao_involved(self):
+        return np.max([aoR_holder.aoR.shape[0] for aoR_holder in self.aoR if aoR_holder is not None])
+    
+    @property
+    def max_ngrid_involved(self):
+        return np.max([aoR_holder.aoR.shape[1] for aoR_holder in self.aoR if aoR_holder is not None])
+    
+    @property
+    def max_nIP_involved(self):
+        return np.max([aoR_holder.aoR.shape[1] for aoR_holder in self.aoRg if aoR_holder is not None])
+    
+    @property
+    def maxsize_group_naux(self):
+        maxsize_group_naux = 0
+        for group_id, atm_ids in enumerate(self.group):
+            naux_tmp = 0
+            for atm_id in atm_ids:
+                naux_tmp += self.aoRg[atm_id].aoR.shape[1]
+            maxsize_group_naux = max(maxsize_group_naux, naux_tmp)
+        return maxsize_group_naux
+    
+    def deallocate_k_buffer(self):
+        if hasattr(self, "build_k_buf") and self.build_k_buf is not None:
+            del self.build_k_buf
+            self.build_k_buf = None
+        if hasattr(self, "build_VW_in_k_buf") and self.build_VW_in_k_buf is not None:
+            del self.build_VW_in_k_buf
+            self.build_VW_in_k_buf = None
+    
+    def allocate_k_buffer(self, nset=1): 
+        
+        log = lib.logger.Logger(self.cell.stdout, self.cell.verbose)
+        
+        ### TODO: split grid again to reduce the size of buf when robust fitting is true! 
+        # TODO: try to calculate the size when direct is true
+        
+        max_nao_involved   = self.max_nao_involved
+        max_ngrid_involved = self.max_ngrid_involved
+        max_nIP_involved   = self.max_nIP_involved
+        maxsize_group_naux = self.maxsize_group_naux
+        
+        allocated = False
+        
+        if self.direct:
+            if hasattr(self, "build_k_buf") and self.build_k_buf is not None:
+                if hasattr(self, "build_VW_in_k_buf") and self.build_VW_in_k_buf is not None:
+                    allocated = True
+        else:
+            if hasattr(self, "build_k_buf") and self.build_k_buf is not None:
+                allocated = True
+    
+        if allocated:
+            pass
+        else:
+            
+            if self.direct:
+                
+                if self._limited_memory:
+                    build_K_bunchsize = min(maxsize_group_naux, self._build_K_bunchsize)
+                else:
+                    build_K_bunchsize = maxsize_group_naux
+                
+                #### compare build_K_bunchsize with those buf used for W matrix ####
+                
+                size1 = maxsize_group_naux * self.nao * nset
+                size2 = maxsize_group_naux * max_nao_involved
+                self.Density_RgAO_buf = np.zeros((size1+size2,), dtype=np.float64)
+
+                #### allocate build_VW_in_k_buf ####      
+                          
+                mesh = self.cell.mesh
+                ngrid= np.prod(mesh)
+                ncomplex = mesh[0] * mesh[1] * (mesh[2]//2+1)
+                nthread = lib.num_threads()
+                
+                build_K_bunchsize = max(maxsize_group_naux * self.naux//ngrid+2, build_K_bunchsize)
+                build_K_bunchsize = max(maxsize_group_naux * max_nIP_involved//max_ngrid_involved+2, build_K_bunchsize)
+                self._build_K_bunchsize = build_K_bunchsize
+                
+                size0 = (np.prod(self.cell.mesh) + 2 * ncomplex) * nthread
+                size1 = build_K_bunchsize * np.prod(self.cell.mesh) 
+                size2 = maxsize_group_naux * self.naux
+                self.build_VW_in_k_buf = np.zeros((size0+size1+size2,), dtype=np.float64)
+                
+                #### allocate build_k_buf ####
+                
+                size1 = build_K_bunchsize * np.prod(self.cell.mesh) # density RgR 
+                size2 = build_K_bunchsize * max_ngrid_involved      # ddot_res_RgR
+                size3 = maxsize_group_naux * self.nao               # K1_tmp1
+                #size4 = max_ngrid_involved * max_nao_involved      # K1_tmp1_ddot_res
+                size4 = maxsize_group_naux * self.nao  
+                #size5 = max_ngrid_involved * max_ngrid_involved
+                size5 = 0
+                size6 = self.nao * self.nao               # K1_final_ddot
+                
+                size = size1 + size2 + size3 + size4 + size5 + size6
+                
+                self.build_k_buf = np.zeros((size,), dtype=np.float64)
+                
+                log.info("In allocate_k_buffer, Density_RgAO_buf  memory = %d"%(self.Density_RgAO_buf.nbytes))
+                log.info("In allocate_k_buffer, build_VW_in_k_buf memory = %d"%(self.build_VW_in_k_buf.nbytes))
+                log.info("In allocate_k_buffer, build_k_buf       memory = %d"%(self.build_k_buf.nbytes))
+                
+            else:
+                            
+                self.Density_RgAO_buf = np.zeros((self.naux, self.nao), dtype=np.float64)
+                max_dim = max(max_nao_involved, max_ngrid_involved, self.nao)
+                
+                ### size0 in getting W part of K ###
+            
+                size0 = self.naux * max_nIP_involved + self.naux * max_nao_involved + self.naux * max(max_nIP_involved, max_nao_involved)
+                
+                ### size1 in getting Density Matrix ### 
+            
+                size11 = self.nao * max_nIP_involved + self.nao * self.nao
+                size1  = self.naux * self.nao + self.naux * max_dim + self.nao * self.nao
+                size1 += self.naux * max_nao_involved     
+                size1 = max(size1, size11)
+            
+                ### size2 in getting K ### 
+                
+                size2 = self.naux * max_nao_involved
+                if self.with_robust_fitting:
+                    size2 += self.naux * max_ngrid_involved + self.naux * max_nao_involved
+                    size2 += self.naux * max_ngrid_involved
+                self.build_k_buf = np.zeros((max(size0, size1, size2)), dtype=np.float64)
+            
+    def _construct_build_aoRg(self, IP_group, group=None):
+        
+        if group is None:
+            group = []
+            for i in range(self.natm):
+                group.append([i])
+        for i in range(len(group)):
+            group[i] = np.array(group[i], dtype=np.int32)
+            group[i].sort()
+        assert self.natm == len(IP_group) 
+        
+        aoR_holders_res = []
+        for i in range(self.natm):
+            aoR_holders_res.append(None)
+
+        assert hasattr(self, "partition")
+        assert hasattr(self, "aoR")
+        
+        atm_ordering = []
+        for i in range(len(group)):
+            atm_ordering.extend(group[i]) 
+        
+        IP_ID_NOW = 0 
+        GRID_ID_NOW = 0
+        
+        IP_loc_in_ordered_grids = []
+        
+        for atm_id in atm_ordering:
+            aoR_holder = self.aoR[atm_id]
+            if aoR_holder is None:
+                if IP_group[atm_id] is None:
+                    continue
+                else:
+                    IP_ID_NOW += len(IP_group[atm_id])
+                    continue
+            nIP = len(IP_group[atm_id])
+            
+            idx = np.searchsorted(self.partition[atm_id], IP_group[atm_id])
+            
+            ao_involved = aoR_holder.ao_involved.copy()
+            aoR = aoR_holder.aoR[:, idx].copy()
+            aoR_holders_res[atm_id] = ISDF_Local_Utils.aoR_Holder(aoR, ao_involved, IP_ID_NOW, IP_ID_NOW+nIP, IP_ID_NOW, IP_ID_NOW+nIP)
+            
+            IP_loc_in_ordered_grids.extend(idx+GRID_ID_NOW)
+
+            IP_ID_NOW   += nIP
+            GRID_ID_NOW += len(self.partition[atm_id])
+            
+        self.IP_loc_in_ordered_grids = np.array(IP_loc_in_ordered_grids, dtype=np.int32)
+        assert self.IP_loc_in_ordered_grids.ndim == 1
+        
+        return aoR_holders_res
+            
+    def _determine_c(self):
+        '''
+        called in build_IP_local when c is not set
+        empirical rule to determine c
+        '''
+        
+        DEFAULT = 15
+        SEGMENT = [1e-2, 1e-3, 1e-4, 1e-5]
+        C       = [10, 20, 30, 35, 40]
+        
+        if self.rela_cutoff_QRCP is None:
+            return DEFAULT
+        else:
+            if self.rela_cutoff_QRCP > SEGMENT[0]:
+                return C[0]
+            else:
+                for i in range(1, len(SEGMENT)):
+                    if self.rela_cutoff_QRCP > SEGMENT[i]:
+                        return C[i]
+                return C[-1]
+        
+    def build_IP_local(self, c=None, m=5, first_natm=None, group=None, Ls = None, debug=True):
+        
+        if c is None:
+            c = self._determine_c()
+        
+        if first_natm is None:
+            first_natm = self.natm
+        
+        if group == None:
+            group = []
+            for i in range(natm):
+                group.append([i])
+        
+        self.group = group
+        
+        for i in range(len(group)):
+            group[i] = np.array(group[i], dtype=np.int32)
+            group[i].sort()
+        
+        # build partition and aoR # 
+        
+        t1 = (lib.logger.process_clock(), lib.logger.perf_counter())
+        
+        self.build_partition_aoR(Ls)
+        
+        ao2atomID = self.ao2atomID
+        partition = self.partition
+        aoR  = self.aoR
+        natm = self.natm
+        nao  = self.nao
+        
+        self.partition_atmID_to_gridID = partition
+        
+        self.partition_group_to_gridID = []
+        for i in range(len(group)):
+            self.partition_group_to_gridID.append([])
+            for atm_id in group[i]:
+                self.partition_group_to_gridID[i].extend(partition[atm_id])
+            self.partition_group_to_gridID[i] = np.array(self.partition_group_to_gridID[i], dtype=np.int32)
+            # self.partition_group_to_gridID[i].sort()
+        
+        ngrids = self.coords.shape[0]
+        
+        gridID_2_atmID = np.zeros((ngrids), dtype=np.int32)
+        
+        for atm_id in range(natm):
+            gridID_2_atmID[partition[atm_id]] = atm_id
+        
+        self.gridID_2_atmID  = gridID_2_atmID
+        self.grid_ID_ordered = ISDF_Local_Utils._get_grid_ordering(self.partition, self.group, self.use_mpi)
+        
+        t2 = (lib.logger.process_clock(), lib.logger.perf_counter())
+        
+        # if self.verbose and debug:
+        _benchmark_time(t1, t2, "build_partition_aoR", self)
+        
+        t1 = t2
+        
+        if len(group) < first_natm:
+            IP_Atm = select_IP_atm_ls(self, c+1, m, first_natm, 
+                                      rela_cutoff=self.rela_cutoff_QRCP,
+                                      no_retriction_on_nIP=self.no_restriction_on_nIP,
+                                      use_mpi=self.use_mpi)
+        else:
+            IP_Atm = select_IP_atm_ls(self, c, m, first_natm, 
+                                      rela_cutoff=self.rela_cutoff_QRCP,
+                                      no_retriction_on_nIP=self.no_restriction_on_nIP,
+                                      use_mpi=self.use_mpi)
+        t3 = (lib.logger.process_clock(), lib.logger.perf_counter())
+                
+        self.aoRg_possible = self._construct_build_aoRg(IP_Atm, None)
+        
+        t4 = (lib.logger.process_clock(), lib.logger.perf_counter())
+        if self.verbose and debug:
+            _benchmark_time(t3, t4, "build_aoRg_possible", self)
+        
+        select_IP_local_ls_drive(self, c, m, IP_Atm, group, use_mpi=self.use_mpi)
+        
+        t2 = (lib.logger.process_clock(), lib.logger.perf_counter())
+        
+        if self.verbose and debug:
+            _benchmark_time(t1, t2, "select_IP", self)
+        
+        t1 = t2
+        
+        build_aux_basis_ls(self, group, self.IP_group, debug=debug, use_mpi=self.use_mpi)
+        
+        t2 = (lib.logger.process_clock(), lib.logger.perf_counter())
+        
+        if self.verbose and debug:
+            _benchmark_time(t1, t2, "build_aux_basis", self)
+        
+        sys.stdout.flush()
+
+    def get_coulG(self):
+        if hasattr(self, "rsjk") and self.rsjk is not None:
+            
+            ##### construct coulG_LR , copy from rsjk.py #####
+    
+            if self.rsjk.cell.dimension!=3:
+                raise NotImplementedError('3D only')
+
+            _, _, kws = self.rsjk.cell.get_Gv_weights(self.mesh)
+            coulG_SR_at_G0 = np.pi/self.rsjk.omega**2 * kws
+            kpt = np.zeros(3)
+            with lib.temporary_env(self.rsjk.cell, dimension=3):
+                coulG_SR = self.rsjk.weighted_coulG_SR(kpt, False, self.mesh)
+            G0_idx = 0
+            coulG_SR[G0_idx] += coulG_SR_at_G0
+            coulG_full = self.rsjk.weighted_coulG(kpt, None, self.mesh, omega=0.0)
+            self.coulG = coulG_full - coulG_SR
+            
+            coulG_bench = tools.get_coulG(self.cell_rsjk, mesh=self.cell_rsjk.mesh, omega=0.0)
+            
+            ### find coulG_full with values larger than 1e-6 ###
+            
+            idx = np.where(np.abs(coulG_full) > 1e-6)
+            
+            G1 = coulG_full[idx].copy()
+            G2 = coulG_bench[idx].copy()
+            ratio = G2/G1
+            fac = ratio[0]
+            assert fac == 1.0/kws 
+            assert np.allclose(ratio, fac)
+            self.coulG *= fac
+            
+        else:
+            self.coulG = tools.get_coulG(self.cell, mesh=self.cell.mesh)
+            
+    def diag_dm(self, dm, linear_dep_threshold=1e-16):
+        '''Solver for generalized eigenvalue problem
+
+        .. math:: HC = SCE
+        
+            used only for occ-RI-K, better not merge into PySCF first!
+        
+        '''
+        # print("ovlp = ", self.ovlp)
+        
+        # diagonalize overlap matrix
+        e, v = scipy.linalg.eigh(self.ovlp)
+        
+        mask = e > linear_dep_threshold * e[-1]
+        e = e[mask]
+        v = v[:,mask]
+        v*= np.sqrt(e)
+        
+        dm_new_basis = np.dot(v.T, np.dot(dm, v))
+        
+        mo_occ, mo_coeff = scipy.linalg.eigh(dm_new_basis)
+        
+        mo_coeff = np.dot(v, mo_coeff) # SC = mocoeff
+        
+        v /= np.sqrt(e) 
+        
+        mo_coeff = np.dot(v.T, mo_coeff)
+        mo_coeff = (1.0/e).reshape(-1,1) * mo_coeff
+        mo_coeff = np.dot(v, mo_coeff)
+        
+        return mo_occ[::-1], mo_coeff[:,::-1]
+
+    def build_auxiliary_Coulomb(self, debug=True):
+        
+        if self.direct == True:
+            return # do nothing
+        
+        ### the cutoff based on distance for V and W is used only for testing now ! ###
+        
+        distance_max = np.max(self.distance_matrix)
+        if self.V_W_cutoff is not None and self.V_W_cutoff > distance_max:
+            logger.warn(self, "WARNING : V_W_cutoff is larger than the maximum distance in the cell")
+            self.V_W_cutoff = None # no cutoff indeed 
+        if self.V_W_cutoff is not None:
+            logger.debug4(self, "PBC_ISDF_Info_Quad:->build_auxiliary_Coulomb: V_W_cutoff   = %12.6e" % self.V_W_cutoff)
+            logger.debug4(self, "PBC_ISDF_Info_Quad:->build_auxiliary_Coulomb: distance_max = %12.6e" % distance_max)
+        
+        if self.with_robust_fitting:
+            build_auxiliary_Coulomb_local_bas(self, debug=debug, use_mpi=self.use_mpi)
+        else:
+            build_auxiliary_Coulomb_local_bas_wo_robust_fitting(self, debug=debug, use_mpi=self.use_mpi)
+                
+        if self.V_W_cutoff is not None:
+            
+            if hasattr(self, "V_R"):
+                V = self.V_R
+                
+                bra_loc = 0
+                for atm_i, aoRg_holder in enumerate(self.aoRg):
+                    nbra = aoRg_holder.aoR.shape[1]
+                    ket_loc = 0
+                    for atm_j, aoR_holder in enumerate(self.aoR):
+                        nket = aoR_holder.aoR.shape[1]
+                        if self.distance_matrix[atm_i, atm_j] > self.V_W_cutoff:
+                            V[bra_loc:bra_loc+nbra, ket_loc:ket_loc+nket] = 0.0
+                        ket_loc += nket
+                    bra_loc += nbra
+                    
+                self.V_R = V
+
+            W = self.W
+            
+            bra_loc = 0
+            for atm_i, aoRg_holder_bra in enumerate(self.aoRg):
+                nbra = aoRg_holder.aoR.shape[1]
+                ket_loc = 0
+                for atm_j, aoRg_holder_ket in enumerate(self.aoRg):
+                    nket = aoRg_holder.aoR.shape[1]
+                    if self.distance_matrix[atm_i, atm_j] > self.V_W_cutoff:
+                        W[bra_loc:bra_loc+nbra, ket_loc:ket_loc+nket] = 0.0
+                    ket_loc += nket
+                bra_loc += nbra
+            
+            self.W = W
+
+    get_jk = ISDF_Local_JK.get_jk_dm_quadratic
+        
+    def aoR_RangeSeparation(self, CompactAO):
+        
+        self.CompactAOList = np.array(CompactAO, dtype=np.int32)
+        DiffuseAO = []
+        for i in range(self.nao):
+            if i not in CompactAO:
+                DiffuseAO.append(i)
+        self.DiffuseAOList = np.array(DiffuseAO, dtype=np.int32)
+        
+        IsCompact = np.zeros((self.nao), dtype=bool)
+        IsCompact[CompactAO] = True
+        IsCompact[DiffuseAO] = False
+        self.IsCompact = IsCompact
+        
+        for aoR in self.aoR:
+            aoR.RangeSeparation(IsCompact)
+        for aoRg in self.aoRg:
+            aoRg.RangeSeparation(IsCompact)
+    
+    def aoRg_full(self):
+        
+        fn_pack = getattr(libisdf, "_Pack_Matrix_SparseRow_DenseCol", None)
+        assert fn_pack is not None
+
+        partition = []
+    
+        res = np.zeros((self.nao, self.naux), dtype=np.float64)
+        for i in range(self.natm):
+            aoRg_i            = self.aoRg[i]
+            ao_involved_i     = aoRg_i.ao_involved
+            nao_i             = aoRg_i.aoR.shape[0]
+            global_IP_begin_i = aoRg_i.global_gridID_begin
+            nIP_i             = aoRg_i.aoR.shape[1]
+                    
+            fn_pack(
+                res.ctypes.data_as(ctypes.c_void_p), 
+                ctypes.c_int(res.shape[0]),
+                ctypes.c_int(res.shape[1]),
+                aoRg_i.aoR.ctypes.data_as(ctypes.c_void_p),
+                ctypes.c_int(nao_i),
+                ctypes.c_int(nIP_i),
+                ao_involved_i.ctypes.data_as(ctypes.c_void_p),
+                ctypes.c_int(global_IP_begin_i),
+                ctypes.c_int(global_IP_begin_i+nIP_i)
+            )
+            
+            partition.append([global_IP_begin_i, global_IP_begin_i+nIP_i])
+            
+        return res, partition
+      
+    ### LS_THC fit ###
+    
+    def LS_THC_recompression(self, X:np.ndarray, force_LS_THC=True): 
+        
+        from isdf_ao2mo import LS_THC 
+        
+        if force_LS_THC:
+            self.with_robust_fitting = False
+            self.force_LS_THC        = True
+            self.W    = LS_THC(self, X) / (self.ngrids/self.cell.vol)
+            self.aoRg = X
+            self.aoR  = None
+            self.V_R  = None
+        else:
+            self.force_LS_THC        = False
+            self.W2    = LS_THC(self, X) / (self.ngrids/self.cell.vol)
+            self.aoRg2 = X
+
+    ### check aoR value ###
+    
+    def check_aoR(self):
+        for aoR_holder in self.aoR:
+            max_abs_index = np.unravel_index(np.argmax(np.abs(aoR_holder.aoR)), aoR_holder.aoR.shape)
+            value = aoR_holder.aoR[max_abs_index[0]][max_abs_index[1]]
+            ao_indx = aoR_holder.ao_involved[max_abs_index[0]]
+            print("max_abs_value = ", value, " with indx = ", ao_indx, max_abs_index[1]+aoR_holder.global_gridID_begin)
+            
+
+if __name__ == '__main__':
+    
+    C = 15
+    from pyscf.lib.parameters import BOHR
+    from isdf_tools_cell import build_supercell, build_supercell_with_partition
+    import pyscf.pbc.gto as pbcgto
+    
+    verbose = 10
+        
+    cell   = pbcgto.Cell()
+    boxlen = 3.5668
+    cell.a = np.array([[boxlen,0.0,0.0],[0.0,boxlen,0.0],[0.0,0.0,boxlen]])
+    prim_a = np.array([[boxlen,0.0,0.0],[0.0,boxlen,0.0],[0.0,0.0,boxlen]])
+    atm = [
+        ['C', (0.     , 0.     , 0.    )],
+        ['C', (0.8917 , 0.8917 , 0.8917)],
+        ['C', (1.7834 , 1.7834 , 0.    )],
+        ['C', (2.6751 , 2.6751 , 0.8917)],
+        ['C', (1.7834 , 0.     , 1.7834)],
+        ['C', (2.6751 , 0.8917 , 2.6751)],
+        ['C', (0.     , 1.7834 , 1.7834)],
+        ['C', (0.8917 , 2.6751 , 2.6751)],
+    ] 
+    KE_CUTOFF = 70
+    # basis  = 'unc-gth-cc-tzvp'
+    # pseudo = "gth-hf"  
+    basis  = 'gth-dzvp'
+    pseudo = "gth-pade"   
+    prim_cell = build_supercell(atm, prim_a, Ls = [1,1,1], ke_cutoff=KE_CUTOFF, basis=basis, pseudo=pseudo)    
+    prim_partition = [[0,1],[2,3],[4,5],[6,7]]
+    # prim_partition = [[0], [1], [2], [3], [4], [5], [6], [7]]
+    # prim_partition = [[0,1,2,3,4,5,6,7]]
+    # prim_partition = [[0,1,2,3],[4,5,6,7]]
+    
+    prim_mesh = prim_cell.mesh
+
+    Ls = [1, 1, 2]
+    # Ls = [2, 2, 2]
+    Ls = np.array(Ls, dtype=np.int32)
+    mesh = [Ls[0] * prim_mesh[0], Ls[1] * prim_mesh[1], Ls[2] * prim_mesh[2]]
+    mesh = np.array(mesh, dtype=np.int32)
+    # mesh = None  ### NOTE: magically, use None will be much slower ? 
+    
+    cell, group_partition = build_supercell_with_partition(atm, prim_a, mesh=mesh, 
+                                                     Ls=Ls,
+                                                     basis=basis, 
+                                                     pseudo=pseudo,
+                                                     partition=prim_partition, ke_cutoff=KE_CUTOFF, verbose=verbose)
+    print("group_partition = ", group_partition)
+    
+    t1 = (lib.logger.process_clock(), lib.logger.perf_counter())
+    # pbc_isdf_info = PBC_ISDF_Info_Quad(cell, with_robust_fitting=True, aoR_cutoff=1e-8, direct=False, use_occ_RI_K=False)
+    pbc_isdf_info = PBC_ISDF_Info_Quad(cell, with_robust_fitting=True, 
+                                       aoR_cutoff=1e-8, 
+                                       # direct=False, 
+                                       direct=True, 
+                                       limited_memory=True, build_K_bunchsize=32,
+                                       use_occ_RI_K=False, rela_cutoff_QRCP=3e-3)
+    pbc_isdf_info.build_IP_local(c=C, m=5, group=group_partition)
+    pbc_isdf_info.build_auxiliary_Coulomb()    
+    t2 = (lib.logger.process_clock(), lib.logger.perf_counter())
+    _benchmark_time(t1, t2, "build isdf", pbc_isdf_info)
+    
+    # pbc_isdf_info.check_aoR()
+    # exit(1)
+    
+    from pyscf.pbc import scf
+
+    t1 = (lib.logger.process_clock(), lib.logger.perf_counter())
+
+    mf = scf.RHF(cell)
+    pbc_isdf_info.direct_scf = mf.direct_scf
+    mf.with_df = pbc_isdf_info
+    mf.max_cycle = 6
+    mf.conv_tol = 1e-7
+    
+    mf.kernel()
+    
+    t2 = (lib.logger.process_clock(), lib.logger.perf_counter())
+    
+    _benchmark_time(t1, t2, "scf", pbc_isdf_info)
+    sys.stdout.flush()
\ No newline at end of file
diff --git a/pyscf/isdf/isdf_local_MPI.py b/pyscf/isdf/isdf_local_MPI.py
new file mode 100644
index 000000000..320897409
--- /dev/null
+++ b/pyscf/isdf/isdf_local_MPI.py
@@ -0,0 +1,213 @@
+#!/usr/bin/env python
+# Copyright 2014-2020 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Author: Ning Zhang <ningzhang1024@gmail.com>
+#
+
+import numpy as np
+
+from pyscf import lib
+import pyscf.pbc.gto as pbcgto
+from pyscf.pbc.gto import Cell
+from pyscf.gto.mole import *
+
+from   pyscf.isdf.isdf_tools_mpi import rank, comm, comm_size, allgather, bcast
+import pyscf.isdf.isdf_local   as isdf_local
+import pyscf.isdf.isdf_local_k as isdf_local_k
+from   pyscf.isdf.isdf_tools_local import flatten_aoR_holder
+
+###############################################################
+
+# debug code #
+
+def dump_attributes(mydf, attr_lst:list[str], dtype=np.int32, filename:str=None):
+    
+    res = []
+    
+    for attr in attr_lst:
+        assert hasattr(mydf, attr)
+        tmp = getattr(mydf, attr)
+        if isinstance(tmp, list):
+            if all([isinstance(x, np.ndarray) for x in tmp]):
+                tmp = np.concatenate([x.ravel() for x in tmp])
+            else:
+                tmp = np.asarray(tmp, dtype=dtype)
+        else:
+            tmp = np.asarray(tmp, dtype=dtype)
+        res.append(tmp.flatten().astype(dtype))
+    
+    res = np.concatenate(res)
+    print("rank = ", rank, res.shape)
+    res.tofile(filename)
+
+def dump_aoR(mydf, filename:str=None):
+    
+    res_int   = []
+    res_float = []
+    
+    for attr in ["aoR", "aoR1", "aoRg", "aoRg"]:
+        if hasattr(mydf, attr):
+            tmp = getattr(mydf, attr)
+            if tmp is None:
+                print("%s is None" % (attr))
+                continue
+            tmp1, tmp2 = flatten_aoR_holder(tmp)
+            res_int.append(tmp1)
+            res_float.append(tmp2)
+    
+    res_int   = np.concatenate(res_int)
+    res_float = np.concatenate(res_float)
+    
+    print("rank = ", rank, res_int.shape, res_float.shape)
+    res_int.tofile(filename   + "_int.dat")
+    res_float.tofile(filename + "_float.dat")
+    
+
+############## MPI version of PBC_ISDF_Info_Quad ##############
+
+class PBC_ISDF_Info_Quad_MPI(isdf_local.PBC_ISDF_Info_Quad):
+    ''' Interpolative separable density fitting (ISDF) for periodic systems with MPI.
+    
+    The locality is explored! 
+    
+    k-point sampling is not currently supported!
+    
+    '''
+
+    # Quad stands for quadratic scaling
+    
+    def __init__(self, mol:Cell, 
+                 kmesh             = None,
+                 verbose           = None,
+                 rela_cutoff_QRCP  = None,
+                 aoR_cutoff        = 1e-8,
+                 limited_memory    = False,
+                 build_K_bunchsize = None):
+        
+        super().__init__(mol, True, kmesh, verbose, rela_cutoff_QRCP, aoR_cutoff, True, 
+                         use_occ_RI_K      = False,
+                         limited_memory    = limited_memory,
+                         build_K_bunchsize = build_K_bunchsize)
+        self.use_mpi = True
+        assert self.use_aft_ao == False
+    
+    dump_attributes = dump_attributes
+    dump_aoR = dump_aoR
+
+###############################################################
+
+############## MPI version of PBC_ISDF_Info_Quad_K ##############
+
+class PBC_ISDF_Info_Quad_K_MPI(isdf_local_k.PBC_ISDF_Info_Quad_K):
+    ''' Interpolative separable density fitting (ISDF) for periodic systems with MPI.
+    
+    The locality is explored! 
+    
+    k-point sampling is not currently supported!
+    
+    '''
+
+    # Quad stands for quadratic scaling
+    
+    def __init__(self, mol:Cell, 
+                 kmesh             = None,
+                 verbose           = None,
+                 rela_cutoff_QRCP  = None,
+                 aoR_cutoff        = 1e-8,
+                 limited_memory    = False,
+                 build_K_bunchsize = None):
+        
+        super().__init__(mol, True, kmesh, verbose, rela_cutoff_QRCP, aoR_cutoff, True, 
+                         # use_occ_RI_K      = False,
+                         limited_memory    = limited_memory,
+                         build_K_bunchsize = build_K_bunchsize)
+        self.use_mpi = True
+        assert self.use_aft_ao == False
+
+    dump_attributes = dump_attributes
+    dump_aoR = dump_aoR
+
+#################################################################
+
+if __name__ == '__main__':
+
+    C = 15
+    from pyscf.lib.parameters import BOHR
+    from isdf_tools_cell import build_supercell, build_supercell_with_partition
+    
+    verbose = 6
+    if rank != 0:
+        verbose = 0
+    
+    prim_a = np.array(
+                    [[14.572056092/2, 0.000000000, 0.000000000],
+                     [0.000000000, 14.572056092/2, 0.000000000],
+                     [0.000000000, 0.000000000,  6.010273939],]) * BOHR
+    atm = [
+['Cu1',	(1.927800,	1.927800,	1.590250)],
+['O1',	(1.927800,	0.000000,	1.590250)],
+['O1',	(0.000000,	1.927800,	1.590250)],
+['Ca',	(0.000000,	0.000000,	0.000000)],
+    ]
+    from pyscf.gto.basis import parse_nwchem
+    fbas="basis2.dat"  ## NOTE: you should copy it from examples/isdf to run this scripts
+    atms = ['O', 'Cu', "Ca"]
+    basis = {atm:parse_nwchem.load(fbas, atm) for atm in atms}
+    pseudo = {'Cu1': 'gth-pbe-q19', 'Cu2': 'gth-pbe-q19', 'O1': 'gth-pbe', 'Ca': 'gth-pbe'}
+    ke_cutoff = 128 
+    prim_cell = build_supercell(atm, prim_a, Ls = [1,1,1], ke_cutoff=ke_cutoff, basis=basis, pseudo=pseudo)
+    prim_mesh = prim_cell.mesh
+    KE_CUTOFF = 128
+        
+    prim_mesh = prim_cell.mesh    
+    prim_partition = [[0], [1], [2], [3]]    
+    
+    Ls = [2, 2, 1]
+    Ls = np.array(Ls, dtype=np.int32)
+    mesh = [Ls[0] * prim_mesh[0], Ls[1] * prim_mesh[1], Ls[2] * prim_mesh[2]]
+    mesh = np.array(mesh, dtype=np.int32)
+    
+    cell, group_partition = build_supercell_with_partition(atm, prim_a, mesh=mesh, 
+                                                     Ls=Ls,
+                                                     basis=basis, pseudo=pseudo,
+                                                     partition=prim_partition, ke_cutoff=KE_CUTOFF, verbose=verbose)
+    if rank == 0:
+        print("group_partition = ", group_partition)
+    
+    pbc_isdf_info = PBC_ISDF_Info_Quad_MPI(cell, aoR_cutoff=1e-8, verbose=verbose, limited_memory=True, build_K_bunchsize=16)
+    pbc_isdf_info.build_IP_local(c=C, m=5, group=group_partition)
+    pbc_isdf_info.Ls = Ls
+    pbc_isdf_info.build_auxiliary_Coulomb(debug=True)
+    
+    from pyscf.pbc import scf
+
+    if comm_size > 1:
+        comm.Barrier()
+
+    mf = scf.RHF(cell)
+    mf = scf.addons.smearing_(mf, sigma=0.2, method='fermi')
+    pbc_isdf_info.direct_scf = mf.direct_scf
+    mf.with_df = pbc_isdf_info
+    mf.max_cycle = 16
+    mf.conv_tol = 0.0
+    
+    dm = mf.init_guess_by_atom()
+    
+    if comm_size > 1:
+        dm = bcast(dm, root=0)
+    
+    mf.kernel(dm)
+    
+    comm.Barrier()
\ No newline at end of file
diff --git a/pyscf/isdf/isdf_local_jk.py b/pyscf/isdf/isdf_local_jk.py
new file mode 100644
index 000000000..c414df9e1
--- /dev/null
+++ b/pyscf/isdf/isdf_local_jk.py
@@ -0,0 +1,2112 @@
+#!/usr/bin/env python
+# Copyright 2014-2020 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Author: Ning Zhang <ningzhang1024@gmail.com>
+#
+
+############ sys module ############
+
+import copy, sys
+import ctypes
+import numpy as np
+
+############ pyscf module ############
+
+from pyscf import lib
+from pyscf.lib import logger
+from pyscf.pbc import tools
+from pyscf.pbc.lib.kpts_helper import is_zero, gamma_point
+from pyscf.pbc.df.df_jk import _ewald_exxdiv_for_G0
+from pyscf.pbc.df.df_jk import _format_dms, _format_kpts_band, _format_jks
+libisdf = lib.load_library('libisdf')
+
+############ isdf utils ############
+
+from pyscf.isdf.isdf_jk import _benchmark_time
+from pyscf.isdf._isdf_local_K_direct import _isdf_get_K_direct_kernel_1
+import pyscf.isdf.isdf_tools_linearop    as     lib_isdf
+
+############ GLOBAL PARAMETER ############
+
+J_MAX_GRID_BUNCHSIZE = 8192
+
+##################################################
+#
+# only Gamma Point
+#
+##################################################
+
+### ls = linear scaling
+
+def _half_J(mydf, dm, use_mpi=False,
+            first_pass = None,
+            short_range = False):
+    
+    if use_mpi:
+        assert mydf.direct == True
+        from pyscf.isdf.isdf_tools_mpi import rank, comm, comm_size, bcast, reduce
+        size = comm.Get_size()
+        
+    ######### prepare the parameter #########
+    
+    assert first_pass in [None, "only_dd", "only_cc", "exclude_cc", "all"]
+    
+    if first_pass is None:
+        first_pass = "all"
+    
+    first_pass_all    = first_pass == "all"
+    first_pass_has_dd = first_pass in ["all", "only_dd", "exclude_cc"]
+    first_pass_has_cc = first_pass in ["all", "only_cc"]
+    first_pass_has_cd = first_pass in ["all", "exclude_cc"]
+    
+    t1 = (logger.process_clock(), logger.perf_counter())
+
+    if len(dm.shape) == 3:
+        assert dm.shape[0] == 1
+        dm = dm[0]
+        
+    nao  = dm.shape[0]
+    cell = mydf.cell
+    assert cell.nao == nao
+    vol = cell.vol
+    mesh = np.array(cell.mesh, dtype=np.int32)
+    ngrid = np.prod(mesh)
+
+    aoR  = mydf.aoR
+    assert isinstance(aoR, list)
+    naux = mydf.naux
+    
+    #### step 0. allocate buffer 
+    
+    max_nao_involved   = np.max([aoR_holder.aoR.shape[0] for aoR_holder in aoR if aoR_holder is not None])
+    max_ngrid_involved = np.max([aoR_holder.aoR.shape[1] for aoR_holder in aoR if aoR_holder is not None])
+    ngrids_local       = np.sum([aoR_holder.aoR.shape[1] for aoR_holder in aoR if aoR_holder is not None])
+    
+    density_R = np.zeros((ngrid,), dtype=np.float64)
+    
+    dm_buf      = np.zeros((max_nao_involved, max_nao_involved), dtype=np.float64)
+    max_col_buf = min(max_ngrid_involved, J_MAX_GRID_BUNCHSIZE)
+    ddot_buf    = np.zeros((max_nao_involved, max_col_buf), dtype=np.float64)
+    
+    fn_multiplysum = getattr(libisdf, "_fn_J_dmultiplysum", None)
+    assert fn_multiplysum is not None
+    
+    ##### get the involved C function ##### 
+    
+    fn_extract_dm = getattr(libisdf, "_extract_dm_involved_ao", None) 
+    assert fn_extract_dm is not None
+    
+    fn_extract_dm2 = getattr(libisdf, "_extract_dm_involved_ao_RS", None)
+    assert fn_extract_dm is not None
+    
+    fn_packadd_dm = getattr(libisdf, "_packadd_local_dm", None)
+    assert fn_packadd_dm is not None 
+    
+    #### step 1. get density value on real space grid and IPs
+    
+    group = mydf.group
+    ngroup = len(group)
+    
+    density_R_tmp = None
+    
+    density_R_tmp_buf = np.zeros((max_ngrid_involved,), dtype=np.float64)
+    
+    def _get_rhoR(
+        bra_aoR, 
+        bra_ao_involved,
+        ket_aoR, 
+        ket_ao_involved,
+        bra_type,
+        ket_type
+    ):
+        
+        nbra_ao = bra_aoR.shape[0]
+        nket_ao = ket_aoR.shape[0]
+        if bra_type == ket_type:
+            dm_now = np.ndarray((nbra_ao, nbra_ao), buffer=dm_buf)
+            fn_extract_dm(
+                dm.ctypes.data_as(ctypes.c_void_p),
+                ctypes.c_int(nao),
+                dm_now.ctypes.data_as(ctypes.c_void_p),
+                bra_ao_involved.ctypes.data_as(ctypes.c_void_p),
+                ctypes.c_int(nbra_ao),
+            )
+            
+            # _density_R_tmp = np.zeros((ket_aoR.shape[1],), dtype=np.float64)
+            _density_R_tmp = np.ndarray((ket_aoR.shape[1],), buffer=density_R_tmp_buf)
+           
+            for p0, p1 in lib.prange(0, ket_aoR.shape[1], J_MAX_GRID_BUNCHSIZE):
+                ddot_res = np.ndarray((nbra_ao, p1-p0), buffer=ddot_buf)
+                lib.ddot(dm_now, ket_aoR[:,p0:p1], c=ddot_res)
+                _res_tmp = np.ndarray((p1-p0,), 
+                                      dtype =_density_R_tmp.dtype, 
+                                      buffer=_density_R_tmp, 
+                                      offset=p0*_density_R_tmp.dtype.itemsize)
+                fn_multiplysum(
+                    _res_tmp.ctypes.data_as(ctypes.c_void_p),
+                    ctypes.c_int(nbra_ao),
+                    ctypes.c_int(p1-p0),
+                    bra_aoR.ctypes.data_as(ctypes.c_void_p),
+                    ctypes.c_int(bra_aoR.shape[0]),
+                    ctypes.c_int(bra_aoR.shape[1]),
+                    ctypes.c_int(0),
+                    ctypes.c_int(p0),
+                    ddot_res.ctypes.data_as(ctypes.c_void_p),
+                    ctypes.c_int(nbra_ao),
+                    ctypes.c_int(p1-p0),
+                    ctypes.c_int(0),
+                    ctypes.c_int(0))
+            return _density_R_tmp
+        else:
+            dm_now = np.ndarray((nbra_ao, nket_ao), buffer=dm_buf)
+            fn_extract_dm2(
+                dm.ctypes.data_as(ctypes.c_void_p),
+                ctypes.c_int(nao),
+                dm_now.ctypes.data_as(ctypes.c_void_p),
+                bra_ao_involved.ctypes.data_as(ctypes.c_void_p),
+                ctypes.c_int(bra_ao_involved.shape[0]),
+                ket_ao_involved.ctypes.data_as(ctypes.c_void_p),
+                ctypes.c_int(ket_ao_involved.shape[0]),
+            )            
+            # _density_R_tmp = np.zeros((ket_aoR.shape[1],), dtype=np.float64)
+            _density_R_tmp = np.ndarray((ket_aoR.shape[1],), buffer=density_R_tmp_buf)
+            
+            for p0, p1 in lib.prange(0, ket_aoR.shape[1], J_MAX_GRID_BUNCHSIZE):
+                ddot_res = np.ndarray((nbra_ao, p1-p0), buffer=ddot_buf)
+                lib.ddot(dm_now, ket_aoR[:,p0:p1], c=ddot_res)
+                _res_tmp = np.ndarray((p1-p0,), 
+                                      dtype =_density_R_tmp.dtype, 
+                                      buffer=_density_R_tmp, 
+                                      offset=p0*_density_R_tmp.dtype.itemsize)
+                fn_multiplysum(
+                    _res_tmp.ctypes.data_as(ctypes.c_void_p),
+                    ctypes.c_int(nbra_ao),
+                    ctypes.c_int(p1-p0),
+                    bra_aoR.ctypes.data_as(ctypes.c_void_p),
+                    ctypes.c_int(bra_aoR.shape[0]),
+                    ctypes.c_int(bra_aoR.shape[1]),
+                    ctypes.c_int(0),
+                    ctypes.c_int(p0),
+                    ddot_res.ctypes.data_as(ctypes.c_void_p),
+                    ctypes.c_int(nbra_ao),
+                    ctypes.c_int(p1-p0),
+                    ctypes.c_int(0),
+                    ctypes.c_int(0))
+            
+            return _density_R_tmp * 2.0
+    
+    for atm_id, aoR_holder in enumerate(aoR):
+        
+        if aoR_holder is None:
+            continue
+        
+        if use_mpi:
+            if atm_id % comm_size != rank:
+                continue
+            
+        ngrids_now          = aoR_holder.aoR.shape[1]
+        nao_involved        = aoR_holder.aoR.shape[0]
+        global_gridID_begin = aoR_holder.global_gridID_begin
+        nCompact            = aoR_holder.nCompact
+        
+        if first_pass_all:        
+            density_R_tmp = _get_rhoR(
+                aoR_holder.aoR, 
+                aoR_holder.ao_involved, 
+                aoR_holder.aoR, 
+                aoR_holder.ao_involved,
+                "all",
+                "all"
+            )
+        
+            density_R[global_gridID_begin:global_gridID_begin+ngrids_now] = density_R_tmp
+        else: 
+            
+            if first_pass_has_cc:
+                density_R_tmp = _get_rhoR(
+                    aoR_holder.aoR[:nCompact,:], 
+                    aoR_holder.ao_involved[:nCompact], 
+                    aoR_holder.aoR[:nCompact,:], 
+                    aoR_holder.ao_involved[:nCompact],
+                    "compact",
+                    "compact"
+                )
+                
+                density_R[global_gridID_begin:global_gridID_begin+ngrids_now] += density_R_tmp
+            
+            if first_pass_has_dd:
+                density_R_tmp = _get_rhoR(
+                    aoR_holder.aoR[nCompact:,:], 
+                    aoR_holder.ao_involved[nCompact:], 
+                    aoR_holder.aoR[nCompact:,:], 
+                    aoR_holder.ao_involved[nCompact:],
+                    "diffuse",
+                    "diffuse"
+                )
+                
+                density_R[global_gridID_begin:global_gridID_begin+ngrids_now] += density_R_tmp
+            
+            if first_pass_has_cd:
+                density_R_tmp = _get_rhoR(
+                    aoR_holder.aoR[:nCompact,:], 
+                    aoR_holder.ao_involved[:nCompact], 
+                    aoR_holder.aoR[nCompact:,:], 
+                    aoR_holder.ao_involved[nCompact:],
+                    "compact",
+                    "diffuse"
+                )                
+                density_R[global_gridID_begin:global_gridID_begin+ngrids_now] += density_R_tmp
+    
+    # assert local_grid_loc == ngrids_local
+    
+    if use_mpi:
+        density_R = reduce(density_R, root=0)
+    else:
+        assert ngrids_local == np.prod(mesh)
+            
+    grid_ID_ordered = mydf.grid_ID_ordered
+    
+    if (use_mpi and rank == 0) or (use_mpi == False):
+        density_R_original = np.zeros_like(density_R)
+            
+        fn_order = getattr(libisdf, "_Reorder_Grid_to_Original_Grid", None)
+        assert fn_order is not None
+            
+        fn_order(
+            ctypes.c_int(density_R.size),
+            mydf.grid_ID_ordered.ctypes.data_as(ctypes.c_void_p),
+            density_R.ctypes.data_as(ctypes.c_void_p),
+            density_R_original.ctypes.data_as(ctypes.c_void_p),
+        )
+
+        density_R = density_R_original.copy()
+    
+    J = None
+    
+    if (use_mpi and rank == 0) or (use_mpi == False):
+    
+        fn_J = getattr(libisdf, "_construct_J", None)
+        assert(fn_J is not None)
+
+        J = np.zeros_like(density_R)
+
+        if short_range:
+            coulG = mydf.coulG_SR
+        else:
+            coulG = mydf.coulG
+
+        fn_J(
+            mesh.ctypes.data_as(ctypes.c_void_p),
+            density_R.ctypes.data_as(ctypes.c_void_p),
+            coulG.ctypes.data_as(ctypes.c_void_p),
+            J.ctypes.data_as(ctypes.c_void_p),
+        )
+                    
+        J_ordered = np.zeros_like(J)
+
+        fn_order = getattr(libisdf, "_Original_Grid_to_Reorder_Grid", None)
+        assert fn_order is not None 
+            
+        fn_order(
+            ctypes.c_int(J.size),
+            mydf.grid_ID_ordered.ctypes.data_as(ctypes.c_void_p),
+            J.ctypes.data_as(ctypes.c_void_p),
+            J_ordered.ctypes.data_as(ctypes.c_void_p),
+        )
+            
+        J = J_ordered.copy()
+            
+    if use_mpi:        
+        J = bcast(J, root=0)
+    
+    t2 = (logger.process_clock(), logger.perf_counter())
+    
+    del dm_buf, ddot_buf, density_R
+    del density_R_tmp
+    
+    _benchmark_time(t1, t2, "half_J", mydf)
+    
+    return J
+
+def _contract_j_dm_ls(mydf, dm, 
+                      use_mpi     = False, 
+                      first_pass  = None, 
+                      second_pass = None,
+                      short_range = False):
+    
+    if use_mpi:
+        assert mydf.direct == True
+        from pyscf.isdf.isdf_tools_mpi import rank, comm, comm_size, bcast, reduce
+        size = comm.Get_size()
+    
+    ###### Prepocess parameter for RS ######
+    
+    assert first_pass  in [None, "only_dd", "only_cc", "exclude_cc", "all"]
+    assert second_pass in [None, "only_dd", "only_cc", "exclude_cc", "all"]
+    
+    if short_range:
+        assert first_pass == "only_dd"
+        assert second_pass == "only_dd"
+    
+    if first_pass is None:
+        first_pass = "all"
+    if second_pass is None:
+        second_pass = "all"
+    
+    second_pass_all    = second_pass == "all"
+    second_pass_has_dd = second_pass in ["all", "only_dd", "exclude_cc"]
+    second_pass_has_cc = second_pass in ["all", "only_cc"]
+    second_pass_has_cd = second_pass in ["all", "exclude_cc"]
+    
+    ####### judge whether to call the original one #######
+    
+    if isinstance(mydf.aoRg, np.ndarray):
+        has_aoR = False
+        if hasattr(mydf, "aoR") and mydf.aoR is not None:
+            assert isinstance(mydf.aoR, np.ndarray)
+            has_aoR = True
+        ### call the original get_j ###
+        from isdf_jk import _contract_j_dm_fast, _contract_j_dm_wo_robust_fitting
+        if has_aoR:
+            return _contract_j_dm_fast(mydf, dm, use_mpi=use_mpi)
+        else:
+            return _contract_j_dm_wo_robust_fitting(mydf, dm, use_mpi=use_mpi)
+    
+    ####### Start the calculation ########
+    
+    t1 = (logger.process_clock(), logger.perf_counter())
+
+    if len(dm.shape) == 3:
+        assert dm.shape[0] == 1
+        dm = dm[0]
+        
+    nao  = dm.shape[0]
+    cell = mydf.cell
+    assert cell.nao == nao
+    vol = cell.vol
+    mesh = np.array(cell.mesh, dtype=np.int32)
+    ngrid = np.prod(mesh)
+
+    aoR  = mydf.aoR
+    assert isinstance(aoR, list)
+    naux = mydf.naux
+    
+    #### step 0. allocate buffer 
+    
+    max_nao_involved   = np.max([aoR_holder.aoR.shape[0] for aoR_holder in aoR if aoR_holder is not None])
+    max_ngrid_involved = np.max([aoR_holder.aoR.shape[1] for aoR_holder in aoR if aoR_holder is not None])
+    ngrids_local       = np.sum([aoR_holder.aoR.shape[1] for aoR_holder in aoR if aoR_holder is not None])
+    
+    density_R = np.zeros((ngrid,), dtype=np.float64)
+    
+    # max_dim_buf = max(max_ngrid_involved, max_nao_involved)
+    max_dim_buf = max_nao_involved
+    ddot_buf = np.zeros((max_dim_buf, max_dim_buf), dtype=np.float64)
+    aoR_buf1 = np.zeros((max_nao_involved, max_ngrid_involved), dtype=np.float64)
+    
+    ##### get the involved C function ##### 
+    
+    fn_packadd_dm = getattr(libisdf, "_packadd_local_dm", None)
+    assert fn_packadd_dm is not None
+    
+    fn_packadd_dm2 = getattr(libisdf, "_packadd_local_RS", None)
+    assert fn_packadd_dm2 is not None
+    
+    #### step 1 2. get density value on real space grid and IPs
+    
+    group = mydf.group
+    ngroup = len(group)
+
+    J = _half_J(mydf, dm, use_mpi, first_pass, short_range)
+    
+    #### step 3. get J 
+
+    J_Res = np.zeros((nao, nao), dtype=np.float64)
+
+    ordered_ao_ind = np.arange(nao)
+
+    def _get_j_pass2_ls(_aoR_bra, 
+                        _ao_involved_bra, 
+                        _aoR_ket,
+                        _ao_involved_ket,
+                        _bra_type,
+                        _ket_type,
+                        _potential,
+                        _Res):
+        
+        nao_bra = _aoR_bra.shape[0]
+        nao_ket = _aoR_ket.shape[0]
+                
+        if _bra_type == _ket_type:
+            
+            aoR_J_res = np.ndarray(_aoR_ket.shape, buffer=aoR_buf1)
+            lib_isdf.d_ij_j_ij(_aoR_ket, _potential, out=aoR_J_res)
+            ddot_res = np.ndarray((nao_ket, nao_ket), buffer=ddot_buf)
+            lib.ddot(_aoR_ket, aoR_J_res.T, c=ddot_res)
+            
+            if nao_ket == nao and np.allclose(_ao_involved_ket, ordered_ao_ind):
+                _Res += ddot_res
+            else:
+                fn_packadd_dm(
+                    ddot_res.ctypes.data_as(ctypes.c_void_p),
+                    ctypes.c_int(nao_ket),
+                    _ao_involved_ket.ctypes.data_as(ctypes.c_void_p),
+                    _Res.ctypes.data_as(ctypes.c_void_p),
+                    ctypes.c_int(_Res.shape[0])
+                )
+        else:
+            
+            ### J_Res = ddot_res + ddot_res.T
+            
+            aoR_J_res = np.ndarray(_aoR_ket.shape, buffer=aoR_buf1)
+            lib_isdf.d_ij_j_ij(_aoR_ket, _potential, out=aoR_J_res)
+            ddot_res = np.ndarray((nao_bra, nao_ket), buffer=ddot_buf)
+            lib.ddot(_aoR_bra, aoR_J_res.T, c=ddot_res)
+            
+            fn_packadd_dm2(
+                ddot_res.ctypes.data_as(ctypes.c_void_p),
+                ctypes.c_int(nao_bra),
+                _ao_involved_bra.ctypes.data_as(ctypes.c_void_p),
+                ctypes.c_int(nao_ket),
+                _ao_involved_ket.ctypes.data_as(ctypes.c_void_p),
+                _Res.ctypes.data_as(ctypes.c_void_p),
+                ctypes.c_int(_Res.shape[0])
+            )
+
+
+    for atm_id, aoR_holder in enumerate(aoR):
+        
+        if aoR_holder is None:
+            continue
+        
+        if use_mpi:
+            if atm_id % comm_size != rank:
+                continue
+        
+        ngrids_now   = aoR_holder.aoR.shape[1]
+        nao_involved = aoR_holder.nao_involved
+        nao_compact  = aoR_holder.nCompact
+        nao_diffuse  = nao_involved - nao_compact
+        
+        global_gridID_begin = aoR_holder.global_gridID_begin
+        
+        J_tmp = J[global_gridID_begin:global_gridID_begin+ngrids_now] 
+                
+        if second_pass_all:  ### with RS case ###
+              
+            _get_j_pass2_ls(
+                aoR_holder.aoR, 
+                aoR_holder.ao_involved, 
+                aoR_holder.aoR,
+                aoR_holder.ao_involved,
+                "all",
+                "all",
+                J_tmp,
+                J_Res
+            )   
+        
+        else:
+            
+            if second_pass_has_cc:
+                _get_j_pass2_ls(
+                    aoR_holder.aoR[:nao_compact,:], 
+                    aoR_holder.ao_involved[:nao_compact], 
+                    aoR_holder.aoR[:nao_compact,:],
+                    aoR_holder.ao_involved[:nao_compact],
+                    "compact",
+                    "compact",
+                    J_tmp,
+                    J_Res
+                )
+                
+            if second_pass_has_dd:
+                _get_j_pass2_ls(
+                    aoR_holder.aoR[nao_compact:,:], 
+                    aoR_holder.ao_involved[nao_compact:], 
+                    aoR_holder.aoR[nao_compact:,:],
+                    aoR_holder.ao_involved[nao_compact:],
+                    "diffuse",
+                    "diffuse",
+                    J_tmp,
+                    J_Res
+                )
+                
+            if second_pass_has_cd:
+                _get_j_pass2_ls(
+                    aoR_holder.aoR[:nao_compact,:], 
+                    aoR_holder.ao_involved[:nao_compact], 
+                    aoR_holder.aoR[nao_compact:,:],
+                    aoR_holder.ao_involved[nao_compact:],
+                    "compact",
+                    "diffuse",
+                    J_tmp,
+                    J_Res
+                )
+
+    J = J_Res
+
+    if use_mpi:
+        J = reduce(J, root=0)
+    
+    t2 = (logger.process_clock(), logger.perf_counter())
+    
+    _benchmark_time(t1, t2, "_contract_j_dm_fast", mydf)
+    
+    ######### delete the buffer #########
+
+    del ddot_buf 
+    del aoR_buf1
+    
+    return J * ngrid / vol
+
+def _contract_j_dm_wo_robust_fitting(mydf, dm, use_mpi=False):
+    
+    if use_mpi:
+        from mpi4py import MPI
+        comm = MPI.COMM_WORLD
+        rank = comm.Get_rank()
+        size = comm.Get_size()
+        assert mydf.direct == True
+    
+    ####### judge whether to call the original one #######
+    
+    if isinstance(mydf.aoRg, np.ndarray):
+        from isdf_jk import _contract_j_dm_wo_robust_fitting
+        _contract_j_dm_wo_robust_fitting(mydf, dm, use_mpi=use_mpi)
+    
+    ######## start the calculation ########
+    
+    t1 = (logger.process_clock(), logger.perf_counter())
+
+    if len(dm.shape) == 3:
+        assert dm.shape[0] == 1
+        dm = dm[0]
+        
+    nao  = dm.shape[0]
+    cell = mydf.cell
+    assert cell.nao == nao
+    vol = cell.vol
+    mesh = np.array(cell.mesh, dtype=np.int32)
+    ngrid = np.prod(mesh)
+
+    aoRg  = mydf.aoRg
+    assert isinstance(aoRg, list)
+    naux = mydf.naux
+    W = mydf.W
+    
+    #### step 0. allocate buffer 
+    
+    max_nao_involved = np.max([aoR_holder.aoR.shape[0] for aoR_holder in aoRg if aoR_holder is not None])
+    max_ngrid_involved = np.max([aoR_holder.aoR.shape[1] for aoR_holder in aoRg if aoR_holder is not None])
+    ngrids_local = np.sum([aoR_holder.aoR.shape[1] for aoR_holder in aoRg if aoR_holder is not None])
+    
+    density_Rg = np.zeros((naux,), dtype=np.float64)
+    
+    dm_buf = np.zeros((max_nao_involved, max_nao_involved), dtype=np.float64)
+    max_dim_buf = max(max_ngrid_involved, max_nao_involved)
+    ddot_buf = np.zeros((max_dim_buf, max_dim_buf), dtype=np.float64)
+    aoR_buf1 = np.zeros((max_nao_involved, max_ngrid_involved), dtype=np.float64)
+    
+    ##### get the involved C function ##### 
+    
+    fn_extract_dm = getattr(libisdf, "_extract_dm_involved_ao", None) 
+    assert fn_extract_dm is not None
+    
+    fn_packadd_dm = getattr(libisdf, "_packadd_local_dm", None)
+    assert fn_packadd_dm is not None
+    
+    #### step 1. get density value on real space grid and IPs
+
+    group = mydf.group
+    ngroup = len(group)
+    
+    density_R_tmp = None
+    ordered_ao_ind = np.arange(nao)
+    
+    for atm_id, aoR_holder in enumerate(aoRg):
+        
+        if aoR_holder is None:
+            continue
+        
+        if use_mpi:
+            if atm_id % comm_size != rank:
+                continue
+            
+        ngrids_now = aoR_holder.aoR.shape[1]
+        nao_involved = aoR_holder.aoR.shape[0]
+        
+        if nao_involved < nao or (nao_involved == nao and not np.allclose(aoR_holder.ao_involved, ordered_ao_ind)):
+            fn_extract_dm(
+                dm.ctypes.data_as(ctypes.c_void_p),
+                ctypes.c_int(nao),
+                dm_buf.ctypes.data_as(ctypes.c_void_p),
+                aoR_holder.ao_involved.ctypes.data_as(ctypes.c_void_p),
+                ctypes.c_int(nao_involved),
+            )
+        else:
+            dm_buf.ravel()[:] = dm.ravel()
+        
+        dm_now = np.ndarray((nao_involved, nao_involved), buffer=dm_buf)
+    
+        ddot_res = np.ndarray((nao_involved, ngrids_now), buffer=ddot_buf)
+        
+        lib.ddot(dm_now, aoR_holder.aoR, c=ddot_res)
+        density_R_tmp = lib.multiply_sum_isdf(aoR_holder.aoR, ddot_res)
+        
+        global_gridID_begin = aoR_holder.global_gridID_begin
+        
+        density_Rg[global_gridID_begin:global_gridID_begin+ngrids_now] = density_R_tmp
+        
+    if use_mpi == False:
+        assert ngrids_local == naux
+    
+    if use_mpi:
+        density_Rg = reduce(density_Rg, root=0)
+        J = bcast(J, root=0)
+    
+    #### step 3. get J 
+    
+    J = np.asarray(lib.dot(W, density_Rg.reshape(-1,1)), order='C').reshape(-1)
+    
+    J_Res = np.zeros((nao, nao), dtype=np.float64)
+
+    for aoR_holder in aoRg:
+        
+        if aoR_holder is None:
+            continue
+        
+        if use_mpi:
+            if atm_id % comm_size != rank:
+                continue
+        
+        ngrids_now = aoR_holder.aoR.shape[1]
+        nao_involved = aoR_holder.aoR.shape[0]
+        
+        global_gridID_begin = aoR_holder.global_gridID_begin
+        
+        J_tmp = J[global_gridID_begin:global_gridID_begin+ngrids_now] 
+        
+        aoR_J_res = np.ndarray(aoR_holder.aoR.shape, buffer=aoR_buf1)
+        lib_isdf.d_ij_j_ij(aoR_holder.aoR, J_tmp, out=aoR_J_res)
+        ddot_res = np.ndarray((nao_involved, nao_involved), buffer=ddot_buf)
+        lib.ddot(aoR_holder.aoR, aoR_J_res.T, c=ddot_res)
+        
+        if nao_involved == nao and np.allclose(aoR_holder.ao_involved, ordered_ao_ind):
+            J_Res += ddot_res
+        else:
+            fn_packadd_dm(
+                ddot_res.ctypes.data_as(ctypes.c_void_p),
+                ctypes.c_int(nao_involved),
+                aoR_holder.ao_involved.ctypes.data_as(ctypes.c_void_p),
+                J_Res.ctypes.data_as(ctypes.c_void_p),
+                ctypes.c_int(nao)
+            )        
+
+    J = J_Res
+
+    if use_mpi:
+        J = reduce(J, root=0)
+    
+    t2 = (logger.process_clock(), logger.perf_counter())
+    
+    _benchmark_time(t1, t2, "_contract_j_dm_fast", mydf)
+    
+    ######### delete the buffer #########
+    
+    del dm_buf, ddot_buf, density_Rg
+    del density_R_tmp
+    del aoR_buf1
+    
+    return J * ngrid / vol
+
+############# quadratic scaling (not cubic!) #############
+
+def __get_DensityMatrixonRgAO_qradratic(mydf, dm, 
+                                        bra_aoR_holder, 
+                                        bra_type        = None,
+                                        _res:np.ndarray = None, 
+                                        verbose         = 1):
+    
+    assert bra_type in [None, "all", "compact", "diffuse"]
+        
+    t1 = (logger.process_clock(), logger.perf_counter())
+    
+    if len(dm.shape) == 3:
+        assert dm.shape[0] <= 4
+        # dm = dm[0]
+    else:
+        dm = dm.reshape(1, *dm.shape)
+    
+    assert dm.shape[1] == dm.shape[2]
+    nset, nao = dm.shape[0], dm.shape[1]
+    
+    ngrid_bra = np.sum([aoR_holder.aoR.shape[1] for aoR_holder in bra_aoR_holder if aoR_holder is not None])
+    
+    max_ngrid_bra = np.max([aoR_holder.aoR.shape[1] for aoR_holder in bra_aoR_holder if aoR_holder is not None])
+    max_ao_involved = np.max([aoR_holder.aoR.shape[0] for aoR_holder in bra_aoR_holder if aoR_holder is not None])
+    
+    if _res is None:
+        res = np.zeros((nset, ngrid_bra, nao), dtype=np.float64)
+    else:
+        res = np.ndarray((nset, ngrid_bra, nao), buffer=_res, dtype=np.float64)
+    
+    ### allocate buf ###
+    
+    offset      = 0
+    ddot_buf    = np.ndarray((max_ngrid_bra, nao), buffer=mydf.build_k_buf, offset=offset)
+    offset     += ddot_buf.size * ddot_buf.dtype.itemsize
+    dm_pack_buf = np.ndarray((dm.shape[1], dm.shape[2]), buffer=mydf.build_k_buf, offset=offset)
+        
+    ### get pack fn ### 
+    
+    fn_packrow = getattr(libisdf, "_buildK_packrow", None)
+    assert fn_packrow is not None
+    fn_packcol = getattr(libisdf, "_buildK_packcol", None)
+    assert fn_packcol is not None
+    
+    ### perform aoR_bra.T * dm
+    
+    ordered_ao_ind = np.arange(nao)
+    grid_shift     = None
+    ngrid_loc      = 0
+    
+    for aoR_holder in bra_aoR_holder:
+        
+        if aoR_holder is None:
+            continue
+        
+        ngrid_now = aoR_holder.aoR.shape[1]
+        nao_involved = aoR_holder.aoR.shape[0]
+        nao_compact  = aoR_holder.nCompact
+                
+        ao_begin_indx = 0
+        ao_end_indx   = nao_involved
+        if bra_type == "compact":
+            ao_end_indx = nao_compact
+        elif bra_type == "diffuse":
+            ao_begin_indx = nao_compact 
+        
+        nao_at_work = ao_end_indx - ao_begin_indx
+        
+        for iset in range(nset):
+            if (nao_at_work) == nao and np.allclose(aoR_holder.ao_involved, ordered_ao_ind):
+                dm_packed = dm[iset]
+            else:
+                dm_packed = np.ndarray((nao_at_work, nao), buffer=dm_pack_buf)
+                fn_packrow(
+                    dm_packed.ctypes.data_as(ctypes.c_void_p),
+                    ctypes.c_int(nao_at_work),
+                    ctypes.c_int(nao),
+                    dm[iset].ctypes.data_as(ctypes.c_void_p),
+                    ctypes.c_int(nao),
+                    ctypes.c_int(nao),
+                    aoR_holder.ao_involved[ao_begin_indx:ao_end_indx].ctypes.data_as(ctypes.c_void_p)
+                )
+        
+            ddot_res = np.ndarray((ngrid_now, nao), buffer=ddot_buf)
+            lib.ddot(aoR_holder.aoR[ao_begin_indx:ao_end_indx,:].T, dm_packed, c=ddot_res)
+            grid_loc_begin = aoR_holder.global_gridID_begin
+    
+            if grid_shift is None:
+                grid_shift = grid_loc_begin
+            else:
+                assert grid_loc_begin>=grid_shift
+        
+            res[iset, grid_loc_begin-grid_shift:grid_loc_begin-grid_shift+ngrid_now, :] = ddot_res
+        
+    t2 = (logger.process_clock(), logger.perf_counter())
+    return res
+
+def _contract_k_dm_quadratic(mydf, dm, with_robust_fitting=True, use_mpi=False):
+    
+    if use_mpi:
+        raise NotImplementedError("MPI is not supported yet.")
+    
+    ####### judge whether to call the original one #######
+    
+    if isinstance(mydf.aoRg, np.ndarray):
+        from isdf_jk import _contract_k_dm, _contract_k_dm_wo_robust_fitting
+        if mydf.aoR is None:
+            return _contract_k_dm_wo_robust_fitting(mydf, dm, False, use_mpi=use_mpi)
+        else:
+            return _contract_k_dm(mydf, dm, with_robust_fitting, use_mpi=use_mpi)
+    
+    ######## start the calculation ########
+    
+    t1 = (logger.process_clock(), logger.perf_counter())
+    
+    if len(dm.shape) == 3:
+        assert dm.shape[0] == 1
+        dm = dm[0]
+        
+    nao  = dm.shape[0]
+    cell = mydf.cell
+    assert cell.nao == nao
+    vol = cell.vol
+    mesh = np.array(cell.mesh, dtype=np.int32)
+    ngrid = np.prod(mesh)
+    
+    aoRg = mydf.aoRg
+    assert isinstance(aoRg, list)
+    aoR = mydf.aoR
+    assert isinstance(aoR, list)
+    
+    naux = mydf.naux
+    nao = cell.nao
+    
+    #### step 0. allocate buffer
+    
+    max_nao_involved = np.max([aoR_holder.aoR.shape[0] for aoR_holder in aoR if aoR_holder is not None])
+    max_ngrid_involved = np.max([aoR_holder.aoR.shape[1] for aoR_holder in aoR if aoR_holder is not None])
+    max_nIP_involved = np.max([aoR_holder.aoR.shape[1] for aoR_holder in aoRg if aoR_holder is not None])
+    
+    mydf.allocate_k_buffer()
+    
+    # ddot_res_buf = np.zeros((naux, max_nao_involved), dtype=np.float64)
+    ddot_res_buf = mydf.build_k_buf
+    
+    ##### get the involved C function ##### 
+    
+    fn_packadd_row = getattr(libisdf, "_buildK_packaddrow", None)
+    assert fn_packadd_row is not None
+    fn_packadd_col = getattr(libisdf, "_buildK_packaddcol", None)
+    assert fn_packadd_col is not None
+    
+    fn_packcol1 = getattr(libisdf, "_buildK_packcol", None)
+    fn_packcol2 = getattr(libisdf, "_buildK_packcol2", None)
+    assert fn_packcol1 is not None
+    assert fn_packcol2 is not None
+    
+    #### step 1. get density matrix value on real space grid and IPs
+    
+    Density_RgAO = __get_DensityMatrixonRgAO_qradratic(mydf, dm, aoRg, "all", mydf.Density_RgAO_buf)
+    Density_RgAO = Density_RgAO[0]
+    
+    #### step 2. get K, those part which W is involved 
+    
+    W = mydf.W
+    assert W is not None
+    assert isinstance(W, np.ndarray)
+        
+    K1 = np.zeros((naux, nao), dtype=np.float64)
+    
+    ####### buf for the first loop #######
+    
+    offset = 0
+    ddot_buf1 = np.ndarray((naux, max_nIP_involved), buffer=ddot_res_buf, offset=offset, dtype=np.float64)
+    offset = ddot_buf1.size * ddot_res_buf.dtype.itemsize
+    pack_buf = np.ndarray((naux, max_nao_involved), buffer=ddot_res_buf, offset=offset, dtype=np.float64)
+    offset+= pack_buf.size * pack_buf.dtype.itemsize
+    ddot_buf2 = np.ndarray((naux, max(max_nIP_involved, max_nao_involved)), buffer=ddot_res_buf, offset=offset, dtype=np.float64)
+    
+    ordered_ao_ind = np.arange(nao)
+    
+    ### TODO: consider MPI 
+    
+    nIP_loc = 0
+    for aoRg_holder in aoRg:
+        
+        if aoRg_holder is None:
+            continue
+    
+        nIP_now = aoRg_holder.aoR.shape[1]
+        nao_involved = aoRg_holder.aoR.shape[0]
+        
+        #### pack the density matrix ####
+        
+        if nao_involved == nao and np.allclose(aoRg_holder.ao_involved, ordered_ao_ind):
+            Density_RgAO_packed = Density_RgAO
+        else:
+            # Density_RgAO_packed = Density_RgAO[:, aoRg_holder.ao_involved]
+            Density_RgAO_packed = np.ndarray((naux, nao_involved), buffer=pack_buf)
+            fn_packcol1(
+                Density_RgAO_packed.ctypes.data_as(ctypes.c_void_p),
+                ctypes.c_int(naux),
+                ctypes.c_int(nao_involved),
+                Density_RgAO.ctypes.data_as(ctypes.c_void_p),
+                ctypes.c_int(naux),
+                ctypes.c_int(nao),
+                aoRg_holder.ao_involved.ctypes.data_as(ctypes.c_void_p)
+            )
+        
+        # W_tmp = Density_RgRg[:, nIP_loc:nIP_loc+nIP_now] * W[:, nIP_loc:nIP_loc+nIP_now]
+        
+        ddot_res1 = np.ndarray((naux, nIP_now), buffer=ddot_buf1)
+        lib.ddot(Density_RgAO_packed, aoRg_holder.aoR, c=ddot_res1)
+        Density_RgRg = ddot_res1
+        W_packed = np.ndarray((naux, nIP_now), buffer=ddot_buf2)
+        fn_packcol2(
+            W_packed.ctypes.data_as(ctypes.c_void_p),
+            ctypes.c_int(naux),
+            ctypes.c_int(nIP_now),
+            W.ctypes.data_as(ctypes.c_void_p),
+            ctypes.c_int(naux),
+            ctypes.c_int(naux),
+            ctypes.c_int(nIP_loc),
+            ctypes.c_int(nIP_loc+nIP_now)
+        )
+        lib_isdf.cwise_mul(W_packed, Density_RgRg, out=Density_RgRg)
+        W_tmp = Density_RgRg
+
+        # ddot
+        
+        ddot_res = np.ndarray((naux, nao_involved), buffer=ddot_buf2)
+        lib.ddot(W_tmp, aoRg_holder.aoR.T, c=ddot_res)
+        
+        if nao_involved == nao and np.allclose(aoRg_holder.ao_involved, ordered_ao_ind):
+            K1 += ddot_res
+        else:
+            # K1[: , aoRg_holder.ao_involved] += ddot_res
+            fn_packadd_col(
+                K1.ctypes.data_as(ctypes.c_void_p),
+                ctypes.c_int(K1.shape[0]),
+                ctypes.c_int(K1.shape[1]),
+                ddot_res.ctypes.data_as(ctypes.c_void_p),
+                ctypes.c_int(ddot_res.shape[0]),
+                ctypes.c_int(ddot_res.shape[1]),
+                aoRg_holder.ao_involved.ctypes.data_as(ctypes.c_void_p)
+            )
+
+        nIP_loc += nIP_now
+    # del W_tmp
+    assert nIP_loc == naux
+        
+    K = np.zeros((nao, nao), dtype=np.float64) 
+    
+    nIP_loc = 0
+    for aoRg_holder in aoRg:
+        
+        if aoRg_holder is None:
+            continue
+    
+        nIP_now = aoRg_holder.aoR.shape[1]
+        nao_involved = aoRg_holder.aoR.shape[0]
+        
+        K_tmp = K1[nIP_loc:nIP_loc+nIP_now, :]
+        
+        ddot_res = np.ndarray((nao_involved, nao), buffer=ddot_res_buf)
+        lib.ddot(aoRg_holder.aoR, K_tmp, c=ddot_res)
+        
+        if nao_involved == nao and np.allclose(aoRg_holder.ao_involved, ordered_ao_ind):
+            K += ddot_res
+        else:
+            # K[aoRg_holder.ao_involved, :] += ddot_res 
+            fn_packadd_row(
+                K.ctypes.data_as(ctypes.c_void_p),
+                ctypes.c_int(K.shape[0]),
+                ctypes.c_int(K.shape[1]),
+                ddot_res.ctypes.data_as(ctypes.c_void_p),
+                ctypes.c_int(ddot_res.shape[0]),
+                ctypes.c_int(ddot_res.shape[1]),
+                aoRg_holder.ao_involved.ctypes.data_as(ctypes.c_void_p)
+            )
+        
+        nIP_loc += nIP_now
+    # del K_tmp
+    assert nIP_loc == naux
+    
+    #### step 3. get K, those part which W is not involved, with robust fitting
+    
+    if with_robust_fitting:
+        
+        K = -K
+        
+        ### calcualte those parts where V is involved 
+        
+        V_R = mydf.V_R
+        assert V_R is not None
+        assert isinstance(V_R, np.ndarray)
+        
+        # lib_isdf.cwise_mul(V_R, Density_RgR, out=Density_RgR)
+        
+        K2 = K1
+        K2.ravel()[:] = 0.0    
+    
+        # fn_packcol = getattr(libisdf, "_buildK_packcol2", None)
+        # assert fn_packcol is not None
+
+        ddot_buf1 = np.ndarray((naux, max_nao_involved), buffer=ddot_res_buf)
+        offset    = naux * max_nao_involved * ddot_res_buf.dtype.itemsize
+        V_tmp_buf = np.ndarray((naux, max_ngrid_involved), buffer=ddot_res_buf, offset=offset)
+        offset   += V_tmp_buf.size * V_tmp_buf.dtype.itemsize
+        pack_buf  = np.ndarray((naux, max_nao_involved), buffer=ddot_res_buf, offset=offset)
+        offset   += pack_buf.size * pack_buf.dtype.itemsize
+        ddot_buf2 = np.ndarray((naux, max_ngrid_involved), buffer=ddot_res_buf, offset=offset)
+    
+        ngrid_loc = 0
+        
+        for aoR_holder in aoR:
+            
+            if aoR_holder is None:
+                continue
+            
+            ngrid_now = aoR_holder.aoR.shape[1]
+            nao_involved = aoR_holder.aoR.shape[0]
+            
+            #### pack the density matrix ####
+            
+            if nao_involved == nao and np.allclose(aoR_holder.ao_involved, ordered_ao_ind):
+                Density_RgAO_packed = Density_RgAO
+            else:
+                # Density_RgAO_packed = Density_RgAO[:, aoR_holder.ao_involved]
+                Density_RgAO_packed = np.ndarray((naux, nao_involved), buffer=pack_buf)
+                fn_packcol1(
+                    Density_RgAO_packed.ctypes.data_as(ctypes.c_void_p),
+                    ctypes.c_int(naux),
+                    ctypes.c_int(nao_involved),
+                    Density_RgAO.ctypes.data_as(ctypes.c_void_p),
+                    ctypes.c_int(naux),
+                    ctypes.c_int(nao),
+                    aoR_holder.ao_involved.ctypes.data_as(ctypes.c_void_p)
+                )
+            
+            # V_tmp = Density_RgR[:, ngrid_loc:ngrid_loc+ngrid_now] * V_R[:, ngrid_loc:ngrid_loc+ngrid_now]
+            
+            ddot_res2 = np.ndarray((naux, ngrid_now), buffer=ddot_buf2)
+            lib.ddot(Density_RgAO_packed, aoR_holder.aoR, c=ddot_res2)
+            Density_RgR = ddot_res2
+            V_packed = np.ndarray((naux, ngrid_now), buffer=V_tmp_buf)
+            fn_packcol2(
+                V_packed.ctypes.data_as(ctypes.c_void_p),
+                ctypes.c_int(naux),
+                ctypes.c_int(ngrid_now),
+                V_R.ctypes.data_as(ctypes.c_void_p),
+                ctypes.c_int(naux),
+                ctypes.c_int(ngrid),
+                ctypes.c_int(ngrid_loc),
+                ctypes.c_int(ngrid_loc+ngrid_now)
+            )
+            lib_isdf.cwise_mul(V_packed, Density_RgR, out=Density_RgR)
+            V_tmp = Density_RgR
+                        
+            ddot_res = np.ndarray((naux, nao_involved), buffer=ddot_buf1)
+            lib.ddot(V_tmp, aoR_holder.aoR.T, c=ddot_res)
+            
+            if nao_involved == nao and np.allclose(aoR_holder.ao_involved, ordered_ao_ind):
+                K2 += ddot_res
+            else:
+                # K2[: , aoR_holder.ao_involved] += ddot_res 
+                fn_packadd_col(
+                    K2.ctypes.data_as(ctypes.c_void_p),
+                    ctypes.c_int(K2.shape[0]),
+                    ctypes.c_int(K2.shape[1]),
+                    ddot_res.ctypes.data_as(ctypes.c_void_p),
+                    ctypes.c_int(ddot_res.shape[0]),
+                    ctypes.c_int(ddot_res.shape[1]),
+                    aoR_holder.ao_involved.ctypes.data_as(ctypes.c_void_p)
+                )
+            
+            ngrid_loc += ngrid_now
+        # del V_tmp
+
+        assert ngrid_loc == ngrid
+        
+        K_add = np.zeros((nao, nao), dtype=np.float64)
+        
+        nIP_loc = 0
+        for aoRg_holder in aoRg:
+            
+            if aoRg_holder is None:
+                continue
+        
+            nIP_now = aoRg_holder.aoR.shape[1]
+            nao_involved = aoRg_holder.aoR.shape[0]
+            
+            K_tmp = K2[nIP_loc:nIP_loc+nIP_now, :] # no need to pack, continguous anyway
+            
+            ddot_res = np.ndarray((nao_involved, nao), buffer=ddot_res_buf)
+            lib.ddot(aoRg_holder.aoR, K_tmp, c=ddot_res)
+            
+            if nao == nao_involved and np.allclose(aoRg_holder.ao_involved, ordered_ao_ind):
+                K_add += ddot_res
+            else:
+                # K_add[aoRg_holder.ao_involved, :] += ddot_res 
+                fn_packadd_row(
+                    K_add.ctypes.data_as(ctypes.c_void_p),
+                    ctypes.c_int(K_add.shape[0]),
+                    ctypes.c_int(K_add.shape[1]),
+                    ddot_res.ctypes.data_as(ctypes.c_void_p),
+                    ctypes.c_int(ddot_res.shape[0]),
+                    ctypes.c_int(ddot_res.shape[1]),
+                    aoRg_holder.ao_involved.ctypes.data_as(ctypes.c_void_p)
+                )
+            
+            nIP_loc += nIP_now
+        # del K_tmp
+        assert nIP_loc == naux
+        
+        K_add += K_add.T
+        
+        K += K_add
+    
+    ######### finally delete the buffer #########
+    
+    del K1
+    
+    t2 = (logger.process_clock(), logger.perf_counter())
+    
+    # if mydf.verbose:
+    _benchmark_time(t1, t2, "_contract_k_dm_quadratic", mydf)
+    
+    return K * ngrid / vol
+
+def _contract_k_dm_quadratic_direct(mydf, dm, use_mpi=False):
+    
+    if use_mpi:
+        assert mydf.direct == True
+        from pyscf.isdf.isdf_tools_mpi import rank, comm, comm_size, bcast, reduce
+        size = comm.Get_size()
+    
+    t1 = (logger.process_clock(), logger.perf_counter())
+    
+    if dm.ndim == 3:
+        assert dm.shape[0] <= 4
+        # dm = dm[0]
+    else:
+        dm = dm.reshape(1, *dm.shape)
+        
+    aoR = mydf.aoR
+    aoRg = mydf.aoRg    
+    
+    max_nao_involved   = mydf.max_nao_involved
+    max_ngrid_involved = mydf.max_ngrid_involved
+    max_nIP_involved   = mydf.max_nIP_involved
+    maxsize_group_naux = mydf.maxsize_group_naux
+        
+    ####### preparing the data #######
+        
+    nset, nao  = dm.shape[0], dm.shape[1]
+    cell = mydf.cell
+    assert cell.nao == nao
+    vol = cell.vol
+    mesh = np.array(cell.mesh, dtype=np.int32)
+    mesh_int32 = mesh
+    ngrid = np.prod(mesh)
+    
+    aoRg = mydf.aoRg
+    assert isinstance(aoRg, list)
+    aoR = mydf.aoR
+    assert isinstance(aoR, list)
+    
+    naux = mydf.naux
+    nao = cell.nao
+    aux_basis = mydf.aux_basis
+    
+    grid_ordering = mydf.grid_ID_ordered 
+    
+    if hasattr(mydf, "coulG") == False:
+        if mydf.omega is not None:
+            assert mydf.omega >= 0.0
+        # mydf.coulG = tools.get_coulG(cell, mesh=mesh, omega=mydf.omega)
+        raise NotImplementedError("coulG is not implemented yet.")
+    
+    coulG = mydf.coulG
+    coulG_real = coulG.reshape(*mesh)[:, :, :mesh[2]//2+1].reshape(-1).copy()
+    
+    mydf.allocate_k_buffer(nset)
+    build_k_buf  = mydf.build_k_buf
+    build_VW_buf = mydf.build_VW_in_k_buf
+    
+    group = mydf.group
+    assert len(group) == len(aux_basis)
+        
+    ######### allocate buffer ######### 
+        
+    Density_RgAO_buf = mydf.Density_RgAO_buf
+    
+    nThread            = lib.num_threads()
+    bufsize_per_thread = (coulG_real.shape[0] * 2 + np.prod(mesh))
+    buf_build_V        = np.ndarray((nThread, bufsize_per_thread), dtype=np.float64, buffer=build_VW_buf) 
+    
+    offset_now = buf_build_V.size * buf_build_V.dtype.itemsize
+    
+    build_K_bunchsize = min(maxsize_group_naux, mydf._build_K_bunchsize)
+    
+    offset_build_now       = 0
+    offset_Density_RgR_buf = 0
+    Density_RgR_buf        = np.ndarray((build_K_bunchsize, ngrid), buffer=build_k_buf, offset=offset_build_now)
+    
+    offset_build_now        += Density_RgR_buf.size * Density_RgR_buf.dtype.itemsize
+    offset_ddot_res_RgR_buf  = offset_build_now
+    ddot_res_RgR_buf         = np.ndarray((build_K_bunchsize, max_ngrid_involved), buffer=build_k_buf, offset=offset_ddot_res_RgR_buf)
+    
+    offset_build_now   += ddot_res_RgR_buf.size * ddot_res_RgR_buf.dtype.itemsize
+    offset_K1_tmp1_buf  = offset_build_now
+    K1_tmp1_buf         = np.ndarray((maxsize_group_naux, nao), buffer=build_k_buf, offset=offset_K1_tmp1_buf)
+    
+    offset_build_now            += K1_tmp1_buf.size * K1_tmp1_buf.dtype.itemsize
+    offset_K1_tmp1_ddot_res_buf  = offset_build_now
+    K1_tmp1_ddot_res_buf         = np.ndarray((maxsize_group_naux, nao), buffer=build_k_buf, offset=offset_K1_tmp1_ddot_res_buf)
+    
+    offset_build_now += K1_tmp1_ddot_res_buf.size * K1_tmp1_ddot_res_buf.dtype.itemsize
+
+    offset_K1_final_ddot_buf = offset_build_now
+    K1_final_ddot_buf        = np.ndarray((nao, nao), buffer=build_k_buf, offset=offset_K1_final_ddot_buf)
+    
+    ########### get involved C function ###########
+    
+    fn_packcol1 = getattr(libisdf, "_buildK_packcol", None)
+    assert fn_packcol1 is not None
+    fn_packcol2 = getattr(libisdf, "_buildK_packcol2", None)
+    assert fn_packcol2 is not None
+    fn_packadd_col = getattr(libisdf, "_buildK_packaddcol", None)
+    assert fn_packadd_col is not None
+    fn_packadd_row = getattr(libisdf, "_buildK_packaddrow", None)
+    assert fn_packadd_row is not None
+
+    ordered_ao_ind = np.arange(nao)
+
+    ######### begin work #########
+    
+    K1 = np.zeros((nset, nao, nao), dtype=np.float64) # contribution from V matrix
+    K2 = np.zeros((nset, nao, nao), dtype=np.float64) # contribution from W matrix
+    
+    for group_id, atm_ids in enumerate(group):
+        
+        if use_mpi:
+            if group_id % comm_size != rank:
+                continue
+        
+        naux_tmp = 0
+        aoRg_holders = []
+        for atm_id in atm_ids:
+            naux_tmp += aoRg[atm_id].aoR.shape[1]
+            aoRg_holders.append(aoRg[atm_id])
+        assert naux_tmp == aux_basis[group_id].shape[0]
+        
+        aux_basis_tmp = aux_basis[group_id]
+        
+        #### 1. build the involved DM_RgR #### 
+        
+        Density_RgAO_tmp            = np.ndarray((nset, naux_tmp, nao), buffer=Density_RgAO_buf)
+        offset_density_RgAO_buf     = Density_RgAO_tmp.size * Density_RgAO_buf.dtype.itemsize
+        Density_RgAO_tmp.ravel()[:] = 0.0
+        Density_RgAO_tmp            = __get_DensityMatrixonRgAO_qradratic(mydf, dm, aoRg_holders, "all", Density_RgAO_tmp, verbose=mydf.verbose)
+        
+        #### 2. build the V matrix #### 
+        
+        W_tmp = None
+        
+        for iset in range(nset):
+            
+            calculate_W_tmp = (iset == 0) 
+            
+            _W_tmp = _isdf_get_K_direct_kernel_1(
+                mydf, coulG_real,
+                group_id, Density_RgAO_tmp[iset],
+                None, True, calculate_W_tmp,
+                ##### buffer #####
+                buf_build_V,
+                build_VW_buf,
+                offset_now,
+                Density_RgR_buf,
+                Density_RgAO_buf,
+                offset_density_RgAO_buf,
+                ddot_res_RgR_buf,
+                K1_tmp1_buf,
+                K1_tmp1_ddot_res_buf,
+                K1_final_ddot_buf,
+                ##### bunchsize #####
+                build_K_bunchsize,
+                ##### other info #####
+                use_mpi=use_mpi,
+                ##### out #####
+                K1_or_2=K1[iset])
+
+            if calculate_W_tmp:
+                W_tmp = _W_tmp.copy()
+                
+            _isdf_get_K_direct_kernel_1(
+                mydf, coulG_real,
+                group_id, Density_RgAO_tmp[iset],
+                W_tmp, False, False,
+                ##### buffer #####
+                buf_build_V,
+                build_VW_buf,
+                offset_now,
+                Density_RgR_buf,
+                Density_RgAO_buf,
+                offset_density_RgAO_buf,
+                ddot_res_RgR_buf,
+                K1_tmp1_buf,
+                K1_tmp1_ddot_res_buf,
+                K1_final_ddot_buf,
+                ##### bunchsize #####
+                build_K_bunchsize,
+                ##### other info #####
+                use_mpi=use_mpi,
+                ##### out #####
+                K1_or_2=K2[iset])
+                
+    ######### finally delete the buffer #########
+    
+    if use_mpi:
+        comm.Barrier()
+    
+    if use_mpi:
+        K1 = reduce(K1, root = 0)
+        K2 = reduce(K2, root = 0)
+        K = np.zeros_like(K1)
+        if rank == 0:
+            for iset in range(nset):
+                K[iset] = K1[iset] + K1[iset].T - K2[iset]
+        else:
+            K = None
+        K = bcast(K, root = 0)
+    else:
+        K = np.zeros_like(K1)
+        for iset in range(nset):
+            K[iset] = K1[iset] + K1[iset].T - K2[iset]
+    
+    del K1
+    del K2
+    
+    t2 = (logger.process_clock(), logger.perf_counter())
+    
+    _benchmark_time(t1, t2, "_contract_k_dm_quadratic_direct", mydf)
+                
+    return K * ngrid / vol
+
+############# occ RI #############
+
+def get_jk_occRI(mydf, dm, use_mpi=False, with_j=True, with_k=True):
+
+    assert mydf.omega is None or mydf.omega == 0.0
+    # assert with_j_occRI is False
+
+    t1 = (logger.process_clock(), logger.perf_counter())
+    t0 = t1
+
+    if mydf.direct:
+        raise NotImplementedError("get_jk_occRI does not support robust fitting or direct=True")
+
+    if use_mpi:
+        raise NotImplementedError("get_jk_occRI does not support use_mpi=True")
+
+    # print("dm.shape = ", dm.shape)
+
+    if getattr(dm, 'mo_coeff', None) is not None:
+        mo_coeff = dm.mo_coeff
+        mo_occ   = dm.mo_occ
+    else:
+        raise NotImplementedError("mo_coeff is not provided yet")
+
+    if len(dm.shape) == 3:
+        assert dm.shape[0] == 1
+        dm = dm[0]
+
+    ##### fetch the basic info #####
+
+    nao  = dm.shape[0]
+    cell = mydf.cell
+    assert cell.nao == nao
+    vol   = cell.vol
+    mesh  = np.array(cell.mesh, dtype=np.int32)
+    ngrid = np.prod(mesh)
+
+    aoR  = mydf.aoR
+    aoRg = mydf.aoRg
+    assert isinstance(aoR, list)
+    naux = mydf.naux
+
+    weight = np.sqrt(cell.vol/ngrid)
+
+    ######### weighted mo_coeff #########
+    
+    occ_tol    = mydf.occ_tol
+    nocc       = np.count_nonzero(mo_occ > occ_tol)
+    occ_weight = np.sqrt(mo_occ[mo_occ > occ_tol])
+    # print("occ_weight = ", occ_weight)
+    mo_coeff_full     = mo_coeff.copy()
+    mo_coeff_original = mo_coeff[:,mo_occ > occ_tol].copy()
+    mo_coeff = mo_coeff[:,mo_occ > occ_tol] * occ_weight ## NOTE: it is a weighted mo_coeff
+    mo_coeff = mo_coeff.copy()                           ## NOTE: nonsense thing in python
+    assert mo_coeff.shape[1] == nocc
+    assert mo_coeff.shape[0] == nao
+    
+    # dm2 = np.dot(mo_coeff, mo_coeff.T)
+    # assert np.allclose(dm, dm2)
+
+    # print("mo_coeff_original = ", mo_coeff_original[:,0])
+    # print("mo_coeff          = ", mo_coeff[:,0])
+
+    ####### determine whether to construct moR #######
+    
+    construct_moR    = with_j or (with_k and mydf.with_robust_fitting is True)
+    construct_dmRgRg = with_k
+    construct_dmRgR  = with_k and mydf.with_robust_fitting is True
+
+    #### step -2. allocate buffer 
+
+    max_nao_involved   = np.max([aoR_holder.aoR.shape[0] for aoR_holder in aoR if aoR_holder is not None])
+    max_ngrid_involved = np.max([aoR_holder.aoR.shape[1] for aoR_holder in aoR if aoR_holder is not None])
+    ngrids_local       = np.sum([aoR_holder.aoR.shape[1] for aoR_holder in aoR if aoR_holder is not None])
+    max_dim_buf        = max(max_ngrid_involved, max_nao_involved)
+    max_nIP_involved   = np.max([aoRg_holder.aoR.shape[1] for aoRg_holder in aoRg if aoRg_holder is not None])
+        
+    mydf.deallocate_k_buffer()
+    
+    if hasattr(mydf, "moRg") is False:
+        mydf.moRg = np.zeros((nocc, naux), dtype=np.float64)
+    else:
+        if nocc != mydf.moRg.shape[0]:
+            mydf.moRg = np.zeros((nocc, naux), dtype=np.float64)
+            
+    if hasattr(mydf, "K1_packbuf") is False:
+        mydf.K1_packbuf = np.zeros((nocc, max_ngrid_involved), dtype=np.float64)
+    else:
+        if nocc != mydf.K1_packbuf.shape[0]:
+            mydf.K1_packbuf = np.zeros((nocc, max_ngrid_involved), dtype=np.float64)
+    
+    if construct_moR:
+        if hasattr(mydf, "moR") is False:
+            mydf.moR = np.zeros((nocc, ngrid), dtype=np.float64)
+        else:
+            if nocc != mydf.moR.shape[0]:
+                mydf.moR = np.zeros((nocc, ngrid), dtype=np.float64)
+            
+    if construct_dmRgR:
+        if hasattr(mydf, "dmRgR") is False:
+            mydf.dmRgR = np.zeros((naux, ngrid), dtype=np.float64)
+    if construct_dmRgRg:
+        if hasattr(mydf, "dmRgRg") is False:
+            mydf.dmRgRg = np.zeros((naux, naux), dtype=np.float64)
+    
+    ddot_buf          = np.zeros((max_dim_buf, max_dim_buf), dtype=np.float64)
+    aoR_buf1          = np.zeros((max_nao_involved, max_ngrid_involved), dtype=np.float64)
+    moR_buf           = np.zeros((nocc, max_ngrid_involved), dtype=np.float64) # which can generated on the fly
+    mo_coeff_pack_buf = np.zeros((nao, max_nao_involved), dtype=np.float64)
+
+    ####### involved functions #######
+    
+    fn_packrow = getattr(libisdf, "_buildK_packrow", None)
+    assert fn_packrow is not None
+    
+    fn_packadd_row = getattr(libisdf, "_buildK_packaddrow", None)
+    assert fn_packadd_row is not None
+    
+    fn_packcol = getattr(libisdf, "_buildK_packcol", None)
+    assert fn_packcol is not None
+    
+    fn_packcol2 = getattr(libisdf, "_buildK_packcol2", None)
+    assert fn_packcol2 is not None
+    
+    fn_packcol3 = getattr(libisdf, "_buildK_packcol3", None)
+    assert fn_packcol3 is not None
+
+    fn_packadd_col = getattr(libisdf, "_buildK_packaddcol", None)
+    assert fn_packadd_col is not None
+    
+    fn_packadd_dm = getattr(libisdf, "_packadd_local_dm", None)
+    assert fn_packadd_dm is not None
+
+    #### step -1. construct moR, moRg, dmRgRg, dmRg ####
+
+    IP_loc_in_ordered_grids = mydf.IP_loc_in_ordered_grids
+
+    def _get_mo_values_on_grids(_aoR_holders, out_):
+        
+        for aoR_holder in _aoR_holders:
+            
+            ngrids_now   = aoR_holder.aoR.shape[1]
+            nao_involved = aoR_holder.aoR.shape[0]
+            
+            mo_coeff_packed = np.ndarray((nao_involved, nocc), buffer=mo_coeff_pack_buf)
+            # assert mo_coeff_packed.shape[0] == aoR_holder.ao_involved.shape[0]
+            # assert mo_coeff_packed.shape[1] == mo_coeff.shape[1]
+            
+            fn_packrow(
+                mo_coeff_packed.ctypes.data_as(ctypes.c_void_p),
+                ctypes.c_int(mo_coeff_packed.shape[0]),
+                ctypes.c_int(mo_coeff_packed.shape[1]),
+                mo_coeff.ctypes.data_as(ctypes.c_void_p),
+                ctypes.c_int(mo_coeff.shape[0]),
+                ctypes.c_int(mo_coeff.shape[1]),
+                aoR_holder.ao_involved.ctypes.data_as(ctypes.c_void_p)
+            )
+            
+            moR_now = np.ndarray((nocc, ngrids_now), buffer=moR_buf)
+            lib.ddot(mo_coeff_packed.T, aoR_holder.aoR, c=moR_now)
+            global_gridID_begin = aoR_holder.global_gridID_begin
+            fn_packcol3(
+                out_.ctypes.data_as(ctypes.c_void_p),
+                ctypes.c_int(out_.shape[0]),
+                ctypes.c_int(out_.shape[1]),
+                ctypes.c_int(global_gridID_begin),
+                ctypes.c_int(global_gridID_begin+ngrids_now),
+                moR_now.ctypes.data_as(ctypes.c_void_p),
+                ctypes.c_int(moR_now.shape[0]),
+                ctypes.c_int(moR_now.shape[1])
+            )
+                
+
+    t3 = (logger.process_clock(), logger.perf_counter())
+    
+    if hasattr(mydf, "moR"):
+        moR = mydf.moR
+    else:
+        moR = None  
+    moRg = mydf.moRg
+
+    if construct_moR:
+        _get_mo_values_on_grids(aoR, moR)
+        fn_packcol(
+            moRg.ctypes.data_as(ctypes.c_void_p),
+            ctypes.c_int(moRg.shape[0]),
+            ctypes.c_int(moRg.shape[1]),
+            moR.ctypes.data_as(ctypes.c_void_p),
+            ctypes.c_int(moR.shape[0]),
+            ctypes.c_int(moR.shape[1]),
+            IP_loc_in_ordered_grids.ctypes.data_as(ctypes.c_void_p)
+        )
+        
+    else:
+        moR = None
+        _get_mo_values_on_grids(aoRg, moRg)
+
+    t4 = (logger.process_clock(), logger.perf_counter())
+    
+    #if mydf.verbose:
+    _benchmark_time(t3, t4, "get_mo over grids", mydf)
+        #sys.stdout.flush()
+
+    t3 = (logger.process_clock(), logger.perf_counter())
+    
+    if construct_dmRgR:
+        dmRgR = mydf.dmRgR
+        lib.ddot(moRg.T, moR, c=dmRgR)
+        dmRgRg = mydf.dmRgRg
+        fn_packcol(
+            dmRgRg.ctypes.data_as(ctypes.c_void_p),
+            ctypes.c_int(naux),
+            ctypes.c_int(naux),
+            dmRgR.ctypes.data_as(ctypes.c_void_p),
+            ctypes.c_int(naux),
+            ctypes.c_int(ngrid),
+            IP_loc_in_ordered_grids.ctypes.data_as(ctypes.c_void_p)
+        )
+    else:
+        dmRgR = None
+        dmRgRg = mydf.dmRgRg
+        lib.ddot(moRg.T, moRg, c=dmRgRg)
+    
+    t4 = (logger.process_clock(), logger.perf_counter())
+    
+    #if mydf.verbose:
+    _benchmark_time(t3, t4, "get_dm over grids", mydf)
+        
+    #### step 0 get_half_J ####
+
+    if with_j:
+        
+        # weighted moR to densityR
+        
+        rhoR = np.zeros((ngrid), dtype=np.float64)
+        
+        fn_rhoR = getattr(libisdf, "moR_to_Density", None)
+        assert fn_rhoR is not None
+        
+        fn_rhoR(
+            ctypes.c_int(ngrid), 
+            ctypes.c_int(nocc),
+            moR.ctypes.data_as(ctypes.c_void_p),
+            rhoR.ctypes.data_as(ctypes.c_void_p)
+        )
+                        
+        # from rhoG to the potential # 
+        
+        rhoR_original = np.zeros_like(rhoR)
+        
+        fn_order = getattr(libisdf, "_Reorder_Grid_to_Original_Grid", None)
+        assert fn_order is not None
+        
+        fn_order(
+            ctypes.c_int(ngrid),
+            mydf.grid_ID_ordered.ctypes.data_as(ctypes.c_void_p),
+            rhoR.ctypes.data_as(ctypes.c_void_p),
+            rhoR_original.ctypes.data_as(ctypes.c_void_p)
+        )        
+        
+        rhoR = rhoR_original
+                
+        fn_J = getattr(libisdf, "_construct_J", None)
+        assert fn_J is not None
+        
+        if hasattr(mydf, "coulG") == False:
+            if mydf.omega is not None:
+                assert mydf.omega >= 0.0
+            print("mydf.omega = ", mydf.omega)
+            # mydf.coulG = tools.get_coulG(cell, mesh=mesh, omega=mydf.omega)
+            raise ValueError("mydf.coulG is not found.")
+        
+        J = np.zeros_like(rhoR)
+        
+        fn_J(
+            mesh.ctypes.data_as(ctypes.c_void_p),
+            rhoR.ctypes.data_as(ctypes.c_void_p),
+            mydf.coulG.ctypes.data_as(ctypes.c_void_p),
+            J.ctypes.data_as(ctypes.c_void_p)
+        )
+        
+        J_ordered = np.zeros_like(J)
+        
+        fn_order = getattr(libisdf, "_Original_Grid_to_Reorder_Grid", None)
+        assert fn_order is not None
+        
+        fn_order(
+            ctypes.c_int(ngrid),
+            mydf.grid_ID_ordered.ctypes.data_as(ctypes.c_void_p),
+            J.ctypes.data_as(ctypes.c_void_p),
+            J_ordered.ctypes.data_as(ctypes.c_void_p)
+        )
+        
+        rhoR = J_ordered.copy() 
+        
+    else:
+        rhoR = None
+
+    J_Res = np.zeros((nao, nao), dtype=np.float64)
+
+    ordered_ao_ind = np.arange(nao, dtype=np.int32)
+
+    #### step 1 get_J ####
+
+    t1 = (logger.process_clock(), logger.perf_counter())
+
+    for aoR_holder in aoR:
+        
+        if with_j is False:
+            continue
+        
+        if aoR_holder is None:
+            continue
+        
+        if use_mpi:
+            if atm_id % comm_size != rank:
+                continue
+        
+        ngrids_now = aoR_holder.aoR.shape[1]
+        nao_involved = aoR_holder.aoR.shape[0]
+        
+        global_gridID_begin = aoR_holder.global_gridID_begin
+        rhoR_tmp = rhoR[global_gridID_begin:global_gridID_begin+ngrids_now] 
+        
+        aoR_rhoR_res = np.ndarray((nao_involved, ngrids_now), buffer=aoR_buf1)
+        lib_isdf.d_ij_j_ij(aoR_holder.aoR, rhoR_tmp, out=aoR_rhoR_res)
+        ddot_res = np.ndarray((nao_involved, nao_involved), buffer=ddot_buf)
+        lib.ddot(aoR_rhoR_res, aoR_holder.aoR.T, c=ddot_res)
+        
+        if nao_involved == nao and np.allclose(aoR_holder.ao_involved, ordered_ao_ind):
+            J_Res += ddot_res
+        else:
+            fn_packadd_dm(
+                ddot_res.ctypes.data_as(ctypes.c_void_p),
+                ctypes.c_int(nao_involved),
+                aoR_holder.ao_involved.ctypes.data_as(ctypes.c_void_p),
+                J_Res.ctypes.data_as(ctypes.c_void_p),
+                ctypes.c_int(nao)
+            )      
+
+    J = J_Res
+
+    if with_j is False:
+        J = None
+
+    t2 = (logger.process_clock(), logger.perf_counter())
+    
+    if with_j:
+        _benchmark_time(t1, t2, "get_j", mydf)
+
+    t1 = (logger.process_clock(), logger.perf_counter())
+
+    if with_k is False:
+        K = None
+        return J * ngrid / vol, K
+
+    K = np.zeros((nocc, nao), dtype=np.float64)
+
+    #### in the following steps, mo should not be weighted ####
+
+    occ_weight_inv = (1.0 / occ_weight).copy()
+    if moR is not None:
+        lib.d_i_ij_ij(occ_weight_inv, moR,  out=moR)
+    if moRg is not None:
+        lib.d_i_ij_ij(occ_weight_inv, moRg, out=moRg)
+
+    #### step 2 get moRg and dmRgRg ####
+        
+    ### step 3. get_K ###
+    
+    lib_isdf.cwise_mul(mydf.W, dmRgRg, out=dmRgRg)
+    W2 = dmRgRg
+    if construct_dmRgR:
+        lib_isdf.cwise_mul(mydf.V_R, dmRgR, out=dmRgR)
+        V2 = dmRgR
+    else:
+        V2 = None
+        
+    K1 = lib.ddot(moRg, W2)       ### moRg * W2 * aoRg.T
+    K1_res = np.zeros((nocc, nao), dtype=np.float64)
+    if mydf.with_robust_fitting:
+        K2 = lib.ddot(moRg, V2)   ### moRg * V2 * aoR.T
+        K3 = lib.ddot(V2, moR.T)  ### aoRg * V2 * moR.T
+        K2_res = np.zeros((nocc, nao), dtype=np.float64)
+        K3_res = np.zeros((nao, nocc), dtype=np.float64)
+    else:
+        K2 = None
+        K3 = None
+    
+    K = np.zeros((nocc, nao), dtype=np.float64)
+    K1_packbuf = mydf.K1_packbuf
+    
+    ##### construct with aoRg #####
+    
+    for aoR_holder in mydf.aoRg:
+        
+        ngrids_now = aoR_holder.aoR.shape[1]
+        nao_involved = aoR_holder.aoR.shape[0]
+        
+        ########## for (moRg * W2) * aoRg.T ##########
+        
+        K1_pack = np.ndarray((nocc, ngrids_now), buffer=K1_packbuf)
+        
+        grid_loc_now = aoR_holder.global_gridID_begin
+        
+        fn_packcol2(
+            K1_pack.ctypes.data_as(ctypes.c_void_p),
+            ctypes.c_int(nocc),
+            ctypes.c_int(ngrids_now),
+            K1.ctypes.data_as(ctypes.c_void_p),    
+            ctypes.c_int(nocc),
+            ctypes.c_int(naux),
+            ctypes.c_int(grid_loc_now),
+            ctypes.c_int(grid_loc_now+ngrids_now)
+        )
+        
+        ddot_res = np.ndarray((nocc, nao_involved), buffer=ddot_buf)
+
+        lib.ddot(K1_pack, aoR_holder.aoR.T, c=ddot_res)
+        
+        if nao_involved == nao and np.allclose(aoR_holder.ao_involved, ordered_ao_ind):
+            K1_res += ddot_res
+        else:
+            fn_packadd_col(
+                K1_res.ctypes.data_as(ctypes.c_void_p),
+                ctypes.c_int(K1_res.shape[0]),
+                ctypes.c_int(K1_res.shape[1]),
+                ddot_res.ctypes.data_as(ctypes.c_void_p),
+                ctypes.c_int(ddot_res.shape[0]),
+                ctypes.c_int(ddot_res.shape[1]),
+                aoR_holder.ao_involved.ctypes.data_as(ctypes.c_void_p)
+            )
+        
+        ########## aoRg * (V2 * moR.T) ##########
+        
+        if mydf.with_robust_fitting:
+            K3_pack = K3[grid_loc_now:grid_loc_now+ngrids_now, :]
+            ddot_res = np.ndarray((nao_involved, nocc), buffer=ddot_buf)
+            lib.ddot(aoR_holder.aoR, K3_pack, c=ddot_res)
+            if nao_involved == nao and np.allclose(aoR_holder.ao_involved, ordered_ao_ind):
+                K3_res += ddot_res
+            else:
+                fn_packadd_row(
+                    K3_res.ctypes.data_as(ctypes.c_void_p),
+                    ctypes.c_int(K3_res.shape[0]),
+                    ctypes.c_int(K3_res.shape[1]),
+                    ddot_res.ctypes.data_as(ctypes.c_void_p),
+                    ctypes.c_int(ddot_res.shape[0]),
+                    ctypes.c_int(ddot_res.shape[1]),
+                    aoR_holder.ao_involved.ctypes.data_as(ctypes.c_void_p)
+                )
+        
+        grid_loc_now += ngrids_now
+    
+    
+    if mydf.with_robust_fitting:
+        
+        for aoR_holder in mydf.aoR:
+            
+            ngrids_now = aoR_holder.aoR.shape[1]
+            nao_involved = aoR_holder.aoR.shape[0]
+            
+            ########## (moRg * V2) * aoR.T ##########
+    
+            K2_pack = np.ndarray((nocc, ngrids_now), buffer=K1_packbuf)
+            
+            grid_loc_now = aoR_holder.global_gridID_begin
+            
+            fn_packcol2(
+                K2_pack.ctypes.data_as(ctypes.c_void_p),
+                ctypes.c_int(nocc),
+                ctypes.c_int(ngrids_now),
+                K2.ctypes.data_as(ctypes.c_void_p),    
+                ctypes.c_int(nocc),
+                ctypes.c_int(ngrid),
+                ctypes.c_int(grid_loc_now),
+                ctypes.c_int(grid_loc_now+ngrids_now)
+            )
+    
+            ddot_res = np.ndarray((nocc, nao_involved), buffer=ddot_buf)
+            
+            lib.ddot(K2_pack, aoR_holder.aoR.T, c=ddot_res)
+            
+            if nao_involved == nao and np.allclose(aoR_holder.ao_involved, ordered_ao_ind):
+                K2_res += ddot_res
+            else:
+                fn_packadd_col(
+                    K2_res.ctypes.data_as(ctypes.c_void_p),
+                    ctypes.c_int(K2_res.shape[0]),
+                    ctypes.c_int(K2_res.shape[1]),
+                    ddot_res.ctypes.data_as(ctypes.c_void_p),
+                    ctypes.c_int(ddot_res.shape[0]),
+                    ctypes.c_int(ddot_res.shape[1]),
+                    aoR_holder.ao_involved.ctypes.data_as(ctypes.c_void_p)
+                )
+    
+    if mydf.with_robust_fitting:
+        K1 = K1_res
+        K2 = K2_res
+        K3 = K3_res
+        K = -K1 + K2 + K3.T
+    else:
+        K1 = K1_res
+        K = K1
+    
+    ### delete buf ###
+    
+    del ddot_buf, aoR_buf1, moR_buf, mo_coeff_pack_buf
+    
+    t2 = (logger.process_clock(), logger.perf_counter())
+
+    _benchmark_time(t1, t2, "get_k_occRI", mydf)
+    
+    # Kiv = K.copy() # for debug
+    
+    ##### final step from Kiv -> kuv ####
+    
+    t1 = (logger.process_clock(), logger.perf_counter())
+    
+    ovlp = mydf.ovlp
+    K1 = lib.ddot(mo_coeff_original, K)
+    K1 = lib.ddot(ovlp, K1)
+    # print("K.shape = ", K.shape)
+    # print("mo_coeff_original.shape = ", mo_coeff_original.shape)
+    Kij = lib.ddot(K, mo_coeff_original)
+    assert np.allclose(Kij, Kij.T)
+    K2 = lib.ddot(mo_coeff_original, Kij)
+    K2 = lib.ddot(ovlp, K2)
+    K2 = lib.ddot(K2, mo_coeff_original.T)
+    K2 = lib.ddot(K2, ovlp)
+    K = K1 + K1.T - K2
+    
+    # Kip = lib.ddot(K, mo_coeff_full)
+    # Kpq = np.zeros((nao, nao), dtype=np.float64)
+    # Kpq[:nocc, :] = Kip
+    # Kpq[nocc:, :nocc] = Kip[:,nocc:].T
+    # K = lib.ddot(mo_coeff_full, Kpq)
+    # K = lib.ddot(K, mo_coeff_full.T)
+        
+    t2 = (logger.process_clock(), logger.perf_counter())
+    t00 = t2
+    
+    _benchmark_time(t1, t2, "get_k_iv_2_uv", mydf)
+    _benchmark_time(t0, t00, "get_jk_occ-RI-K", mydf)
+    
+    del K1, K2, K3
+    
+    return J * ngrid / vol, K * ngrid / vol
+
+
+def get_jk_dm_quadratic(mydf, dm, hermi=1, kpt=np.zeros(3),
+                        kpts_band=None, with_j=True, with_k=True, omega=None, 
+                        **kwargs):
+    
+    '''JK'''
+    
+    ############ deal with occ-RI-K ############
+    
+    use_occ_RI_K = False
+    
+    if getattr(mydf, "occ_RI_K", None) is not None:
+        use_occ_RI_K = mydf.occ_RI_K
+    
+    if getattr(dm, '__dict__', None) is not None:
+        mo_coeff = dm.__dict__['mo_coeff']
+        mo_occ   = dm.__dict__['mo_occ']
+        if mo_coeff is not None:
+            assert mo_occ is not None
+            if mo_coeff.ndim == 3:
+                assert mo_coeff.shape[2] == mo_occ.shape[1]
+                assert mo_occ.ndim == 2
+            else:
+                assert mo_coeff.shape[1] == mo_occ.shape[0]
+                assert mo_coeff.ndim == 2
+                assert mo_occ.ndim == 1
+        # if use_occ_RI_K and mo_coeff is None:
+        #     dm = np.asarray(dm)
+        #     if len(dm.shape) == 3:
+        #         assert dm.shape[0] == 1
+        #         dm = dm[0]
+        #     mo_occ, mo_coeff = mydf.diag_dm(dm)
+        #     dm = dm.reshape(1, dm.shape[0], dm.shape[1])
+        #     dm = lib.tag_array(dm, mo_coeff=mo_coeff, mo_occ=mo_occ)
+    else:
+        dm = np.asarray(dm)
+        if len(dm.shape) == 3:
+            assert dm.shape[0] <= 4
+        # if use_occ_RI_K:
+        #     assert dm.shape[0] == 1
+        #     dm = dm[0]
+        #     mo_occ, mo_coeff = mydf.diag_dm(dm)
+        #     dm = dm.reshape(1, dm.shape[0], dm.shape[1])
+        #     dm = lib.tag_array(dm, mo_coeff=mo_coeff, mo_occ=mo_occ)
+        # else:
+        #     mo_occ = None
+        #     mo_coeff = None
+        mo_occ = None
+        mo_coeff = None
+    
+    # if use_occ_RI_K:
+    #     if mydf.direct == True:
+    #         raise ValueError("ISDF does not support direct=True for occ-RI-K")
+    
+    if dm.ndim == 2:
+        dm = dm.reshape(1, *dm.shape)
+    
+    assert dm.ndim == 3
+    
+    ############ end deal with occ-RI-K ############
+    
+    direct  = mydf.direct
+    use_mpi = mydf.use_mpi
+    
+    if use_mpi and direct == False:
+        raise NotImplementedError("ISDF does not support use_mpi and direct=False")
+    
+    if len(dm.shape) == 3:
+        assert dm.shape[0] <= 4
+        ## NOTE: 1 for RHF 2 for UHF 3/4 for GHF
+
+    if hasattr(mydf, 'Ls') and mydf.Ls is not None:
+        from pyscf.isdf.isdf_tools_densitymatrix import symmetrize_dm
+        dm = symmetrize_dm(dm, mydf.Ls)
+    else:
+        if hasattr(mydf, 'kmesh') and mydf.kmesh is not None:
+            from pyscf.isdf.isdf_tools_densitymatrix import symmetrize_dm
+            dm = symmetrize_dm(dm, mydf.kmesh)
+
+    if use_mpi:
+        from pyscf.isdf.isdf_tools_mpi import rank, bcast
+        dm = bcast(dm, root=0)
+        if mo_coeff is not None:
+            mo_coeff = bcast(mo_coeff, root=0)
+        if mo_occ is not None:
+            mo_occ   = bcast(mo_occ,   root=0)
+
+    dm = lib.tag_array(dm, mo_coeff=mo_coeff, mo_occ=mo_occ)
+
+    nset, nao = dm.shape[:2]
+
+    ############ end deal with dm with tags ############
+
+    #### perform the calculation ####
+
+    if "exxdiv" in kwargs:
+        exxdiv = kwargs["exxdiv"]
+        kwargs.pop("exxdiv")
+    else:
+        exxdiv = None
+
+    assert exxdiv in ["ewald", None]
+
+    vj = vk = None
+
+    if kpts_band is not None and abs(kpt-kpts_band).sum() > 1e-9:
+        raise NotImplementedError("ISDF does not support kpts_band != kpt")
+
+    log = logger.Logger(mydf.stdout, mydf.verbose)
+    t1 = (logger.process_clock(), logger.perf_counter())
+
+    j_real = gamma_point(kpt)
+    k_real = gamma_point(kpt) and not np.iscomplexobj(dm)
+
+    assert j_real
+    assert k_real
+
+    mem_now    = lib.current_memory()[0]
+    max_memory = max(2000, (mydf.max_memory - mem_now))
+
+    log.debug1('max_memory = %d MB (%d in use)', max_memory, mem_now)
+
+    # if use_occ_RI_K:
+    #     vj, vk = get_jk_occRI(mydf, dm, use_mpi, with_j, with_k)
+    # else:
+    
+    ### TODO: improve the efficiency ###
+    
+    vj = np.zeros_like(dm)
+    vk = np.zeros_like(dm)
+    for iset in range(nset):
+        if with_j and iset<=1:
+            from pyscf.isdf.isdf_jk import _contract_j_dm
+            vj[iset] = _contract_j_dm_ls(mydf, dm[iset], use_mpi)  
+        if with_k:
+            if mydf.direct:
+                if iset == 0:
+                    vk = _contract_k_dm_quadratic_direct(mydf, dm, use_mpi=use_mpi)
+                # vk[iset] = _contract_k_dm_quadratic_direct(mydf, dm[iset], use_mpi=use_mpi)
+            else:
+                vk[iset] = _contract_k_dm_quadratic(mydf, dm[iset], mydf.with_robust_fitting, use_mpi=use_mpi)
+
+    ##### the following code is added to deal with _ewald_exxdiv_for_G0 #####
+    
+    if not use_mpi or (use_mpi and rank == 0):
+    
+        kpts    = kpt.reshape(1,3)
+        kpts    = np.asarray(kpts)
+        dm_kpts = dm.reshape(-1, dm.shape[0], dm.shape[1]).copy()
+        dm_kpts = lib.asarray(dm_kpts, order='C')
+        dms     = _format_dms(dm_kpts, kpts)
+        nset, nkpts, nao = dms.shape[:3]
+        
+        assert nset  <= 4
+        assert nkpts == 1
+    
+        kpts_band, input_band = _format_kpts_band(kpts_band, kpts), kpts_band
+        nband = len(kpts_band)
+        
+        assert nband == 1
+
+        if is_zero(kpts_band) and is_zero(kpts):
+            vk = vk.reshape(nset,nband,nao,nao)
+        else:
+            raise NotImplementedError("ISDF does not support kpts_band != 0")
+
+        if exxdiv == 'ewald':
+            _ewald_exxdiv_for_G0(mydf.cell, kpts, dms, vk, kpts_band=kpts_band)
+    
+        vk = vk[:,0,:,:]
+    
+    if use_mpi:
+        vj = bcast(vj, root=0)
+        vk = bcast(vk, root=0)
+
+    ##### end of dealing with _ewald_exxdiv_for_G0 #####
+
+    t1 = log.timer('sr jk', *t1)
+
+    return vj, vk
+
+############# linear scaling implementation ############# 
\ No newline at end of file
diff --git a/pyscf/isdf/isdf_local_k.py b/pyscf/isdf/isdf_local_k.py
new file mode 100644
index 000000000..a440ee1a3
--- /dev/null
+++ b/pyscf/isdf/isdf_local_k.py
@@ -0,0 +1,1378 @@
+#!/usr/bin/env python
+# Copyright 2014-2020 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Author: Ning Zhang <ningzhang1024@gmail.com>
+#
+
+############ sys module ############
+
+import copy
+from copy import deepcopy
+import numpy as np
+import ctypes
+
+############ pyscf module ############
+
+from pyscf import lib
+from pyscf.pbc.gto import Cell
+from pyscf.pbc import tools
+from pyscf.gto.mole import *
+libisdf = lib.load_library('libisdf')
+
+############ isdf utils ############
+
+from   pyscf.isdf.isdf_eval_gto import ISDF_eval_gto 
+import pyscf.isdf.isdf_local as ISDF_Local
+import pyscf.isdf.isdf_tools_local as ISDF_Local_Utils
+from   pyscf.isdf.isdf_local_k_jk import get_jk_dm_translation_symmetry
+from   pyscf.isdf.isdf_jk import _benchmark_time
+
+############ subroutines --- deal with translation symmetry ############
+
+### WARNING: the unit cell must be put in the first cell !! ###
+
+def _expand_partition_prim(partition_prim, kmesh, mesh):
+
+    meshPrim = np.array(mesh) // np.array(kmesh) 
+    
+    partition = []
+    
+    for i in range(kmesh[0]):
+        for j in range(kmesh[1]):
+            for k in range(kmesh[2]):
+                shift = i * meshPrim[0] * mesh[1] * mesh[2] + j * meshPrim[1] * mesh[2] + k * meshPrim[2]
+                for data in partition_prim:
+                    partition.append(data + shift)
+    
+    return partition
+
+def _expand_primlist_2_superlist(primlist, kmesh, mesh):
+    
+    meshPrim = np.array(mesh) // np.array(kmesh)
+    
+    superlist = []
+    
+    for i in range(kmesh[0]):
+        for j in range(kmesh[1]):
+            for k in range(kmesh[2]):
+                shift = i * meshPrim[0] * mesh[1] * mesh[2] + j * meshPrim[1] * mesh[2] + k * meshPrim[2]
+                superlist.extend(primlist + shift)
+    
+    return np.array(superlist, dtype=np.int32)
+
+def _get_grid_ordering_k(input, kmesh, mesh):
+    
+    if isinstance(input, list):
+        prim_ordering = []
+        for data in input:
+            prim_ordering.extend(data)
+        return _expand_primlist_2_superlist(prim_ordering, kmesh, mesh)
+    else:
+        raise NotImplementedError
+
+def select_IP_local_ls_k_drive(mydf, c, m, 
+                               IP_possible_atm, 
+                               group, 
+                               build_aoR_FFT = True,
+                               use_mpi       = False):
+    
+    # assert use_mpi == False
+    
+    IP_group  = []
+    aoRg_possible = mydf.aoRg_possible
+    
+    assert len(IP_possible_atm) == mydf.first_natm
+    
+    #### do the work ####
+    
+    first_natm = mydf.first_natm
+    
+    for i in range(len(group)):
+        IP_group.append(None)
+
+    if len(group) < first_natm:
+        if use_mpi == False:
+            for i in range(len(group)):
+                IP_group[i] = ISDF_Local.select_IP_group_ls(
+                    mydf, aoRg_possible, c, m,
+                    group = group[i],
+                    atm_2_IP_possible=IP_possible_atm
+                )
+        else:
+            group_begin, group_end = ISDF_Local_Utils._range_partition(len(group), rank, comm_size, use_mpi)
+            for i in range(group_begin, group_end):
+                IP_group[i] = ISDF_Local.select_IP_local_ls(
+                    mydf, aoRg_possible, c, m,
+                    group = group[i],
+                    atm_2_IP_possible=IP_possible_atm
+                )
+            IP_group = ISDF_Local_Utils._sync_list(IP_group, len(group))
+    else:
+        IP_group = IP_possible_atm
+
+    mydf.IP_group        = IP_group
+    mydf.IP_flat_prim    = []
+    mydf.IP_segment_prim = []
+    
+    nIP_now = 0
+    
+    for x in IP_group:
+        mydf.IP_flat_prim.extend(x)
+        mydf.IP_segment_prim.append(nIP_now)
+        nIP_now += len(x)
+        
+    mydf.IP_flat   = _expand_primlist_2_superlist(mydf.IP_flat_prim, mydf.kmesh, mydf.mesh)
+    mydf.naux      = mydf.IP_flat.shape[0]
+    mydf.nIP_Prim  = len(mydf.IP_flat_prim)
+    mydf.nGridPrim = len(mydf.grid_ID_ordered_prim)
+    gridID_2_atmID = mydf.gridID_2_atmID
+    
+    partition_IP = []
+    for i in range(mydf.cell.natm):
+        partition_IP.append([])
+    
+    for _ip_id_ in mydf.IP_flat:
+        atm_id = gridID_2_atmID[_ip_id_]
+        partition_IP[atm_id].append(_ip_id_)
+    
+    for i in range(mydf.cell.natm):
+        partition_IP[i] = np.array(partition_IP[i], dtype=np.int32)
+    
+    mydf.IP_segment = [0]
+    for atm_id in mydf.atm_ordering:
+        mydf.IP_segment.append(mydf.IP_segment[-1] + len(partition_IP[atm_id]))
+    mydf.IP_segment = np.array(mydf.IP_segment, dtype=np.int32)
+    
+    ### build aoR_IP ###
+    
+    #### recalculate it anyway ! #### 
+    
+    coords = mydf.coords
+    weight = np.sqrt(mydf.cell.vol / coords.shape[0])
+    
+    del mydf.aoRg_possible
+    mydf.aoRg_possible = None
+    
+    t1 = (lib.logger.process_clock(), lib.logger.perf_counter())
+    
+    weight = np.sqrt(mydf.cell.vol / coords.shape[0])
+    
+    mydf.aoRg = ISDF_Local_Utils.get_aoR(
+        mydf.cell, coords, partition_IP,
+        first_natm,
+        mydf.cell.natm,
+        mydf.group_global,
+        mydf.distance_matrix,
+        mydf.AtmConnectionInfo,
+        False,
+        mydf.use_mpi,
+        True)
+    
+    assert len(mydf.aoRg) == first_natm
+    
+    mydf.aoRg1 = ISDF_Local_Utils.get_aoR(
+        mydf.cell, coords, partition_IP,
+        mydf.cell.natm,
+        first_natm,
+        mydf.group_global,
+        mydf.distance_matrix,
+        mydf.AtmConnectionInfo,
+        False,
+        mydf.use_mpi,
+        True)
+        
+    aoRg_activated = []
+    for _id_, aoR_holder in enumerate(mydf.aoRg):
+        if aoR_holder.ao_involved.size == 0:
+            aoRg_activated.append(False)
+        else:
+            aoRg_activated.append(True)
+    aoRg_activated = np.array(aoRg_activated, dtype=bool)
+    mydf.aoRg_activated = aoRg_activated
+        
+    t2 = (lib.logger.process_clock(), lib.logger.perf_counter())
+
+    #################### build aoRg_FFT ####################
+
+    kmesh = mydf.kmesh
+    ncell_complex = kmesh[0] * kmesh[1] * (kmesh[2]//2+1)
+    nao_prim = mydf.nao // np.prod(kmesh)
+    nbas_prim = mydf.cell.nbas // np.prod(mydf.kmesh)
+    weight = np.sqrt(mydf.cell.vol / coords.shape[0])
+    nIP_Prim = mydf.nIP_Prim
+
+    ### todo make it a list ! ### 
+    
+    ################# construct aoRg_FFT #################
+    
+    if build_aoR_FFT:
+
+        aoRg_Tmp = ISDF_eval_gto(mydf.cell, coords=coords[mydf.IP_flat], shls_slice=(0, nbas_prim)) * weight
+        
+        mydf.aoRg_FFT                 = np.zeros((nao_prim, ncell_complex*mydf.nIP_Prim), dtype=np.complex128)
+        mydf.aoRg_FFT_real            = np.ndarray((nao_prim, np.prod(kmesh)*mydf.nIP_Prim), dtype=np.double, buffer=mydf.aoRg_FFT, offset=0)
+        mydf.aoRg_FFT_real.ravel()[:] = aoRg_Tmp.ravel()
+    
+        del aoRg_Tmp
+        
+        nthread = lib.num_threads()
+        buffer  = np.zeros((nao_prim, ncell_complex*mydf.nIP_Prim), dtype=np.complex128)
+        
+        fn = getattr(libisdf, "_FFT_Matrix_Col_InPlace", None)
+        assert fn is not None
+    
+        '''
+        fn = _FFT_Matrix_Col_InPlace transform 
+
+        (A0 | A1 | A2) --> (A0+A1+A2 | A0+wA1 + w^2 A2 | A0 + w^2 A1+ w A2)
+
+        '''
+        
+        # print("aoRg_FFT.shape = ", mydf.aoRg_FFT.shape)
+    
+        kmesh = np.array(kmesh, dtype=np.int32)
+    
+        fn(
+            mydf.aoRg_FFT_real.ctypes.data_as(ctypes.c_void_p),
+            ctypes.c_int(nao_prim),
+            ctypes.c_int(nIP_Prim),
+            kmesh.ctypes.data_as(ctypes.c_void_p),
+            buffer.ctypes.data_as(ctypes.c_void_p)
+        ) # no normalization factor ! 
+
+        aoRg_packed = []
+        for i in range(ncell_complex):
+            aoRg_packed.append(mydf.aoRg_FFT[:, i*nIP_Prim:(i+1)*nIP_Prim].copy())
+        del mydf.aoRg_FFT
+        mydf.aoRg_FFT = aoRg_packed
+    else:
+        mydf.aoRg_FFT = None
+
+    ################# End aoRg_FFT #################
+
+    #################### build aoR_FFT ####################
+
+    if mydf.with_robust_fitting and build_aoR_FFT:
+        
+        ngrids            = coords.shape[0]
+        ngrids_prim       = ngrids // np.prod(kmesh)
+        aoR_tmp           = ISDF_eval_gto(mydf.cell, coords=coords[mydf.grid_ID_ordered], shls_slice=(0, nbas_prim)) * weight
+        mydf.aoR_FFT      = np.zeros((nao_prim, ncell_complex*ngrids_prim), dtype=np.complex128)
+        mydf.aoR_FFT_real = np.ndarray((nao_prim, np.prod(kmesh)*ngrids_prim), dtype=np.double, buffer=mydf.aoR_FFT, offset=0)
+        mydf.aoR_FFT_real.ravel()[:] = aoR_tmp.ravel()
+        
+        del aoR_tmp
+        
+        buffer         = np.zeros((nao_prim, ncell_complex*ngrids_prim), dtype=np.complex128)
+        
+        fn(
+            mydf.aoR_FFT_real.ctypes.data_as(ctypes.c_void_p),
+            ctypes.c_int(nao_prim),
+            ctypes.c_int(ngrids_prim),
+            kmesh.ctypes.data_as(ctypes.c_void_p),
+            buffer.ctypes.data_as(ctypes.c_void_p)
+        )
+
+        aoR_packed = []
+        for i in range(ncell_complex):
+            aoR_packed.append(mydf.aoR_FFT[:, i*ngrids_prim:(i+1)*ngrids_prim].copy())
+        del mydf.aoR_FFT
+        mydf.aoR_FFT = aoR_packed
+        # mydf.aoR     = None
+        del buffer         
+    else:
+        mydf.aoR_FFT = None
+        # build aoR #
+
+def build_auxiliary_Coulomb_local_bas_k(mydf, debug=True, use_mpi=False):
+    
+    if use_mpi:
+        raise NotImplementedError
+    
+    t0 = (lib.logger.process_clock(), lib.logger.perf_counter())
+    
+    cell = mydf.cell
+    mesh = mydf.mesh
+    
+    naux = mydf.naux
+    
+    ncomplex = mesh[0] * mesh[1] * (mesh[2] // 2 + 1) * 2 
+    
+    grid_ordering = mydf.grid_ID_ordered
+    
+    assert mydf.omega is None or mydf.omega == 0.0
+    coulG = tools.get_coulG(cell, mesh=mesh)
+    mydf.coulG = coulG.copy()
+    coulG_real = coulG.reshape(*mesh)[:, :, :mesh[2]//2+1].reshape(-1).copy()
+    
+    nThread = lib.num_threads()
+    bufsize_per_thread = int((coulG_real.shape[0] * 2 + mesh[0] * mesh[1] * mesh[2]) * 1.1)
+    buf = np.empty((nThread, bufsize_per_thread), dtype=np.double)
+    
+    def construct_V_CCode(aux_basis:list[np.ndarray], 
+                          # buf:np.ndarray, 
+                          V=None, shift_row=None):
+        
+        nThread = buf.shape[0]
+        bufsize_per_thread = buf.shape[1]
+        
+        nAux = 0
+        for x in aux_basis:
+            nAux += x.shape[0]
+        
+        ngrids             = mesh[0] * mesh[1] * mesh[2]
+        mesh_int32         = np.array(mesh, dtype=np.int32)
+
+        if V is None:
+            assert shift_row is None
+            V = np.zeros((nAux, ngrids), dtype=np.double)
+                    
+        fn = getattr(libisdf, "_construct_V_local_bas", None)
+        assert(fn is not None)
+
+        if shift_row is None:
+            shift_row = 0
+        # ngrid_now = 0
+        
+        for i in range(len(aux_basis)):
+            
+            aux_basis_now = aux_basis[i]
+            grid_ID = mydf.partition_group_to_gridID[i]
+            # ngrid_now += grid_ID.size
+            # print("i           = ", i)
+            # print("shift_row   = ", shift_row) 
+            # print("aux_bas_now = ", aux_basis_now.shape)
+            # print("ngrid_now   = ", grid_ID.size)
+            # print("buf = ", buf.shape)
+            # print("ngrid_ordering = ", grid_ordering.size)
+            # sys.stdout.flush()
+            assert aux_basis_now.shape[1] == grid_ID.size 
+        
+            fn(mesh_int32.ctypes.data_as(ctypes.c_void_p),
+                ctypes.c_int(aux_basis_now.shape[0]),
+                ctypes.c_int(aux_basis_now.shape[1]),
+                grid_ID.ctypes.data_as(ctypes.c_void_p),
+                aux_basis_now.ctypes.data_as(ctypes.c_void_p),
+                coulG_real.ctypes.data_as(ctypes.c_void_p),
+                ctypes.c_int(shift_row),
+                V.ctypes.data_as(ctypes.c_void_p),
+                grid_ordering.ctypes.data_as(ctypes.c_void_p),
+                buf.ctypes.data_as(ctypes.c_void_p),
+                ctypes.c_int(bufsize_per_thread))
+        
+            shift_row += aux_basis_now.shape[0]
+
+        return V
+
+    
+    V = construct_V_CCode(mydf.aux_basis, V=None, shift_row=None)
+    
+    if mydf.with_robust_fitting:
+        mydf.V_R = V
+    
+    ########### construct W ###########
+    
+    naux_bra = 0
+    for x in mydf.aux_basis:
+        naux_bra += x.shape[0]
+    
+    naux = mydf.naux
+    
+    assert naux % naux_bra == 0
+    assert naux // naux_bra == np.prod(mydf.kmesh)
+    
+    mydf.W = np.zeros((naux_bra, naux), dtype=np.double)
+    
+    ngroup = len(mydf.aux_basis)    
+    aux_bra_shift = 0
+    kmesh = mydf.kmesh
+        
+    for i in range(ngroup):
+            
+        aux_ket_shift = 0
+        grid_shift = 0
+        naux_bra = mydf.aux_basis[i].shape[0]
+        
+        for ix in range(kmesh[0]):
+            for iy in range(kmesh[1]):
+                for iz in range(kmesh[2]):
+                   for j in range(ngroup):
+                        aux_basis_ket = mydf.aux_basis[j]
+                        ngrid_now = aux_basis_ket.shape[1]
+                        naux_ket = aux_basis_ket.shape[0]
+                        mydf.W[aux_bra_shift:aux_bra_shift+naux_bra, aux_ket_shift:aux_ket_shift+naux_ket] = lib.ddot(
+                           V[aux_bra_shift:aux_bra_shift+naux_bra, grid_shift:grid_shift+ngrid_now],
+                           aux_basis_ket.T
+                        )
+                        aux_ket_shift += naux_ket
+                        grid_shift += ngrid_now                 
+                     
+        aux_bra_shift += naux_bra
+                        
+        assert grid_shift == np.prod(mesh)
+            
+    del buf
+    buf = None
+    
+    assert V.shape[0] == mydf.naux // np.prod(mydf.kmesh)
+    assert V.shape[1] == np.prod(mesh)
+    assert mydf.W.shape[0] == mydf.naux // np.prod(mydf.kmesh)
+    assert mydf.W.shape[1] == mydf.naux
+    
+    if mydf.with_robust_fitting == False:
+        del V
+    
+##### get_jk #####
+    
+class PBC_ISDF_Info_Quad_K(ISDF_Local.PBC_ISDF_Info_Quad):
+    
+    # Quad stands for quadratic scaling
+    
+    def __init__(self, 
+                 mol:Cell,  # means the primitive cell 
+                 with_robust_fitting=True,
+                 kmesh              =None,
+                 verbose            =None,
+                 rela_cutoff_QRCP   =None,
+                 aoR_cutoff         =1e-8,
+                 direct             =False,
+                 limited_memory     =False,
+                 build_K_bunchsize  =None,
+                 ):
+        
+        ### extract the info from the primitive cell ###
+        
+        atm = []
+        
+        #### TODO: remove the following restriction on the structure of lattice #### 
+        
+        assert mol.a[0][1] == 0.0
+        assert mol.a[0][2] == 0.0
+        assert mol.a[1][0] == 0.0
+        assert mol.a[1][2] == 0.0
+        assert mol.a[2][0] == 0.0
+        assert mol.a[2][1] == 0.0
+        
+        from pyscf.lib.parameters import BOHR
+        
+        for i in range(mol.natm):
+            coords = mol.atom_coord(i)
+            coords = np.array(coords) * BOHR
+            atm.append([mol.atom_symbol(i), tuple(coords)])
+        
+        prim_mesh = mol.mesh
+        mesh = np.array(prim_mesh) * np.array(kmesh)
+        
+        nelectron = np.sum(mol.nelectron)
+        
+        from pyscf.isdf.isdf_tools_cell import build_supercell
+        supercell = build_supercell(
+            atm, 
+            mol.a,
+            spin       = nelectron*np.prod(kmesh) % 2,
+            mesh       = mesh,
+            Ls         = kmesh,
+            basis      = mol.basis,
+            pseudo     = mol.pseudo,
+            ke_cutoff  = mol.ke_cutoff,
+            max_memory = mol.max_memory,
+            verbose    = mol.verbose
+        )
+                
+        self.prim_cell = mol
+        
+        # print("supercell.mesh = ", supercell.mesh)
+        
+        super().__init__(supercell, with_robust_fitting, None, verbose, rela_cutoff_QRCP, aoR_cutoff, direct, use_occ_RI_K=False, 
+                         limited_memory=limited_memory, build_K_bunchsize=build_K_bunchsize)
+        
+        self.kmesh = kmesh
+        
+        self.kpts = self.prim_cell.make_kpts(kmesh)
+        
+        assert self.mesh[0] % kmesh[0] == 0
+        assert self.mesh[1] % kmesh[1] == 0
+        assert self.mesh[2] % kmesh[2] == 0
+        
+        # print("self.mesh = ", self.mesh)
+        # exit(1)
+        
+        #### information relating primitive cell and supercell
+        
+        self.meshPrim = np.array(self.mesh) // np.array(self.kmesh)
+        self.natm     = self.cell.natm
+        self.natmPrim = self.cell.natm // np.prod(self.kmesh)
+        
+        self.with_translation_symmetry = True
+        
+        from pyscf.isdf.isdf_tools_cell import build_primitive_cell
+        self.primCell = build_primitive_cell(self.cell, self.kmesh)
+        self.nao_prim = self.nao // np.prod(self.kmesh)
+        assert self.nao_prim == self.primCell.nao_nr()
+    
+        ##### rename everthing with pre_fix  _supercell ####
+    
+    def build_partition_aoR(self, Ls=None):
+        '''
+        
+        build partition of grid points and AO values on grids 
+        
+        partition of grids is the assignment of each grids to the atom
+        
+        partition is hence a list of list of grids
+        
+        '''
+        
+        if self.aoR is not None and self.partition is not None:
+            return
+        
+        log = lib.logger.Logger(self.stdout, self.verbose)
+        
+        ##### build cutoff info #####   
+        
+        self.distance_matrix   = ISDF_Local_Utils.get_cell_distance_matrix(self.cell)
+        weight                 = np.sqrt(self.cell.vol / self.coords.shape[0])
+        precision              = self.aoR_cutoff
+        rcut                   = ISDF_Local_Utils._estimate_rcut(self.cell, self.coords.shape[0], precision)
+        rcut_max               = np.max(rcut)
+        atm2_bas               = ISDF_Local_Utils._atm_to_bas(self.cell)
+        self.AtmConnectionInfo = []
+        
+        for i in range(self.cell.natm):
+            tmp = ISDF_Local_Utils.AtmConnectionInfo(self.cell, i, self.distance_matrix, precision, rcut, rcut_max, atm2_bas)
+            self.AtmConnectionInfo.append(tmp)
+    
+        #### information dealing grids , build parition ####
+                
+        t1 = (lib.logger.process_clock(), lib.logger.perf_counter())
+                
+        if Ls is None:
+            Ls = [
+                int(self.cell.lattice_vectors()[0][0]/2)+1,
+                int(self.cell.lattice_vectors()[1][1]/2)+1,
+                int(self.cell.lattice_vectors()[2][2]/2)+1
+            ]
+        
+        self.partition_prim = ISDF_Local_Utils.get_partition(
+            self.cell, self.coords,
+            self.AtmConnectionInfo,
+            Ls,
+            self.with_translation_symmetry, 
+            self.kmesh,
+            self.use_mpi
+        ) ## the id of grid points of self.partition_prim is w.r.t the supercell ##
+        
+        for i in range(len(self.partition_prim)):
+            self.partition_prim[i] = np.array(self.partition_prim[i], dtype=np.int32)
+        
+        assert len(self.partition_prim) == self.natmPrim ## the grid id is the global grid id 
+        
+        self.partition = _expand_partition_prim(self.partition_prim, self.kmesh, self.mesh)
+        
+        assert len(self.partition) == self.natm
+        
+        t2 = (lib.logger.process_clock(), lib.logger.perf_counter())
+        
+        #### 
+        
+        if not self.use_mpi:
+            rank = 0
+            _benchmark_time(t1, t2, "build_partition", self)
+        else:
+            from pyscf.isdf.isdf_tools_mpi import rank, bcast
+            if rank == 0:
+                _benchmark_time(t1, t2, "build_partition", self)
+        
+        #### build aoR #### 
+        
+        t1 = (lib.logger.process_clock(), lib.logger.perf_counter())
+        
+        sync_aoR = False
+        if self.direct:
+            sync_aoR = True
+        
+        ## deal with translation symmetry ##
+        
+        first_natm = self.first_natm
+        natm       = self.cell.natm
+                
+        ### we need three types of aoR ### 
+        
+        # this type of aoR is used in get J and select IP 
+        
+        weight = np.sqrt(self.cell.vol / self.coords.shape[0])
+        
+        self.aoR = ISDF_Local_Utils.get_aoR(self.cell, self.coords, self.partition, 
+                                                  first_natm,
+                                                  natm,
+                                                  self.group_global,
+                                                  self.distance_matrix, 
+                                                  self.AtmConnectionInfo, 
+                                                  self.use_mpi, self.use_mpi, sync_aoR) ### full col, store aoR[:, :ngrid_prim]
+        
+    
+        memory = ISDF_Local_Utils._get_aoR_holders_memory(self.aoR) ### full col
+        assert len(self.aoR) == first_natm
+        
+        if rank == 0:
+            log.info("In ISDF-K build_partition_aoR aoR memory: %d " % (memory))
+        
+        
+        weight = np.sqrt(self.cell.vol / self.coords.shape[0])
+        self.aoR1 = ISDF_Local_Utils.get_aoR(self.cell, self.coords, self.partition, 
+                                                   None,
+                                                   first_natm,
+                                                   self.group_global,
+                                                   self.distance_matrix, 
+                                                   self.AtmConnectionInfo, 
+                                                   self.use_mpi, self.use_mpi, sync_aoR) ### full row , store aoR[:nao_prim, :]
+        
+        memory = ISDF_Local_Utils._get_aoR_holders_memory(self.aoR1)  ### full row 
+        assert len(self.aoR1) == natm
+        
+        if rank == 0:
+            log.info("In ISDF-K build_partition_aoR aoR1 memory: %s", memory)
+        
+        partition_activated = None
+        
+        ##### the following info is used in get_J ##### 
+        
+        if rank == 0:
+            partition_activated = []
+            for _id_, aoR_holder in enumerate(self.aoR1):
+                if aoR_holder.ao_involved.size == 0:
+                    partition_activated.append(False)
+                else:
+                    partition_activated.append(True)
+            partition_activated = np.array(partition_activated, dtype=bool)
+            
+        if self.use_mpi:
+            partition_activated = bcast(partition_activated)
+        
+        self.partition_activated = partition_activated
+        self.partition_activated_id = []
+        for i in range(len(partition_activated)):
+            if partition_activated[i]:
+                self.partition_activated_id.append(i)
+        self.partition_activated_id = np.array(self.partition_activated_id, dtype=np.int32)
+        
+        t2 = (lib.logger.process_clock(), lib.logger.perf_counter())
+        
+        if rank == 0:
+            _benchmark_time(t1, t2, "build_aoR", self)
+    
+    def set_group(self, group=None):
+        
+        first_natm = self.first_natm 
+        if group is None:
+            group = []
+            for i in range(first_natm):
+                group.append([i])
+        
+        ## check the group ##
+        
+        natm_involved = 0
+        for data in group:
+            for atm_id in data:
+                assert atm_id < first_natm
+            natm_involved += len(data)
+        assert natm_involved == first_natm 
+        
+        for i in range(len(group)):
+            group[i] = np.array(group[i], dtype=np.int32)
+
+        assert len(group) <= first_natm
+
+        self.group = group
+        
+        self.group_global = []
+        shift = 0
+        self.atm_ordering = []
+        for ix in range(self.kmesh[0]):
+            for iy in range(self.kmesh[1]):
+                for iz in range(self.kmesh[2]):
+                    for data in self.group:
+                        self.group_global.append(data + shift)
+                        self.atm_ordering.extend(data + shift)
+                    shift += self.natmPrim
+        self.atm_ordering = np.array(self.atm_ordering, dtype=np.int32)
+                
+        self.atm_id_2_group = np.zeros((self.cell.natm), dtype=np.int32)
+        for i in range(len(self.group_global)):
+            for atm_id in self.group_global[i]:
+                self.atm_id_2_group[atm_id] = i
+    
+    def build_IP_local(self, c=5, m=5, group=None, Ls = None, debug=True):
+        
+        assert self.use_aft_ao == False
+
+        self.set_group(group)
+        first_natm = self.first_natm 
+                
+        # build partition and aoR # 
+        
+        t1 = (lib.logger.process_clock(), lib.logger.perf_counter())
+                
+        self.build_partition_aoR(None)
+        
+        self.grid_segment = [0]
+        for atm_id in self.atm_ordering:
+            loc_now = self.grid_segment[-1] + len(self.partition[atm_id])
+            self.grid_segment.append(loc_now)
+        self.grid_segment = np.array(self.grid_segment, dtype=np.int32)
+        
+        ao2atomID = self.ao2atomID
+        partition = self.partition
+        aoR       = self.aoR
+        natm      = self.natm
+        nao       = self.nao
+        
+        self.partition_atmID_to_gridID = partition
+        
+        self.partition_group_to_gridID = []
+        for i in range(len(group)):
+            self.partition_group_to_gridID.append([])
+            for atm_id in group[i]:
+                self.partition_group_to_gridID[i].extend(partition[atm_id])
+            self.partition_group_to_gridID[i] = np.array(self.partition_group_to_gridID[i], dtype=np.int32)
+            
+        ngrids = self.coords.shape[0]
+        
+        gridID_2_atmID = np.zeros((ngrids), dtype=np.int32)
+        
+        for atm_id in range(self.cell.natm):
+            gridID_2_atmID[partition[atm_id]] = atm_id
+        
+        self.gridID_2_atmID            = gridID_2_atmID
+        self.grid_ID_ordered           = _get_grid_ordering_k(self.partition_group_to_gridID, self.kmesh, self.mesh)
+        self.grid_ID_ordered_prim      = self.grid_ID_ordered[:ngrids//np.prod(self.kmesh)].copy()
+        self.partition_group_to_gridID = _expand_partition_prim(self.partition_group_to_gridID, self.kmesh, self.mesh)
+        
+        for i in range(len(self.grid_ID_ordered_prim)):
+            grid_ID = self.grid_ID_ordered_prim[i]
+            
+            ix = grid_ID // (self.mesh[1] * self.mesh[2])
+            iy = (grid_ID % (self.mesh[1] * self.mesh[2])) // self.mesh[2]
+            iz = grid_ID % self.mesh[2]
+            
+            # assert ix < self.meshPrim[0]
+            # assert iy < self.meshPrim[1]
+            # assert iz < self.meshPrim[2]
+            
+            self.grid_ID_ordered_prim[i] = ix * self.meshPrim[1] * self.meshPrim[2] + iy * self.meshPrim[2] + iz
+            
+        
+        t2 = (lib.logger.process_clock(), lib.logger.perf_counter())
+        
+        #if self.verbose and debug:
+        if not self.use_mpi:
+            rank = 0
+        else:
+            from pyscf.isdf.isdf_tools_mpi import rank
+        
+        if rank == 0:
+            _benchmark_time(t1, t2, "build_partition_aoR", self)
+        
+        t1 = t2 
+        
+        if len(group) < first_natm:
+            IP_Atm = ISDF_Local.select_IP_atm_ls(
+                self, 
+                c+1, m, 
+                first_natm, 
+                rela_cutoff          = self.rela_cutoff_QRCP,
+                no_retriction_on_nIP = self.no_restriction_on_nIP,
+                use_mpi              = self.use_mpi
+            )
+        else:
+            IP_Atm = ISDF_Local.select_IP_atm_ls(
+                self, 
+                c, m, 
+                first_natm, 
+                rela_cutoff          = self.rela_cutoff_QRCP,
+                no_retriction_on_nIP = self.no_restriction_on_nIP,
+                use_mpi              = self.use_mpi
+            )
+        
+        self.IP_Atm = IP_Atm
+        
+        t3 = (lib.logger.process_clock(), lib.logger.perf_counter()) 
+        
+        weight = np.sqrt(self.cell.vol / self.coords.shape[0])
+        
+        self.aoRg_possible = ISDF_Local_Utils.get_aoR(
+            self.cell, self.coords, 
+            IP_Atm, 
+            first_natm,
+            natm,
+            self.group,
+            self.distance_matrix, 
+            self.AtmConnectionInfo, 
+            self.use_mpi, self.use_mpi, True
+        )
+        
+        assert len(self.aoRg_possible) == first_natm
+        
+        t4 = (lib.logger.process_clock(), lib.logger.perf_counter())
+        
+        #if self.verbose and debug:
+        if rank == 0:
+            _benchmark_time(t3, t4, "build_aoRg_possible", self)
+        
+        build_aoR_FFT = (self.direct == False)
+        
+        select_IP_local_ls_k_drive(
+            self, c, m, 
+            self.IP_Atm, self.group, 
+            build_aoR_FFT = build_aoR_FFT,
+            use_mpi       = self.use_mpi
+        )
+        
+        t2 = (lib.logger.process_clock(), lib.logger.perf_counter())
+        
+        #if self.verbose and debug:
+        if rank == 0:
+            _benchmark_time(t1, t2, "select_IP", self)
+        
+        t1 = t2 
+        
+        ISDF_Local.build_aux_basis_ls(
+            self, group, self.IP_group, debug=debug, use_mpi=self.use_mpi)
+        
+        t2 = (lib.logger.process_clock(), lib.logger.perf_counter())
+        
+        # if self.verbose and debug:
+        if rank == 0:
+            _benchmark_time(t1, t2, "build_aux_basis", self)
+    
+        t1 = t2
+        sys.stdout.flush()
+    
+    def build_auxiliary_Coulomb(self, debug=True):
+        
+        if self.direct == False:
+            build_auxiliary_Coulomb_local_bas_k(self, debug=debug, use_mpi=self.use_mpi)
+
+    ################ testing code    ################
+    
+    # def test_
+    
+    ################ allocate buffer ################ 
+    
+    def _get_bufsize_get_j(self):
+        
+        # if self.with_robust_fitting == False:
+        if True:
+            
+            naux       = self.naux
+            nao        = self.nao
+            nIP_Prim   = self.nIP_Prim
+            nao_prim   = self.nao // np.prod(self.kmesh)
+            
+            size_buf3  = nao * naux + naux + naux + nao * nao
+            size_buf4  = nao * nIP_Prim
+            size_buf4 += nIP_Prim
+            size_buf4 += nao_prim * nao
+            size_buf4 += nIP_Prim
+            size_buf4 += nao_prim * nao_prim
+            size_buf4 += nao_prim * nIP_Prim * 3
+            
+            return max(size_buf3, size_buf4)
+            
+        # else:
+        #     raise NotImplementedError
+
+    def _get_bufsize_get_k(self):
+        
+        # if self.with_robust_fitting == False:
+        if self.with_robust_fitting == False:
+            
+            naux     = self.naux
+            nao      = self.nao
+            nIP_Prim = self.nIP_Prim
+            nao_prim = self.nao // np.prod(self.kmesh)
+            ncell_complex = self.kmesh[0] * self.kmesh[1] * (self.kmesh[2]//2+1)
+            
+            #### size of density matrix ####
+            
+            size_dm = nao_prim * nao_prim * ncell_complex * 2
+            size_dm += nIP_Prim * nIP_Prim * ncell_complex * 2
+            
+            #### size of buf to construct dm ####
+            
+            size_buf5 = nao_prim * nao_prim * 2 * 2
+            size_buf5 += nao_prim * nIP_Prim * 2 * 2
+            
+            size_fft_buf = nIP_Prim * nIP_Prim * ncell_complex * 2
+            
+            #### size of buf to construct K ####
+            
+            size_buf6  = nao_prim * nao_prim * ncell_complex * 2 # k-buf
+            size_buf6 += nIP_Prim * nIP_Prim * 2     # buf_A
+            size_buf6 += nao_prim * nIP_Prim * 2 *2  # buf_B/C
+            size_buf6 += nao_prim * nao_prim * 2     # buf_D
+        
+            return size_dm + max(size_buf5, size_buf6, size_fft_buf)
+        
+        else:
+            
+            naux     = self.naux
+            nao      = self.nao
+            nIP_Prim = self.nIP_Prim
+            nGrid_Prim = self.nGridPrim
+            nao_prim = self.nao // np.prod(self.kmesh)
+            ncell_complex = self.kmesh[0] * self.kmesh[1] * (self.kmesh[2]//2+1)
+            
+            #### size of density matrix ####
+            
+            size_dm = nao_prim * nao_prim * ncell_complex * 2
+            size_dm += nIP_Prim * nGrid_Prim * ncell_complex * 2
+            
+            #### size of buf to construct dm ####
+            
+            size_buf5 = nao_prim * nao_prim * 2 
+            size_buf5 += nao_prim * nIP_Prim * 2 
+            size_buf5 += nao_prim * nGrid_Prim * 2 * 2
+            size_buf5 += nIP_Prim * nGrid_Prim * 2 
+            
+            size_fft_buf = nIP_Prim * nGrid_Prim * ncell_complex * 2
+            
+            #### size of buf to construct K ####
+            
+            size_buf6  = nao_prim * nao_prim * ncell_complex * 2 # k-buf
+            size_buf6 += nIP_Prim * nGrid_Prim * 2     # buf_A
+            size_buf6 += nao_prim * nGrid_Prim * 2     # buf_B
+            size_buf6 += nao_prim * nIP_Prim * 2 * 2   # buf_B2/C
+            size_buf6 += nao_prim * nao_prim * 2       # buf_D
+        
+            return size_dm + max(size_buf5, size_buf6, size_fft_buf)
+
+    def _allocate_jk_buffer(self, dtype=np.float64):
+        
+        if self.jk_buffer is not None:
+            return
+            
+        num_threads = lib.num_threads()
+        
+        nIP_Prim = self.nIP_Prim
+        nGridPrim = self.nGridPrim
+        ncell_complex = self.kmesh[0] * self.kmesh[1] * (self.kmesh[2]//2+1)
+        nao_prim  = self.nao // np.prod(self.kmesh)
+        naux       = self.naux
+        nao        = self.nao
+        ngrids = nGridPrim * self.kmesh[0] * self.kmesh[1] * self.kmesh[2]
+        ncell  = np.prod(self.kmesh)
+        
+        self.outcore = False 
+        
+        if self.outcore is False:
+            
+            ### in build aux basis ###
+            
+            size_buf1 = nIP_Prim * ncell_complex*nIP_Prim * 2
+            size_buf1+= nIP_Prim * ncell_complex*nGridPrim * 2 * 2
+            size_buf1+= num_threads * nGridPrim * 2
+            size_buf1+= nIP_Prim * nIP_Prim * 2
+            size_buf1+= nIP_Prim * nGridPrim * 2 * 2
+            size_buf1 = 0
+            
+            ### in construct W ###
+
+            size_buf2  = nIP_Prim * nIP_Prim * 2
+            size_buf2 += nIP_Prim * nGridPrim * 2 * 2
+            size_buf2 += nIP_Prim * nIP_Prim *  ncell_complex * 2 * 2
+            size_buf2 = 0
+                        
+            ### in get_j ###
+                    
+            buf_J = self._get_bufsize_get_j()
+            buf_J = 0
+            
+            ### in get_k ### 
+        
+            buf_K = self._get_bufsize_get_k()
+            
+            ### ddot_buf ###
+            
+            size_ddot_buf = (nIP_Prim*nIP_Prim+2)*num_threads            
+            size_buf      = max(size_buf1,size_buf2,buf_J,buf_K)
+                        
+            if hasattr(self, "IO_buf"):
+                if self.IO_buf.size < (size_buf+size_ddot_buf):
+                    self.IO_buf = np.zeros((size_buf+size_ddot_buf), dtype=np.float64)
+                self.jk_buffer = np.ndarray((size_buf), dtype=np.float64, buffer=self.IO_buf, offset=0)
+                self.ddot_buf  = np.ndarray((size_ddot_buf), dtype=np.float64, buffer=self.IO_buf, offset=size_buf)
+
+            else:
+
+                self.jk_buffer = np.ndarray((size_buf), dtype=np.float64)
+                self.ddot_buf  = np.zeros((size_ddot_buf), dtype=np.float64)
+
+    ##### all the following functions are used to deal with translation symmetry when getting j and getting k #####
+    
+    def _get_permutation_column_aoR(self, box_x, box_y, box_z, loc_internal=None):
+        
+        assert box_x < self.kmesh[0]
+        assert box_y < self.kmesh[1]
+        assert box_z < self.kmesh[2]
+        
+        if hasattr(self, "aoR_col_permutation") is False:
+            self.aoR_col_permutation = []
+            for i in range(np.prod(self.kmesh)):
+                self.aoR_col_permutation.append(None)
+        
+        loc = box_x * self.kmesh[1] * self.kmesh[2] + box_y * self.kmesh[2] + box_z 
+        
+        if self.aoR_col_permutation[loc] is None:
+            ### construct the permutation matrix ###
+            permutation = []
+            for aoR_holder in self.aoR:
+                ao_involved = aoR_holder.ao_involved
+                ao_permutated = []
+                for ao_id in ao_involved:
+                    box_id = ao_id // self.nao_prim
+                    nao_id = ao_id % self.nao_prim
+                    box_x_ = box_id // (self.kmesh[1] * self.kmesh[2])
+                    box_y_ = (box_id % (self.kmesh[1] * self.kmesh[2])) // self.kmesh[2]
+                    box_z_ = box_id % self.kmesh[2]
+                    box_x_new = (box_x + box_x_) % self.kmesh[0]
+                    box_y_new = (box_y + box_y_) % self.kmesh[1]
+                    box_z_new = (box_z + box_z_) % self.kmesh[2]
+                    nao_id_new = box_x_new * self.kmesh[1] * self.kmesh[2] * self.nao_prim + box_y_new * self.kmesh[2] * self.nao_prim + box_z_new * self.nao_prim + nao_id
+                    ao_permutated.append(nao_id_new)
+                # print("ao_permutated = ", ao_permutated)
+                permutation.append(np.array(ao_permutated, dtype=np.int32))
+            self.aoR_col_permutation[loc] = permutation
+        
+        if loc_internal is not None:
+            return self.aoR_col_permutation[loc][loc_internal]
+        else:
+            return self.aoR_col_permutation[loc]
+    
+    def _get_permutation_column_aoRg(self, box_x, box_y, box_z, loc_internal=None):
+        
+        assert box_x < self.kmesh[0]
+        assert box_y < self.kmesh[1]
+        assert box_z < self.kmesh[2]
+    
+        if hasattr(self, "aoRg_col_permutation") is False:
+            self.aoRg_col_permutation = []
+            for i in range(np.prod(self.kmesh)):
+                self.aoRg_col_permutation.append(None)
+        
+        loc = box_x * self.kmesh[1] * self.kmesh[2] + box_y * self.kmesh[2] + box_z
+        
+        if self.aoRg_col_permutation[loc] is None:
+            ### construct the permutation matrix ###
+            permutation = []
+            for aoRg_holder in self.aoRg:
+                ao_involved = aoRg_holder.ao_involved
+                ao_permutated = []
+                for ao_id in ao_involved:
+                    box_id = ao_id // self.nao_prim
+                    nao_id = ao_id % self.nao_prim
+                    box_x_ = box_id // (self.kmesh[1] * self.kmesh[2])
+                    box_y_ = (box_id % (self.kmesh[1] * self.kmesh[2])) // self.kmesh[2]
+                    box_z_ = box_id % self.kmesh[2]
+                    box_x_new = (box_x + box_x_) % self.kmesh[0]
+                    box_y_new = (box_y + box_y_) % self.kmesh[1]
+                    box_z_new = (box_z + box_z_) % self.kmesh[2]
+                    nao_id_new = box_x_new * self.kmesh[1] * self.kmesh[2] * self.nao_prim + box_y_new * self.kmesh[2] * self.nao_prim + box_z_new * self.nao_prim + nao_id
+                    ao_permutated.append(nao_id_new)
+                permutation.append(np.array(ao_permutated, dtype=np.int32))
+            self.aoRg_col_permutation[loc] = permutation
+        
+        if loc_internal is not None:
+            return self.aoRg_col_permutation[loc][loc_internal]
+        else:
+            return self.aoRg_col_permutation[loc]
+        
+    def _get_aoRg_Row(self, box_x, box_y, box_z):
+        
+        assert box_x < self.kmesh[0]
+        assert box_y < self.kmesh[1]
+        assert box_z < self.kmesh[2]
+        
+        if box_x == 0 and box_y == 0 and box_z == 0:
+            return self.aoRg1
+        else:
+            Res = []
+            for ix in range(self.kmesh[0]):
+                for iy in range(self.kmesh[1]):
+                    for iz in range(self.kmesh[2]):
+                        ix_ = (ix - box_x + self.kmesh[0]) % self.kmesh[0]
+                        iy_ = (iy - box_y + self.kmesh[1]) % self.kmesh[1]
+                        iz_ = (iz - box_z + self.kmesh[2]) % self.kmesh[2]
+                        loc_ = ix_ * self.kmesh[1] * self.kmesh[2] + iy_ * self.kmesh[2] + iz_
+                        for i in range(loc_*self.natmPrim, (loc_+1)*self.natmPrim):
+                            Res.append(self.aoRg1[i])
+            return Res
+
+    #### subroutine to deal with _ewald_exxdiv_for_G0
+
+    def get_jk(self, _dm, hermi=1, kpts=None, kpts_band=None,
+               with_j=True, with_k=True, omega=None, exxdiv=None):
+        
+        dm = deepcopy(_dm)
+        
+        if self.use_mpi:
+            from pyscf.isdf.isdf_tools_mpi import rank, bcast, comm
+            dm = bcast(dm, root=0)
+        
+        if omega is not None:  # J/K for RSH functionals
+            raise NotImplementedError
+            # with self.range_coulomb(omega) as rsh_df:
+            #     return rsh_df.get_jk(dm, hermi, kpts, kpts_band, with_j, with_k,
+            #                          omega=None, exxdiv=exxdiv)
+        
+        from pyscf.pbc.df.aft import _check_kpts
+        
+        kpts, is_single_kpt = _check_kpts(self, kpts)
+        
+        if is_single_kpt:
+            assert np.allclose(kpts[0], np.zeros(3))
+            assert not self.use_mpi
+            vj, vk = get_jk_dm_translation_symmetry(self, dm, hermi, kpts[0], kpts_band,
+                                                    with_j, with_k, exxdiv=exxdiv)
+        else:
+            
+            ### first construct J and K ### 
+            
+            from pyscf.isdf.isdf_local_k_jk import _contract_j_dm_k_ls, _get_k_kSym_robust_fitting_fast, _get_k_kSym, _get_k_kSym_direct, _get_k_kSym_direct_mimic_MPI
+            from pyscf.pbc.df.df_jk import _ewald_exxdiv_for_G0, _format_dms, _format_kpts_band, _format_jks
+            
+            ### preprocess dm ### 
+            
+            if dm.ndim == 3:
+                dm = dm.reshape(1, *dm.shape)
+            nset = dm.shape[0]
+            vj = np.zeros_like(dm, dtype=np.complex128)
+            vk = np.zeros_like(dm, dtype=np.complex128)
+            
+            for iset in range(nset):
+                if iset<=1:
+                    vj[iset] = _contract_j_dm_k_ls(self, dm[iset], self.use_mpi)
+                if self.with_robust_fitting:
+                    if self.direct:
+                        # vk[iset] = _get_k_kSym_direct(self, dm[iset])
+                        if iset == 0:
+                            # if self.use_mpi:
+                            vk = _get_k_kSym_direct(self, dm, self.use_mpi)
+                            #else:
+                            #vk = _get_k_kSym_direct_mimic_MPI(self, dm, self.use_mpi)
+                    else:
+                        vk[iset] = _get_k_kSym_robust_fitting_fast(self, dm[iset])
+                else:
+                    vk[iset] = _get_k_kSym(self, dm[iset])
+            
+            # if self.use_mpi:
+            #     from   pyscf.isdf.isdf_tools_mpi import rank, comm, comm_size
+            #     for i in range(comm_size):
+            #         if i == rank:
+            #             print("rank == ", rank)
+            #             print("vk   = ", vk[0][0][0,:32])
+            #             print("vk   = ", vk[0][0][:32,0])
+            #         comm.Barrier()
+            # else:
+            #     print("vk   = ", vk[0][0][0,:32])
+            #     print("vk   = ", vk[0][0][:32,0])
+            
+            ### post process J and K ###
+            
+            if not self.use_mpi or (self.use_mpi and rank == 0):
+                
+                kpts = np.asarray(kpts)
+                dm_kpts = lib.asarray(dm, order='C')
+                assert dm_kpts.ndim == 4
+                assert dm_kpts.shape[1] == len(kpts)
+                assert dm_kpts.shape[2] == dm_kpts.shape[3]
+                dms = _format_dms(dm_kpts, kpts)
+                nset, nkpts, nao = dms.shape[:3]
+                assert nset <= 4
+                        
+                kpts_band, input_band = _format_kpts_band(kpts_band, kpts), kpts_band
+                nband = len(kpts_band)
+                assert nband == nkpts
+            
+                vk_kpts = vk.reshape(nset, nband, nao, nao)
+            
+                cell = self.prim_cell
+            
+                if exxdiv == 'ewald':
+                    _ewald_exxdiv_for_G0(cell, kpts, dms, vk_kpts, kpts_band=kpts_band)
+            
+                vk      = _format_jks(vk_kpts, dm_kpts, input_band, kpts)
+                vj_kpts = vj.reshape(nset, nband, nao, nao)
+                vj      = _format_jks(vj_kpts, dm_kpts, input_band, kpts)
+
+                #print("vk   = ", vk[0][0][0,:32])
+                #print("vk   = ", vk[0][0][:32,0])
+                
+                if nset == 1:
+                    
+                    vj = vj[0]
+                    vk = vk[0]
+                
+        
+        if self.use_mpi:
+            
+            vj = bcast(vj, root = 0)
+            vk = bcast(vk, root = 0)
+            
+            comm.Barrier()
+            
+            # for i in range(comm_size):
+            #     if i == rank:
+            #         print("rank == ", rank)
+            #         print("vk   = ", vk[0][0,:32])
+            #         print("vk   = ", vk[0][:32,0])
+        # else:
+        #     print("vk   = ", vk[0][0,:32])
+        #     print("vk   = ", vk[0][:32,0])
+            
+        return vj, vk
+
+if __name__ == "__main__":
+
+    from isdf_tools_cell import build_supercell, build_supercell_with_partition
+    C = 25
+    
+    verbose = 10
+    import pyscf.pbc.gto as pbcgto
+    
+    cell   = pbcgto.Cell()
+    boxlen = 3.5668
+    cell.a = np.array([[boxlen,0.0,0.0],[0.0,boxlen,0.0],[0.0,0.0,boxlen]])
+    prim_a = np.array([[boxlen,0.0,0.0],[0.0,boxlen,0.0],[0.0,0.0,boxlen]])
+    atm = [
+        ['C', (0.     , 0.     , 0.    )],
+        ['C', (0.8917 , 0.8917 , 0.8917)],
+        ['C', (1.7834 , 1.7834 , 0.    )],
+        ['C', (2.6751 , 2.6751 , 0.8917)],
+        ['C', (1.7834 , 0.     , 1.7834)],
+        ['C', (2.6751 , 0.8917 , 2.6751)],
+        ['C', (0.     , 1.7834 , 1.7834)],
+        ['C', (0.8917 , 2.6751 , 2.6751)],
+    ] 
+    
+    KE_CUTOFF = 70
+    
+    prim_cell = build_supercell(atm, prim_a, Ls = [1,1,1], ke_cutoff=KE_CUTOFF)
+    prim_mesh = prim_cell.mesh
+    # prim_partition = [[0], [1], [2], [3], [4], [5], [6], [7]]
+    # prim_partition = [[0,1,2,3,4,5,6,7]]
+    prim_partition = [[0,1],[2,3],[4,5],[6,7]]
+    
+    Ls = [1, 1, 8]
+    kpts = prim_cell.make_kpts(Ls)
+    Ls = np.array(Ls, dtype=np.int32)
+    mesh = [Ls[0] * prim_mesh[0], Ls[1] * prim_mesh[1], Ls[2] * prim_mesh[2]]
+    mesh = np.array(mesh, dtype=np.int32)
+    
+    cell, group_partition = build_supercell_with_partition(atm, prim_a, mesh=mesh, 
+                                                     Ls=Ls,
+                                                     #basis=basis, pseudo=pseudo,
+                                                     partition=prim_partition, ke_cutoff=KE_CUTOFF, verbose=verbose)
+    
+    # pbc_isdf_info = PBC_ISDF_Info_Quad_K(cell, kmesh=Ls, with_robust_fitting=True, aoR_cutoff=1e-8, direct=False, rela_cutoff_QRCP=3e-3)
+    pbc_isdf_info = PBC_ISDF_Info_Quad_K(prim_cell, kmesh=Ls, with_robust_fitting=True, aoR_cutoff=1e-8, 
+                                         direct=True, 
+                                         # direct=False, 
+                                         rela_cutoff_QRCP=3e-3,
+                                         limited_memory=True, 
+                                         build_K_bunchsize=32)
+    pbc_isdf_info.build_IP_local(c=C, m=5, group=prim_partition, Ls=[Ls[0]*10, Ls[1]*10, Ls[2]*10])
+    pbc_isdf_info.verbose = 10
+    
+    weight = np.sqrt(cell.vol / pbc_isdf_info.coords.shape[0])
+    aoR_benchmark = ISDF_eval_gto(cell, coords=pbc_isdf_info.coords[pbc_isdf_info.grid_ID_ordered]) * weight
+        
+    naux_prim = 0
+    for data in pbc_isdf_info.aoRg:
+        naux_prim += data.aoR.shape[1]
+    print("naux_prim = ", naux_prim)
+    print("naux = ", pbc_isdf_info.naux)
+    
+    aoR_unpacked = np.zeros_like(aoR_benchmark)
+    ngrid = 0
+    for ix in range(Ls[0]):
+        for iy in range(Ls[1]):
+            for iz in range(Ls[2]):
+                perm_col = pbc_isdf_info._get_permutation_column_aoR(ix, iy, iz)
+                for _loc_, data in enumerate(pbc_isdf_info.aoR):
+                    aoR_unpacked[perm_col[_loc_], ngrid:ngrid+data.aoR.shape[1]] = data.aoR
+                    ngrid += data.aoR.shape[1]
+    assert ngrid == np.prod(mesh)
+    diff = aoR_benchmark - aoR_unpacked
+    where = np.where(np.abs(diff) > 1e-4)
+    print("where = ", where)
+    print("diff = ", np.linalg.norm(diff)/np.sqrt(aoR_unpacked.size)) 
+    
+    ngrid_prim = np.prod(prim_mesh)
+    diff = aoR_benchmark[:, :ngrid_prim] - aoR_unpacked[:,:ngrid_prim]
+    print("diff.shape = ", diff.shape)
+    print("diff = ", np.linalg.norm(diff)/np.sqrt(diff.size))
+    where = np.where(np.abs(diff) > 1e-4)
+    print("where = ", where)
+    
+    grid_ID_prim = pbc_isdf_info.grid_ID_ordered[:ngrid_prim]
+    grid_ID_prim2 = []
+    for i in range(pbc_isdf_info.natmPrim):
+        grid_ID_prim2.extend(pbc_isdf_info.partition[i])
+    grid_ID_prim2 = np.array(grid_ID_prim2, dtype=np.int32)
+    assert np.allclose(grid_ID_prim, grid_ID_prim2)
+        
+    # pbc_isdf_info.build_auxiliary_Coulomb(debug=True)
+        
+    from pyscf.pbc import scf
+
+    mf = scf.KRHF(prim_cell, kpts)
+    # mf = scf.KUHF(prim_cell, kpts)
+    # pbc_isdf_info.kpts = np.array([[0,0,0]])  
+    # mf = scf.addons.smearing_(mf, sigma=0.2, method='fermi')
+    pbc_isdf_info.set_build_K_distance_cutoff(30.0)
+    pbc_isdf_info.direct_scf = mf.direct_scf
+    mf.with_df = pbc_isdf_info
+    mf.max_cycle = 16
+    mf.conv_tol = 1e-7
+    
+    mf.kernel()
+    
+    # exit(1)
+    
+    ######### bench mark #########
+    
+    pbc_isdf_info = ISDF_Local.PBC_ISDF_Info_Quad(cell, with_robust_fitting=True, aoR_cutoff=1e-8, direct=True, rela_cutoff_QRCP=1e-3, use_occ_RI_K=False)
+    pbc_isdf_info.build_IP_local(c=C, m=5, group=group_partition, Ls=[Ls[0]*10, Ls[1]*10, Ls[2]*10])
+    # pbc_isdf_info.build_IP_local(c=C, m=5, group=group_partition, Ls=[Ls[0]*3, Ls[1]*3, Ls[2]*3])
+    pbc_isdf_info.Ls = Ls
+    pbc_isdf_info.build_auxiliary_Coulomb(debug=True)
+    
+    aoR_unpacked = []
+    for aoR_holder in pbc_isdf_info.aoR:
+        aoR_unpacked.append(aoR_holder.todense(cell.nao_nr()))
+    aoR_unpacked = np.concatenate(aoR_unpacked, axis=1)
+    grid_ordered = pbc_isdf_info.grid_ID_ordered
+    aoR_benchmark = ISDF_eval_gto(cell, coords=pbc_isdf_info.coords[grid_ordered]) * weight
+    diff = aoR_benchmark - aoR_unpacked
+    print("diff = ", np.linalg.norm(diff)/np.sqrt(aoR_unpacked.size))
+    # exit(1)
+    
+    mf = scf.RHF(cell)
+    pbc_isdf_info.direct_scf = mf.direct_scf
+    mf.with_df = pbc_isdf_info
+    mf.max_cycle = 16
+    mf.conv_tol = 1e-7
+    mf.kernel()
\ No newline at end of file
diff --git a/pyscf/isdf/isdf_local_k_jk.py b/pyscf/isdf/isdf_local_k_jk.py
new file mode 100644
index 000000000..f23206a93
--- /dev/null
+++ b/pyscf/isdf/isdf_local_k_jk.py
@@ -0,0 +1,2083 @@
+#!/usr/bin/env python
+# Copyright 2014-2020 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Author: Ning Zhang <ningzhang1024@gmail.com>
+#
+
+############ sys module ############
+
+import copy
+import numpy as np
+import ctypes
+
+############ pyscf module ############
+
+from pyscf import lib
+from pyscf.pbc import tools
+from pyscf.pbc.lib.kpts_helper import is_zero, gamma_point
+from pyscf.gto.mole import *
+from pyscf.pbc.df.df_jk import _ewald_exxdiv_for_G0, _format_dms, _format_kpts_band, _format_jks
+
+from pyscf.isdf.isdf_jk import _benchmark_time
+from pyscf.isdf.isdf_tools_densitymatrix import pack_JK, pack_JK_in_FFT_space
+from pyscf.isdf.isdf_local_jk   import J_MAX_GRID_BUNCHSIZE, __get_DensityMatrixonRgAO_qradratic
+from pyscf.isdf.isdf_tools_kSampling     import _RowCol_FFT_bench
+from pyscf.isdf._isdf_local_K_direct     import _isdf_get_K_direct_kernel_1
+libisdf = lib.load_library('libisdf')
+import pyscf.isdf.isdf_tools_linearop    as     lib_isdf
+
+############ subroutines ############
+
+def _preprocess_dm(mydf, dm):
+
+    log = lib.logger.Logger(mydf.cell.stdout, mydf.cell.verbose)
+
+    in_real_space = True
+
+    kmesh = np.asarray(mydf.kmesh, dtype=np.int32)
+    ncell_complex = kmesh[0] * kmesh[1] * (kmesh[2]//2+1)
+    
+    if len(dm.shape) == 3:
+        if dm.shape[0] == 1:
+            if dm.dtype == np.float64:
+                dm = dm[0].real
+            else:
+                in_real_space = False
+                dm = dm[0].real
+        else:  
+            
+            #print("dm.shape = ", dm.shape)
+            #print("dm = ", dm)
+            #print("dtype = ", dm.dtype)
+            
+            in_real_space = False 
+                        
+            if dm.dtype == np.float64:
+                #assert kmesh[0] in [1, 2]
+                #assert kmesh[1] in [1, 2]
+                #assert kmesh[2] in [1, 2]
+                dm = np.asarray(dm, dtype=np.complex128)
+            
+            assert dm.dtype    == np.complex128
+            assert dm.shape[1] == dm.shape[2]
+            assert dm.shape[0] == np.prod(kmesh)
+            
+            nao_prim   = dm.shape[1]
+            nkpts      = dm.shape[0]
+            
+            #dm_complex = np.transpose(dm, axes=(1, 0, 2)).copy()
+            #dm_complex = dm_complex.reshape(nao_prim, -1)
+            
+            ### check the symmetry ###
+            
+            for ix in range(kmesh[0]):
+                for iy in range(kmesh[1]):
+                    for iz in range(kmesh[2]):
+                        loc1 = ix * kmesh[1] * kmesh[2] + iy * kmesh[2] + iz
+                        loc2 = (kmesh[0] - ix) % kmesh[0] * kmesh[1] * kmesh[2] + (kmesh[1] - iy) % kmesh[1] * kmesh[2] + (kmesh[2] - iz) % kmesh[2]
+                        #print("loc1     = ", loc1, "loc2 = ", loc2)
+                        #print("dm[loc1] = ", dm[loc1])
+                        #print("dm[loc2] = ", dm[loc2])
+                        diff = np.linalg.norm(dm[loc1] - dm[loc2].conj()) / np.sqrt(dm.size)
+                        # print("diff = ", diff) ## NOTE: should be very small
+                        # assert diff < 1e-7
+                        if diff > 1e-7:
+                            log.debug4("warning, the input density matrix is not symmetric.")
+                            log.debug4("k1    = (%d, %d, %d) " % (ix, iy, iz))
+                            log.debug4("k2    = (%d, %d, %d) " % ((kmesh[0] - ix) % kmesh[0], (kmesh[1] - iy) % kmesh[1], (kmesh[2] - iz) % kmesh[2]))
+                            # log.debug4("kmesh = ", kmesh)
+                            log.debug4("diff  = %15.6f" % (diff))
+            dm_complex = np.zeros((ncell_complex, nao_prim, nao_prim), dtype=np.complex128)
+            loc = 0
+            for ix in range(kmesh[0]):
+                for iy in range(kmesh[1]):
+                    for iz in range(kmesh[2]//2+1):
+                        loc1 = ix * kmesh[1] * kmesh[2] + iy * kmesh[2] + iz
+                        loc2 = (kmesh[0] - ix) % kmesh[0] * kmesh[1] * kmesh[2] + (kmesh[1] - iy) % kmesh[1] * kmesh[2] + (kmesh[2] - iz) % kmesh[2]
+                        # dm_complex[loc].ravel()[:] = dm[loc1].ravel()[:]
+                        dm_input = ((dm[loc1] + dm[loc2].conj()) / 2.0).copy()
+                        dm_complex[loc].ravel()[:] = dm_input.ravel()[:]
+                        loc += 1
+            
+            dm_complex = np.transpose(dm_complex, axes=(1, 0, 2)).copy()
+            dm_complex = dm_complex.conj().copy()
+            
+            #print("dm_complex.shape = ", dm_complex.shape)
+            #print("dm_complex = ", dm_complex[:, 0, :])
+            #print("dm_complex = ", dm_complex[:, 1, :])
+            
+            ### do the FFT ### 
+            
+            dm_real = np.ndarray((nao_prim, nkpts * nao_prim), dtype=np.float64, buffer=dm_complex)
+            buf_fft = np.zeros((nao_prim, ncell_complex, nao_prim), dtype=np.complex128)
+            
+            fn2 = getattr(libisdf, "_iFFT_Matrix_Col_InPlace", None)
+            assert fn2 is not None
+
+            fn2(
+                dm_complex.ctypes.data_as(ctypes.c_void_p),
+                ctypes.c_int(nao_prim),
+                ctypes.c_int(nao_prim),
+                kmesh.ctypes.data_as(ctypes.c_void_p),
+                buf_fft.ctypes.data_as(ctypes.c_void_p)
+            )
+            
+            #print("dm_real    = ", dm_real)
+            #print("dm_complex = ", dm_complex)
+            
+            dm = pack_JK(dm_real, kmesh, nao_prim)
+            
+            #print("dm.shape = ", dm.shape)
+
+    return dm, in_real_space
+    
+def _contract_j_dm_k_ls(mydf, _dm, use_mpi=False):
+    
+    dm, in_real_space = _preprocess_dm(mydf, _dm)
+    
+    if use_mpi:
+        assert mydf.direct == True
+        from pyscf.isdf.isdf_tools_mpi import rank, comm, comm_size, bcast, reduce
+        size = comm_size
+        # raise NotImplementedError("MPI is not supported yet.")
+        dm = bcast(dm, root=0)
+    
+    t1 = (logger.process_clock(), logger.perf_counter())
+
+    if len(dm.shape) == 3:
+        assert dm.shape[0] == 1
+        dm = dm[0]
+        
+    nao  = dm.shape[0]
+    cell = mydf.cell
+    assert cell.nao == nao
+    vol = cell.vol
+    mesh = np.array(cell.mesh, dtype=np.int32)
+    ngrid = np.prod(mesh)
+    ngrid_prim = ngrid // np.prod(mydf.kmesh)
+
+    aoR  = mydf.aoR
+    assert isinstance(aoR, list)
+    naux = mydf.naux
+    aoR1 = mydf.aoR1
+    assert isinstance(aoR1, list)
+    
+    kmesh = np.array(mydf.kmesh, dtype=np.int32)
+    ncell = np.prod(kmesh)
+    ncell_complex = kmesh[0] * kmesh[1] * (kmesh[2]//2+1)
+    
+    #### step 0. allocate buffer 
+    
+    max_nao_involved    = np.max([aoR_holder.aoR.shape[0] for aoR_holder in aoR if aoR_holder is not None])
+    max_nao_involved1   = np.max([aoR_holder.aoR.shape[0] for aoR_holder in aoR1 if aoR_holder is not None])
+    max_nao_involved    = max(max_nao_involved, max_nao_involved1)
+    max_ngrid_involved  = np.max([aoR_holder.aoR.shape[1] for aoR_holder in aoR if aoR_holder is not None])
+    max_ngrid_involved1 = np.max([aoR_holder.aoR.shape[1] for aoR_holder in aoR1 if aoR_holder is not None])
+    max_ngrid_involved  = max(max_ngrid_involved, max_ngrid_involved1)
+
+    density_R_prim = np.zeros((ngrid_prim,), dtype=np.float64)
+    
+    dm_buf      = np.zeros((max_nao_involved, max_nao_involved), dtype=np.float64)
+    max_dim_buf = max_nao_involved
+    max_col_buf = min(max_ngrid_involved, J_MAX_GRID_BUNCHSIZE)
+    aoR_buf1    = np.zeros((max_nao_involved, max_ngrid_involved), dtype=np.float64)
+    
+    ##### get the involved C function ##### 
+    
+    fn_extract_dm = getattr(libisdf, "_extract_dm_involved_ao", None) 
+    assert fn_extract_dm is not None
+    
+    fn_packadd_dm = getattr(libisdf, "_packadd_local_dm", None)
+    assert fn_packadd_dm is not None
+    
+    fn_multiplysum = getattr(libisdf, "_fn_J_dmultiplysum", None)
+    assert fn_multiplysum is not None
+    
+    #### step 1. get density value on real space grid and IPs
+    
+    density_R_tmp = None
+    ddot_buf      = np.zeros((max_nao_involved, max_col_buf), dtype=np.float64)
+    
+    for atm_id, aoR_holder in enumerate(aoR):
+        
+        if aoR_holder is None:
+            continue
+        
+        if use_mpi:
+            if atm_id % comm_size != rank:
+                continue
+            
+        ngrids_now = aoR_holder.aoR.shape[1]
+        nao_involved = aoR_holder.aoR.shape[0]
+        
+        if nao_involved < nao:
+            fn_extract_dm(
+                dm.ctypes.data_as(ctypes.c_void_p),
+                ctypes.c_int(nao),
+                dm_buf.ctypes.data_as(ctypes.c_void_p),
+                aoR_holder.ao_involved.ctypes.data_as(ctypes.c_void_p),
+                ctypes.c_int(nao_involved),
+            )
+        else:
+            dm_buf.ravel()[:] = dm.ravel()
+        
+        dm_now = np.ndarray((nao_involved, nao_involved), buffer=dm_buf)
+        global_gridID_begin = aoR_holder.global_gridID_begin
+        
+        for p0, p1 in lib.prange(0, ngrids_now, J_MAX_GRID_BUNCHSIZE):
+            ddot_res = np.ndarray((nao_involved, p1-p0), buffer=ddot_buf)
+            lib.ddot(dm_now, aoR_holder.aoR[:,p0:p1], c=ddot_res)
+            # density_R_tmp = lib.multiply_sum_isdf(aoR_holder.aoR[:,p0:p1], ddot_res)
+            _res_tmp = np.ndarray((p1-p0,),
+                                dtype =density_R_prim.dtype, 
+                                buffer=density_R_prim, 
+                                offset=(global_gridID_begin+p0)*density_R_prim.dtype.itemsize)
+            # density_R_prim[global_gridID_begin+p0:global_gridID_begin+p1] = density_R_tmp
+            fn_multiplysum(
+                    _res_tmp.ctypes.data_as(ctypes.c_void_p),
+                    ctypes.c_int(nao_involved),
+                    ctypes.c_int(p1-p0),
+                    aoR_holder.aoR.ctypes.data_as(ctypes.c_void_p),
+                    ctypes.c_int(aoR_holder.aoR.shape[0]),
+                    ctypes.c_int(aoR_holder.aoR.shape[1]),
+                    ctypes.c_int(0),
+                    ctypes.c_int(p0),
+                    ddot_res.ctypes.data_as(ctypes.c_void_p),
+                    ctypes.c_int(nao_involved),
+                    ctypes.c_int(p1-p0),
+                    ctypes.c_int(0),
+                    ctypes.c_int(0))
+        # ddot_res = np.ndarray((nao_involved, ngrids_now), buffer=ddot_buf)
+        # lib.ddot(dm_now, aoR_holder.aoR, c=ddot_res)
+        # density_R_tmp = lib.multiply_sum_isdf(aoR_holder.aoR, ddot_res)
+        # density_R_prim[global_gridID_begin:global_gridID_begin+ngrids_now] = density_R_tmp
+    
+    if use_mpi:
+        density_R_prim = reduce(density_R_prim, root=0)
+            
+    grid_ID_ordered = mydf.grid_ID_ordered_prim
+    
+    if (use_mpi and rank == 0) or (use_mpi == False):
+        
+        density_R_original = np.zeros_like(density_R_prim)
+            
+        fn_order = getattr(libisdf, "_Reorder_Grid_to_Original_Grid", None)
+        assert fn_order is not None
+            
+        fn_order(
+            ctypes.c_int(density_R_prim.size),
+            mydf.grid_ID_ordered_prim.ctypes.data_as(ctypes.c_void_p),
+            density_R_prim.ctypes.data_as(ctypes.c_void_p),
+            density_R_original.ctypes.data_as(ctypes.c_void_p),
+        )
+
+        density_R_prim = density_R_original.copy()
+    
+    J = None
+    
+    ddot_buf = np.zeros((max_nao_involved, max_nao_involved), dtype=np.float64)
+    
+    if (use_mpi and rank == 0) or (use_mpi == False):
+    
+        fn_J = getattr(libisdf, "_construct_J", None)
+        assert(fn_J is not None)
+
+        if hasattr(mydf, "coulG_prim") == False:
+            assert mydf.omega is None or mydf.omega == 0.0
+            mydf.coulG_prim = tools.get_coulG(mydf.primCell, mesh=mydf.primCell.mesh)
+
+        J = np.zeros_like(density_R_prim)
+
+        mesh_prim = np.array(mydf.primCell.mesh, dtype=np.int32)
+
+        fn_J(
+            mesh_prim.ctypes.data_as(ctypes.c_void_p),
+            density_R_prim.ctypes.data_as(ctypes.c_void_p),
+            mydf.coulG_prim.ctypes.data_as(ctypes.c_void_p),
+            J.ctypes.data_as(ctypes.c_void_p),
+        )
+                    
+        J_ordered = np.zeros_like(J)
+
+        fn_order = getattr(libisdf, "_Original_Grid_to_Reorder_Grid", None)
+        assert fn_order is not None 
+            
+        fn_order(
+            ctypes.c_int(J.size),
+            grid_ID_ordered.ctypes.data_as(ctypes.c_void_p),
+            J.ctypes.data_as(ctypes.c_void_p),
+            J_ordered.ctypes.data_as(ctypes.c_void_p),
+        )
+            
+        J = J_ordered.copy()
+            
+    if use_mpi:
+        J = bcast(J, root=0)
+    
+    #### step 3. get J , using translation symmetry ###
+
+    nao_prim = mydf.nao_prim
+    J_Res    = np.zeros((nao_prim, nao), dtype=np.float64)
+
+    partition_activated_ID = mydf.partition_activated_id
+        
+    kmesh     = np.asarray(mydf.kmesh, dtype=np.int32)
+    natm_prim = mydf.natmPrim
+    
+    grid_segment = mydf.grid_segment
+    
+    fn_packadd_J = getattr(libisdf, "_buildJ_k_packaddrow", None)
+    assert fn_packadd_J is not None
+    
+    for task_id, box_id in enumerate(partition_activated_ID):
+        
+        if use_mpi:
+            if task_id % comm_size != rank:
+                continue
+        
+        box_loc1 = box_id // natm_prim
+        box_loc2 = box_id % natm_prim
+        
+        box_x = box_loc1 // (kmesh[1] * kmesh[2])
+        box_y = box_loc1 % (kmesh[1] * kmesh[2]) // kmesh[2]
+        box_z = box_loc1 % kmesh[2]
+        
+        aoR_holder_bra = aoR1[box_id]
+    
+        permutation = mydf._get_permutation_column_aoR(box_x, box_y, box_z, box_loc2)
+        
+        aoR_holder_ket = aoR[box_loc2]
+        
+        J_tmp = J[grid_segment[box_loc2]:grid_segment[box_loc2+1]]
+        
+        assert aoR_holder_ket.aoR.shape[1] == J_tmp.size
+        
+        aoR_J_res = np.ndarray(aoR_holder_bra.aoR.shape, buffer=aoR_buf1)
+        lib_isdf.d_ij_j_ij(aoR_holder_bra.aoR, J_tmp, out=aoR_J_res)
+        
+        nao_bra = aoR_holder_bra.aoR.shape[0]
+        nao_ket = aoR_holder_ket.aoR.shape[0]
+
+        ddot_res = np.ndarray((nao_bra, nao_ket), buffer=ddot_buf)
+        lib.ddot(aoR_J_res, aoR_holder_ket.aoR.T, c=ddot_res)
+        
+        #### pack and add the result to J_Res
+        
+        fn_packadd_J(
+            J_Res.ctypes.data_as(ctypes.c_void_p),
+            ctypes.c_int(nao_prim),
+            ctypes.c_int(nao),
+            ddot_res.ctypes.data_as(ctypes.c_void_p),
+            ctypes.c_int(nao_bra),
+            ctypes.c_int(nao_ket),
+            aoR_holder_bra.ao_involved.ctypes.data_as(ctypes.c_void_p),
+            permutation.ctypes.data_as(ctypes.c_void_p),
+        )
+    
+    J = J_Res
+    if use_mpi:
+        J = reduce(J, root=0)
+        
+    ######### delete the buffer #########
+    
+    del dm_buf, ddot_buf, density_R_prim
+    del density_R_tmp
+    del aoR_buf1
+    
+    if not use_mpi or (use_mpi and rank == 0):
+        
+        J *= ngrid / vol
+    
+        if in_real_space:
+            J = pack_JK(J, mydf.kmesh, nao_prim)
+        else:
+            ## transform J back to FFT space ##
+            fn1 = getattr(libisdf, "_FFT_Matrix_Col_InPlace", None)
+            assert fn1 is not None
+            J_complex = np.ndarray((nao_prim,nao_prim*ncell_complex), dtype=np.complex128)
+            fft_buf   = np.ndarray((nao_prim,nao_prim*ncell_complex), dtype=np.complex128)
+            J_real    = np.ndarray((nao_prim,nao_prim*ncell),         dtype=np.float64,  buffer=J_complex)
+            J_real.ravel()[:]    = J.ravel()[:]
+            fn1(
+                J_real.ctypes.data_as(ctypes.c_void_p),
+                ctypes.c_int(nao_prim),
+                ctypes.c_int(nao_prim),
+                kmesh.ctypes.data_as(ctypes.c_void_p),
+                fft_buf.ctypes.data_as(ctypes.c_void_p)
+            )
+            del fft_buf
+            ## pack J in FFT space ##
+            J_complex = J_complex.conj().copy()
+            J = pack_JK_in_FFT_space(J_complex, mydf.kmesh, nao_prim)
+    
+    if use_mpi:
+        J = bcast(J, root=0)
+    
+    t2 = (logger.process_clock(), logger.perf_counter())
+    
+    if not use_mpi or (use_mpi and rank == 0):
+        _benchmark_time(t1, t2, "_contract_j_dm_k_ls", mydf)
+        
+    return J
+
+def _get_k_kSym_robust_fitting_fast(mydf, _dm):
+    
+    '''
+    this is a slow version, abandon ! 
+    '''
+ 
+    #### preprocess ####  
+    
+    dm, in_real_space = _preprocess_dm(mydf, _dm)
+    
+    mydf._allocate_jk_buffer(dm.dtype)
+    t1 = (logger.process_clock(), logger.perf_counter())
+    
+    if len(dm.shape) == 3:
+        assert dm.shape[0] == 1
+        dm = dm[0]
+    
+    nao  = dm.shape[0]
+    cell = mydf.cell    
+    assert cell.nao == nao
+    ngrid = np.prod(cell.mesh)
+    vol = cell.vol
+    
+    W    = mydf.W
+    naux = mydf.naux
+    
+    kmesh = np.array(mydf.kmesh, dtype=np.int32)
+    mesh = mydf.mesh
+    meshPrim = np.array(mesh) // np.array(kmesh)
+    nGridPrim = mydf.nGridPrim
+    ncell = np.prod(kmesh)
+    ncell_complex = kmesh[0] * kmesh[1] * (kmesh[2]//2+1)
+    nIP_prim = mydf.nIP_Prim
+    nao_prim = nao // ncell
+    
+    #### allocate buffer ####
+     
+    
+    offset = 0
+    
+    DM_complex = np.ndarray((nao_prim,nao_prim*ncell_complex), dtype=np.complex128, buffer=mydf.jk_buffer, offset=offset)
+    # DM_complex = np.ndarray((nao_prim,nao_prim*ncell_complex), dtype=np.complex128)
+    DM_real = np.ndarray((nao_prim,nao), dtype=np.float64, buffer=DM_complex)
+    DM_real.ravel()[:] = dm[:nao_prim, :].ravel()[:]
+    offset += DM_complex.size * DM_complex.itemsize
+    
+    offset_after_dm = offset
+    
+    DM_RgRg_complex = np.ndarray((nIP_prim,nIP_prim*ncell_complex), dtype=np.complex128,  buffer=mydf.jk_buffer, offset=offset)
+    DM_RgRg_real = np.ndarray((nIP_prim,nIP_prim*ncell), dtype=np.float64, buffer=DM_RgRg_complex)
+    offset += DM_RgRg_complex.size * DM_RgRg_complex.itemsize
+
+    offset_after_DM = offset
+    
+    #### get D ####
+    
+    #_get_DM_RgRg_real(mydf, DM_real, DM_complex, DM_RgRg_real, DM_RgRg_complex, offset)
+    
+    fn1 = getattr(libisdf, "_FFT_Matrix_Col_InPlace", None)
+    assert fn1 is not None
+    
+    fn_packcol2 = getattr(libisdf, "_buildK_packcol2", None)
+    assert fn_packcol2 is not None
+    fn_packcol3 = getattr(libisdf, "_buildK_packcol3", None)
+    assert fn_packcol3 is not None
+    
+    fn_copy = getattr(libisdf, "_buildK_copy", None)
+    assert fn_copy is not None
+    
+    buf_fft = np.ndarray((nao_prim, nao_prim*ncell_complex), dtype=np.complex128, buffer=mydf.jk_buffer, offset=offset)
+    
+    t3 = (logger.process_clock(), logger.perf_counter())
+    
+    fn1(
+        DM_real.ctypes.data_as(ctypes.c_void_p),
+        ctypes.c_int(nao_prim),
+        ctypes.c_int(nao_prim),
+        kmesh.ctypes.data_as(ctypes.c_void_p),
+        buf_fft.ctypes.data_as(ctypes.c_void_p)
+    )
+    
+    t4 = (logger.process_clock(), logger.perf_counter())
+    _benchmark_time(t3, t4, "_fft1", mydf)
+    
+    buf_A = np.ndarray((nao_prim, nao_prim), dtype=np.complex128, buffer=mydf.jk_buffer, offset=offset)
+    
+    offset2 = offset + (nao_prim * nao_prim) * buf_A.itemsize
+    buf_B = np.ndarray((nao_prim, nIP_prim), dtype=np.complex128, buffer=mydf.jk_buffer, offset=offset2)
+    
+    offset3 = offset2 + (nao_prim * nIP_prim) * buf_B.itemsize
+    buf_C = np.ndarray((nao_prim, nIP_prim), dtype=np.complex128, buffer=mydf.jk_buffer, offset=offset3)
+    
+    offset4 = offset3 + (nao_prim * nIP_prim) * buf_C.itemsize
+    buf_D = np.ndarray((nIP_prim, nIP_prim), dtype=np.complex128, buffer=mydf.jk_buffer, offset=offset4)
+    
+    aoRg_FFT = mydf.aoRg_FFT
+    
+    t3 = (logger.process_clock(), logger.perf_counter())
+    
+    if isinstance(aoRg_FFT, list):
+        
+        for i in range(ncell_complex):
+        
+            k_begin = i * nao_prim
+            k_end   = (i + 1) * nao_prim
+        
+            # buf_A[:] = DM_complex[:, k_begin:k_end]
+            fn_packcol2(
+                buf_A.ctypes.data_as(ctypes.c_void_p),
+                ctypes.c_int(nao_prim),
+                ctypes.c_int(2*nao_prim),
+                DM_complex.ctypes.data_as(ctypes.c_void_p),
+                ctypes.c_int(DM_complex.shape[0]),
+                ctypes.c_int(2*DM_complex.shape[1]),
+                ctypes.c_int(2*k_begin),
+                ctypes.c_int(2*k_end)   # 2 due to complex number
+            )
+            
+            # buf_B[:] = aoRg_FFT[:, i*nIP_prim:(i+1)*nIP_prim]
+            # buf_B.ravel()[:] = aoRg_FFT[i].ravel()[:]
+            fn_copy(
+                buf_B.ctypes.data_as(ctypes.c_void_p),
+                aoRg_FFT[i].ctypes.data_as(ctypes.c_void_p),
+                ctypes.c_size_t(2*buf_B.size) # 2 due to complex number
+            )
+        
+            lib.dot(buf_A, buf_B, c=buf_C)
+            lib.dot(buf_B.T.conj(), buf_C, c=buf_D)
+        
+            k_begin = i * nIP_prim
+            k_end   = (i + 1) * nIP_prim
+        
+            # DM_RgRg_complex[:, k_begin:k_end] = buf_D 
+            fn_packcol3(
+                DM_RgRg_complex.ctypes.data_as(ctypes.c_void_p),
+                ctypes.c_int(DM_RgRg_complex.shape[0]),
+                ctypes.c_int(2*DM_RgRg_complex.shape[1]),
+                ctypes.c_int(2*k_begin),
+                ctypes.c_int(2*k_end),
+                buf_D.ctypes.data_as(ctypes.c_void_p),
+                ctypes.c_int(buf_D.shape[0]),
+                ctypes.c_int(2*buf_D.shape[1]),
+            )
+            
+    else:
+    
+        raise NotImplementedError("not implemented yet.")
+    
+        for i in range(ncell_complex):
+        
+            k_begin = i * nao_prim
+            k_end   = (i + 1) * nao_prim
+        
+            buf_A[:] = DM_complex[:, k_begin:k_end]
+            buf_B[:] = aoRg_FFT[:, i*nIP_prim:(i+1)*nIP_prim]
+        
+            lib.dot(buf_A, buf_B, c=buf_C)
+            lib.dot(buf_B.T.conj(), buf_C, c=buf_D)
+        
+            k_begin = i * nIP_prim
+            k_end   = (i + 1) * nIP_prim
+        
+            DM_RgRg_complex[:, k_begin:k_end] = buf_D
+    
+    t4 = (logger.process_clock(), logger.perf_counter())
+    _benchmark_time(t3, t4, "DM_RgRg_complex", mydf)
+    
+    t3 = t4
+    
+    buf_fft = np.ndarray((nIP_prim, nIP_prim*ncell_complex), dtype=np.complex128, buffer=mydf.jk_buffer, offset=offset)
+    
+    fn2 = getattr(libisdf, "_iFFT_Matrix_Col_InPlace", None)
+    assert fn2 is not None
+        
+    fn2(
+        DM_RgRg_complex.ctypes.data_as(ctypes.c_void_p),
+        ctypes.c_int(nIP_prim),
+        ctypes.c_int(nIP_prim),
+        kmesh.ctypes.data_as(ctypes.c_void_p),
+        buf_fft.ctypes.data_as(ctypes.c_void_p)
+    )
+    
+    t4 = (logger.process_clock(), logger.perf_counter())
+    _benchmark_time(t3, t4, "DM_RgRg_complex 2", mydf)
+    t3 = t4
+    
+    # inplace multiplication
+    
+    lib_isdf.cwise_mul(mydf.W, DM_RgRg_real, out=DM_RgRg_real)
+    
+    t4 = (logger.process_clock(), logger.perf_counter())
+    _benchmark_time(t3, t4, "lib.cwise_mul 2", mydf)
+    t3 = t4
+    
+    offset = offset_after_DM
+    
+    buf_fft = np.ndarray((nIP_prim, nIP_prim*ncell_complex), dtype=np.complex128, buffer=mydf.jk_buffer, offset=offset)
+    
+    fn1(
+        DM_RgRg_real.ctypes.data_as(ctypes.c_void_p),
+        ctypes.c_int(nIP_prim),
+        ctypes.c_int(nIP_prim),
+        kmesh.ctypes.data_as(ctypes.c_void_p),
+        buf_fft.ctypes.data_as(ctypes.c_void_p)
+    )
+    
+    t4 = (logger.process_clock(), logger.perf_counter())
+    _benchmark_time(t3, t4, "DM_RgRg_real", mydf)
+    t3 = t4
+    
+    K_complex_buf = np.ndarray((nao_prim, nao_prim*ncell_complex), dtype=np.complex128, buffer=mydf.jk_buffer, offset=offset)
+    K_real_buf    = np.ndarray((nao_prim, nao_prim*ncell), dtype=np.float64, buffer=mydf.jk_buffer, offset=offset)
+    offset += (nao_prim * nao_prim * ncell_complex) * K_complex_buf.itemsize
+    offset_now = offset    
+    
+    buf_A = np.ndarray((nIP_prim, nIP_prim), dtype=np.complex128, buffer=mydf.jk_buffer, offset=offset_now)
+    offset_now += (nIP_prim * nIP_prim) * buf_A.itemsize
+    buf_B = np.ndarray((nao_prim, nIP_prim), dtype=np.complex128, buffer=mydf.jk_buffer, offset=offset_now)
+    offset_now += (nao_prim * nIP_prim) * buf_B.itemsize
+    buf_C = np.ndarray((nIP_prim, nao_prim), dtype=np.complex128, buffer=mydf.jk_buffer, offset=offset_now)
+    offset_now += (nIP_prim * nao_prim) * buf_C.itemsize
+    buf_D = np.ndarray((nao_prim, nao_prim), dtype=np.complex128, buffer=mydf.jk_buffer, offset=offset_now)
+    
+    if isinstance(aoRg_FFT, list):
+        for i in range(ncell_complex):
+        
+            k_begin = i * nIP_prim
+            k_end   = (i + 1) * nIP_prim
+        
+            # buf_A.ravel()[:] = DM_RgRg_complex[:, k_begin:k_end].ravel()[:]
+            fn_packcol2(
+                buf_A.ctypes.data_as(ctypes.c_void_p),
+                ctypes.c_int(nIP_prim),
+                ctypes.c_int(2*nIP_prim),
+                DM_RgRg_complex.ctypes.data_as(ctypes.c_void_p),
+                ctypes.c_int(DM_RgRg_complex.shape[0]),
+                ctypes.c_int(2*DM_RgRg_complex.shape[1]),
+                ctypes.c_int(2*k_begin),
+                ctypes.c_int(2*k_end)
+            )
+            
+            # buf_B.ravel()[:] = aoRg_FFT[:, i*nIP_prim:(i+1)*nIP_prim].ravel()[:]
+            # buf_B.ravel()[:] = aoRg_FFT[i].ravel()[:]
+            fn_copy(
+                buf_B.ctypes.data_as(ctypes.c_void_p),
+                aoRg_FFT[i].ctypes.data_as(ctypes.c_void_p),
+                ctypes.c_size_t(2*buf_B.size) # 2 due to complex number
+            )
+            
+        
+            lib.dot(buf_A, buf_B.T.conj(), c=buf_C)
+            lib.dot(buf_B, buf_C, c=buf_D)
+        
+            k_begin = i * nao_prim
+            k_end   = (i + 1) * nao_prim
+        
+            # K_complex_buf[:, k_begin:k_end] = buf_D
+            
+            fn_packcol3(
+                K_complex_buf.ctypes.data_as(ctypes.c_void_p),
+                ctypes.c_int(K_complex_buf.shape[0]),
+                ctypes.c_int(2*K_complex_buf.shape[1]),
+                ctypes.c_int(2*k_begin),
+                ctypes.c_int(2*k_end),
+                buf_D.ctypes.data_as(ctypes.c_void_p),
+                ctypes.c_int(buf_D.shape[0]),
+                ctypes.c_int(2*buf_D.shape[1]),
+            )
+            
+    else:
+        
+        raise NotImplementedError("not implemented yet.")
+        
+        for i in range(ncell_complex):
+        
+            k_begin = i * nIP_prim
+            k_end   = (i + 1) * nIP_prim
+        
+            buf_A.ravel()[:] = DM_RgRg_complex[:, k_begin:k_end].ravel()[:]
+            buf_B.ravel()[:] = aoRg_FFT[:, i*nIP_prim:(i+1)*nIP_prim].ravel()[:]
+        
+            lib.dot(buf_A, buf_B.T.conj(), c=buf_C)
+            lib.dot(buf_B, buf_C, c=buf_D)
+        
+            k_begin = i * nao_prim
+            k_end   = (i + 1) * nao_prim
+        
+            K_complex_buf[:, k_begin:k_end] = buf_D
+    
+    t4 = (logger.process_clock(), logger.perf_counter())
+    _benchmark_time(t3, t4, "K_complex_buf", mydf)
+    t3 = t4
+    
+    #if in_real_space:
+    
+    buf_fft = np.ndarray((nao_prim, nao_prim*ncell_complex), dtype=np.complex128, buffer=mydf.jk_buffer, offset=offset)
+    
+    fn2(
+        K_complex_buf.ctypes.data_as(ctypes.c_void_p),
+        ctypes.c_int(nao_prim),
+        ctypes.c_int(nao_prim),
+        kmesh.ctypes.data_as(ctypes.c_void_p),
+        buf_fft.ctypes.data_as(ctypes.c_void_p)
+    )
+    
+    t4 = (logger.process_clock(), logger.perf_counter())
+    _benchmark_time(t3, t4, "K_real_buf", mydf)
+    t3 = t4
+    
+    K_real_buf *= (ngrid / vol)
+    
+    K = -pack_JK(K_real_buf, kmesh, nao_prim, output=None) # "-" due to robust fitting
+    
+    #else:
+    #    K = -pack_JK_in_FFT_space(K_complex_buf, kmesh, nao_prim) / np.prod(kmesh)
+    
+    ########### do the same thing on V ###########
+    
+    DM_RgR_complex = np.ndarray((nIP_prim,nGridPrim*ncell_complex), dtype=np.complex128, buffer=mydf.jk_buffer, offset=offset_after_dm)
+    DM_RgR_real = np.ndarray((nIP_prim,nGridPrim*ncell), dtype=np.float64, buffer=DM_RgR_complex)
+    
+    offset_now = offset_after_dm + DM_RgR_complex.size * DM_RgR_complex.itemsize
+    
+    aoR_FFT = mydf.aoR_FFT
+    
+    offset_A = offset_now
+    buf_A = np.ndarray((nao_prim, nao_prim), dtype=np.complex128, buffer=mydf.jk_buffer, offset=offset_A)
+    offset_B = offset_A + buf_A.size * buf_A.itemsize
+    buf_B = np.ndarray((nao_prim, nGridPrim), dtype=np.complex128, buffer=mydf.jk_buffer, offset=offset_B)
+    offset_B2 = offset_B + buf_B.size * buf_B.itemsize
+    buf_B2 = np.ndarray((nao_prim, nIP_prim), dtype=np.complex128, buffer=mydf.jk_buffer, offset=offset_B2)
+    offset_C = offset_B2 + buf_B2.size * buf_B2.itemsize
+    buf_C = np.ndarray((nao_prim, nGridPrim), dtype=np.complex128, buffer=mydf.jk_buffer, offset=offset_C)
+    offset_D = offset_C + buf_C.size * buf_C.itemsize
+    buf_D = np.ndarray((nIP_prim, nGridPrim), dtype=np.complex128, buffer=mydf.jk_buffer, offset=offset_D)
+    
+    if isinstance(aoRg_FFT, list):
+        assert isinstance(aoR_FFT, list)
+        
+        for i in range(ncell_complex):
+        
+            k_begin = i * nao_prim
+            k_end   = (i + 1) * nao_prim
+        
+            # buf_A[:] = DM_complex[:, k_begin:k_end]
+            fn_packcol2(
+                buf_A.ctypes.data_as(ctypes.c_void_p),
+                ctypes.c_int(nao_prim),
+                ctypes.c_int(2*nao_prim),
+                DM_complex.ctypes.data_as(ctypes.c_void_p),
+                ctypes.c_int(DM_complex.shape[0]),
+                ctypes.c_int(2*DM_complex.shape[1]),
+                ctypes.c_int(2*k_begin),
+                ctypes.c_int(2*k_end)
+            )
+            
+            # buf_B[:] = aoR_FFT[:, i*nGridPrim:(i+1)*nGridPrim]
+            # buf_B2[:] = aoRg_FFT[:, i*nIP_prim:(i+1)*nIP_prim]
+            # buf_B.ravel()[:] = aoR_FFT[i].ravel()[:]
+            # buf_B2.ravel()[:] = aoRg_FFT[i].ravel()[:]
+            fn_copy(
+                buf_B.ctypes.data_as(ctypes.c_void_p),
+                aoR_FFT[i].ctypes.data_as(ctypes.c_void_p),
+                ctypes.c_size_t(2*buf_B.size) # 2 due to complex number
+            )
+            fn_copy(
+                buf_B2.ctypes.data_as(ctypes.c_void_p),
+                aoRg_FFT[i].ctypes.data_as(ctypes.c_void_p),
+                ctypes.c_size_t(2*buf_B2.size) # 2 due to complex number
+            )
+
+        
+            lib.dot(buf_A, buf_B, c=buf_C)
+            lib.dot(buf_B2.T.conj(), buf_C, c=buf_D)
+        
+            k_begin = i * nGridPrim
+            k_end   = (i + 1) * nGridPrim
+        
+            # DM_RgR_complex[:, k_begin:k_end] = buf_D
+            fn_packcol3(
+                DM_RgR_complex.ctypes.data_as(ctypes.c_void_p),
+                ctypes.c_int(DM_RgR_complex.shape[0]),
+                ctypes.c_int(2*DM_RgR_complex.shape[1]),
+                ctypes.c_int(2*k_begin),
+                ctypes.c_int(2*k_end),
+                buf_D.ctypes.data_as(ctypes.c_void_p),
+                ctypes.c_int(buf_D.shape[0]),
+                ctypes.c_int(2*buf_D.shape[1]),
+            )
+    
+    else:
+        
+        raise NotImplementedError("not implemented yet.")
+        
+        for i in range(ncell_complex):
+        
+            k_begin = i * nao_prim
+            k_end   = (i + 1) * nao_prim
+        
+            buf_A[:] = DM_complex[:, k_begin:k_end]
+            buf_B[:] = aoR_FFT[:, i*nGridPrim:(i+1)*nGridPrim]
+            buf_B2[:] = aoRg_FFT[:, i*nIP_prim:(i+1)*nIP_prim]
+        
+            lib.dot(buf_A, buf_B, c=buf_C)
+            lib.dot(buf_B2.T.conj(), buf_C, c=buf_D)
+        
+            k_begin = i * nGridPrim
+            k_end   = (i + 1) * nGridPrim
+        
+            DM_RgR_complex[:, k_begin:k_end] = buf_D
+    
+    t4 = (logger.process_clock(), logger.perf_counter())
+    _benchmark_time(t3, t4, "DM_RgR_complex", mydf)
+    t3 = t4
+    
+    buf_A = None
+    buf_B = None
+    buf_B2 = None
+    buf_C = None
+    buf_D = None
+    
+    offset_now_fft = offset_now
+    
+    buf_fft = np.ndarray((nIP_prim, nGridPrim*ncell_complex), dtype=np.complex128, buffer=mydf.jk_buffer, offset=offset_now_fft)
+    
+    fn2(
+        DM_RgR_complex.ctypes.data_as(ctypes.c_void_p),
+        ctypes.c_int(nIP_prim),
+        ctypes.c_int(nGridPrim),
+        kmesh.ctypes.data_as(ctypes.c_void_p),
+        buf_fft.ctypes.data_as(ctypes.c_void_p)
+    )
+        
+    t4 = (logger.process_clock(), logger.perf_counter())
+    _benchmark_time(t3, t4, "DM_RgR_real", mydf)
+    t3 = t4
+        
+    # inplace multiplication
+    
+    lib_isdf.cwise_mul(mydf.V_R, DM_RgR_real, out=DM_RgR_real)
+    
+    t4 = (logger.process_clock(), logger.perf_counter())
+    _benchmark_time(t3, t4, "cwise_mul", mydf)
+    t3 = t4
+        
+    fn1(
+        DM_RgR_real.ctypes.data_as(ctypes.c_void_p),
+        ctypes.c_int(nIP_prim),
+        ctypes.c_int(nGridPrim),
+        kmesh.ctypes.data_as(ctypes.c_void_p),
+        buf_fft.ctypes.data_as(ctypes.c_void_p)
+    )
+    
+    t4 = (logger.process_clock(), logger.perf_counter())
+    _benchmark_time(t3, t4, "DM_RgR_complex 2", mydf)
+    t3 = t4
+        
+    buf_fft = None
+    
+    offset_K = offset_now
+    
+    K_complex_buf = np.ndarray((nao_prim, nao_prim*ncell_complex), dtype=np.complex128, buffer=mydf.jk_buffer, offset=offset_K)
+    K_real_buf    = np.ndarray((nao_prim, nao_prim*ncell), dtype=np.float64, buffer=K_complex_buf)
+    
+    offset_after_K = offset_K + K_complex_buf.size * K_complex_buf.itemsize
+    
+    offset_A = offset_K + K_complex_buf.size * K_complex_buf.itemsize
+    buf_A = np.ndarray((nIP_prim, nGridPrim), dtype=np.complex128, buffer=mydf.jk_buffer, offset=offset_A)
+    offset_B = offset_A + buf_A.size * buf_A.itemsize
+    buf_B = np.ndarray((nao_prim, nGridPrim), dtype=np.complex128, buffer=mydf.jk_buffer, offset=offset_B)
+    offset_B2 = offset_B + buf_B.size * buf_B.itemsize
+    buf_B2 = np.ndarray((nao_prim, nIP_prim), dtype=np.complex128, buffer=mydf.jk_buffer, offset=offset_B2)
+    offset_C = offset_B2 + buf_B2.size * buf_B2.itemsize
+    buf_C = np.ndarray((nIP_prim, nao_prim), dtype=np.complex128, buffer=mydf.jk_buffer, offset=offset_C)
+    offset_D = offset_C + buf_C.size * buf_C.itemsize
+    buf_D = np.ndarray((nao_prim, nao_prim), dtype=np.complex128, buffer=mydf.jk_buffer, offset=offset_D)
+    
+    if isinstance(aoRg_FFT, list):
+        
+        for i in range(ncell_complex):
+        
+            k_begin = i * nGridPrim
+            k_end   = (i + 1) * nGridPrim
+        
+            # buf_A.ravel()[:] = DM_RgR_complex[:, k_begin:k_end].ravel()[:]
+            fn_packcol2(
+                buf_A.ctypes.data_as(ctypes.c_void_p),
+                ctypes.c_int(nIP_prim),
+                ctypes.c_int(2*nGridPrim),
+                DM_RgR_complex.ctypes.data_as(ctypes.c_void_p),
+                ctypes.c_int(DM_RgR_complex.shape[0]),
+                ctypes.c_int(2*DM_RgR_complex.shape[1]),
+                ctypes.c_int(2*k_begin),
+                ctypes.c_int(2*k_end)
+            )
+            
+            # buf_B.ravel()[:] = aoR_FFT[i].ravel()[:]
+            # buf_B2.ravel()[:] = aoRg_FFT[i].ravel()[:]
+            fn_copy(
+                buf_B.ctypes.data_as(ctypes.c_void_p),
+                aoR_FFT[i].ctypes.data_as(ctypes.c_void_p),
+                ctypes.c_size_t(2*buf_B.size) # 2 due to complex number
+            )
+            fn_copy(
+                buf_B2.ctypes.data_as(ctypes.c_void_p),
+                aoRg_FFT[i].ctypes.data_as(ctypes.c_void_p),
+                ctypes.c_size_t(2*buf_B2.size) # 2 due to complex number
+            )
+            
+        
+            lib.dot(buf_A, buf_B.T.conj(), c=buf_C)
+            lib.dot(buf_B2, buf_C, c=buf_D)
+                
+            k_begin = i * nao_prim
+            k_end   = (i + 1) * nao_prim
+        
+            # K_complex_buf[:, k_begin:k_end] = buf_D
+            fn_packcol3(
+                K_complex_buf.ctypes.data_as(ctypes.c_void_p),
+                ctypes.c_int(K_complex_buf.shape[0]),
+                ctypes.c_int(2*K_complex_buf.shape[1]),
+                ctypes.c_int(2*k_begin),
+                ctypes.c_int(2*k_end),
+                buf_D.ctypes.data_as(ctypes.c_void_p),
+                ctypes.c_int(buf_D.shape[0]),
+                ctypes.c_int(2*buf_D.shape[1]),
+            )
+        
+    else:
+        
+        raise NotImplementedError("not implemented yet.")
+        
+        for i in range(ncell_complex):
+        
+            k_begin = i * nGridPrim
+            k_end   = (i + 1) * nGridPrim
+        
+            buf_A.ravel()[:] = DM_RgR_complex[:, k_begin:k_end].ravel()[:]
+            # print("buf_A = ", buf_A[:5,:5])
+            buf_B.ravel()[:] = aoR_FFT[:, i*nGridPrim:(i+1)*nGridPrim].ravel()[:]
+            # print("buf_B = ", buf_B[:5,:5])
+            buf_B2.ravel()[:] = aoRg_FFT[:, i*nIP_prim:(i+1)*nIP_prim].ravel()[:]  
+            # print("buf_B2 = ", buf_B2[:5,:5]) 
+        
+            lib.dot(buf_A, buf_B.T.conj(), c=buf_C)
+            lib.dot(buf_B2, buf_C, c=buf_D)
+        
+            # print("buf_D = ", buf_D[:5,:5])
+        
+            k_begin = i * nao_prim
+            k_end   = (i + 1) * nao_prim
+        
+            K_complex_buf[:, k_begin:k_end] = buf_D
+    
+    t4 = (logger.process_clock(), logger.perf_counter())
+    _benchmark_time(t3, t4, "K_complex_buf 1", mydf)
+    t3 = t4
+    
+    buf_A = None
+    buf_B = None
+    buf_B2 = None
+    buf_C = None
+    buf_D = None
+    
+    offset_now = offset_after_K
+    
+    buf_fft = np.ndarray((nao_prim, nao_prim*ncell_complex), dtype=np.complex128, buffer=mydf.jk_buffer, offset=offset_now)
+        
+    fn2(
+        K_complex_buf.ctypes.data_as(ctypes.c_void_p),
+        ctypes.c_int(nao_prim),
+        ctypes.c_int(nao_prim),
+        kmesh.ctypes.data_as(ctypes.c_void_p),
+        buf_fft.ctypes.data_as(ctypes.c_void_p)
+    )
+    
+    t4 = (logger.process_clock(), logger.perf_counter())
+    _benchmark_time(t3, t4, "K_complex_buf 2", mydf)
+    t3 = t4
+    
+    buf_fft = None
+    
+    K_real_buf *= (ngrid / vol)
+    
+    t2 = (logger.process_clock(), logger.perf_counter())
+    
+    _benchmark_time(t1, t2, "_contract_k_dm", mydf)
+    
+    t1 = t2
+    
+    K2 = pack_JK(K_real_buf, kmesh, nao_prim, output=None)
+    
+    t2 = (logger.process_clock(), logger.perf_counter())
+    
+    _benchmark_time(t1, t2, "_pack_JK", mydf)
+    
+    K += K2 + K2.T
+    
+    if in_real_space == False:
+    #    K += K2 + K2.T
+    #else:
+    #    K2 = K2 + K2.T
+    #    K2 = K2[:nao_prim,:]
+        
+        K = K[:nao_prim,:].copy()
+
+        K_complex = np.ndarray((nao_prim, nao_prim*ncell_complex), dtype=np.complex128)
+        K_real    = np.ndarray((nao_prim, nao_prim*ncell), dtype=np.float64, buffer=K_complex)
+        K_real.ravel()[:]    = K.ravel()[:]
+        buf_fft    = np.zeros_like(K_complex)
+        
+        fn1(
+            K_real.ctypes.data_as(ctypes.c_void_p),
+            ctypes.c_int(nao_prim),
+            ctypes.c_int(nao_prim),
+            kmesh.ctypes.data_as(ctypes.c_void_p),
+            buf_fft.ctypes.data_as(ctypes.c_void_p)
+        )
+        
+        K_complex = K_complex.conj().copy()
+        K_complex = pack_JK_in_FFT_space(K_complex, kmesh, nao_prim)
+        K = K_complex 
+        
+        
+    DM_RgR_complex = None
+    DM_RgR_real = None
+    
+    return K
+    
+    # return DM_RgRg_real # temporary return for debug
+
+def _get_k_kSym(mydf, _dm):
+ 
+    #### preprocess ####  
+    
+    dm, in_real_space = _preprocess_dm(mydf, _dm)
+    
+    mydf._allocate_jk_buffer(dm.dtype)
+    t1 = (logger.process_clock(), logger.perf_counter())
+    
+    if len(dm.shape) == 3:
+        assert dm.shape[0] == 1
+        dm = dm[0]
+    
+    nao  = dm.shape[0]
+    cell = mydf.cell    
+    assert cell.nao == nao
+    ngrid = np.prod(cell.mesh)
+    vol = cell.vol
+    
+    W    = mydf.W
+    naux = mydf.naux
+    
+    kmesh = np.array(mydf.kmesh, dtype=np.int32)
+    mesh = mydf.mesh
+    meshPrim = np.array(mesh) // np.array(kmesh)
+    nGridPrim = mydf.nGridPrim
+    ncell = np.prod(kmesh)
+    ncell_complex = kmesh[0] * kmesh[1] * (kmesh[2]//2+1)
+    nIP_prim = mydf.nIP_Prim
+    nao_prim = nao // ncell
+    
+    #### allocate buffer ####
+    
+    offset          = 0
+    DM_RgRg_complex = np.ndarray((nIP_prim,nIP_prim*ncell_complex), dtype=np.complex128, buffer=mydf.jk_buffer, offset=offset)
+    DM_RgRg_real    = np.ndarray((nIP_prim,nIP_prim*ncell),         dtype=np.float64,    buffer=mydf.jk_buffer, offset=offset)
+    
+    offset += (nIP_prim * nIP_prim * ncell_complex) * DM_RgRg_complex.itemsize
+    DM_complex = np.ndarray((nao_prim,nao_prim*ncell_complex), dtype=np.complex128, buffer=mydf.jk_buffer, offset=offset)
+    DM_real = np.ndarray((nao_prim,nao), dtype=np.float64, buffer=mydf.jk_buffer, offset=offset)
+    DM_real.ravel()[:] = dm[:nao_prim, :].ravel()[:]
+    offset += (nao_prim * nao_prim * ncell_complex) * DM_complex.itemsize
+    
+    #### get D ####
+        
+    fn1 = getattr(libisdf, "_FFT_Matrix_Col_InPlace", None)
+    assert fn1 is not None
+    
+    buf_fft = np.ndarray((nao_prim, nao_prim*ncell_complex), dtype=np.complex128, buffer=mydf.jk_buffer, offset=offset)
+        
+    fn1(
+        DM_real.ctypes.data_as(ctypes.c_void_p),
+        ctypes.c_int(nao_prim),
+        ctypes.c_int(nao_prim),
+        kmesh.ctypes.data_as(ctypes.c_void_p),
+        buf_fft.ctypes.data_as(ctypes.c_void_p)
+    )
+    
+    buf_A = np.ndarray((nao_prim, nao_prim), dtype=np.complex128, buffer=mydf.jk_buffer, offset=offset)
+    
+    offset2 = offset + (nao_prim * nao_prim) * buf_A.itemsize
+    buf_B = np.ndarray((nao_prim, nIP_prim), dtype=np.complex128, buffer=mydf.jk_buffer, offset=offset2)
+    
+    offset3 = offset2 + (nao_prim * nIP_prim) * buf_B.itemsize
+    buf_C = np.ndarray((nao_prim, nIP_prim), dtype=np.complex128, buffer=mydf.jk_buffer, offset=offset3)
+    
+    offset4 = offset3 + (nao_prim * nIP_prim) * buf_C.itemsize
+    buf_D = np.ndarray((nIP_prim, nIP_prim), dtype=np.complex128, buffer=mydf.jk_buffer, offset=offset4)
+    
+    aoRg_FFT = mydf.aoRg_FFT
+    
+    if isinstance(aoRg_FFT, list): 
+        for i in range(ncell_complex):
+        
+            k_begin = i * nao_prim
+            k_end   = (i + 1) * nao_prim
+        
+            buf_A[:] = DM_complex[:, k_begin:k_end]
+            # buf_B[:] = aoRg_FFT[:, i*nIP_prim:(i+1)*nIP_prim]
+            buf_B = aoRg_FFT[i]
+        
+            lib.dot(buf_A, buf_B, c=buf_C)
+            lib.dot(buf_B.T.conj(), buf_C, c=buf_D)
+        
+            k_begin = i * nIP_prim
+            k_end   = (i + 1) * nIP_prim
+        
+            DM_RgRg_complex[:, k_begin:k_end] = buf_D
+    else:
+        for i in range(ncell_complex):
+        
+            k_begin = i * nao_prim
+            k_end   = (i + 1) * nao_prim
+        
+            buf_A[:] = DM_complex[:, k_begin:k_end]
+            buf_B[:] = aoRg_FFT[:, i*nIP_prim:(i+1)*nIP_prim]
+        
+            lib.dot(buf_A, buf_B, c=buf_C)
+            lib.dot(buf_B.T.conj(), buf_C, c=buf_D)
+        
+            k_begin = i * nIP_prim
+            k_end   = (i + 1) * nIP_prim
+        
+            DM_RgRg_complex[:, k_begin:k_end] = buf_D
+    
+    buf_fft = np.ndarray((nIP_prim, nIP_prim*ncell_complex), dtype=np.complex128, buffer=mydf.jk_buffer, offset=offset)
+    
+    fn2 = getattr(libisdf, "_iFFT_Matrix_Col_InPlace", None)
+    assert fn2 is not None
+    
+    fn2(
+        DM_RgRg_complex.ctypes.data_as(ctypes.c_void_p),
+        ctypes.c_int(nIP_prim),
+        ctypes.c_int(nIP_prim),
+        kmesh.ctypes.data_as(ctypes.c_void_p),
+        buf_fft.ctypes.data_as(ctypes.c_void_p)
+    )
+    
+    # inplace multiplication
+    
+    lib_isdf.cwise_mul(mydf.W, DM_RgRg_real, out=DM_RgRg_real)
+    
+    offset = nIP_prim * nIP_prim * ncell_complex * DM_RgRg_complex.itemsize
+    
+    buf_fft = np.ndarray((nIP_prim, nIP_prim*ncell_complex), dtype=np.complex128, buffer=mydf.jk_buffer, offset=offset)
+    
+    fn1(
+        DM_RgRg_real.ctypes.data_as(ctypes.c_void_p),
+        ctypes.c_int(nIP_prim),
+        ctypes.c_int(nIP_prim),
+        kmesh.ctypes.data_as(ctypes.c_void_p),
+        buf_fft.ctypes.data_as(ctypes.c_void_p)
+    )
+    
+    K_complex_buf = np.ndarray((nao_prim, nao_prim*ncell_complex), dtype=np.complex128, buffer=mydf.jk_buffer, offset=offset)
+    K_real_buf    = np.ndarray((nao_prim, nao_prim*ncell), dtype=np.float64, buffer=mydf.jk_buffer, offset=offset)
+    offset += (nao_prim * nao_prim * ncell_complex) * K_complex_buf.itemsize
+    offset_now = offset    
+    
+    buf_A = np.ndarray((nIP_prim, nIP_prim), dtype=np.complex128, buffer=mydf.jk_buffer, offset=offset_now)
+    offset_now += (nIP_prim * nIP_prim) * buf_A.itemsize
+    buf_B = np.ndarray((nao_prim, nIP_prim), dtype=np.complex128, buffer=mydf.jk_buffer, offset=offset_now)
+    offset_now += (nao_prim * nIP_prim) * buf_B.itemsize
+    buf_C = np.ndarray((nIP_prim, nao_prim), dtype=np.complex128, buffer=mydf.jk_buffer, offset=offset_now)
+    offset_now += (nIP_prim * nao_prim) * buf_C.itemsize
+    buf_D = np.ndarray((nao_prim, nao_prim), dtype=np.complex128, buffer=mydf.jk_buffer, offset=offset_now)
+    
+    if isinstance(aoRg_FFT, list): 
+        
+        for i in range(ncell_complex):
+        
+            k_begin = i * nIP_prim
+            k_end   = (i + 1) * nIP_prim
+        
+            buf_A.ravel()[:] = DM_RgRg_complex[:, k_begin:k_end].ravel()[:]
+            # buf_B.ravel()[:] = aoRg_FFT[:, i*nIP_prim:(i+1)*nIP_prim].ravel()[:]
+            buf_B = aoRg_FFT[i]
+        
+            lib.dot(buf_A, buf_B.T.conj(), c=buf_C)
+            lib.dot(buf_B, buf_C, c=buf_D)
+        
+            k_begin = i * nao_prim
+            k_end   = (i + 1) * nao_prim
+        
+            K_complex_buf[:, k_begin:k_end] = buf_D
+    else:
+        
+        for i in range(ncell_complex):
+        
+            k_begin = i * nIP_prim
+            k_end   = (i + 1) * nIP_prim
+        
+            buf_A.ravel()[:] = DM_RgRg_complex[:, k_begin:k_end].ravel()[:]
+            buf_B.ravel()[:] = aoRg_FFT[:, i*nIP_prim:(i+1)*nIP_prim].ravel()[:]
+        
+            lib.dot(buf_A, buf_B.T.conj(), c=buf_C)
+            lib.dot(buf_B, buf_C, c=buf_D)
+        
+            k_begin = i * nao_prim
+            k_end   = (i + 1) * nao_prim
+        
+            K_complex_buf[:, k_begin:k_end] = buf_D
+    
+    buf_fft = np.ndarray((nao_prim, nao_prim*ncell_complex), dtype=np.complex128, buffer=mydf.jk_buffer, offset=offset)
+    
+    K_complex_buf *= (ngrid / vol)
+    
+    #print("K_complex_buf = ", K_complex_buf)
+    
+    if in_real_space:
+        
+        fn2(
+            K_complex_buf.ctypes.data_as(ctypes.c_void_p),
+            ctypes.c_int(nao_prim),
+            ctypes.c_int(nao_prim),
+            kmesh.ctypes.data_as(ctypes.c_void_p),
+            buf_fft.ctypes.data_as(ctypes.c_void_p)
+        )
+    
+        K = pack_JK(K_real_buf, kmesh, nao_prim, output=None)
+    
+    else:
+    
+        K_complex_buf = K_complex_buf.conj().copy()  ### NOTE: convention problem   
+        K = pack_JK_in_FFT_space(K_complex_buf, kmesh, nao_prim, output=None)
+    
+    t2 = (logger.process_clock(), logger.perf_counter())
+    
+    _benchmark_time(t1, t2, "_contract_k_dm", mydf)
+    
+    return K
+   
+def _get_k_kSym_direct(mydf, _dm, use_mpi=False):
+    
+    if use_mpi:
+        assert mydf.direct == True
+        from pyscf.isdf.isdf_tools_mpi import rank, comm, comm_size, bcast, reduce
+        size = comm.Get_size()
+    
+    t1 = (logger.process_clock(), logger.perf_counter())
+    t0 = (logger.process_clock(), logger.perf_counter())
+    
+    ############# preprocess #############
+    
+    dm = None 
+    
+    if (use_mpi and rank == 0) or not use_mpi:
+        
+        dm = []
+        nset = _dm.shape[0]
+    
+        for iset in range(nset):
+            _dm_tmp, in_real_space = _preprocess_dm(mydf, _dm[iset])
+            dm.append(_dm_tmp)
+            if in_real_space:
+                if np.prod(mydf.kmesh) == 1:
+                    in_real_space = False
+        assert not in_real_space
+    
+        dm = np.asarray(dm)
+    
+    if use_mpi:
+        dm = bcast(dm, root=0)
+    
+    if len(dm.shape) == 3:
+        assert dm.shape[0] <= 4
+    else:
+        dm = dm.reshape(1, *dm.shape)
+        
+    aoR  = mydf.aoR
+    aoRg = mydf.aoRg    
+    
+    max_nao_involved   = mydf.max_nao_involved
+    max_ngrid_involved = mydf.max_ngrid_involved
+    max_nIP_involved   = mydf.max_nIP_involved
+    maxsize_group_naux = mydf.maxsize_group_naux
+        
+    ####### preparing the data #######
+        
+    nset, nao  = dm.shape[0], dm.shape[1]
+    cell = mydf.cell
+    assert cell.nao == nao
+    vol  = cell.vol
+    mesh = np.array(cell.mesh, dtype=np.int32)
+    mesh_int32 = mesh
+    ngrid = np.prod(mesh)
+    
+    aoRg = mydf.aoRg
+    assert isinstance(aoRg, list)
+    aoR = mydf.aoR
+    assert isinstance(aoR, list)
+    
+    naux = mydf.naux
+    nao  = cell.nao
+    nao_prim  = mydf.nao_prim
+    aux_basis = mydf.aux_basis
+    kmesh     = np.array(mydf.kmesh, dtype=np.int32)
+    nkpts     = np.prod(kmesh)
+    
+    grid_ordering = mydf.grid_ID_ordered 
+    
+    if hasattr(mydf, "coulG") == False:
+        if mydf.omega is not None:
+            assert mydf.omega >= 0.0
+        # mydf.coulG = tools.get_coulG(cell, mesh=mesh, omega=mydf.omega)
+        raise NotImplementedError("coulG is not implemented yet.")
+    
+    coulG = mydf.coulG
+    coulG_real = coulG.reshape(*mesh)[:, :, :mesh[2]//2+1].reshape(-1).copy()
+    
+    mydf.allocate_k_buffer(nset)
+    build_k_buf  = mydf.build_k_buf
+    build_VW_buf = mydf.build_VW_in_k_buf
+    
+    group = mydf.group
+    assert len(group) == len(aux_basis)
+    
+    ######### allocate buffer ######### 
+        
+    Density_RgAO_buf = mydf.Density_RgAO_buf
+    
+    nThread            = lib.num_threads()
+    bufsize_per_thread = (coulG_real.shape[0] * 2 + np.prod(mesh))
+    buf_build_V        = np.ndarray((nThread, bufsize_per_thread), dtype=np.float64, buffer=build_VW_buf) 
+    
+    offset_now = buf_build_V.size * buf_build_V.dtype.itemsize
+    
+    build_K_bunchsize = min(maxsize_group_naux, mydf._build_K_bunchsize)
+    
+    offset_build_now       = 0
+    offset_Density_RgR_buf = 0
+    Density_RgR_buf        = np.ndarray((build_K_bunchsize, ngrid), buffer=build_k_buf, offset=offset_build_now)
+    
+    offset_build_now        += Density_RgR_buf.size * Density_RgR_buf.dtype.itemsize
+    offset_ddot_res_RgR_buf  = offset_build_now
+    ddot_res_RgR_buf         = np.ndarray((build_K_bunchsize, max_ngrid_involved), buffer=build_k_buf, offset=offset_ddot_res_RgR_buf)
+    
+    offset_build_now   += ddot_res_RgR_buf.size * ddot_res_RgR_buf.dtype.itemsize
+    offset_K1_tmp1_buf  = offset_build_now
+    K1_tmp1_buf         = np.ndarray((maxsize_group_naux, nao), buffer=build_k_buf, offset=offset_K1_tmp1_buf)
+    
+    offset_build_now            += K1_tmp1_buf.size * K1_tmp1_buf.dtype.itemsize
+    offset_K1_tmp1_ddot_res_buf  = offset_build_now
+    K1_tmp1_ddot_res_buf         = np.ndarray((maxsize_group_naux, nao), buffer=build_k_buf, offset=offset_K1_tmp1_ddot_res_buf)
+    
+    offset_build_now += K1_tmp1_ddot_res_buf.size * K1_tmp1_ddot_res_buf.dtype.itemsize
+
+    offset_K1_final_ddot_buf = offset_build_now
+    K1_final_ddot_buf        = np.ndarray((nao, nao), buffer=build_k_buf, offset=offset_K1_final_ddot_buf)
+    
+    ########### get involved C function ###########
+    
+    fn_packcol1 = getattr(libisdf, "_buildK_packcol", None)
+    assert fn_packcol1 is not None
+    fn_packcol2 = getattr(libisdf, "_buildK_packcol2", None)
+    assert fn_packcol2 is not None
+    fn_packadd_col = getattr(libisdf, "_buildK_packaddcol", None)
+    assert fn_packadd_col is not None
+    fn_packadd_row = getattr(libisdf, "_buildK_packaddrow", None)
+    assert fn_packadd_row is not None
+
+    ordered_ao_ind = np.arange(nao)
+
+    ######### begin work #########
+    
+    K1 = np.zeros((nset, nao_prim, nao), dtype=np.float64) # contribution from V matrix
+    K2 = np.zeros((nset, nao_prim, nao), dtype=np.float64) # contribution from W matrix
+    
+    from pyscf.isdf._isdf_local_K_direct import reset_profile_buildK_time, add_cputime_RgAO, add_walltime_RgAO, log_profile_buildK_time
+    
+    reset_profile_buildK_time()
+    
+    ######## distribution task among different process ########
+    
+    task_info = []
+    
+    nIP_prim = mydf.nIP_Prim
+    
+    if use_mpi:
+        nIP_bunchsize = (nIP_prim + comm_size) // comm_size
+        bunch_begin   = rank * nIP_bunchsize
+        bunch_end     = min(nIP_prim, (rank + 1) * nIP_bunchsize)
+        
+    else:
+        bunch_begin = 0
+        bunch_end   = nIP_prim
+    
+    iIP = 0
+    
+    for group_id, atm_ids in enumerate(group):
+        
+        naux_tmp = 0
+        for atm_id in atm_ids:
+            naux_tmp += aoRg[atm_id].aoR.shape[1]
+        assert naux_tmp == aux_basis[group_id].shape[0]
+        assert iIP + naux_tmp <= nIP_prim
+        
+        ### judge whether [iIP, iIP+naux_tmp) intersects with [bunch_begin, bunch_end) ###
+        
+        if iIP >= bunch_end or iIP + naux_tmp <= bunch_begin:
+            task_info.append((None, None))
+        else:
+            if bunch_begin <= iIP:
+                group_begin = 0
+            else:
+                group_begin = bunch_begin - iIP
+            if bunch_end >= iIP + naux_tmp:
+                group_end = naux_tmp
+            else:
+                group_end = bunch_end - iIP
+            task_info.append((group_begin, group_end))
+        
+        iIP += naux_tmp
+    
+    #if use_mpi:
+    #    print("rank = ", rank, "task_info = ", task_info)
+    
+    ###########################################################
+    
+    for group_id, atm_ids in enumerate(group):
+        
+        if task_info[group_id][0] is None:
+            continue
+        
+        #if use_mpi:
+        #    if group_id % comm_size != rank:
+        #        continue
+        
+        naux_tmp = 0
+        aoRg_holders = []
+        for atm_id in atm_ids:
+            naux_tmp += aoRg[atm_id].aoR.shape[1]
+            aoRg_holders.append(aoRg[atm_id])
+        assert naux_tmp == aux_basis[group_id].shape[0]
+        
+        aux_basis_tmp = aux_basis[group_id]
+        
+        #### 1. build the involved DM_RgR #### 
+        
+        t1 = (logger.process_clock(), logger.perf_counter())
+        
+        Density_RgAO_tmp            = np.ndarray((nset, naux_tmp, nao), buffer=Density_RgAO_buf)
+        offset_density_RgAO_buf     = Density_RgAO_tmp.size * Density_RgAO_buf.dtype.itemsize
+        Density_RgAO_tmp.ravel()[:] = 0.0
+        Density_RgAO_tmp            = __get_DensityMatrixonRgAO_qradratic(mydf, dm, aoRg_holders, "all", Density_RgAO_tmp, verbose=mydf.verbose)
+        
+        #build_k_buf.ravel()[:]  = 0.0
+        #build_VW_buf.ravel()[:] = 0.0
+        
+        t2 = (logger.process_clock(), logger.perf_counter())
+        
+        add_cputime_RgAO(t2[0] - t1[0])
+        add_walltime_RgAO(t2[1] - t1[1])
+        
+        #### 2. build the V matrix #### 
+        
+        W_tmp = None
+        
+        for iset in range(nset):
+            
+            calculate_W_tmp = (iset == 0) 
+            
+            #build_k_buf.ravel()[:]  = 0.0
+            #build_VW_buf.ravel()[:] = 0.0
+        
+            _W_tmp = _isdf_get_K_direct_kernel_1(
+                mydf, coulG_real,
+                group_id, Density_RgAO_tmp[iset],
+                None, True, calculate_W_tmp,
+                ##### buffer #####
+                buf_build_V,
+                build_VW_buf,
+                offset_now,
+                Density_RgR_buf,
+                Density_RgAO_buf,
+                offset_density_RgAO_buf,
+                ddot_res_RgR_buf,
+                K1_tmp1_buf,
+                K1_tmp1_ddot_res_buf,
+                K1_final_ddot_buf,
+                ##### bunchsize #####
+                #maxsize_group_naux,
+                build_K_bunchsize,
+                ##### other info #####
+                use_mpi=use_mpi,
+                begin_id=task_info[group_id][0],
+                end_id  =task_info[group_id][1],
+                ##### out #####
+                K1_or_2=K1[iset])
+            
+            if calculate_W_tmp:
+                W_tmp = _W_tmp.copy()
+            
+            #build_k_buf.ravel()[:]  = 0.0
+            #build_VW_buf.ravel()[:] = 0.0
+        
+            _isdf_get_K_direct_kernel_1(
+                mydf, coulG_real,
+                group_id, Density_RgAO_tmp[iset],
+                W_tmp, False, False,
+                ##### buffer #####
+                buf_build_V,
+                build_VW_buf,
+                offset_now,
+                Density_RgR_buf,
+                Density_RgAO_buf,
+                offset_density_RgAO_buf,
+                ddot_res_RgR_buf,
+                K1_tmp1_buf,
+                K1_tmp1_ddot_res_buf,
+                K1_final_ddot_buf,
+                ##### bunchsize #####
+                #maxsize_group_naux,
+                build_K_bunchsize,
+                ##### other info #####
+                use_mpi=use_mpi,
+                begin_id=task_info[group_id][0],
+                end_id  =task_info[group_id][1],
+                ##### out #####
+                K1_or_2=K2[iset])
+    
+    if (use_mpi and rank == 0) or not use_mpi:
+        log_profile_buildK_time(mydf)
+                
+    ######### finally delete the buffer #########
+    
+    if use_mpi:
+        comm.Barrier()
+    
+    if use_mpi:
+        K1 = reduce(K1, root = 0)
+        K2 = reduce(K2, root = 0)
+        if rank == 0:
+            # K = K1 + K1.T - K2
+            K1_packed = []
+            K2_packed = []
+            for iset in range(nset):
+                #K1 = pack_JK(K1, kmesh, nao_prim)
+                #K2 = pack_JK(K2, kmesh, nao_prim)
+                K1_packed.append(pack_JK(K1[iset], kmesh, nao_prim))
+                K2_packed.append(pack_JK(K2[iset], kmesh, nao_prim))
+            K1 = np.array(K1_packed)
+            K2 = np.array(K2_packed)
+            K = np.zeros_like(K1)
+            # K  = K1 + K1.T - K2
+            for iset in range(nset):
+                K[iset] = K1[iset] + K1[iset].T - (K2[iset] + K2[iset].T)/2.0
+        else:
+            K = None
+        K = bcast(K, root = 0)
+    else:
+        # K = K1 + K1.T - K2 
+        K1_packed = []
+        K2_packed = []
+        for iset in range(nset):
+            #K1 = pack_JK(K1, kmesh, nao_prim)
+            #K2 = pack_JK(K2, kmesh, nao_prim)
+            K1_packed.append(pack_JK(K1[iset], kmesh, nao_prim))
+            K2_packed.append(pack_JK(K2[iset], kmesh, nao_prim))
+        K1 = np.array(K1_packed)
+        K2 = np.array(K2_packed)
+        K  = np.zeros_like(K1)
+        # K  = K1 + K1.T - K2
+        for iset in range(nset):
+            K[iset] = K1[iset] + K1[iset].T - (K2[iset] + K2[iset].T)/2.0
+    
+    del K1
+    del K2
+    
+    ############ transform back to K ############
+    
+    if (use_mpi and rank == 0) or not use_mpi:
+        
+        K_res = []
+    
+        for iset in range(nset):
+            Ktmp  = _RowCol_FFT_bench(K[iset, :nao_prim, :], kmesh, inv=True, TransBra=False, TransKet=True)
+            K_res.append(Ktmp)
+    
+        K  = np.asarray(K_res)
+        K *= nkpts
+        K *= ngrid / vol
+    
+        Res = []
+        for iset in range(nset):
+            Res.append([])
+        for i in range(np.prod(kmesh)):
+            for iset in range(nset):
+                Res[iset].append(K[iset, :, i*nao_prim:(i+1)*nao_prim])
+            
+        K  = np.array(Res)
+    
+    if use_mpi:
+        K = bcast(K, root=0)
+    
+    t2 = (logger.process_clock(), logger.perf_counter())
+    
+    if (use_mpi and rank == 0) or not use_mpi:
+        _benchmark_time(t0, t2, "_contract_k_dm_quadratic_direct", mydf)
+    
+    return K
+   
+def get_jk_dm_translation_symmetry(mydf, dm, hermi=1, kpt=np.zeros(3),
+                                    kpts_band=None, with_j=True, with_k=True, omega=None, 
+                                   **kwargs):
+    
+    '''JK for given k-point'''
+    
+    direct = mydf.direct
+    use_mpi = mydf.use_mpi
+    
+    if use_mpi :
+        raise NotImplementedError("ISDF does not support use_mpi")
+    
+    if len(dm.shape) == 3:
+        assert dm.shape[0] <= 4
+        #dm = dm[0]
+    else:
+        assert dm.ndim == 2
+        dm = dm.reshape(1, dm.shape[0], dm.shape[1])
+
+    if hasattr(mydf, 'kmesh') and mydf.kmesh is not None:
+        from isdf_tools_densitymatrix import symmetrize_dm
+        dm = symmetrize_dm(dm, mydf.kmesh)
+    else:
+        if hasattr(mydf, 'kmesh') and mydf.kmesh is not None:
+            from isdf_tools_densitymatrix import symmetrize_dm
+            dm = symmetrize_dm(dm, mydf.kmesh)
+
+    if use_mpi:
+        dm = bcast(dm, root=0)
+
+    nset = dm.shape[0]
+
+    #### perform the calculation ####
+
+    if "exxdiv" in kwargs:
+        exxdiv = kwargs["exxdiv"]
+    else:
+        exxdiv = None
+
+    #vj = vk = None
+    vj = np.zeros_like(dm)
+    vk = np.zeros_like(dm)
+
+    if kpts_band is not None and abs(kpt-kpts_band).sum() > 1e-9:
+        raise NotImplementedError("ISDF does not support kpts_band != kpt")
+
+    log = logger.Logger(mydf.stdout, mydf.verbose)
+    t1 = (logger.process_clock(), logger.perf_counter())
+
+    j_real = gamma_point(kpt)
+    k_real = gamma_point(kpt) and not np.iscomplexobj(dm)
+
+    assert j_real
+    assert k_real
+
+    mem_now = lib.current_memory()[0]
+    max_memory = max(2000, (mydf.max_memory - mem_now))
+
+    log.debug1('max_memory = %d MB (%d in use)', max_memory, mem_now)
+
+    for iset in range(nset):
+        if with_j:
+            vj[iset] = _contract_j_dm_k_ls(mydf, dm[iset], use_mpi)  
+        if with_k:
+            if mydf.direct:
+                raise NotImplementedError
+            else:
+                if mydf.with_robust_fitting:
+                    vk[iset] = _get_k_kSym_robust_fitting_fast(mydf, dm[iset])
+                else:
+                    vk[iset] = _get_k_kSym(mydf, dm[iset])
+            if exxdiv == 'ewald':
+                print("WARNING: ISDF does not support ewald")
+
+    if exxdiv == 'ewald':
+        if np.allclose(kpt, np.zeros(3)):
+            # from pyscf.pbc.df.df_jk import _ewald_exxdiv_for_G0, _format_dms, _format_kpts_band, _format_jks
+            kpts = kpt.reshape(1,3)
+            kpts = np.asarray(kpts)
+            #dm_kpts = dm.reshape(-1, dm.shape[0], dm.shape[1]).copy()
+            dm_kpts = dm.copy()
+            dm_kpts = lib.asarray(dm_kpts, order='C')
+            dms     = _format_dms(dm_kpts, kpts)
+            nset, nkpts, nao = dms.shape[:3]
+            assert nset <= 4
+            kpts_band, input_band = _format_kpts_band(kpts_band, kpts), kpts_band
+            nband = len(kpts_band)
+            assert nband == 1
+            if is_zero(kpts_band) and is_zero(kpts):
+                vk = vk.reshape(nset,nband,nao,nao)
+            else:
+                raise NotImplementedError("ISDF does not support kpts_band != 0")
+            _ewald_exxdiv_for_G0(mydf.cell, kpts, dms, vk, kpts_band=kpts_band)
+            #vk = vk[0,0]
+            vk = vk.reshape(nset,nao,nao)
+        else:
+            logger.warn(mydf, 'get_jk_dm_k_quadratic: Exxdiv for k-point is not supported')
+
+    t1 = log.timer('sr jk', *t1)
+
+    return vj, vk
+
+def _get_k_kSym_direct_mimic_MPI(mydf, _dm, use_mpi=False):
+    
+    if use_mpi:
+        raise NotImplementedError
+        assert mydf.direct == True
+        from pyscf.isdf.isdf_tools_mpi import rank, comm, comm_size, bcast, reduce
+        size = comm.Get_size()
+    
+    t1 = (logger.process_clock(), logger.perf_counter())
+    t0 = (logger.process_clock(), logger.perf_counter())
+    
+    ############# preprocess #############
+    
+    dm = []
+    nset = _dm.shape[0]
+    for iset in range(nset):
+        _dm_tmp, in_real_space = _preprocess_dm(mydf, _dm[iset])
+        dm.append(_dm_tmp)
+        if in_real_space:
+            if np.prod(mydf.kmesh) == 1:
+                in_real_space = False
+    assert not in_real_space
+    dm = np.asarray(dm)
+        
+    if len(dm.shape) == 3:
+        assert dm.shape[0] <= 4
+    else:
+        dm = dm.reshape(1, *dm.shape)
+        
+    aoR  = mydf.aoR
+    aoRg = mydf.aoRg    
+    
+    max_nao_involved   = mydf.max_nao_involved
+    max_ngrid_involved = mydf.max_ngrid_involved
+    max_nIP_involved   = mydf.max_nIP_involved
+    maxsize_group_naux = mydf.maxsize_group_naux
+        
+    ####### preparing the data #######
+        
+    nset, nao  = dm.shape[0], dm.shape[1]
+    cell = mydf.cell
+    assert cell.nao == nao
+    vol  = cell.vol
+    mesh = np.array(cell.mesh, dtype=np.int32)
+    mesh_int32 = mesh
+    ngrid = np.prod(mesh)
+    
+    aoRg = mydf.aoRg
+    assert isinstance(aoRg, list)
+    aoR = mydf.aoR
+    assert isinstance(aoR, list)
+    
+    naux = mydf.naux
+    nao  = cell.nao
+    nao_prim  = mydf.nao_prim
+    aux_basis = mydf.aux_basis
+    kmesh     = np.array(mydf.kmesh, dtype=np.int32)
+    nkpts     = np.prod(kmesh)
+    
+    grid_ordering = mydf.grid_ID_ordered 
+    
+    if hasattr(mydf, "coulG") == False:
+        if mydf.omega is not None:
+            assert mydf.omega >= 0.0
+        raise NotImplementedError("coulG is not implemented yet.")
+    
+    coulG = mydf.coulG
+    coulG_real = coulG.reshape(*mesh)[:, :, :mesh[2]//2+1].reshape(-1).copy()
+    
+    mydf.allocate_k_buffer(nset)
+    build_k_buf  = mydf.build_k_buf
+    build_VW_buf = mydf.build_VW_in_k_buf
+    
+    group = mydf.group
+    assert len(group) == len(aux_basis)
+    
+    ######### allocate buffer ######### 
+        
+    Density_RgAO_buf = mydf.Density_RgAO_buf
+    
+    nThread            = lib.num_threads()
+    bufsize_per_thread = (coulG_real.shape[0] * 2 + np.prod(mesh))
+    # buf_build_V        = np.ndarray((nThread, bufsize_per_thread), dtype=np.float64, buffer=build_VW_buf) 
+    buf_build_V        = np.ndarray((nThread, bufsize_per_thread), dtype=np.float64)
+    
+    offset_now = buf_build_V.size * buf_build_V.dtype.itemsize
+    
+    build_K_bunchsize = min(maxsize_group_naux, mydf._build_K_bunchsize)
+    
+    offset_build_now       = 0
+    offset_Density_RgR_buf = 0
+    Density_RgR_buf        = np.ndarray((build_K_bunchsize, ngrid), buffer=build_k_buf, offset=offset_build_now)
+    
+    offset_build_now        += Density_RgR_buf.size * Density_RgR_buf.dtype.itemsize
+    offset_ddot_res_RgR_buf  = offset_build_now
+    ddot_res_RgR_buf         = np.ndarray((build_K_bunchsize, max_ngrid_involved), buffer=build_k_buf, offset=offset_ddot_res_RgR_buf)
+    
+    offset_build_now   += ddot_res_RgR_buf.size * ddot_res_RgR_buf.dtype.itemsize
+    offset_K1_tmp1_buf  = offset_build_now
+    K1_tmp1_buf         = np.ndarray((maxsize_group_naux, nao), buffer=build_k_buf, offset=offset_K1_tmp1_buf)
+    
+    offset_build_now            += K1_tmp1_buf.size * K1_tmp1_buf.dtype.itemsize
+    offset_K1_tmp1_ddot_res_buf  = offset_build_now
+    K1_tmp1_ddot_res_buf         = np.ndarray((maxsize_group_naux, nao), buffer=build_k_buf, offset=offset_K1_tmp1_ddot_res_buf)
+    
+    offset_build_now += K1_tmp1_ddot_res_buf.size * K1_tmp1_ddot_res_buf.dtype.itemsize
+
+    offset_K1_final_ddot_buf = offset_build_now
+    K1_final_ddot_buf        = np.ndarray((nao, nao), buffer=build_k_buf, offset=offset_K1_final_ddot_buf)
+    
+    ########### get involved C function ###########
+    
+    fn_packcol1 = getattr(libisdf, "_buildK_packcol", None)
+    assert fn_packcol1 is not None
+    fn_packcol2 = getattr(libisdf, "_buildK_packcol2", None)
+    assert fn_packcol2 is not None
+    fn_packadd_col = getattr(libisdf, "_buildK_packaddcol", None)
+    assert fn_packadd_col is not None
+    fn_packadd_row = getattr(libisdf, "_buildK_packaddrow", None)
+    assert fn_packadd_row is not None
+
+    ordered_ao_ind = np.arange(nao)
+
+    ######### begin work #########
+    
+    K1 = np.zeros((nset, nao_prim, nao), dtype=np.float64) # contribution from V matrix
+    K2 = np.zeros((nset, nao_prim, nao), dtype=np.float64) # contribution from W matrix
+    
+    from pyscf.isdf._isdf_local_K_direct import reset_profile_buildK_time, add_cputime_RgAO, add_walltime_RgAO, log_profile_buildK_time
+    
+    reset_profile_buildK_time()
+    
+    ######## distribution task among different process ########
+    
+    if hasattr(mydf, "fake_comm_size"):
+        COMM_SIZE = mydf.fake_comm_size
+    else:
+        COMM_SIZE = 2
+    
+    print("COMM_SIZE = ", COMM_SIZE)
+    
+    for rank in range(COMM_SIZE):
+        
+        K1_tmp = np.zeros((nset, nao_prim, nao), dtype=np.float64)
+        K2_tmp = np.zeros((nset, nao_prim, nao), dtype=np.float64)
+        
+        task_info = []
+    
+        nIP_prim      = mydf.nIP_Prim
+        nIP_bunchsize = (nIP_prim + COMM_SIZE) // COMM_SIZE
+        bunch_begin   = rank * nIP_bunchsize
+        bunch_end     = min(nIP_prim, (rank + 1) * nIP_bunchsize)
+    
+        iIP = 0
+        for group_id, atm_ids in enumerate(group):
+        
+            naux_tmp = 0
+            for atm_id in atm_ids:
+                naux_tmp += aoRg[atm_id].aoR.shape[1]
+            assert naux_tmp == aux_basis[group_id].shape[0]
+            assert iIP + naux_tmp <= nIP_prim
+        
+            ### judge whether [iIP, iIP+naux_tmp) intersects with [bunch_begin, bunch_end) ###
+        
+            if iIP >= bunch_end or iIP + naux_tmp <= bunch_begin:
+                task_info.append((None, None))
+            else:
+                if bunch_begin <= iIP:
+                    group_begin = 0
+                else:
+                    group_begin = bunch_begin - iIP
+                if bunch_end >= iIP + naux_tmp:
+                    group_end = naux_tmp
+                else:
+                    group_end = bunch_end - iIP
+                task_info.append((group_begin, group_end))
+        
+            iIP += naux_tmp
+    
+        if use_mpi:
+            print("rank = ", rank, "task_info = ", task_info)
+    
+        ###########################################################
+    
+        for group_id, atm_ids in enumerate(group):
+        
+            if task_info[group_id][0] is None:
+                continue
+        
+            #if use_mpi:
+            #    if group_id % comm_size != rank:
+            #        continue
+        
+            naux_tmp = 0
+            aoRg_holders = []
+            for atm_id in atm_ids:
+                naux_tmp += aoRg[atm_id].aoR.shape[1]
+                aoRg_holders.append(aoRg[atm_id])
+            assert naux_tmp == aux_basis[group_id].shape[0]
+        
+            aux_basis_tmp = aux_basis[group_id]
+        
+            #### 1. build the involved DM_RgR #### 
+        
+            t1 = (logger.process_clock(), logger.perf_counter())
+        
+            Density_RgAO_tmp        = np.ndarray((nset, naux_tmp, nao), buffer=Density_RgAO_buf)
+            offset_density_RgAO_buf = Density_RgAO_tmp.size * Density_RgAO_buf.dtype.itemsize
+            Density_RgAO_buf.ravel()[:] = 0.0
+            # Density_RgAO_tmp.ravel()[:] = 0.0
+            Density_RgAO_tmp            = __get_DensityMatrixonRgAO_qradratic(mydf, dm, aoRg_holders, "all", Density_RgAO_tmp, verbose=mydf.verbose)
+
+            #build_k_buf.ravel()[:]  = 0.0
+            #build_VW_buf.ravel()[:] = 0.0
+        
+            t2 = (logger.process_clock(), logger.perf_counter())
+        
+            add_cputime_RgAO(t2[0] - t1[0])
+            add_walltime_RgAO(t2[1] - t1[1])
+        
+            #### 2. build the V matrix #### 
+        
+            W_tmp = None
+        
+            for iset in range(nset):
+            
+                calculate_W_tmp = (iset == 0) 
+                
+                build_k_buf.ravel()[:]  = 0.0
+                build_VW_buf.ravel()[:] = 0.0
+            
+                _W_tmp = _isdf_get_K_direct_kernel_1(
+                    mydf, coulG_real,
+                    group_id, Density_RgAO_tmp[iset],
+                    None, True, calculate_W_tmp,
+                    ##### buffer #####
+                    buf_build_V,
+                    build_VW_buf,
+                    offset_now,
+                    Density_RgR_buf,
+                    Density_RgAO_buf,
+                    offset_density_RgAO_buf,
+                    ddot_res_RgR_buf,
+                    K1_tmp1_buf,
+                    K1_tmp1_ddot_res_buf,
+                    K1_final_ddot_buf,
+                    ##### bunchsize #####
+                    #maxsize_group_naux,
+                    build_K_bunchsize,
+                    ##### other info #####
+                    use_mpi=use_mpi,
+                    begin_id=task_info[group_id][0],
+                    end_id  =task_info[group_id][1],
+                    ##### out #####
+                    K1_or_2=K1_tmp[iset])
+            
+                if calculate_W_tmp:
+                    W_tmp = _W_tmp.copy()
+                
+                build_k_buf.ravel()[:]  = 0.0
+                build_VW_buf.ravel()[:] = 0.0
+        
+                _isdf_get_K_direct_kernel_1(
+                    mydf, coulG_real,
+                    group_id, Density_RgAO_tmp[iset],
+                    W_tmp, False, False,
+                    ##### buffer #####
+                    buf_build_V,
+                    build_VW_buf,
+                    offset_now,
+                    Density_RgR_buf,
+                    Density_RgAO_buf,
+                    offset_density_RgAO_buf,
+                    ddot_res_RgR_buf,
+                    K1_tmp1_buf,
+                    K1_tmp1_ddot_res_buf,
+                    K1_final_ddot_buf,
+                    ##### bunchsize #####
+                    #maxsize_group_naux,
+                    build_K_bunchsize,
+                    ##### other info #####
+                    use_mpi=use_mpi,
+                    begin_id=task_info[group_id][0],
+                    end_id  =task_info[group_id][1],
+                    ##### out #####
+                    K1_or_2=K2_tmp[iset])
+    
+        log_profile_buildK_time(mydf)
+                
+        ### reduce ###
+        
+        K1 += K1_tmp
+        K2 += K2_tmp
+                
+    ######### finally delete the buffer #########
+    
+    # K = K1 + K1.T - K2 
+    K1_packed = []
+    K2_packed = []
+    for iset in range(nset):
+        K1_packed.append(pack_JK(K1[iset], kmesh, nao_prim))
+        K2_packed.append(pack_JK(K2[iset], kmesh, nao_prim))
+    K1 = np.array(K1_packed)
+    K2 = np.array(K2_packed)
+    K  = np.zeros_like(K1)
+    # K  = K1 + K1.T - K2
+    for iset in range(nset):
+        K[iset] = K1[iset] + K1[iset].T - (K2[iset] + K2[iset].T)/2.0
+    
+    del K1
+    del K2
+    
+    ############ transform back to K ############
+            
+    K_res = []
+    for iset in range(nset):
+        Ktmp  = _RowCol_FFT_bench(K[iset, :nao_prim, :], kmesh, inv=True, TransBra=False, TransKet=True)
+        K_res.append(Ktmp)
+    K  = np.asarray(K_res)
+    K *= nkpts
+    K *= ngrid / vol
+    Res = []
+    for iset in range(nset):
+        Res.append([])
+    for i in range(np.prod(kmesh)):
+        for iset in range(nset):
+            Res[iset].append(K[iset, :, i*nao_prim:(i+1)*nao_prim])
+    K  = np.array(Res)
+        
+    t2 = (logger.process_clock(), logger.perf_counter())
+    
+    _benchmark_time(t0, t2, "_contract_k_dm_quadratic_direct", mydf)
+    
+    return K
\ No newline at end of file
diff --git a/pyscf/isdf/isdf_posthf.py b/pyscf/isdf/isdf_posthf.py
new file mode 100644
index 000000000..85fd519a2
--- /dev/null
+++ b/pyscf/isdf/isdf_posthf.py
@@ -0,0 +1,198 @@
+#!/usr/bin/env python
+# Copyright 2014-2020 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Author: Ning Zhang <ningzhang1024@gmail.com>
+#
+
+############ sys module ############
+
+import numpy
+import numpy as np
+import ctypes
+
+############ pyscf module ############
+
+import pyscf
+from pyscf import lib
+from pyscf import ao2mo
+from pyscf.ao2mo.incore import iden_coeffs
+from pyscf.pbc import tools
+from pyscf.pbc.lib import kpts_helper
+from pyscf.lib import logger
+from pyscf.pbc.lib.kpts_helper import is_zero, gamma_point, unique
+from pyscf import __config__
+from pyscf.pbc.df.fft_ao2mo import _format_kpts, _iskconserv, _contract_compact
+import pyscf.pbc.gto as pbcgto
+from pyscf.cc.rccsd import _ChemistsERIs, RCCSD 
+libpbc = lib.load_library('libpbc')
+
+############ isdf utils ############
+
+from   isdf_jk         import _benchmark_time
+import isdf_local      as ISDF
+from   isdf_tools_cell import build_supercell, build_supercell_with_partition
+from   isdf_ao2mo      import LS_THC, LS_THC_eri
+
+####################################
+
+### post-HF with ISDF ERIs (NOT THC-POSTHF!)
+
+####################################
+
+############ subroutines ---- deal with CC ############
+
+def _make_isdf_eris_incore(mycc, my_isdf:ISDF.PBC_ISDF_Info_Quad, mo_coeff=None):
+    
+    cput0 = (logger.process_clock(), logger.perf_counter())
+    eris = _ChemistsERIs()
+    eris._common_init_(mycc, mo_coeff)
+    nocc = eris.nocc
+    nmo = eris.fock.shape[0]
+
+    eri1 = my_isdf.ao2mo(mo_coeff, compact=False).reshape(nmo,nmo,nmo,nmo)
+    eris.oooo = eri1[:nocc,:nocc,:nocc,:nocc].copy()
+    eris.ovoo = eri1[:nocc,nocc:,:nocc,:nocc].copy()
+    eris.ovov = eri1[:nocc,nocc:,:nocc,nocc:].copy()
+    eris.oovv = eri1[:nocc,:nocc,nocc:,nocc:].copy()
+    eris.ovvo = eri1[:nocc,nocc:,nocc:,:nocc].copy()
+    eris.ovvv = eri1[:nocc,nocc:,nocc:,nocc:].copy()
+    eris.vvvv = eri1[nocc:,nocc:,nocc:,nocc:].copy()
+    logger.timer(mycc, 'CCSD integral transformation', *cput0)
+    
+    cput1 = (logger.process_clock(), logger.perf_counter())
+    
+    _benchmark_time(cput0, cput1, "CCSD integral transformation", my_isdf)
+    
+    return eris
+
+def RCCSD_isdf(mf, frozen=0, mo_coeff=None, mo_occ=None, run=True, cc2=False):
+    mycc = RCCSD(mf, frozen=frozen, mo_coeff=mo_coeff, mo_occ=mo_occ)
+    mycc.cc2 = cc2
+    # eris = mycc.ao2mo(mo_coeff)
+    if mo_coeff is None:
+        mo_coeff = mf.mo_coeff
+    eris_ccsd = _make_isdf_eris_incore(mycc, mf.with_df, mo_coeff=mo_coeff)
+    # mycc.eris = eris
+    if run:
+        mycc.kernel(eris=eris_ccsd)
+    return mycc, eris_ccsd
+
+if __name__ == '__main__':
+
+    for c in [15]:
+        for N in [1]:
+
+            print("Testing c = ", c, "N = ", N, "...")
+
+            cell   = pbcgto.Cell()
+            boxlen = 3.5668
+            cell.a = np.array([[boxlen,0.0,0.0],[0.0,boxlen,0.0],[0.0,0.0,boxlen]])
+            
+            cell.atom = [
+                ['C', (0.     , 0.     , 0.    )],
+                ['C', (0.8917 , 0.8917 , 0.8917)],
+                ['C', (1.7834 , 1.7834 , 0.    )],
+                ['C', (2.6751 , 2.6751 , 0.8917)],
+                ['C', (1.7834 , 0.     , 1.7834)],
+                ['C', (2.6751 , 0.8917 , 2.6751)],
+                ['C', (0.     , 1.7834 , 1.7834)],
+                ['C', (0.8917 , 2.6751 , 2.6751)],
+            ] 
+
+            cell.basis   = 'gth-szv'
+            cell.pseudo  = 'gth-pade'
+            cell.verbose = 10
+            cell.ke_cutoff = 128
+            cell.max_memory = 800  # 800 Mb
+            cell.precision  = 1e-8  # integral precision
+            cell.use_particle_mesh_ewald = True
+            
+            verbose = 10
+            
+            prim_cell = build_supercell(cell.atom, cell.a, Ls = [1,1,1], ke_cutoff=cell.ke_cutoff, basis=cell.basis, pseudo=cell.pseudo, verbose=10)   
+            prim_partition = [[0,1,2,3], [4,5,6,7]]
+            prim_mesh = prim_cell.mesh
+            
+            Ls = [1, 1, N]
+            Ls = np.array(Ls, dtype=np.int32)
+            mesh = [Ls[0] * prim_mesh[0], Ls[1] * prim_mesh[1], Ls[2] * prim_mesh[2]]
+            mesh = np.array(mesh, dtype=np.int32)
+                        
+            cell, group_partition = build_supercell_with_partition(
+                                    cell.atom, cell.a, mesh=mesh, 
+                                    Ls=Ls,
+                                    basis=cell.basis, 
+                                    pseudo=cell.pseudo,
+                                    partition=prim_partition, ke_cutoff=cell.ke_cutoff, verbose=verbose) 
+        
+            ####### bench mark MP2 ####### 
+            
+            import numpy
+            from pyscf.pbc import gto, scf, mp
+
+            mf = scf.RHF(cell)
+            # mf.kernel()
+            mypt = mp.RMP2(mf)
+            # mypt.kernel()
+            
+            ####### isdf MP2 can perform directly! ####### 
+            
+            myisdf = ISDF.PBC_ISDF_Info_Quad(cell, with_robust_fitting=True, aoR_cutoff=1e-8, direct=False)
+            myisdf.verbose = 10
+            myisdf.build_IP_local(c=c, m=5, group=group_partition, Ls=[Ls[0]*10, Ls[1]*10, Ls[2]*10])
+            myisdf.build_auxiliary_Coulomb(debug=True)
+            
+            mf_isdf = scf.RHF(cell)
+            myisdf.direct_scf = mf_isdf.direct_scf
+            mf_isdf.with_df = myisdf
+            mf_isdf.max_cycle = 8
+            mf_isdf.conv_tol = 1e-8
+            mf_isdf.kernel()
+                        
+            isdf_pt = mp.RMP2(mf_isdf)
+            isdf_pt.kernel()
+            
+            mf_isdf.with_df.LS_THC_recompression(mf_isdf.with_df.aoRg_full()[0], force_LS_THC=False)
+            isdf_pt = mp.RMP2(mf_isdf)
+            isdf_pt.kernel()
+                        
+            ######################## CCSD ########################
+            
+            ## benchmark ##
+            
+            mycc = pyscf.cc.CCSD(mf)
+            # mycc.kernel()
+            
+            mycc_isdf, eris_ccsd = RCCSD_isdf(mf_isdf, run=False, cc2=False)
+            mycc_isdf.kernel(eris=eris_ccsd)
+            
+            eip,cip = mycc_isdf.ipccsd(nroots=2, eris=eris_ccsd)
+            eea,cea = mycc_isdf.eaccsd(nroots=2, eris=eris_ccsd)
+
+            print("eip = ", eip)
+            print("eea = ", eea)
+                    
+            ####### THC-DF ####### 
+            
+            _myisdf = ISDF.PBC_ISDF_Info_Quad(cell, with_robust_fitting=True, aoR_cutoff=1e-8, direct=False, use_occ_RI_K=False)
+            _myisdf.build_IP_local(c=15, m=5, group=group_partition, Ls=[Ls[0]*10, Ls[1]*10, Ls[2]*10])
+            R,_        = _myisdf.aoRg_full()
+            Z          = LS_THC(myisdf, R)
+            eri_LS_THC = LS_THC_eri(Z, R) 
+            print("eri_LS_THC = ", eri_LS_THC[0,0,0,0])
+            eri_benchmark = myisdf.get_eri(compact=False)
+            print("eri_benchmark = ", eri_benchmark[0,0,0,0])
+            diff          = np.linalg.norm(eri_LS_THC - eri_benchmark)
+            print("diff = ", diff/np.sqrt(eri_benchmark.size))
\ No newline at end of file
diff --git a/pyscf/isdf/isdf_tools_cell.py b/pyscf/isdf/isdf_tools_cell.py
new file mode 100644
index 000000000..5bd1da04d
--- /dev/null
+++ b/pyscf/isdf/isdf_tools_cell.py
@@ -0,0 +1,147 @@
+#!/usr/bin/env python
+# Copyright 2014-2020 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Author: Ning Zhang <ningzhang1024@gmail.com>
+#
+
+import sys
+
+import numpy
+import numpy as np
+import copy
+
+from pyscf.pbc.gto import Cell
+import pyscf.pbc.gto as pbcgto
+
+
+def build_supercell(prim_atm, 
+                    prim_a, 
+                    spin=0,
+                    charge=0,
+                    mesh=None, 
+                    Ls = [1,1,1], 
+                    basis='gth-dzvp', 
+                    pseudo='gth-pade', 
+                    ke_cutoff=70, 
+                    max_memory=2000, 
+                    precision=1e-8,
+                    use_particle_mesh_ewald=True,
+                    verbose=4):
+    
+    Cell = pbcgto.Cell()
+    
+    assert prim_a[0, 1] == 0.0
+    assert prim_a[0, 2] == 0.0
+    assert prim_a[1, 0] == 0.0
+    assert prim_a[1, 2] == 0.0
+    assert prim_a[2, 0] == 0.0
+    assert prim_a[2, 1] == 0.0
+    
+    Supercell_a = prim_a * np.array(Ls)
+    Cell.a = Supercell_a
+    
+    atm = []
+    
+    for ix in range(Ls[0]):
+        for iy in range(Ls[1]):
+            for iz in range(Ls[2]):
+                shift = [ix * prim_a[0, 0], iy * prim_a[1, 1], iz * prim_a[2, 2]]
+                for atom in prim_atm:
+                    atm.append([atom[0], (atom[1][0] + shift[0], atom[1][1] + shift[1], atom[1][2] + shift[2])])
+    
+    Cell.atom = atm
+    Cell.basis = basis
+    Cell.pseudo = pseudo
+    Cell.ke_cutoff = ke_cutoff
+    Cell.max_memory = max_memory
+    Cell.precision = precision
+    Cell.use_particle_mesh_ewald = use_particle_mesh_ewald
+    Cell.verbose = verbose
+    Cell.unit    = 'angstorm'
+    Cell.spin    = spin
+    Cell.charge  = charge
+    
+    Cell.build(mesh=mesh)
+    
+    return Cell
+
+def build_primitive_cell(supercell:Cell, kmesh):
+    
+    Cell = pbcgto.Cell()
+    
+    # assert prim_a[0, 1] == 0.0
+    # assert prim_a[0, 2] == 0.0
+    # assert prim_a[1, 0] == 0.0
+    # assert prim_a[1, 2] == 0.0
+    # assert prim_a[2, 0] == 0.0
+    # assert prim_a[2, 1] == 0.0
+    
+    prim_a = np.array( [supercell.a[0]/kmesh[0], supercell.a[1]/kmesh[1], supercell.a[2]/kmesh[2]], dtype=np.float64 )
+    
+    #print("supercell.a = ", supercell.a)
+    #print("prim_a = ", prim_a)
+    
+    Cell.a = prim_a
+    
+    atm = supercell.atom[:supercell.natm//np.prod(kmesh)]
+    
+    Cell.atom = atm
+    Cell.basis = supercell.basis
+    Cell.pseudo = supercell.pseudo
+    Cell.ke_cutoff = supercell.ke_cutoff
+    Cell.max_memory = supercell.max_memory
+    Cell.precision = supercell.precision
+    Cell.use_particle_mesh_ewald = supercell.use_particle_mesh_ewald
+    Cell.verbose = supercell.verbose
+    Cell.unit = supercell.unit
+    
+    mesh = np.array(supercell.mesh) // np.array(kmesh)
+    
+    Cell.build(mesh=mesh)
+    
+    return Cell
+
+def build_supercell_with_partition(prim_atm, 
+                                   prim_a, 
+                                   mesh=None, 
+                                   Ls = [1,1,1],
+                                   partition = None, 
+                                   basis='gth-dzvp', 
+                                   pseudo='gth-pade', 
+                                   ke_cutoff=70, 
+                                   max_memory=2000, 
+                                   precision=1e-8,
+                                   use_particle_mesh_ewald=True,
+                                   verbose=4):
+
+    cell = build_supercell(prim_atm, prim_a, mesh=mesh, Ls=Ls, basis=basis, pseudo=pseudo, ke_cutoff=ke_cutoff, max_memory=max_memory, precision=precision, use_particle_mesh_ewald=use_particle_mesh_ewald, verbose=verbose)
+
+    natm_prim = len(prim_atm)
+    
+    if partition is None:
+        partition = []
+        for i in range(natm_prim):
+            partition.append([i])
+
+    partition_supercell = []
+
+    for ix in range(Ls[0]):
+        for iy in range(Ls[1]):
+            for iz in range(Ls[2]):
+                cell_id = ix * Ls[1] * Ls[2] + iy * Ls[2] + iz
+                for sub_partition in partition:
+                    partition_supercell.append([x + cell_id * natm_prim for x in sub_partition])
+
+    return cell, partition_supercell
diff --git a/pyscf/isdf/isdf_tools_densitymatrix.py b/pyscf/isdf/isdf_tools_densitymatrix.py
new file mode 100644
index 000000000..a5bfd0cf4
--- /dev/null
+++ b/pyscf/isdf/isdf_tools_densitymatrix.py
@@ -0,0 +1,188 @@
+#!/usr/bin/env python
+# Copyright 2014-2020 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Author: Ning Zhang <ningzhang1024@gmail.com>
+#
+
+import sys
+
+import numpy
+import numpy as np
+
+from pyscf.pbc.gto import Cell
+import pyscf.pbc.gto as pbcgto
+
+def symmetrize_dm(dm:np.ndarray, Ls):
+    '''
+    
+    generate translation symmetrized density matrix (by average)
+    
+    Args :
+        dm : np.ndarray, density matrix, shape = (nao, nao)
+        Ls : list, supercell dimension, shape = (3,), or kmesh in k-sampling
+
+    Returns :
+        dm_symm : np.ndarray, symmetrized density matrix, shape = (nao, nao)
+    '''
+    
+    is_single_dm = False
+    
+    if dm.ndim == 2:
+        is_single_dm = True
+        dm = dm.reshape(1, dm.shape[0], dm.shape[1])
+        
+    ncell = np.prod(Ls)
+    nao   = dm.shape[1]
+    nset  = dm.shape[0]
+    nao_prim = nao // ncell
+    dm_symm = np.zeros((nset,nao,nao), dtype=dm.dtype)
+        
+    for i in range(Ls[0]):
+        for j in range(Ls[1]):
+            for k in range(Ls[2]):
+                
+                dm_symmized_buf = np.zeros((nset,nao_prim,nao_prim), dtype=dm.dtype)
+                
+                for i_row in range(Ls[0]):
+                    for j_row in range(Ls[1]):
+                        for k_row in range(Ls[2]):
+                            
+                            loc_row = i_row * Ls[1] * Ls[2] + j_row * Ls[2] + k_row
+                            loc_col = ((i + i_row) % Ls[0]) * Ls[1] * Ls[2] + ((j + j_row) % Ls[1]) * Ls[2] + (k + k_row) % Ls[2]
+                            
+                            b_begin = loc_row * nao_prim
+                            b_end   = (loc_row + 1) * nao_prim
+                            
+                            k_begin = loc_col * nao_prim
+                            k_end   = (loc_col + 1) * nao_prim
+                            
+                            dm_symmized_buf += dm[:,b_begin:b_end, k_begin:k_end]
+        
+                dm_symmized_buf /= ncell
+                
+                for i_row in range(Ls[0]):
+                    for j_row in range(Ls[1]):
+                        for k_row in range(Ls[2]):
+                            
+                            loc_row = i_row * Ls[1] * Ls[2] + j_row * Ls[2] + k_row
+                            loc_col = ((i + i_row) % Ls[0]) * Ls[1] * Ls[2] + ((j + j_row) % Ls[1]) * Ls[2] + (k + k_row) % Ls[2]
+                            
+                            b_begin = loc_row * nao_prim
+                            b_end   = (loc_row + 1) * nao_prim
+                            
+                            k_begin = loc_col * nao_prim
+                            k_end   = (loc_col + 1) * nao_prim
+                            
+                            dm_symm[:,b_begin:b_end, k_begin:k_end] = dm_symmized_buf        
+    
+    if is_single_dm:
+        return dm_symm[0]
+    else:
+        return dm_symm        
+
+def pack_JK(input_mat:np.ndarray, Ls, nao_prim, output=None):
+    
+    '''
+    pack matrix in real space
+    '''
+    
+    assert input_mat.dtype == np.float64    
+    ncell = np.prod(Ls)
+    # print("ncell = ", ncell)
+    # print("Ls = ", Ls)  
+    # print("nao_prim = ", nao_prim)
+    # print("input_mat.shape = ", input_mat.shape)
+    assert input_mat.shape[0] == nao_prim
+    assert input_mat.shape[1] == nao_prim * ncell
+    
+    if output is None:
+        output = np.zeros((ncell*nao_prim, ncell*nao_prim), dtype=np.float64)  
+    else:
+        assert output.shape == (ncell*nao_prim, ncell*nao_prim)  
+    
+    for ix_row in range(Ls[0]):
+        for iy_row in range(Ls[1]):
+            for iz_row in range(Ls[2]):
+                
+                loc_row = ix_row * Ls[1] * Ls[2] + iy_row * Ls[2] + iz_row
+                
+                b_begin = loc_row * nao_prim
+                b_end   = (loc_row + 1) * nao_prim
+                
+                for ix_col in range(Ls[0]):
+                    for iy_col in range(Ls[1]):
+                        for iz_col in range(Ls[2]):
+                            
+                            loc_col = ix_col * Ls[1] * Ls[2] + iy_col * Ls[2] + iz_col
+                            
+                            k_begin = loc_col * nao_prim
+                            k_end   = (loc_col + 1) * nao_prim
+                            
+                            ix = (ix_col - ix_row) % Ls[0]
+                            iy = (iy_col - iy_row) % Ls[1]
+                            iz = (iz_col - iz_row) % Ls[2]
+                            
+                            loc_col2 = ix * Ls[1] * Ls[2] + iy * Ls[2] + iz
+                            
+                            k_begin2 = loc_col2 * nao_prim
+                            k_end2   = (loc_col2 + 1) * nao_prim
+                            
+                            output[b_begin:b_end, k_begin:k_end] = input_mat[:, k_begin2:k_end2]
+                            
+    return output
+ 
+def pack_JK_in_FFT_space(input_mat:np.ndarray, kmesh, nao_prim, output=None):
+    
+    '''
+    pack matrix in k-space
+    '''
+    
+    ncomplex = kmesh[0] * kmesh[1] * (kmesh[2] // 2 + 1)
+    assert input_mat.dtype == np.complex128
+    assert input_mat.shape[0] == nao_prim
+    #print("input_mat.shape = ", input_mat.shape)
+    #print("nao_prim = ", nao_prim)
+    #print("ncomplex = ", ncomplex)
+    assert input_mat.shape[1] == nao_prim * ncomplex
+    
+    nkpts = np.prod(kmesh)
+    
+    if output is None:
+        output = np.zeros((nao_prim, nao_prim*nkpts), dtype=np.complex128)
+    else:
+        assert output.shape == (nao_prim, nao_prim*nkpts) or output.shape == (nkpts, nao_prim, nao_prim)
+    
+    output = output.reshape(nkpts, nao_prim, nao_prim)
+    
+    loc = 0
+    
+    for ix in range(kmesh[0]):
+        for iy in range(kmesh[1]):
+            for iz in range(kmesh[2] // 2 + 1):
+                loc1 = ix * kmesh[1] * kmesh[2] + iy * kmesh[2] + iz
+                #loc2 = ix * kmesh[1] * kmesh[2] + iy * kmesh[2] + (kmesh[2] - iz) % kmesh[2]
+                loc2 = (kmesh[0] - ix) % kmesh[0] * kmesh[1] * kmesh[2] + (kmesh[1] - iy) % kmesh[1] * kmesh[2] + (kmesh[2] - iz) % kmesh[2]
+                if loc1 == loc2:
+                    output[loc1] = input_mat[:, loc*nao_prim:(loc+1)*nao_prim]
+                    imag_part = np.imag(output[loc1])
+                    if np.max(np.abs(imag_part)) > 1e-8:
+                        print("Warning: max abs of imag_part = ", np.max(np.abs(imag_part)))
+                else:
+                    output[loc1] = input_mat[:, loc*nao_prim:(loc+1)*nao_prim]
+                    output[loc2] = input_mat[:, loc*nao_prim:(loc+1)*nao_prim].conj()
+                loc += 1
+                
+    return output
+
diff --git a/pyscf/isdf/isdf_tools_kSampling.py b/pyscf/isdf/isdf_tools_kSampling.py
new file mode 100644
index 000000000..26901c96b
--- /dev/null
+++ b/pyscf/isdf/isdf_tools_kSampling.py
@@ -0,0 +1,257 @@
+#!/usr/bin/env python
+# Copyright 2014-2020 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Author: Ning Zhang <ningzhang1024@gmail.com>
+#
+
+import numpy as np
+from pyscf import lib
+from pyscf.pbc.lib.kpts import KPoints
+from pyscf.gto.mole import *
+
+def _extract_grid_primitive_cell(cell_a, mesh, Ls, coords):
+    """
+    Extract the primitive cell grid information from the supercell grid information
+    """
+    
+    #print("In _extract_grid_primitive_cell")
+    
+    assert cell_a[0, 1] == 0.0
+    assert cell_a[0, 2] == 0.0
+    assert cell_a[1, 0] == 0.0
+    assert cell_a[1, 2] == 0.0
+    assert cell_a[2, 0] == 0.0
+    assert cell_a[2, 1] == 0.0
+    
+    ngrids = np.prod(mesh)
+    # print("ngrids = ", ngrids)
+
+    assert ngrids == coords.shape[0]
+    
+    Lx = Ls[0]
+    Ly = Ls[1]
+    Lz = Ls[2]
+    
+    # print("Lx = ", Lx)
+    # print("Ly = ", Ly)
+    # print("Lz = ", Lz)
+    
+    # print("Length supercell x = %15.6f , primitive cell x = %15.6f" % (cell_a[0, 0], cell_a[0, 0] / Lx))
+    # print("Length supercell y = %15.6f , primitive cell y = %15.6f" % (cell_a[1, 1], cell_a[1, 1] / Ly))
+    # print("Length supercell z = %15.6f , primitive cell z = %15.6f" % (cell_a[2, 2], cell_a[2, 2] / Lz))
+    
+    nx, ny, nz = mesh
+    
+    # print("nx = ", nx)
+    # print("ny = ", ny)
+    # print("nz = ", nz)
+    
+    coords = coords.reshape(nx, ny, nz, 3)
+    
+    assert nx % Lx == 0
+    assert ny % Ly == 0
+    assert nz % Lz == 0
+    
+    nx_prim = nx // Lx
+    ny_prim = ny // Ly
+    nz_prim = nz // Lz
+    
+    # print("nx_prim = ", nx_prim)
+    # print("ny_prim = ", ny_prim)
+    # print("nz_prim = ", nz_prim)
+    
+    ngrids_prim = nx_prim * ny_prim * nz_prim
+    
+    res_dict = {}
+    
+    res = []
+        
+    prim_grid = coords[:nx_prim, :ny_prim, :nz_prim].reshape(-1, 3)
+        
+    for ix in range(Lx):
+        for iy in range(Ly):
+            for iz in range(Lz):
+                x_0 = ix * nx_prim
+                x_1 = (ix + 1) * nx_prim
+                y_0 = iy * ny_prim
+                y_1 = (iy + 1) * ny_prim
+                z_0 = iz * nz_prim
+                z_1 = (iz + 1) * nz_prim
+                
+                grid_tmp = coords[x_0:x_1, y_0:y_1, z_0:z_1].reshape(-1, 3)
+                
+                shift_bench = np.zeros((3), dtype=np.float64)
+                shift_bench[0] = ix * cell_a[0, 0] / Lx
+                shift_bench[1] = iy * cell_a[1, 1] / Ly
+                shift_bench[2] = iz * cell_a[2, 2] / Lz
+                
+                shifts = grid_tmp - prim_grid
+                
+                # print("shifts = ", shifts)
+                # print("shift_bench = ", shift_bench)
+                
+                for ID in range(shifts.shape[0]):
+                    shift = shifts[ID]
+                    # print("shift = ", shift)
+                    if np.allclose(shift, shift_bench) == False:
+                        tmp = shift - shift_bench
+                        nx = round (tmp[0] / cell_a[0, 0])
+                        ny = round (tmp[1] / cell_a[1, 1])
+                        nz = round (tmp[2] / cell_a[2, 2])
+                        # print(tmp)
+                        # print(nx, ny, nz)
+                        assert np.allclose(tmp[0], nx * cell_a[0, 0])
+                        assert np.allclose(tmp[1], ny * cell_a[1, 1])
+                        assert np.allclose(tmp[2], nz * cell_a[2, 2])
+                        # grid_tmp[ID] = prim_grid[ID] + shift_bench, do not shift to avoid numerical error
+
+                res.append(grid_tmp)
+                res_dict[(ix, iy, iz)] = grid_tmp
+    res = np.array(res).reshape(-1, 3)
+    return res, res_dict
+
+def _split_partition(Voroini_partition, mesh, Ls):
+    ngrids = np.prod(mesh)
+    assert ngrids == coords.shape[0]
+    
+    Lx = Ls[0]
+    Ly = Ls[1]
+    Lz = Ls[2]
+
+    nx, ny, nz = mesh
+    
+    Voroini_partition_reshaped = Voroini_partition.reshape(nx, ny, nz)
+        
+    assert nx % Lx == 0
+    assert ny % Ly == 0
+    assert nz % Lz == 0
+    
+    nx_prim = nx // Lx
+    ny_prim = ny // Ly
+    nz_prim = nz // Lz
+    
+    ngrids_prim = nx_prim * ny_prim * nz_prim
+    
+    res_dict = {}
+    prim_grid = Voroini_partition_reshaped[:nx_prim, :ny_prim, :nz_prim].reshape(-1, 3)
+        
+    for ix in range(Lx):
+        for iy in range(Ly):
+            for iz in range(Lz):
+                x_0 = ix * nx_prim
+                x_1 = (ix + 1) * nx_prim
+                y_0 = iy * ny_prim
+                y_1 = (iy + 1) * ny_prim
+                z_0 = iz * nz_prim
+                z_1 = (iz + 1) * nz_prim
+                
+                grid_tmp               = Voroini_partition_reshaped[x_0:x_1, y_0:y_1, z_0:z_1].reshape(-1)
+                res_dict[(nx, ny, nz)] = grid_tmp
+    
+    return res_dict
+
+def _RowCol_FFT_bench(input, Ls, inv=False, TransBra = True, TransKet = True):
+    """
+    A is a 3D array, (nbra, nket, ngrid_prim)
+    """
+    
+    A = input
+    ncell = np.prod(Ls)
+    
+    if TransKet:
+        assert A.shape[1] % ncell == 0
+    if TransBra:
+        assert A.shape[0] % ncell == 0
+    
+    # print("A.shape = ", A.shape)
+    # print("Ls = ", Ls)
+    
+    NPOINT_KET = A.shape[1] // ncell
+    
+    if TransKet:
+        A = A.reshape(A.shape[0], -1, NPOINT_KET) # nbra, nBox, NPOINT
+        A = A.transpose(0, 2, 1)                  # nbra, NPOINT, nBox
+        shape_tmp = A.shape
+        A = A.reshape(A.shape[0] * NPOINT_KET, *Ls)
+        # perform 3d fft 
+        if inv:
+            A = np.fft.ifftn(A, axes=(1, 2, 3))
+        else:
+            A = np.fft.fftn(A, axes=(1, 2, 3))
+        A = A.reshape(shape_tmp)
+        A = A.transpose(0, 2, 1)
+        A = A.reshape(A.shape[0], -1)
+        print("finish transform ket")
+    # transform bra
+    NPOINT_BRA = A.shape[0] // ncell
+    if TransBra:
+        A = A.reshape(-1, NPOINT_BRA, A.shape[1])
+        A = A.transpose(1, 2, 0)
+        shape_tmp = A.shape
+        A = A.reshape(-1, *Ls)
+        if inv:
+            A = np.fft.fftn(A, axes=(1, 2, 3))
+        else:
+            A = np.fft.ifftn(A, axes=(1, 2, 3))
+        A = A.reshape(shape_tmp)
+        A = A.transpose(2, 0, 1)
+        A = A.reshape(-1, A.shape[2])
+        print("finish transform bra")
+    # print(A[:NPOINT, :NPOINT])
+    return A
+
+def _RowCol_FFT_ColFull_bench(input, Ls, mesh):
+    """
+    A is a 3D array, (nbra, nket, ngrid_prim)
+    """
+    A = input
+    ncell = np.prod(Ls)
+    nGrids = np.prod(mesh)
+    assert A.shape[1] == nGrids
+    assert A.shape[0] % ncell == 0
+    A = A.reshape(A.shape[0], *mesh)
+    # perform 3d fft 
+    A = np.fft.fftn(A, axes=(1, 2, 3))
+    A = A.reshape(A.shape[0], -1)
+    print("finish transform ket")
+    # transform bra
+    NPOINT_BRA = A.shape[0] // ncell
+    A = A.reshape(-1, NPOINT_BRA, A.shape[1])
+    A = A.transpose(1, 2, 0)
+    shape_tmp = A.shape
+    A = A.reshape(-1, *Ls)
+    A = np.fft.ifftn(A, axes=(1, 2, 3))
+    A = A.reshape(shape_tmp)
+    A = A.transpose(2, 0, 1)
+    A = A.reshape(-1, A.shape[2])
+    print("finish transform bra")
+    return A
+
+def _kmesh_to_Kpoints(cell, mesh):
+    
+    from pyscf.pbc.lib.kpts import KPoints 
+    
+    kpts = []
+    
+    for i in range(mesh[0]):
+        for j in range(mesh[1]):
+            for k in range(mesh[2]):
+                kpts.append([1.0/float(mesh[0]) * float(i), 
+                             1.0/float(mesh[1]) * float(j), 
+                             1.0/float(mesh[2]) * float(k)])
+    
+    kpts = np.array(kpts)
+    
+    return KPoints(cell, kpts) 
\ No newline at end of file
diff --git a/pyscf/isdf/isdf_tools_linearop.py b/pyscf/isdf/isdf_tools_linearop.py
new file mode 100644
index 000000000..c09a95aa9
--- /dev/null
+++ b/pyscf/isdf/isdf_tools_linearop.py
@@ -0,0 +1,104 @@
+#!/usr/bin/env python
+# Copyright 2014-2020 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Author: Ning Zhang <ningzhang1024@gmail.com>
+#
+
+############ sys module ############
+
+import copy
+import numpy as np
+import numpy
+import scipy
+import ctypes, sys
+from pyscf import lib
+libisdf = lib.load_library('libisdf')
+
+def square_inPlace(a):
+    
+    assert(a.dtype == numpy.double)
+    fn = getattr(libisdf, "NPdsquare_inPlace", None)
+    assert(fn is not None)
+
+    fn(a.ctypes.data_as(ctypes.c_void_p),
+       ctypes.c_size_t(a.size))
+
+    return a
+
+def d_i_ij_ij(a, b, out=None):
+    assert(a.dtype == b.dtype)
+    assert(a.shape[0] == b.shape[0])
+    assert(a.ndim == 1)
+    assert(b.ndim == 2)
+
+    if a.dtype != numpy.double:
+        raise NotImplementedError
+    else:
+        fn = getattr(libisdf, "NPd_i_ij_ij", None)
+        assert(fn is not None)
+
+    if out is None:
+        out = numpy.empty_like(a)
+
+    fn(out.ctypes.data_as(ctypes.c_void_p),
+       a.ctypes.data_as(ctypes.c_void_p),
+       b.ctypes.data_as(ctypes.c_void_p),
+       ctypes.c_size_t(b.shape[0]),
+       ctypes.c_size_t(b.shape[1]))
+
+    return out
+
+def d_ij_j_ij(a, b, out=None):
+    assert(a.dtype == b.dtype)
+    assert(a.shape[1] == b.shape[0])
+    assert(a.ndim == 2)
+    assert(b.ndim == 1)
+
+    if a.dtype != numpy.double:
+        raise NotImplementedError
+    else:
+        fn = getattr(libisdf, "NPd_ij_j_ij", None)
+        assert(fn is not None)
+
+    if out is None:
+        out = numpy.empty_like(a)
+
+    fn(out.ctypes.data_as(ctypes.c_void_p),
+       a.ctypes.data_as(ctypes.c_void_p),
+       b.ctypes.data_as(ctypes.c_void_p),
+       ctypes.c_size_t(a.shape[0]),
+       ctypes.c_size_t(a.shape[1]))
+
+    return out
+
+def cwise_mul(a, b, out=None):
+    assert(a.size == b.size)
+    assert(a.dtype == b.dtype)
+
+    if a.dtype != numpy.double:
+        raise NotImplementedError
+    else:
+        fn = getattr(libisdf, "NPdcwisemul", None)
+        assert(fn is not None)
+
+    if out is None:
+        out = numpy.empty_like(a)
+
+    fn(out.ctypes.data_as(ctypes.c_void_p),
+       a.ctypes.data_as(ctypes.c_void_p),
+       b.ctypes.data_as(ctypes.c_void_p),
+       ctypes.c_size_t(a.size))
+
+    return out
\ No newline at end of file
diff --git a/pyscf/isdf/isdf_tools_local.py b/pyscf/isdf/isdf_tools_local.py
new file mode 100644
index 000000000..30ed5ecc0
--- /dev/null
+++ b/pyscf/isdf/isdf_tools_local.py
@@ -0,0 +1,1139 @@
+#!/usr/bin/env python
+# Copyright 2014-2020 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Author: Ning Zhang <ningzhang1024@gmail.com>
+#
+
+########## pyscf module ##########
+
+import copy
+from functools import reduce
+import numpy as np
+import pyscf
+from pyscf import lib
+import pyscf.pbc.gto as pbcgto
+from pyscf.pbc.gto import Cell
+from pyscf.pbc import tools
+from pyscf.pbc.lib.kpts import KPoints
+from pyscf.pbc.lib.kpts_helper import is_zero, gamma_point, member
+from pyscf.gto.mole import *
+import pyscf.pbc.df.ft_ao as ft_ao
+from pyscf.pbc.df import aft, rsdf_builder, aft_jk
+
+########## isdf  module ##########
+
+from pyscf.isdf.isdf_jk       import _benchmark_time
+import pyscf.isdf.isdf_ao2mo  as isdf_ao2mo
+import pyscf.isdf.isdf_jk     as isdf_jk
+from pyscf.isdf.isdf_eval_gto import ISDF_eval_gto
+
+########## sys   module ##########
+
+import ctypes, sys
+from multiprocessing import Pool
+libisdf = lib.load_library('libisdf')
+
+########## global parameter ##########
+
+DISTANCE_CUTOFF = 16 # suitable for cuprates ! 
+
+############ build atm connection graph ############
+
+class AtmConnectionInfo:
+    def __init__(self, cell:Cell, atmID, distance_matrix, precision, rcut, rcut_max, atm_to_bas):
+        '''
+        rcut: the cutoff radius of each bas
+        '''
+        
+        self.precision = precision
+        self.atmID = atmID
+        self.atmID_connection = np.where(distance_matrix[atmID] < rcut_max)[0]
+        self.distance = distance_matrix[atmID][self.atmID_connection]
+        self.atm_connected_info = list(zip(self.atmID_connection, self.distance))
+        # sort by distance 
+        self.atm_connected_info.sort(key=lambda x: x[1])
+        self.bas_range = np.arange(atm_to_bas[atmID][0], atm_to_bas[atmID][1])
+        self.bas_cut = rcut[atm_to_bas[atmID][0]:atm_to_bas[atmID][1]]
+    
+    def __repr__(self):
+        return "atmID = %d, atm_connected_info = %s, bas_range = %s, bas_cut = %s" % (self.atmID, self.atm_connected_info, self.bas_range, self.bas_cut)
+
+class aoR_Holder:
+    def __init__(self, aoR, ao_involved, local_gridID_begin, local_gridID_end, global_gridID_begin, global_gridID_end):
+        '''
+        currently local_gridID_begin, local_gridID_end is not useful
+        '''
+        
+        assert aoR.shape[0] == len(ao_involved)
+        assert (local_gridID_end - local_gridID_begin) == (global_gridID_end - global_gridID_begin)
+        assert aoR.shape[1] <= (global_gridID_end - global_gridID_begin)
+        # assert aoR.shape[1] == local_gridID_end - local_gridID_begin
+        # assert aoR.shape[1] == global_gridID_end - global_gridID_begin
+        # if aoR.shape[1] != (global_gridID_end - global_gridID_begin):
+        self.ngrid_tot  = global_gridID_end - global_gridID_begin
+        self.ngrid_kept = aoR.shape[1]
+        
+        self.aoR = aoR
+        self.ao_involved = np.array(ao_involved, dtype=np.int32)
+        self.nao_involved = len(ao_involved)
+        self.local_gridID_begin = local_gridID_begin
+        self.local_gridID_end = local_gridID_end
+        self.global_gridID_begin = global_gridID_begin
+        self.global_gridID_end = global_gridID_end
+        self.nCompact = self.nao_involved  ## by default all orbitals are compact
+        
+        ## build ao_involved segment ## 
+        
+        self.ao_involved_sorted = np.sort(self.ao_involved)
+        self.aoR         = self.aoR[np.argsort(self.ao_involved)]
+        self.ao_involved = self.ao_involved_sorted 
+        
+        # diff            = np.diff(self.ao_involved)
+        # segment_indices = np.where(diff > 1)[0] + 1
+        # segments        = np.split(self.ao_involved, segment_indices)
+        # self.segments = []
+        # if len(segments) == 1 and len(segments[0]) == 0:
+        #     self.segments.append(0)
+        # else:
+        #     loc_begin = 0
+        #     for segment in segments:
+        #         self.segments.append(loc_begin)
+        #         self.segments.append(segment[0])
+        #         self.segments.append(segment[-1]+1)
+        #         loc_begin += len(segment)
+        #     self.segments.append(loc_begin)
+        # self.segments = np.array(self.segments, dtype=np.int32)
+        # segments = None
+    
+    def RangeSeparation(self, IsCompact:np.ndarray):
+        ordering_C = []
+        ordering_D = []
+        nao_involved = len(self.ao_involved)
+        for i in range(nao_involved):
+            if IsCompact[self.ao_involved[i]]:
+                ordering_C.append(i)
+            else:
+                ordering_D.append(i)
+        self.nCompact = len(ordering_C)
+        ordering = ordering_C
+        ordering.extend(ordering_D)
+        ordering = np.array(ordering, dtype=np.int32)
+        self.aoR = self.aoR[ordering].copy()
+        self.ao_involved = self.ao_involved[ordering].copy()
+        # print("ordering = ", ordering)
+        # print("nCompact = ", self.nCompact)
+        for i in range(self.nCompact):
+            assert IsCompact[self.ao_involved[i]]
+    
+    def size(self):
+        return self.aoR.nbytes + self.ao_involved.nbytes 
+        # + self.segments.nbytes
+
+    def todense(self, nao):
+        aoR = np.zeros((nao, self.aoR.shape[1]))
+        aoR[self.ao_involved] = self.aoR
+        return aoR
+
+def _get_aoR_holders_memory(aoR_holders:list[aoR_Holder]):
+    
+    return sum([_aoR_holder.size() for _aoR_holder in aoR_holders if _aoR_holder is not None])
+
+def flatten_aoR_holder(aoR_holders:list[aoR_Holder]):
+    res_int   = []
+    res_float = []
+    for _aoR_holder in aoR_holders:
+        res_int.extend(_aoR_holder.ao_involved)
+        res_int.extend([_aoR_holder.local_gridID_begin, _aoR_holder.local_gridID_end, _aoR_holder.global_gridID_begin, _aoR_holder.global_gridID_end])
+        res_float.extend(_aoR_holder.aoR.ravel())
+    res_int = np.array(res_int, dtype=np.int32)
+    res_float = np.array(res_float, dtype=np.float64)
+    return res_int, res_float
+
+def _pack_aoR_holder(aoR_holders:list[aoR_Holder], nao):
+    
+    has_involved = [False] * nao
+    
+    nGrid = 0
+    for _aoR_holder in aoR_holders:
+        if _aoR_holder is None:
+            continue
+        for i in _aoR_holder.ao_involved:
+            has_involved[i] = True  
+        # nGrid += _aoR_holder.aoR.shape[1]
+        nGrid += _aoR_holder.ngrid_tot
+    
+    ao2loc = [-1] * nao 
+    loc_now = 0
+    for ao_id, involved in enumerate(has_involved):
+        if involved:
+            ao2loc[ao_id] = loc_now
+            loc_now += 1
+    nao_involved = loc_now  
+    
+    aoR_packed = np.zeros((nao_involved, nGrid))
+    
+    fn_pack = getattr(libisdf, "_Pack_Matrix_SparseRow_DenseCol", None)
+    assert fn_pack is not None
+    
+    
+    grid_begin_id = 0
+    for _aoR_holder in aoR_holders:
+        if _aoR_holder is None:
+            continue
+        loc_packed = np.zeros((_aoR_holder.aoR.shape[0]), dtype=np.int32)
+        # grid_end_id = grid_begin_id + _aoR_holder.aoR.shape[1]
+        grid_end_id = grid_begin_id + _aoR_holder.ngrid_tot
+        for loc, ao_id in enumerate(_aoR_holder.ao_involved):
+            loc_packed[loc] = ao2loc[ao_id]
+        # aoR_packed[loc_packed, grid_begin_id:grid_end_id] = _aoR_holder.aoR
+        fn_pack(
+            aoR_packed.ctypes.data_as(ctypes.c_void_p), 
+            ctypes.c_int(aoR_packed.shape[0]),
+            ctypes.c_int(aoR_packed.shape[1]),
+            _aoR_holder.aoR.ctypes.data_as(ctypes.c_void_p),
+            ctypes.c_int(_aoR_holder.aoR.shape[0]),
+            ctypes.c_int(_aoR_holder.aoR.shape[1]),
+            loc_packed.ctypes.data_as(ctypes.c_void_p), 
+            ctypes.c_int(grid_begin_id),
+            ctypes.c_int(grid_end_id)
+        )
+        grid_begin_id = grid_end_id
+    ao_packed_invovled = np.array([i for i in range(nao) if has_involved[i]], dtype=np.int32)
+
+    assert nGrid == grid_begin_id
+    local_gridID_begin = 0
+    local_gridID_end = nGrid
+    global_gridID_begin = 0
+    global_gridID_end = nGrid
+    
+    return aoR_Holder(aoR_packed, ao_packed_invovled, local_gridID_begin, local_gridID_end, global_gridID_begin, global_gridID_end)
+
+# get the rcut #
+
+def _atm_to_bas(cell:Cell):
+    
+    shl_atm = []
+        
+    natm = cell.natm
+        
+    for i in range(natm):
+        shl_atm.append([None, None])
+        
+    for i in range(cell.nbas):
+        atm_id = cell.bas_atom(i)
+        if shl_atm[atm_id][0] is None:
+            shl_atm[atm_id][0] = i
+        shl_atm[atm_id][1] = i+1
+    
+    return shl_atm
+
+def _estimate_rcut(cell, ngrids, precision):
+    
+    '''
+    Cutoff raidus, above which each shell decays to a value less than the
+    required precsion
+    '''
+    
+    weight = numpy.sqrt(cell.vol/ngrids) # note the weight ! 
+    log_prec = numpy.log(precision/weight)
+    rcut = []
+    for ib in range(cell.nbas):
+        l = cell.bas_angular(ib)
+        es = cell.bas_exp(ib)
+        cs = abs(cell.bas_ctr_coeff(ib)).max(axis=1)
+        r = 5.
+        r = (((l+2)*numpy.log(r)+numpy.log(cs) - log_prec) / es)**.5
+        r[r < 1.] = 1.
+        r = (((l+2)*numpy.log(r)+numpy.log(cs) - log_prec) / es)**.5
+        rcut.append(r.max())
+    return numpy.array(rcut)
+
+# the distance graph # 
+
+def _distance_translation(pa:np.ndarray, pb:np.ndarray, a):
+    '''
+    calculate the distance between pa pb, but taken the periodic boundary condition into account
+    '''
+    
+    dx = pa[0] - pb[0]
+    dx1 = dx - a[0][0]
+    dx2 = dx + a[0][0]
+    dx = abs(dx)
+    dx1 = abs(dx1)
+    dx2 = abs(dx2)
+    dx = min(dx, dx1, dx2)
+    
+    dy = pa[1] - pb[1]
+    dy1 = dy - a[1][1]
+    dy2 = dy + a[1][1]
+    dy = abs(dy)
+    dy1 = abs(dy1)
+    dy2 = abs(dy2)
+    dy = min(dy, dy1, dy2)
+    
+    dz = pa[2] - pb[2]
+    dz1 = dz - a[2][2]
+    dz2 = dz + a[2][2]
+    dz = abs(dz)
+    dz1 = abs(dz1)
+    dz2 = abs(dz2)
+    dz = min(dz, dz1, dz2)
+    
+    return np.sqrt(dx**2 + dy**2 + dz**2)
+
+def get_cell_distance_matrix(cell:Cell):
+    '''
+    get the distance matrix of the cell
+    '''
+    a = cell.lattice_vectors()
+    n = cell.natm
+    distance_matrix = np.zeros((n, n))
+    for i in range(n):
+        for j in range(i+1, n):
+            distance_matrix[i][j] = _distance_translation(cell.atom_coord(i), cell.atom_coord(j), a)
+            distance_matrix[j][i] = distance_matrix[i][j]
+    return distance_matrix
+
+############ algorithm based on the distance graph and AtmConnectionInfo ############
+
+def get_partition(cell:Cell, coords, AtmConnectionInfoList:list[AtmConnectionInfo], 
+                  Ls=[3,3,3], 
+                  with_translation_symmetry=False,
+                  kmesh=None,
+                  use_mpi=False): # by default split the cell into 4x4x4 supercell
+    
+    ''' get partition of grid points, each group of grid points are associated with one atm.
+    '''
+    
+    ##### this step is super fast #####
+    
+    ##### we simply perform it on root and broadcast it to all other processes #####
+    
+    if use_mpi:
+        from pyscf.isdf.isdf_tools_mpi import rank, comm, comm_size, allgather, bcast, reduce, gather, alltoall, _comm_bunch, allgather_list, bcast_pickel
+    
+    if with_translation_symmetry and kmesh is None:
+        raise ValueError("kmesh must be provided if with_translation_symmetry is True")
+    
+    log = lib.logger.Logger(cell.stdout, cell.verbose)
+    
+    if use_mpi == False or (use_mpi and rank == 0):
+        #print("************* get_partition *************")
+        log.debug4("************* get_partition *************")
+    
+    ##### construct the box info #####
+
+    mesh = cell.mesh
+    lattice_vector = cell.lattice_vectors()
+    lattice_vector = np.array(lattice_vector)
+    
+    meshPrim = None
+    if with_translation_symmetry:
+        meshPrim = np.array(mesh) // np.array(kmesh)
+    
+    mesh_box = np.array([0,0,0])
+    nbox = np.array([0,0,0])
+    if mesh[0] % Ls[0] != 0:
+        mesh_box[0] = mesh[0] // Ls[0] + 1
+        nbox[0] = mesh[0] // mesh_box[0] + 1
+    else:
+        mesh_box[0] = mesh[0] // Ls[0]
+        nbox[0] = mesh[0] // mesh_box[0]
+    if mesh[1] % Ls[1] != 0:
+        mesh_box[1] = mesh[1] // Ls[1] + 1
+        nbox[1] = mesh[1] // mesh_box[1] + 1
+    else:
+        mesh_box[1] = mesh[1] // Ls[1]
+        nbox[1] = mesh[1] // mesh_box[1]
+    if mesh[2] % Ls[2] != 0:
+        mesh_box[2] = mesh[2] // Ls[2] + 1
+        nbox[2] = mesh[2] // mesh_box[2] + 1
+    else:
+        mesh_box[2] = mesh[2] // Ls[2]
+        nbox[2] = mesh[2] // mesh_box[2]
+        
+    Ls_box = [lattice_vector[0] / mesh[0] * mesh_box[0], lattice_vector[1] / mesh[1] * mesh_box[1], lattice_vector[2] / mesh[2] * mesh_box[2]]
+    
+    # print("Ls       = ", Ls)
+    # print("mesh     = ", mesh)
+    # print("mesh_box = ", mesh_box)
+    # print("Ls_box   = ", Ls_box)
+        
+    assert Ls_box[0][0] < 3.0
+    assert Ls_box[1][1] < 3.0
+    assert Ls_box[2][2] < 3.0 # the box cannot be too large
+    
+    ##### helper functions ##### 
+    
+    def get_box_id(x, y, z):
+        ix = int(x // Ls_box[0][0])
+        iy = int(y // Ls_box[1][1])
+        iz = int(z // Ls_box[2][2])
+        return (ix, iy, iz)
+        
+    def get_box_id_from_coord(coord):
+        return get_box_id(coord[0], coord[1], coord[2])
+
+    def get_mesh_id(ix, iy, iz):
+        return ix * mesh[1] * mesh[2] + iy * mesh[2] + iz
+
+    ##### build info between atm and box id #####
+
+    atm_box_id = []
+    box_2_atm = {}
+    
+    atm_coords = []
+
+    for i in range(cell.natm):
+        box_id = get_box_id_from_coord(cell.atom_coord(i))
+        atm_box_id.append(box_id)
+        if box_id not in box_2_atm:
+            box_2_atm[box_id] = [i]
+        else:
+            box_2_atm[box_id].append(i)
+        atm_coords.append(cell.atom_coord(i))
+
+    atm_coords = np.array(atm_coords)
+    distance = np.zeros((cell.natm,), dtype=np.float64)
+    
+    fn_calculate_distance = getattr(libisdf, "distance_between_point_atms", None)
+    assert fn_calculate_distance is not None
+
+    fn_calculate_distance2 = getattr(libisdf, "distance_between_points_atms", None)
+    assert fn_calculate_distance2 is not None
+
+    ######## a rough partition of the cell based on distance only ######## 
+    
+    natm_tmp = cell.natm
+    if with_translation_symmetry:
+        natm_tmp = cell.natm // np.prod(kmesh)
+    partition_rough = []
+    for i in range(natm_tmp):
+        partition_rough.append([])
+
+    grid_id_global = np.arange(mesh[0] * mesh[1] * mesh[2], dtype=np.int32).reshape(mesh[0], mesh[1], mesh[2])
+
+    for ix in range(nbox[0]):
+        for iy in range(nbox[1]):
+            for iz in range(nbox[2]):
+                
+                if use_mpi and rank != 0:
+                    continue
+                
+                box_id = (ix, iy, iz)
+                
+                #### construct the grid ID ####
+                
+                mesh_x_begin = min(ix * mesh_box[0], mesh[0])
+                mesh_x_end = min((ix+1) * mesh_box[0], mesh[0])
+
+                if mesh_x_begin == mesh_x_end:
+                    continue
+            
+                mesh_y_begin = min(iy * mesh_box[1], mesh[1])
+                mesh_y_end = min((iy+1) * mesh_box[1], mesh[1])
+                
+                if mesh_y_begin == mesh_y_end:
+                    continue
+                
+                mesh_z_begin = min(iz * mesh_box[2], mesh[2])
+                mesh_z_end = min((iz+1) * mesh_box[2], mesh[2])
+                
+                if mesh_z_begin == mesh_z_end:
+                    continue
+            
+                IsValidBox=True
+                if with_translation_symmetry:
+                    if mesh_x_begin >= meshPrim[0]:
+                        IsValidBox=False
+                    if mesh_y_begin >= meshPrim[1]:
+                        IsValidBox=False
+                    if mesh_z_begin >= meshPrim[2]:
+                        IsValidBox=False
+                if not IsValidBox:
+                    continue
+                
+                if with_translation_symmetry:
+                    mesh_x_end = min(mesh_x_end, meshPrim[0])
+                    mesh_y_end = min(mesh_y_end, meshPrim[1])
+                    mesh_z_end = min(mesh_z_end, meshPrim[2])
+
+                grid_ID = grid_id_global[mesh_x_begin:mesh_x_end, mesh_y_begin:mesh_y_end, mesh_z_begin:mesh_z_end].flatten()
+                
+                grid_ID.sort()
+                grid_ID = np.array(grid_ID, dtype=np.int32)
+                
+                # print("grid_ID = ", grid_ID)
+                
+                if box_id in box_2_atm:
+                    partition_rough[box_2_atm[box_id][0]%natm_tmp].extend(grid_ID)
+                else:
+                    # random pickup one coord in the box #
+                    
+                    grid_ID_random_pick = grid_ID[np.random.randint(0, len(grid_ID))]
+                    grid_coord = coords[grid_ID_random_pick]
+                    grid_coord = np.array(grid_coord)
+                    
+                    fn_calculate_distance(
+                        distance.ctypes.data_as(ctypes.c_void_p),
+                        grid_coord.ctypes.data_as(ctypes.c_void_p),
+                        atm_coords.ctypes.data_as(ctypes.c_void_p),
+                        ctypes.c_int(cell.natm),
+                        lattice_vector.ctypes.data_as(ctypes.c_void_p)
+                    )
+                    
+                    atm_id = np.argmin(distance)
+                    partition_rough[atm_id%natm_tmp].extend(grid_ID)
+      
+    if use_mpi:
+        comm.Barrier()
+            
+    if use_mpi == False or (use_mpi == True and rank == 0):
+        len_grid_involved = 0
+        for atm_id, x in enumerate(partition_rough):
+            # print("atm %d involved %d grids" % (atm_id, len(x)))
+            len_grid_involved += len(x)
+        if with_translation_symmetry:
+            assert len_grid_involved == np.prod(mesh) // np.prod(kmesh)
+        else:
+            assert len_grid_involved == mesh[0] * mesh[1] * mesh[2]
+    
+    ######## refine the partition based on the AtmConnectionInfo ########
+    
+    partition = []
+    natm_tmp = cell.natm
+    if with_translation_symmetry:
+        natm_tmp = cell.natm // np.prod(kmesh)
+        assert cell.natm % np.prod(kmesh) == 0
+    for i in range(natm_tmp):
+        partition.append([])
+    
+    ao_loc = cell.ao_loc_nr()
+    # print("nao_intot = ", ao_loc[-1])
+    
+    from copy import deepcopy
+    lattice_vector = deepcopy(cell.lattice_vectors())
+    
+    # print("lattice_vector = ", lattice_vector)
+    
+    if with_translation_symmetry:
+        # print("lattice_vector = ", lattice_vector)
+        lattice_vector = np.array(lattice_vector) / np.array(kmesh)
+        # print("lattice_vector = ", lattice_vector)
+    
+    for atm_id in range(natm_tmp):
+        
+        atm_involved = []
+        
+        if use_mpi and rank != 0:
+            continue
+        
+        ## pick up atms with distance < DISTANCE_CUTOFF ##
+        
+        for atm_id_other, distance in AtmConnectionInfoList[atm_id].atm_connected_info:
+            # print("atm %d distance = %f" % (atm_id_other, distance))
+            if distance < DISTANCE_CUTOFF:
+                atm_involved.append(atm_id_other % natm_tmp)
+            if len(atm_involved) >= 16: ## up to 16 atms 
+                break 
+        atm_involved.sort()
+        atm_involved = list(set(atm_involved))
+        atm_involved = np.array(atm_involved, dtype=np.int32)
+        # print("atm %d involved atm = %s" % (atm_id, atm_involved))
+        
+        ## get the involved ao ##
+        
+        atm_coords_involved = []
+        
+        nao_involved = 0
+        for atm_id_other in atm_involved:
+            shl_begin = AtmConnectionInfoList[atm_id_other].bas_range[0]
+            shl_end = AtmConnectionInfoList[atm_id_other].bas_range[-1]+1
+            nao_involved += ao_loc[shl_end] - ao_loc[shl_begin]
+            atm_coords_involved.append(cell.atom_coord(atm_id_other))
+        
+        atm_coords_involved = np.array(atm_coords_involved)
+        
+        grid_ID = partition_rough[atm_id]
+                    
+        ## determine the partition by distance ##
+        
+        coords_now = coords[grid_ID].copy()
+        distance = np.zeros((len(grid_ID), len(atm_involved)), dtype=np.float64)
+        fn_calculate_distance2(
+            distance.ctypes.data_as(ctypes.c_void_p),
+            coords_now.ctypes.data_as(ctypes.c_void_p),
+            ctypes.c_int(len(grid_ID)),
+            atm_coords_involved.ctypes.data_as(ctypes.c_void_p),
+            ctypes.c_int(len(atm_involved)),
+            lattice_vector.ctypes.data_as(ctypes.c_void_p)
+        )
+        argmin_distance = np.argmin(distance, axis=1)
+        for grid_id, _atm_id_ in zip(grid_ID, argmin_distance):
+            partition[atm_involved[_atm_id_]%natm_tmp].append(grid_id)
+    
+    if use_mpi == False or (use_mpi == True and rank == 0):
+        len_grid_involved = 0
+        for atm_id, x in enumerate(partition):
+            len_grid_involved += len(x)
+        if with_translation_symmetry:
+            assert len_grid_involved == np.prod(mesh) // np.prod(kmesh)
+        else:
+            assert len_grid_involved == mesh[0] * mesh[1] * mesh[2]
+    
+    del partition_rough
+    
+    if use_mpi:
+        partition_sendbuf = [np.array(x, dtype=np.int32) for x in partition]
+        partition = []
+        for x in partition_sendbuf:
+            partition.append(bcast(x))
+        del partition_sendbuf
+    
+    if (use_mpi and rank == 0) or use_mpi == False:
+        #print("************* end get_partition *************")
+        log.debug4("************* end get_partition *************")
+    
+    return partition
+
+def _range_partition(ngroup, rank, comm_size, use_mpi=False):
+    
+    ''' given ngroup tasks, split them into comm_size parts, and return the range of tasks for the rank-th process
+    '''
+    
+    if use_mpi == False:
+        return 0, ngroup
+    else:
+        from pyscf.isdf.isdf_tools_mpi import comm_size
+        if ngroup % comm_size == 0:
+            ngroup_local = ngroup // comm_size
+            return rank * ngroup_local, (rank+1) * ngroup_local
+        else:
+            ngroup_local = ngroup // comm_size + 1
+            
+            ## solve equation a * ngroup_local + b * (ngroup_local - 1) = ngroup ## 
+            ## a + b = comm_size ##
+            
+            b = (ngroup_local * comm_size - ngroup)    
+            a = comm_size - b
+            
+            if rank < a:
+                return rank * ngroup_local, (rank+1) * ngroup_local
+            else:
+                return a * ngroup_local + (rank - a) * (ngroup_local - 1), a * ngroup_local + (rank - a + 1) * (ngroup_local - 1)
+
+def _range_partition_array(ngroup, comm_size, use_mpi=False):
+    
+    if use_mpi == False:
+        return np.array([0, ngroup], dtype=np.int32)
+    else:
+        from pyscf.isdf.isdf_tools_mpi import comm_size
+        if ngroup % comm_size == 0:
+            ngroup_local = ngroup // comm_size
+            for i in range(comm_size):
+                if i == 0:
+                    res = np.array([0, ngroup_local], dtype=np.int32)
+                else:
+                    res = np.vstack((res, np.array([i * ngroup_local, (i+1) * ngroup_local], dtype=np.int32)))
+        else:
+            ngroup_local = ngroup // comm_size + 1
+            
+            ## solve equation a * ngroup_local + b * (ngroup_local - 1) = ngroup ## 
+            ## a + b = comm_size ##
+            
+            b = (ngroup_local * comm_size - ngroup)    
+            a = comm_size - b
+            
+            for i in range(comm_size):
+                if i < a:
+                    if i == 0:
+                        res = np.array([0, ngroup_local], dtype=np.int32)
+                    else:
+                        res = np.vstack((res, np.array([i * ngroup_local, (i+1) * ngroup_local], dtype=np.int32)))
+                else:
+                    if i == a:
+                        res = np.vstack((res, np.array([a * ngroup_local, a * ngroup_local + (ngroup_local - 1)], dtype=np.int32)))
+                    else:
+                        res = np.vstack((res, np.array([a * ngroup_local + (i - a) * (ngroup_local - 1), a * ngroup_local + (i - a + 1) * (ngroup_local - 1)], dtype=np.int32)))
+
+        if comm_size == 1:
+            res = res.reshape(1, 2)
+        return res
+
+def _get_grid_ordering(atmid_to_gridID, group, use_mpi=False):
+    
+    ''' given the grid points associated to each atom, return the reordering of grid points according to the ID of atms.
+    '''
+        
+    grid_ordering = []
+    for i in range(len(group)):
+        for atmid in group[i]:
+            grid_ordering.extend(atmid_to_gridID[atmid])
+        
+    return np.array(grid_ordering, dtype=np.int32)
+
+def _get_grid_partition(atmid_to_gridID, group, use_mpi=False):
+    
+    if use_mpi:
+        from pyscf.isdf.isdf_tools_mpi import comm_size
+    
+    ngrid = np.sum([len(x) for x in atmid_to_gridID])
+    
+    if use_mpi == False:
+        return np.array([0, ngrid], dtype=np.int32)
+    else:
+        group_partition_array = _range_partition_array(len(group), comm_size, use_mpi)
+        
+        grid_partition = [0]
+        for i in range(comm_size):
+            group_begin = group_partition_array[i][0]
+            group_end = group_partition_array[i][1]
+            
+            ngrid_local = 0
+            for j in range(group_begin, group_end):
+                for atmid in group[j]:
+                    ngrid_local += len(atmid_to_gridID[atmid])
+            
+            grid_partition.append(grid_partition[-1] + ngrid_local)
+        
+        return np.array(grid_partition, dtype=np.int32)
+
+def _get_atm_2_grid_segment(atmid_to_gridID, group):
+
+    natm = len(atmid_to_gridID)
+    assert sum([len(x) for x in group]) == natm or (natm % sum([len(x) for x in group])) == 0
+    
+    res = []
+    for _ in range(natm):
+        res.append([None, None])
+        
+    grid_loc_now = 0
+    for j in range(len(group)):
+        for atmid in group[j]:
+            res[atmid][0] = grid_loc_now
+            res[atmid][1] = grid_loc_now + len(atmid_to_gridID[atmid])
+            grid_loc_now += len(atmid_to_gridID[atmid])
+    
+    return res
+    
+def _sync_list(list_data, ngroup):
+
+    # if use_mpi:
+    from pyscf.isdf.isdf_tools_mpi import rank, comm_size, bcast
+
+    ### check data ### 
+    
+    if len(list_data) != ngroup:
+        raise ValueError("the length of list_data is not equal to ngroup")
+    
+    group_begin, group_end = _range_partition(ngroup, rank, comm_size, True)
+    
+    for i in range(group_begin):
+        assert list_data[i] is None
+    for i in range(group_end, ngroup):
+        assert list_data[i] is None
+    for i in range(group_begin, group_end):
+        assert list_data[i] is not None
+    
+    ### generate groupid_2_root ###
+    
+    groupid_2_root = [] 
+    
+    range_partition_array = _range_partition_array(ngroup, comm_size, True) 
+    
+    for j in range(comm_size):
+        group_begin = range_partition_array[j][0]
+        group_end = range_partition_array[j][1]
+        for i in range(group_begin, group_end):
+            groupid_2_root.append(j)
+    
+    ### sync ### 
+    
+    for i in range(ngroup):
+        if rank == groupid_2_root[i]:
+            sys.stdout.flush()
+        list_data[i] = bcast(list_data[i], root=groupid_2_root[i])
+        
+    return list_data
+
+def _sync_aoR(aoR_holders, natm):
+    
+    ''' used in MPI 
+    '''
+    
+    assert len(aoR_holders) == natm
+    
+    aoR = []
+    bas_id = []
+    grid_ID_begin = []
+    for i in range(natm):
+        if aoR_holders[i] is not None:
+            aoR.append(aoR_holders[i].aoR)
+            bas_id.append(aoR_holders[i].ao_involved)
+            grid_ID_begin.append(np.asarray([aoR_holders[i].global_gridID_begin],dtype=np.int32))
+        else:
+            aoR.append(None)
+            bas_id.append(None)
+            grid_ID_begin.append(None)
+
+    aoR           = _sync_list(aoR, natm)
+    bas_id        = _sync_list(bas_id, natm)
+    grid_ID_begin = _sync_list(grid_ID_begin, natm)
+
+    aoR_holders = []
+    
+    for i in range(natm):
+        aoR_holders.append(
+            aoR_Holder(aoR[i], bas_id[i], grid_ID_begin[i][0], grid_ID_begin[i][0] + aoR[i].shape[1], grid_ID_begin[i][0], grid_ID_begin[i][0] + aoR[i].shape[1])
+        )
+
+    return aoR_holders
+
+def _build_submol(cell:Cell, atm_invovled):
+    
+    import pyscf.pbc.gto as pbcgto
+    
+    subcell = pbcgto.Cell()
+    subcell.a = cell.a
+    
+    atm = []
+    for atm_id in atm_invovled:
+        atm.append(cell.atom[atm_id])
+    
+    subcell.atom = atm
+    subcell.basis = cell.basis
+    subcell.pseudo = cell.pseudo
+    subcell.verbose = 0
+    subcell.ke_cutoff = cell.ke_cutoff
+    subcell.max_memory = cell.max_memory
+    subcell.precision = cell.precision
+    subcell.use_particle_mesh_ewald = cell.use_particle_mesh_ewald
+    subcell.mesh = cell.mesh
+    subcell.unit = cell.unit
+    subcell.build(mesh = cell.mesh)
+    
+    return subcell
+
+def get_aoR(cell:Cell, coords, partition, 
+            first_npartition = None,
+            first_natm=None, group=None, 
+            distance_matrix=None, AtmConnectionInfoList:list[AtmConnectionInfo]=None, 
+            distributed = False, use_mpi=False, sync_res = False):
+    
+    if first_natm is None:
+        first_natm = cell.natm
+    if first_npartition is None:
+        first_npartition = len(partition)
+    
+    ## aoR is stored distributedly ##
+    
+    log = lib.logger.Logger(cell.stdout, cell.verbose)
+    
+    if use_mpi:
+        from pyscf.isdf.isdf_tools_mpi import rank, comm, comm_size, allgather, bcast, reduce, gather, alltoall, _comm_bunch, allgather_list, bcast_pickel
+        if rank == 0:
+            log.debug4("************* get_aoR *************")
+    else:
+        rank = 0
+        comm_size = 1
+        log.debug4("************* get_aoR *************")
+    
+    weight = np.sqrt(cell.vol / coords.shape[0])
+    
+    RcutMax = -1e10
+    
+    for _info_ in AtmConnectionInfoList:
+        RcutMax = max(RcutMax, np.max(_info_.bas_cut))
+    
+    precision = AtmConnectionInfoList[0].precision
+        
+    aoR_holder = []
+    
+    if group == None:
+        group = []
+        for i in range(cell.natm):
+            group.append([i])
+    
+    for _ in range(first_npartition):
+        aoR_holder.append(None)
+    
+    grid_partition = _get_grid_partition(partition, group, use_mpi)
+    
+    atm_2_grid_segment = _get_atm_2_grid_segment(partition, group)
+    
+    local_gridID_begin = 0
+    global_gridID_begin = grid_partition[rank]
+    ao_loc = cell.ao_loc_nr()
+    
+    atm_begin, atm_end = _range_partition(first_npartition, rank, comm_size, use_mpi)
+
+    for atm_id in range(atm_begin, atm_end):
+        
+        grid_ID = partition[atm_id]
+        
+        if len(grid_ID) == 0:
+            aoR_holder[atm_id] = None
+            continue
+        
+        ##### find the involved atms within RcutMax #####
+        
+        if first_natm!=cell.natm:
+            atm_involved = np.arange(first_natm) # with kmesh ! 
+        else:
+            if first_npartition == len(partition):
+                atm_involved = []
+                for atm_id_other, distance in AtmConnectionInfoList[atm_id].atm_connected_info:
+                    if distance < RcutMax and atm_id_other < first_natm:
+                        atm_involved.append(atm_id_other)
+                atm_involved.sort()
+            else:
+                atm_involved = np.arange(cell.natm) # with kmesh ! 
+                
+        ##### get the involved ao #####
+        
+        nao_involved = 0
+        for atm_id_other in atm_involved:
+            shl_begin = AtmConnectionInfoList[atm_id_other].bas_range[0]
+            shl_end = AtmConnectionInfoList[atm_id_other].bas_range[-1]+1
+            nao_involved += ao_loc[shl_end] - ao_loc[shl_begin]
+        
+        bas_id = []
+
+        ao_loc_now = 0
+        
+        shell_slice = []
+        shl_end_test = 0
+        for atm_id_other in atm_involved:
+            shl_begin = AtmConnectionInfoList[atm_id_other].bas_range[0]
+            shl_end = AtmConnectionInfoList[atm_id_other].bas_range[-1]+1
+            bas_id.extend(np.arange(ao_loc[shl_begin], ao_loc[shl_end]))
+        
+        bas_id = np.array(bas_id)
+                
+        subcell = _build_submol(cell, atm_involved)
+        aoR     = ISDF_eval_gto(subcell, coords=coords[grid_ID]) * weight
+        
+        assert aoR.shape[0] == len(bas_id)
+        
+        ##### screening the aoR, TODO: in C ##### 
+        
+        max_row = np.max(np.abs(aoR), axis=1)
+        where = np.where(max_row > precision)[0]
+        if len(where) < aoR.shape[0] * 0.9:
+            aoR = aoR[where]
+            bas_id = np.array(bas_id)[where]
+        
+        global_gridID_begin = atm_2_grid_segment[atm_id][0]
+        aoR_holder[atm_id]  = aoR_Holder(aoR, bas_id, local_gridID_begin, local_gridID_begin+len(grid_ID), global_gridID_begin, global_gridID_begin+len(grid_ID))
+        
+        assert global_gridID_begin == atm_2_grid_segment[atm_id][0]
+        assert global_gridID_begin + len(grid_ID) == atm_2_grid_segment[atm_id][1]
+        
+        local_gridID_begin += len(grid_ID)
+        global_gridID_begin += len(grid_ID)
+                        
+    del aoR
+    
+    if use_mpi and sync_res:
+        # aoR_holder = _sync_aoR(aoR_holder, cell.natm)
+        aoR_holder = _sync_aoR(aoR_holder, first_npartition)
+        
+    if use_mpi:
+        if rank == 0:
+            log.debug4("************* end get_aoR *************")
+    else:
+        log.debug4("************* end get_aoR *************")
+    
+    return aoR_holder
+
+def get_aoR_analytic(cell:Cell, coords, partition, 
+                     first_npartition = None,
+                     first_natm=None, group=None, 
+                     distance_matrix=None, AtmConnectionInfoList:list[AtmConnectionInfo]=None, 
+                     distributed = False, use_mpi=False, sync_res = False):
+    
+    ''' AO values on grid points using FFT, evaluating analytic AO integrals
+    '''
+    
+    assert use_mpi == False
+    assert first_natm is None or first_natm == cell.natm
+    
+    if group is None:
+        group = []
+        for i in range(cell.natm):
+            group.append([i])
+    
+    precision = AtmConnectionInfoList[0].precision
+    mesh = cell.mesh
+    ngrids = np.prod(mesh)
+    weight = cell.vol/ngrids
+    weight2 = np.sqrt(cell.vol / ngrids)
+    
+    blksize = 2e9//16
+    nao_max_bunch = int(blksize // ngrids)
+
+    Gv = cell.get_Gv() 
+
+    ######## pack info ########
+    
+    aoR_unpacked = []
+    ao_invovled_unpacked = []
+    atm_ordering = []
+    for group_idx in group:
+        group_idx.sort()
+        atm_ordering.extend(group_idx)
+    grid_begin_unpacked = []
+    grid_end_unpacked = []
+    grid_ID_now = 0
+    for atm_id in atm_ordering:
+        grid_ID = partition[atm_id]
+        grid_begin_unpacked.append(grid_ID_now)
+        grid_end_unpacked.append(grid_ID_now + len(grid_ID))
+        grid_ID_now += len(grid_ID)
+        aoR_unpacked.append([])
+        ao_invovled_unpacked.append([])
+    
+    ao_loc = cell.ao_loc_nr()
+    
+    task_sl_loc = [0] 
+    ao_loc_now = 0
+    for i in range(cell.nbas):
+        ao_loc_end = ao_loc[i+1]
+        if ao_loc_end - ao_loc_now > nao_max_bunch:
+            task_sl_loc.append(i)
+            ao_loc_now = ao_loc[i]
+    task_sl_loc.append(cell.nbas)
+    print("task_sl_loc = ", task_sl_loc)
+    nTask = len(task_sl_loc) - 1
+    print("nTask       = ", nTask)
+    
+    for task_id in range(nTask):
+        
+        t1 = (lib.logger.process_clock(), lib.logger.perf_counter())
+        
+        shloc    = (task_sl_loc[task_id], task_sl_loc[task_id+1])
+        aoG      = ft_ao.ft_ao(cell, Gv, shls_slice=shloc).T
+        
+        ### implementation 1 ###
+        # aoR_test = numpy.fft.ifftn(aoG.reshape(-1, *mesh), axes=(1,2,3)).real / (weight)
+        # aoR = aoR_test.reshape(-1, ngrids) * weight2
+        
+        ### implementation 2 ###
+        aoR_test = None
+        aoG = aoG.conj() * np.sqrt(1/cell.vol)
+        aoG = aoG.reshape(-1, *mesh)
+        aoR = numpy.fft.fftn(aoG, axes=(1,2,3)).real * np.sqrt(1/float(ngrids))
+        aoR = aoR.reshape(-1, ngrids)
+        
+        bas_id = np.arange(ao_loc[shloc[0]], ao_loc[shloc[1]])
+        
+        for atm_id, atm_partition in enumerate(partition):
+            aoR_tmp = aoR[:, atm_partition].copy()
+            ### prune the aoR ### 
+            where = np.where(np.max(np.abs(aoR_tmp), axis=1) > precision)[0]
+            aoR_tmp = aoR_tmp[where].copy()
+            bas_id_tmp = bas_id[where].copy()
+            aoR_unpacked[atm_id].append(aoR_tmp)
+            ao_invovled_unpacked[atm_id].append(bas_id_tmp)
+
+        t2 = (lib.logger.process_clock(), lib.logger.perf_counter())
+        
+        if rank == 0:
+            _benchmark_time(t1, t2, "get_aoR_analytic: task %d" % task_id)
+
+    t1 = (lib.logger.process_clock(), lib.logger.perf_counter())
+    
+    aoR_holder = []
+    
+    for atm_id in range(len(aoR_unpacked)):
+        aoR_holder_tmp = np.concatenate(aoR_unpacked[atm_id], axis=0)
+        bas_id =         np.concatenate(ao_invovled_unpacked[atm_id], axis=0) 
+        aoR_holder.append(aoR_Holder(aoR_holder_tmp, bas_id, grid_begin_unpacked[atm_id], grid_end_unpacked[atm_id], grid_begin_unpacked[atm_id], grid_end_unpacked[atm_id]))
+    
+    t2 = (lib.logger.process_clock(), lib.logger.perf_counter())
+    
+    del aoR_unpacked
+    del ao_invovled_unpacked
+    del aoR_tmp
+    del aoR_holder_tmp
+    del bas_id
+    del aoR_test
+    del aoR
+    del aoG
+    
+    
+    if rank == 0:
+        _benchmark_time(t1, t2, "get_aoR_analytic: merge")
+
+    return aoR_holder
+
+if __name__ == '__main__':
+
+    from pyscf.lib.parameters import BOHR
+
+    TARGET_PRECISION = 1e-9
+    
+    prim_a = np.array(
+                    [[14.572056092/2, 0.000000000, 0.000000000],
+                     [0.000000000, 14.572056092/2, 0.000000000],
+                     [0.000000000, 0.000000000,  6.010273939],]) * BOHR
+    atm = [
+['Cu1',	(1.927800,	1.927800,	1.590250)],
+['O1',	(1.927800,	0.000000,	1.590250)],
+['O1',	(0.000000,	1.927800,	1.590250)],
+['Ca',	(0.000000,	0.000000,	0.000000)],
+    ]
+    
+    basis = {
+        'Cu1':'gth-dzvp-molopt-sr', 'Cu2':'gth-dzvp-molopt-sr', 'O1': 'gth-dzvp-molopt-sr', 'Ca':'gth-dzvp-molopt-sr'
+    }
+    pseudo = {'Cu1': 'gth-pbe-q19', 'Cu2': 'gth-pbe-q19', 'O1': 'gth-pbe', 'Ca': 'gth-pbe'}
+
+    
+    ke_cutoff = 128
+    
+    from isdf_tools_cell import build_supercell
+    
+    prim_cell = build_supercell(atm, prim_a, Ls = [1,1,1], ke_cutoff=ke_cutoff, basis=basis, pseudo=pseudo, verbose=10)
+    prim_mesh = prim_cell.mesh
+    
+    supercell = [2, 2, 1]
+    
+    mesh = [supercell[0] * prim_mesh[0], supercell[1] * prim_mesh[1], supercell[2] * prim_mesh[2]]
+    mesh = np.array(mesh, dtype=np.int32)
+    
+    cell = build_supercell(atm, prim_a, Ls = supercell, ke_cutoff=ke_cutoff, mesh=mesh, basis=basis, pseudo=pseudo, verbose=10)
+    
+    print(cell.atom)
+    print(cell.basis)
+        
+    from pyscf.pbc.dft.multigrid.multigrid_pair import MultiGridFFTDF2
+
+    df_tmp  = MultiGridFFTDF2(cell)
+    grids   = df_tmp.grids
+    coords  = np.asarray(grids.coords).reshape(-1,3)
+    assert coords is not None
+    
+    distance_matrix = get_cell_distance_matrix(cell)
+    
+    weight = np.sqrt(cell.vol / coords.shape[0])
+    
+    precision = TARGET_PRECISION
+    rcut = _estimate_rcut(cell, coords.shape[0], precision)
+    rcut_max = np.max(rcut)
+    
+    print("rcut = ", rcut)
+    print("precision = ", precision)
+    print("max_rcut = ", np.max(rcut))
\ No newline at end of file
diff --git a/pyscf/isdf/isdf_tools_mpi.py b/pyscf/isdf/isdf_tools_mpi.py
new file mode 100644
index 000000000..02ef01654
--- /dev/null
+++ b/pyscf/isdf/isdf_tools_mpi.py
@@ -0,0 +1,358 @@
+################### the MPI module ##########################
+
+from pyscf import lib
+import mpi4py
+from mpi4py import MPI
+import numpy
+import numpy as np
+
+comm = MPI.COMM_WORLD
+rank = comm.Get_rank()
+comm_size = comm.Get_size()
+
+## some tools copy from mpi4pyscf ##
+
+INT_MAX = 2147483647
+BLKSIZE = INT_MAX // 64 + 1
+
+def _comm_bunch(size_of_comm, force_even=False):
+    if size_of_comm % comm_size == 0:
+        res = size_of_comm // comm_size
+    else:
+        res = (size_of_comm // comm_size) + 1
+    if force_even:
+        if res % 2 == 1 :
+            res += 1
+    return res
+
+def _assert(condition):
+    if not condition:
+        import traceback
+        sys.stderr.write(''.join(traceback.format_stack()[:-1]))
+        comm.Abort()
+
+def _segment_counts(counts, p0, p1):
+    counts_seg = counts - p0
+    counts_seg[counts<=p0] = 0
+    counts_seg[counts> p1] = p1 - p0
+    return counts_seg
+     
+def allgather(sendbuf, split_recvbuf=False):
+    sendbuf = numpy.asarray(sendbuf, order='C')
+    shape = sendbuf.shape
+    attr = comm.allgather((shape, sendbuf.dtype.char))
+    rshape = [x[0] for x in attr]
+    counts = numpy.array([numpy.prod(x) for x in rshape])
+    mpi_dtype = numpy.result_type(*[x[1] for x in attr]).char
+    _assert(sendbuf.dtype.char == mpi_dtype or sendbuf.size == 0)
+
+    displs = numpy.append(0, numpy.cumsum(counts[:-1]))
+    recvbuf = numpy.empty(sum(counts), dtype=mpi_dtype)
+
+    sendbuf = sendbuf.ravel()
+
+    size_of_recvbuf = recvbuf.size
+
+    print("rank %d size recvbf %d" % (rank, size_of_recvbuf))
+
+    if size_of_recvbuf >= INT_MAX:
+        print("large data size go this branch")
+        blk_size_small = min((INT_MAX // comm_size),BLKSIZE)
+        recvbuf_small = numpy.empty(comm_size*blk_size_small, dtype=mpi_dtype)
+        rdispls_small = numpy.arange(comm_size)*blk_size_small
+        if rank == 0:
+            print("blk_size_small = ", blk_size_small)
+            print("rdispls_small = ", rdispls_small)
+            sys.stdout.flush()
+        for p0, p1 in prange(0, numpy.max(counts), blk_size_small):
+            counts_seg = _segment_counts(counts, p0, p1)
+            comm.Allgatherv([sendbuf[p0:p1], mpi_dtype],
+                            [recvbuf_small, counts_seg, rdispls_small, mpi_dtype])
+            # recvbuf[p0:p1] = recvbuf_small[:p1-p0]
+
+            for i in range(comm_size):
+                begin = displs[i]+p0
+                end = begin + counts_seg[i]
+                recvbuf[begin:end] = recvbuf_small[i*blk_size_small:i*blk_size_small+counts_seg[i]]
+
+        del recvbuf_small
+        del rdispls_small   
+        
+        if split_recvbuf:
+            return [recvbuf[p0:p0+c].reshape(shape)
+                    for p0,c,shape in zip(displs,counts,rshape)]
+        else:
+            return recvbuf
+    else:
+        print("small data size go this branch")
+        print("maxcount = ", numpy.max(counts))
+        end = numpy.max(counts)
+        for p0, p1 in lib.prange(0, end, BLKSIZE):
+            print("rank %d send p0 p1 %d %d"%(rank,p0,p1))
+            counts_seg = _segment_counts(counts, p0, p1)
+            comm.Allgatherv([sendbuf[p0:p1], mpi_dtype],
+                            [recvbuf, counts_seg, displs+p0, mpi_dtype])
+        print("rank %d finish all gather" % (rank))
+        if split_recvbuf:
+            return [recvbuf[p0:p0+c].reshape(shape)
+                    for p0,c,shape in zip(displs,counts,rshape)]
+        else:
+            # try:
+            #     return recvbuf.reshape((-1,) + shape[1:])
+            # except ValueError:
+            return recvbuf
+            # raise ValueError("split_recvbuf is not supported")
+
+def allgather_list(sendbuf):
+    
+    assert isinstance(sendbuf, list)
+    for _data_ in sendbuf:
+        assert isinstance(_data_, numpy.ndarray)
+    
+    shape = [x.shape for x in sendbuf]
+    attr = comm.allgather(shape)
+    attr_flat = []
+    for x in attr:
+        for y in x:
+            attr_flat.append(y)
+
+    if rank == 0:
+        for x in attr_flat:
+            print("x = ", x)
+
+    print("rank %d get here 1" % (rank))
+    sys.stdout.flush()
+
+    size_tot = np.sum([x.size for x in sendbuf])
+    sendbuf_flat = np.empty(size_tot, dtype=sendbuf[0].dtype)
+    offset = 0
+    for x in sendbuf:
+        sendbuf_flat[offset:offset+x.size] = x.ravel()
+        offset += x.size
+    
+    print("rank %d get here 2" % (rank))
+    sys.stdout.flush()
+
+    recvbuf_flat = allgather(sendbuf_flat)
+    
+    print("rank %d get here 3" % (rank))
+    sys.stdout.flush()
+    res = []
+    
+    offset = 0
+    for x in attr_flat:
+        res.append(recvbuf_flat[offset:offset+np.prod(x)].reshape(x))
+        offset += np.prod(x)
+
+    return res
+
+def allgather_pickle(sendbuf):
+    sendbuf_serialized = MPI.pickle.dumps(sendbuf)
+    sendbuf_serialized = np.frombuffer(sendbuf_serialized, dtype=np.uint8)
+    received = allgather(sendbuf_serialized, split_recvbuf=True)
+    received = [MPI.pickle.loads(x.tobytes()) for x in received]
+    del sendbuf_serialized 
+    return received
+
+def reduce(sendbuf, op=MPI.SUM, root=0):
+    sendbuf = numpy.asarray(sendbuf, order='C')
+    shape, mpi_dtype = comm.bcast((sendbuf.shape, sendbuf.dtype.char),root=root)
+    _assert(sendbuf.shape == shape and sendbuf.dtype.char == mpi_dtype)
+
+    dtype = sendbuf.dtype.char
+    recvbuf = numpy.zeros_like(sendbuf)
+    send_seg = numpy.ndarray(sendbuf.size, dtype=sendbuf.dtype, buffer=sendbuf)
+    recv_seg = numpy.ndarray(recvbuf.size, dtype=recvbuf.dtype, buffer=recvbuf)
+    for p0, p1 in lib.prange(0, sendbuf.size, BLKSIZE):
+        comm.Reduce([send_seg[p0:p1], dtype],
+                    [recv_seg[p0:p1], dtype], op, root)
+
+    if rank == root:
+        return recvbuf
+    else:
+        return sendbuf
+
+def scatter(sendbuf, root=0):
+    if rank == root:
+        mpi_dtype = numpy.result_type(*sendbuf).char
+        shape = comm.scatter([x.shape for x in sendbuf])
+        counts = numpy.asarray([x.size for x in sendbuf])
+        comm.bcast((mpi_dtype, counts))
+        sendbuf = [numpy.asarray(x, mpi_dtype).ravel() for x in sendbuf]
+        sendbuf = numpy.hstack(sendbuf)
+    else:
+        shape = comm.scatter(None)
+        mpi_dtype, counts = comm.bcast(None)
+
+    displs = numpy.append(0, numpy.cumsum(counts[:-1]))
+    recvbuf = numpy.empty(numpy.prod(shape), dtype=mpi_dtype)
+
+    #DONOT use lib.prange. lib.prange may terminate early in some processes
+    for p0, p1 in prange(comm, 0, numpy.max(counts), BLKSIZE):
+        counts_seg = _segment_counts(counts, p0, p1)
+        comm.Scatterv([sendbuf, counts_seg, displs+p0, mpi_dtype],
+                      [recvbuf[p0:p1], mpi_dtype], root)
+    return recvbuf.reshape(shape)
+
+def bcast(buf, root=0):
+    buf = numpy.asarray(buf, order='C')
+    shape, dtype = comm.bcast((buf.shape, buf.dtype.char), root=root)
+    if rank != root:
+        buf = numpy.empty(shape, dtype=dtype)
+
+    dtype = buf.dtype.char
+    buf_seg = numpy.ndarray(buf.size, dtype=buf.dtype, buffer=buf)
+    for p0, p1 in lib.prange(0, buf.size, BLKSIZE):
+        comm.Bcast([buf_seg[p0:p1], dtype], root)
+    return buf
+
+def bcast_pickel(buf, root=0):
+    if rank == root:
+        buf_serialized = MPI.pickle.dumps(buf)
+        buf_serialized = np.frombuffer(buf_serialized, dtype=np.uint8)
+    else:
+        buf_serialized = None
+    res = bcast(buf_serialized, root)
+    res = MPI.pickle.loads(res.tobytes())
+    return res
+
+def gather(sendbuf, root=0, split_recvbuf=False):
+
+    sendbuf = numpy.asarray(sendbuf, order='C')
+    shape = sendbuf.shape
+    size_dtype = comm.allgather((shape, sendbuf.dtype.char))
+    # print(size_dtype)
+    rshape = [x[0] for x in size_dtype]
+    counts = numpy.array([numpy.prod(x) for x in rshape])
+
+    mpi_dtype = numpy.result_type(*[x[1] for x in size_dtype]).char
+    _assert(sendbuf.dtype == mpi_dtype or sendbuf.size == 0)
+
+    if rank == root:
+        displs = numpy.append(0, numpy.cumsum(counts[:-1]))
+        recvbuf = numpy.empty(sum(counts), dtype=mpi_dtype)
+
+        sendbuf = sendbuf.ravel()
+        for p0, p1 in lib.prange(0, numpy.max(counts), BLKSIZE):
+            counts_seg = _segment_counts(counts, p0, p1)
+            comm.Gatherv([sendbuf[p0:p1], mpi_dtype],
+                         [recvbuf, counts_seg, displs+p0, mpi_dtype], root)
+        if split_recvbuf:
+            return [recvbuf[p0:p0+c].reshape(shape)
+                    for p0,c,shape in zip(displs,counts,rshape)]
+        else:
+            try:
+                return recvbuf.reshape((-1,) + shape[1:])
+            except ValueError:
+                return recvbuf
+    else:
+        send_seg = sendbuf.ravel()
+        for p0, p1 in lib.prange(0, numpy.max(counts), BLKSIZE):
+            comm.Gatherv([send_seg[p0:p1], mpi_dtype], None, root)
+        return sendbuf
+
+def prange(start, stop, step):
+    '''Similar to lib.prange. This function ensures that all processes have the
+    same number of steps.  It is required by alltoall communication.
+    '''
+    nsteps = (stop - start + step - 1) // step
+    nsteps = max(comm.allgather(nsteps))
+    for i in range(nsteps):
+        i0 = min(stop, start + i * step)
+        i1 = min(stop, i0 + step)
+        yield i0, i1
+        
+def alltoall(sendbuf, split_recvbuf=False):
+    if isinstance(sendbuf, numpy.ndarray):
+        raise NotImplementedError
+        mpi_dtype = comm.bcast(sendbuf.dtype.char)
+        sendbuf = numpy.asarray(sendbuf, mpi_dtype, 'C')
+        nrow = sendbuf.shape[0]
+        ncol = sendbuf.size // nrow
+        segsize = (nrow+comm_size-1) // comm_size * ncol
+        sdispls = numpy.arange(0, comm_size*segsize, segsize)
+        sdispls[sdispls>sendbuf.size] = sendbuf.size
+        scounts = numpy.append(sdispls[1:]-sdispls[:-1], sendbuf.size-sdispls[-1])
+        rshape = comm.alltoall(scounts)
+    else:
+        _assert(len(sendbuf) == comm_size)
+        mpi_dtype = comm.bcast(sendbuf[0].dtype.char)
+        sendbuf = [numpy.asarray(x, mpi_dtype) for x in sendbuf]
+        rshape = comm.alltoall([x.shape for x in sendbuf])
+        scounts = numpy.asarray([x.size for x in sendbuf], dtype=np.int64)
+        sdispls = numpy.append(0, numpy.cumsum(scounts[:-1]))
+        sendbuf = numpy.hstack([x.ravel() for x in sendbuf])
+
+    rcounts = numpy.asarray([numpy.prod(x) for x in rshape], dtype=np.int64)
+    rdispls = numpy.append(0, numpy.cumsum(rcounts[:-1]))
+    recvbuf = numpy.empty(sum(rcounts), dtype=mpi_dtype)
+
+    if rank == 0:
+        print("sdispls = ", sdispls)
+        print("rcounts = ", rcounts)
+        print("rdispls = ", rdispls)
+
+    max_counts = max(numpy.max(scounts), numpy.max(rcounts))
+    
+    if rank == 0:
+        print("max_counts = ", max_counts)
+    
+    sendbuf = sendbuf.ravel()
+    #DONOT use lib.prange. lib.prange may terminate early in some processes
+    
+    size_of_sendbuf = sendbuf.size
+    
+    # if sdispls[-1] >= INT_MAX:
+    if size_of_sendbuf >=INT_MAX:
+        blk_size_small = min((INT_MAX // comm_size),BLKSIZE)
+        sendbuf_small = numpy.empty(comm_size*blk_size_small, dtype=mpi_dtype)
+        recvbuf_small = numpy.empty(comm_size*blk_size_small, dtype=mpi_dtype)
+        sdispls_small = numpy.arange(comm_size)*blk_size_small
+        if rank == 0:
+            print("blk_size_small = ", blk_size_small)
+            print("sdispls_small = ", sdispls_small)
+            sys.stdout.flush()
+        for p0, p1 in prange(0, max_counts, blk_size_small):
+            scounts_seg = _segment_counts(scounts, p0, p1)
+            rcounts_seg = _segment_counts(rcounts, p0, p1)
+            
+            # if rank == 0:
+            #     print("p0 p1 = ", p0, p1)
+            #     print("scounts_seg = ", scounts_seg)
+            #     print("rcounts_seg = ", rcounts_seg)
+            #     sys.stdout.flush()
+            ### copy data to sendbuf_small
+            for i in range(comm_size):
+                begin = sdispls[i]+p0
+                end = begin + scounts_seg[i]
+                sendbuf_small[i*blk_size_small:i*blk_size_small+scounts_seg[i]] = sendbuf[begin:end]
+            
+            comm.Alltoallv([sendbuf_small, scounts_seg, sdispls_small, mpi_dtype],
+                           [recvbuf_small, rcounts_seg, sdispls_small, mpi_dtype])
+            
+            for i in range(comm_size):
+                begin = rdispls[i]+p0
+                end = begin + rcounts_seg[i]
+                recvbuf[begin:end] = recvbuf_small[i*blk_size_small:i*blk_size_small+rcounts_seg[i]]
+                
+        sendbuf_small = None
+        recvbuf_small = None
+    else:
+        for p0, p1 in prange(0, max_counts, BLKSIZE):
+            scounts_seg = _segment_counts(scounts, p0, p1)
+            rcounts_seg = _segment_counts(rcounts, p0, p1)
+            # if rank == 0:
+            #     print("scounts_seg = ", scounts_seg)
+            #     print("rcounts_seg = ", rcounts_seg)
+            comm.Alltoallv([sendbuf, scounts_seg, sdispls+p0, mpi_dtype],
+                           [recvbuf, rcounts_seg, rdispls+p0, mpi_dtype])
+
+    # return None
+
+    if split_recvbuf:
+        return [recvbuf[p0:p0+c].reshape(shape)
+                for p0,c,shape in zip(rdispls, rcounts, rshape)]
+    else:
+        return recvbuf
+    
+################### end of the MPI module ##########################
\ No newline at end of file
diff --git a/pyscf/isdf/pbc_isdf_V.c b/pyscf/isdf/pbc_isdf_V.c
new file mode 100644
index 000000000..a3d166f3f
--- /dev/null
+++ b/pyscf/isdf/pbc_isdf_V.c
@@ -0,0 +1,1567 @@
+#include "fft.h"
+#include <omp.h>
+#include <string.h>
+#include <complex.h>
+#include "vhf/fblas.h"
+#include <math.h>
+
+int get_omp_threads();
+int omp_get_thread_num();
+
+void _construct_J(
+    int *mesh,
+    double *DensityR,
+    double *CoulG,
+    double *J)
+{
+    const int nThread = get_omp_threads();
+    // int mesh_complex[3] = {mesh[0], mesh[1], mesh[2] / 2 + 1};
+    const int n_real = mesh[0] * mesh[1] * mesh[2];
+    // const int n_complex = mesh_complex[0] * mesh_complex[1] * mesh_complex[2];
+    const double fac = 1. / (double)n_real;
+
+    fftw_complex *DensityR_complex = fftw_malloc(sizeof(double __complex__) * n_real);
+    fftw_complex *buf = fftw_malloc(sizeof(double __complex__) * n_real);
+    fftw_complex *J_complex = fftw_malloc(sizeof(double __complex__) * n_real);
+
+    memset(buf, 0, sizeof(double __complex__) * n_real);
+    memset(J_complex, 0, sizeof(double __complex__) * n_real);
+    memset(DensityR_complex, 0, sizeof(double __complex__) * n_real);
+
+#pragma omp parallel for num_threads(nThread) schedule(static)
+    for (int i = 0; i < n_real; ++i)
+    {
+        DensityR_complex[i][0] = DensityR[i];
+    }
+
+    fftw_plan p_forward = fftw_plan_dft_3d(mesh[0], mesh[1], mesh[2], DensityR_complex, (fftw_complex *)buf, FFTW_BACKWARD, FFTW_ESTIMATE);
+    fftw_plan p_backward = fftw_plan_dft_3d(mesh[0], mesh[1], mesh[2], (fftw_complex *)buf, J_complex, FFTW_FORWARD, FFTW_ESTIMATE);
+
+    fftw_execute(p_forward);
+
+    double *ptr = (double *)buf;
+
+#pragma omp parallel for num_threads(nThread) schedule(static)
+    for (int i = 0; i < n_real; i++)
+    {
+        ptr[i * 2] *= CoulG[i] * fac;
+        ptr[i * 2 + 1] *= CoulG[i] * fac;
+    }
+
+    fftw_execute(p_backward);
+
+#pragma omp parallel for num_threads(nThread) schedule(static)
+    for (int i = 0; i < n_real; i++)
+    {
+        J[i] = J_complex[i][0];
+    }
+
+    fftw_destroy_plan(p_forward);
+    fftw_destroy_plan(p_backward);
+
+    fftw_free(buf);
+    fftw_free(DensityR_complex);
+    fftw_free(J_complex);
+}
+
+void _fn_J_dmultiplysum(double *out,
+                        const int nrow, const int ncol,
+                        const double *a,
+                        const int nrow_a, const int ncol_a,
+                        const int row_a_shift,
+                        const int col_a_shift,
+                        const double *b,
+                        const int nrow_b, const int ncol_b,
+                        const int row_b_shift,
+                        const int col_b_shift)
+{
+    static const int BUNCHSIZE = 512;
+
+    const double *pa = a + row_a_shift * ncol_a + col_a_shift;
+    const double *pb = b + row_b_shift * ncol_b + col_b_shift;
+
+    memset(out, 0, sizeof(double) * ncol);
+
+    const int nThread = get_omp_threads();
+    const int nBunch = ncol / BUNCHSIZE + 1;
+
+#pragma omp parallel for num_threads(nThread) schedule(static)
+    for (int i = 0; i < nBunch; i++)
+    {
+        int bunch_start = i * BUNCHSIZE;
+        int bunch_end = (i + 1) * BUNCHSIZE;
+        if (bunch_end > ncol)
+        {
+            bunch_end = ncol;
+        }
+        if (bunch_end > ncol)
+        {
+            bunch_end = ncol;
+        }
+
+        for (int j = 0; j < nrow; j++)
+        {
+            const double *ppa = pa + j * ncol_a;
+            const double *ppb = pb + j * ncol_b;
+            for (int k = bunch_start; k < bunch_end; k++)
+            {
+                out[k] += ppa[k] * ppb[k];
+            }
+        }
+    }
+}
+
+void _Pack_Matrix_SparseRow_DenseCol(
+    double *target,
+    const int nrow_target,
+    const int ncol_target,
+    double *source,
+    const int nrow_source,
+    const int ncol_source,
+    int *RowLoc,
+    const int ColBegin,
+    const int ColEnd)
+{
+    if (ColEnd - ColBegin <= 0)
+    {
+        return;
+    }
+
+    if (ColEnd < (ColBegin + ncol_source))
+    {
+        printf("ColEnd<ColBegin+ncol_source\n");
+        exit(1);
+    }
+
+    if (ColEnd > ncol_target)
+    {
+        printf("ColEnd>ncol_target\n");
+        exit(1);
+    }
+
+    int i;
+
+    for (i = 0; i < nrow_source; i++)
+    {
+        int row_loc = RowLoc[i];
+        memcpy(target + row_loc * ncol_target + ColBegin, source + i * ncol_source, sizeof(double) * ncol_source);
+    }
+}
+
+void _Reorder_Grid_to_Original_Grid(int ngrid, int *gridID, double *Density_or_J,
+                                    double *out)
+{
+    int i;
+    for (i = 0; i < ngrid; i++)
+    {
+        out[gridID[i]] = Density_or_J[i];
+    }
+}
+
+void _Original_Grid_to_Reorder_Grid(
+    int ngrid, int *gridID, double *Density_or_J, double *out)
+{
+    int i;
+    for (i = 0; i < ngrid; i++)
+    {
+        out[i] = Density_or_J[gridID[i]];
+    }
+}
+
+void _construct_V_local_bas(
+    int *mesh,
+    int nrow,
+    int ncol,
+    int *gridID,
+    double *auxBasis,
+    double *CoulG,
+    int row_shift,
+    double *V,
+    int *grid_ordering,
+    double *buf,         // use the ptr of the ptr to ensure that the memory for each thread is aligned
+    const int buffersize // must be a multiple of 16 to ensure memory alignment
+)
+{
+    // printf("nrow: %d, ncol: %d\n", nrow, ncol);
+    // printf("row_shift: %d\n", row_shift);
+
+    const int nThread = get_omp_threads();
+    size_t mesh_complex[3] = {mesh[0], mesh[1], mesh[2] / 2 + 1};
+    const size_t n_real = mesh[0] * mesh[1] * mesh[2];
+    const size_t n_complex = mesh_complex[0] * mesh_complex[1] * mesh_complex[2];
+    const double fac = 1. / (double)n_real;
+
+    // create plan for fft
+
+    fftw_plan p_forward = fftw_plan_dft_r2c(3, mesh, auxBasis, (fftw_complex *)buf, FFTW_ESTIMATE);
+    fftw_plan p_backward = fftw_plan_dft_c2r(3, mesh, (fftw_complex *)buf, V, FFTW_ESTIMATE);
+
+#pragma omp parallel num_threads(nThread)
+    {
+        int thread_id = omp_get_thread_num();
+        double *buf_thread = buf + thread_id * buffersize;
+        fftw_complex *buf_fft = (fftw_complex *)(buf_thread + n_real);
+
+#pragma omp for schedule(static)
+        for (size_t i = 0; i < nrow; i++)
+        {
+            // pack
+
+            memset(buf_thread, 0, sizeof(double) * n_real);
+
+            for (size_t j = 0; j < ncol; j++)
+            {
+                buf_thread[gridID[j]] = auxBasis[i * ncol + j];
+            }
+
+            // forward transform
+
+            fftw_execute_dft_r2c(p_forward, buf_thread, (fftw_complex *)buf_fft);
+
+            // multiply CoulG
+
+            double *ptr = (double *)buf_fft;
+
+            for (size_t j = 0; j < n_complex; j++)
+            {
+                *ptr++ *= CoulG[j]; /// TODO: use ISPC to accelerate
+                *ptr++ *= CoulG[j]; /// TODO: use ISPC to accelerate
+            }
+
+            // backward transform
+
+            memset(buf_thread, 0, sizeof(double) * n_real);
+
+            fftw_execute_dft_c2r(p_backward, (fftw_complex *)buf_fft, buf_thread);
+
+            // scale
+
+            ptr = V + (i + row_shift) * n_real;
+
+            for (size_t j = 0; j < n_real; j++)
+            {
+                // *ptr++ *= fac; /// TODO: use ISPC to accelerate
+                // ptr[grid_ordering[j]] = buf_thread[j] * fac;
+                ptr[j] = buf_thread[grid_ordering[j]] * fac;
+            }
+        }
+    }
+
+    fftw_destroy_plan(p_forward);
+    fftw_destroy_plan(p_backward);
+}
+
+void _construct_V_kernel(int *mesh_bra,
+                         int *mesh_ket,
+                         int *map_bra_2_ket,
+                         int naux,
+                         double *auxBasis,
+                         double *CoulG, // bra
+                         double *V,
+                         const int BunchSize,
+                         double *buf,         // use the ptr of the ptr to ensure that the memory for each thread is aligned
+                         const int buffersize // must be a multiple of 16 to ensure memory alignment
+)
+{
+    // printf("naux      = %d\n", naux);
+    // printf("BunchSize = %d\n", BunchSize);
+
+    // print all the input info
+
+    static const int INC1 = 1;
+    static const int SMALL_SIZE = 8;
+
+    const int nThread = get_omp_threads();
+    const int nBunch = ((naux / BunchSize) / nThread) * nThread; // dispatch evenly
+    const int nLeft = naux - nBunch * BunchSize;
+
+    // printf("nBunch = %d\n", nBunch);
+    // printf("nLeft  = %d\n", nLeft);
+
+    // print the dispatch info
+
+    int mesh_bra_complex[3] = {mesh_bra[0], mesh_bra[1], mesh_bra[2] / 2 + 1};
+    int mesh_ket_complex[3] = {mesh_ket[0], mesh_ket[1], mesh_ket[2] / 2 + 1};
+
+    const int n_real_bra = mesh_bra[0] * mesh_bra[1] * mesh_bra[2];
+    const int n_complex_bra = mesh_bra_complex[0] * mesh_bra_complex[1] * mesh_bra_complex[2];
+    const int n_real_ket = mesh_ket[0] * mesh_ket[1] * mesh_ket[2];
+    const int n_complex_ket = mesh_ket_complex[0] * mesh_ket_complex[1] * mesh_ket_complex[2];
+
+    if (n_real_bra > n_real_ket)
+    {
+        printf("n_real_bra > n_real_ket\n");
+        exit(1);
+    }
+
+    const double fac = 1. / sqrtl((double)n_real_bra * (double)n_real_ket);
+
+    // create plan for fft
+
+    fftw_plan p_forward = fftw_plan_many_dft_r2c(
+        3, mesh_bra, BunchSize, auxBasis, mesh_bra, 1, n_real_bra, (fftw_complex *)buf, mesh_bra_complex, 1, n_complex_bra, FFTW_ESTIMATE);
+
+    fftw_plan p_backward = fftw_plan_many_dft_c2r(
+        3, mesh_ket, BunchSize, (fftw_complex *)buf, mesh_ket_complex, 1, n_complex_ket, V, mesh_ket, 1, n_real_ket, FFTW_ESTIMATE);
+
+    // execute parallelly sharing the same plan
+
+#pragma omp parallel num_threads(nThread)
+    {
+        int thread_id = omp_get_thread_num();
+        double *buf_thread = buf + thread_id * (size_t)buffersize;
+        size_t bunch_i, bunch_start, bunch_end, j, k;
+        double *ptr;
+
+#pragma omp for schedule(static)
+        for (bunch_i = 0; bunch_i < nBunch; ++bunch_i)
+        // for (bunch_i = 0; bunch_i < 0; ++bunch_i)
+        {
+            bunch_start = bunch_i * BunchSize;
+            bunch_end = bunch_start + BunchSize;
+
+            // forward transform
+
+            fftw_execute_dft_r2c(p_forward, auxBasis + (size_t)bunch_start * (size_t)n_real_bra, (fftw_complex *)buf_thread);
+
+            // multiply CoulG
+
+            ptr = buf_thread;
+
+            for (j = bunch_start; j < bunch_end; ++j)
+            {
+                for (k = 0; k < n_complex_bra; ++k)
+                {
+                    *ptr++ *= CoulG[k]; /// TODO: use ISPC to accelerate
+                    *ptr++ *= CoulG[k]; /// TODO: use ISPC to accelerate
+                }
+            }
+
+            if (map_bra_2_ket != NULL)
+            {
+                ptr = buf_thread + n_complex_bra * 2 * BunchSize;
+                memset(ptr, 0, sizeof(double) * n_complex_ket * 2 * BunchSize);
+                for (j = bunch_start; j < bunch_end; ++j)
+                {
+                    size_t shift = (j - bunch_start) * n_complex_bra * 2;
+                    for (k = 0; k < n_complex_bra; ++k)
+                    {
+                        ptr[2 * map_bra_2_ket[k]] = buf_thread[shift + 2 * k];
+                        ptr[2 * map_bra_2_ket[k] + 1] = buf_thread[shift + 2 * k + 1];
+                    }
+                    ptr += n_complex_ket * 2;
+                }
+                ptr = buf_thread + n_complex_bra * 2 * BunchSize;
+            }
+            else
+            {
+                ptr = buf_thread;
+            }
+
+            // backward transform
+
+            // fftw_execute_dft_c2r(p_backward, (fftw_complex *)buf_thread, V + (size_t)bunch_start * (size_t)n_real);
+            fftw_execute_dft_c2r(p_backward, (fftw_complex *)ptr, V + (size_t)bunch_start * (size_t)n_real_ket);
+
+            // scale
+
+            ptr = V + (size_t)bunch_start * (size_t)n_real_ket;
+            int _size_ = n_real_ket * BunchSize;
+            dscal_(&_size_, &fac, ptr, &INC1);
+
+            // for (j = bunch_start; j < bunch_end; ++j)
+            // {
+            //     for (k = 0; k < n_real_ket; ++k)
+            //     {
+            //         *ptr++ *= fac; /// TODO: use ISPC to accelerate
+            //     }
+            // }
+        }
+    }
+
+    // destroy plan
+
+    fftw_destroy_plan(p_forward);
+    fftw_destroy_plan(p_backward);
+
+    // printf("finish bulk nLeft = %d\n", nLeft);
+    // fflush(stdout);
+
+    if (nLeft > 0)
+    {
+        if ((nLeft <= SMALL_SIZE) && (nLeft <= BunchSize))
+        {
+            // printf("nLeft <= SMALL_SIZE or nLeft <= BunchSize\n");
+            // fflush(stdout);
+
+            // use single thread to handle the left
+
+            int bunch_start = nBunch * BunchSize;
+            int bunch_end = bunch_start + nLeft;
+
+            // create plan
+
+            fftw_plan p_forward = fftw_plan_many_dft_r2c(
+                // 3, mesh, nLeft, auxBasis + bunch_start * n_real, mesh, 1, n_real, (fftw_complex *)buf, mesh_complex, 1, n_complex, FFTW_ESTIMATE);
+                3, mesh_bra, nLeft, auxBasis + bunch_start * n_real_bra, mesh_bra, 1, n_real_bra, (fftw_complex *)buf, mesh_bra_complex, 1, n_complex_bra, FFTW_ESTIMATE);
+
+            fftw_plan p_backward = fftw_plan_many_dft_c2r(
+                // 3, mesh, nLeft, (fftw_complex *)buf, mesh_complex, 1, n_complex, V + bunch_start * n_real, mesh, 1, n_real, FFTW_ESTIMATE);
+                3, mesh_ket, nLeft, (fftw_complex *)buf, mesh_ket_complex, 1, n_complex_ket, V + bunch_start * n_real_ket, mesh_ket, 1, n_real_ket, FFTW_ESTIMATE);
+
+            // forward transform
+
+            // fftw_execute_dft_r2c(p_forward, auxBasis + (size_t)bunch_start * (size_t)n_real, (fftw_complex *)buf);
+            fftw_execute_dft_r2c(p_forward, auxBasis + (size_t)bunch_start * (size_t)n_real_bra, (fftw_complex *)buf);
+
+            // multiply CoulG
+
+            double *ptr = buf;
+
+            for (int j = bunch_start; j < bunch_end; ++j)
+            {
+                // for (int k = 0; k < n_complex; ++k)
+                for (int k = 0; k < n_complex_bra; ++k)
+                {
+                    *ptr++ *= CoulG[k]; ///
+                    *ptr++ *= CoulG[k]; ///
+                }
+            }
+
+            if (map_bra_2_ket != NULL)
+            {
+                ptr = buf + n_complex_bra * 2 * nLeft;
+                memset(ptr, 0, sizeof(double) * n_complex_ket * 2 * nLeft);
+                for (int j = bunch_start; j < bunch_end; ++j)
+                {
+                    size_t shift = (j - bunch_start) * n_complex_bra * 2;
+                    for (int k = 0; k < n_complex_bra; ++k)
+                    {
+                        ptr[2 * map_bra_2_ket[k]] = buf[shift + 2 * k];
+                        ptr[2 * map_bra_2_ket[k] + 1] = buf[shift + 2 * k + 1];
+                    }
+                    ptr += n_complex_ket * 2;
+                }
+                ptr = buf + n_complex_bra * 2 * nLeft;
+            }
+            else
+            {
+                ptr = buf;
+            }
+
+            // backward transform
+
+            // fftw_execute_dft_c2r(p_backward, (fftw_complex *)buf, V + (size_t)bunch_start * (size_t)n_real);
+            fftw_execute_dft_c2r(p_backward, (fftw_complex *)ptr, V + (size_t)bunch_start * (size_t)n_real_ket);
+
+            // scale
+
+            // ptr = V + (size_t)bunch_start * (size_t)n_real;
+            ptr = V + (size_t)bunch_start * (size_t)n_real_ket;
+            int _size_ = n_real_ket * nLeft;
+            dscal_(&_size_, &fac, ptr, &INC1);
+
+            // for (int j = bunch_start; j < bunch_end; ++j)
+            // {
+            //     for (int k = 0; k < n_real; ++k)
+            //     {
+            //         *ptr++ *= fac; ///
+            //     }
+            // }
+
+            // destroy plan
+
+            fftw_destroy_plan(p_forward);
+            fftw_destroy_plan(p_backward);
+        }
+        else
+        {
+            // printf("nLeft > SMALL_SIZE or nLeft > BunchSize\n");
+
+            // use parallel thread to handle the left, assume the nTransform is 1
+
+            int bunch_start = nBunch * BunchSize;
+            int bunch_end = bunch_start + nLeft;
+
+            // create plan
+
+            // fftw_plan p_forward = fftw_plan_dft_r2c(3, mesh, auxBasis + bunch_start * n_real, (fftw_complex *)buf, FFTW_ESTIMATE);
+            fftw_plan p_forward = fftw_plan_dft_r2c(3,
+                                                    // mesh, auxBasis + bunch_start * n_real, (fftw_complex *)buf, FFTW_ESTIMATE);
+                                                    mesh_bra, auxBasis + bunch_start * n_real_bra, (fftw_complex *)buf, FFTW_ESTIMATE);
+
+            fftw_plan p_backward = fftw_plan_dft_c2r(3,
+                                                     // mesh, (fftw_complex *)buf, V + bunch_start * n_real, FFTW_ESTIMATE);
+                                                     mesh_ket, (fftw_complex *)buf, V + bunch_start * n_real_ket, FFTW_ESTIMATE);
+
+            // size_t nbuf_per_thread = ((n_complex * 2 + 15) / 16) * 16; // make sure the memory is aligned
+            size_t nbuf_per_thread = ((n_complex_bra * 2 + 15) / 16) * 16; // make sure the memory is aligned
+            if (map_bra_2_ket != NULL)
+            {
+                nbuf_per_thread += ((n_complex_ket * 2 + 15) / 16) * 16;
+            }
+
+#pragma omp parallel num_threads(nThread)
+            {
+                int thread_id = omp_get_thread_num();
+                double *buf_thread = buf + thread_id * (size_t)nbuf_per_thread;
+                size_t k;
+                double *ptr;
+
+#pragma omp for schedule(static)
+                for (size_t j = bunch_start; j < bunch_end; ++j)
+                {
+
+                    // forward transform
+
+                    // fftw_execute_dft_r2c(p_forward, auxBasis + j * (size_t)n_real, (fftw_complex *)buf_thread);
+                    fftw_execute_dft_r2c(p_forward, auxBasis + j * (size_t)n_real_bra, (fftw_complex *)buf_thread);
+
+                    // multiply CoulG
+
+                    ptr = buf_thread;
+
+                    // for (k = 0; k < n_complex; ++k)
+                    for (k = 0; k < n_complex_bra; ++k)
+                    {
+                        *ptr++ *= CoulG[k];
+                        *ptr++ *= CoulG[k];
+                    }
+
+                    if (map_bra_2_ket != NULL)
+                    {
+                        ptr = buf_thread + n_complex_bra * 2;
+                        memset(ptr, 0, sizeof(double) * n_complex_ket * 2);
+                        for (k = 0; k < n_complex_bra; ++k)
+                        {
+                            ptr[2 * map_bra_2_ket[k]] = buf_thread[2 * k];
+                            ptr[2 * map_bra_2_ket[k] + 1] = buf_thread[2 * k + 1];
+                        }
+                        ptr = buf_thread + n_complex_bra * 2;
+                    }
+                    else
+                    {
+                        ptr = buf_thread;
+                    }
+
+                    // backward transform
+
+                    // fftw_execute_dft_c2r(p_backward, (fftw_complex *)buf_thread, V + j * (size_t)n_real);
+                    fftw_execute_dft_c2r(p_backward, (fftw_complex *)ptr, V + j * (size_t)n_real_ket);
+
+                    // scale
+
+                    // ptr = V + j * (size_t)n_real;
+                    ptr = V + j * (size_t)n_real_ket;
+                    int _size_ = n_real_ket;
+                    dscal_(&_size_, &fac, ptr, &INC1);
+
+                    // for (k = 0; k < n_real; ++k)
+                    // {
+                    //     *ptr++ *= fac;
+                    // }
+                }
+            }
+
+            // destroy plan
+
+            fftw_destroy_plan(p_forward);
+            fftw_destroy_plan(p_backward);
+        }
+    }
+}
+
+void _construct_V(int *mesh,
+                  int naux,
+                  double *auxBasis,
+                  double *CoulG,
+                  double *V,
+                  const int BunchSize,
+                  double *buf,         // use the ptr of the ptr to ensure that the memory for each thread is aligned
+                  const int buffersize // must be a multiple of 16 to ensure memory alignment
+)
+{
+    _construct_V_kernel(mesh, mesh, NULL, naux, auxBasis, CoulG, V, BunchSize, buf, buffersize);
+}
+
+void _construct_V2(int *mesh,
+                   int naux,
+                   double *auxBasis,
+                   double *CoulG,
+                   double *V,
+                   double *auxBasisFFT,
+                   const int BunchSize,
+                   double *buf,          // use the ptr of the ptr to ensure that the memory for each thread is aligned
+                   const int buffersize, // must be a multiple of 16 to ensure memory alignment
+                   const int CONSTRUCT_V)
+{
+    // printf("CONSTRUCT_V: %d\n", CONSTRUCT_V);
+
+    // print all the input info
+
+    static const int SMALL_SIZE = 8;
+
+    const int nThread = get_omp_threads();
+    const int nBunch = ((naux / BunchSize) / nThread) * nThread; // dispatch evenly
+    const int nLeft = naux - nBunch * BunchSize;
+
+    // print the dispatch info
+
+    int mesh_complex[3] = {mesh[0], mesh[1], mesh[2] / 2 + 1};
+
+    const int n_real = mesh[0] * mesh[1] * mesh[2];
+    const int n_complex = mesh_complex[0] * mesh_complex[1] * mesh_complex[2];
+    const double fac = 1. / (double)n_real;
+
+    // create plan for fft
+
+    fftw_plan p_forward = fftw_plan_many_dft_r2c(
+        3, mesh, BunchSize, auxBasis, mesh, 1, n_real, (fftw_complex *)buf, mesh_complex, 1, n_complex, FFTW_ESTIMATE);
+    fftw_plan p_backward = fftw_plan_many_dft_c2r(
+        3, mesh, BunchSize, (fftw_complex *)buf, mesh_complex, 1, n_complex, V, mesh, 1, n_real, FFTW_ESTIMATE);
+
+    // execute parallelly sharing the same plan
+
+#pragma omp parallel num_threads(nThread)
+    {
+        int thread_id = omp_get_thread_num();
+        double *buf_thread = buf + thread_id * (size_t)buffersize;
+        size_t bunch_i, bunch_start, bunch_end, j, k;
+        double *ptr;
+
+#pragma omp for schedule(static)
+        for (bunch_i = 0; bunch_i < nBunch; ++bunch_i)
+        // for (bunch_i = 0; bunch_i < 0; ++bunch_i)
+        {
+            bunch_start = bunch_i * BunchSize;
+            bunch_end = bunch_start + BunchSize;
+
+            // forward transform
+
+            fftw_execute_dft_r2c(p_forward, auxBasis + (size_t)bunch_start * (size_t)n_real, (fftw_complex *)buf_thread);
+
+            // multiply CoulG
+
+            ptr = buf_thread;
+
+            // copy
+
+            memcpy(auxBasisFFT + (size_t)bunch_start * (size_t)n_complex * 2, buf_thread, (size_t)BunchSize * (size_t)n_complex * sizeof(double) * 2);
+
+            if (CONSTRUCT_V > 0)
+            {
+                for (j = bunch_start; j < bunch_end; ++j)
+                {
+                    for (k = 0; k < n_complex; ++k)
+                    {
+                        *ptr++ *= CoulG[k]; /// TODO: use ISPC to accelerate
+                        *ptr++ *= CoulG[k]; /// TODO: use ISPC to accelerate
+                    }
+                }
+
+                // backward transform
+
+                fftw_execute_dft_c2r(p_backward, (fftw_complex *)buf_thread, V + (size_t)bunch_start * (size_t)n_real);
+
+                // scale
+
+                ptr = V + (size_t)bunch_start * (size_t)n_real;
+
+                for (j = bunch_start; j < bunch_end; ++j)
+                {
+                    for (k = 0; k < n_real; ++k)
+                    {
+                        *ptr++ *= fac; /// TODO: use ISPC to accelerate
+                    }
+                }
+            }
+        }
+    }
+
+    // destroy plan
+
+    fftw_destroy_plan(p_forward);
+    fftw_destroy_plan(p_backward);
+
+    if (nLeft > 0)
+    {
+        if ((nLeft <= SMALL_SIZE) && (nLeft <= BunchSize))
+        // if (1)
+        {
+            // use single thread to handle the left
+
+            int bunch_start = nBunch * BunchSize;
+            int bunch_end = bunch_start + nLeft;
+
+            // create plan
+
+            fftw_plan p_forward = fftw_plan_many_dft_r2c(
+                3, mesh, nLeft, auxBasis + bunch_start * n_real, mesh, 1, n_real, (fftw_complex *)buf, mesh_complex, 1, n_complex, FFTW_ESTIMATE);
+            fftw_plan p_backward = fftw_plan_many_dft_c2r(
+                3, mesh, nLeft, (fftw_complex *)buf, mesh_complex, 1, n_complex, V + bunch_start * n_real, mesh, 1, n_real, FFTW_ESTIMATE);
+
+            // forward transform
+
+            fftw_execute_dft_r2c(p_forward, auxBasis + (size_t)bunch_start * (size_t)n_real, (fftw_complex *)buf);
+
+            // multiply CoulG
+
+            double *ptr = buf;
+
+            // copy
+
+            memcpy(auxBasisFFT + (size_t)bunch_start * (size_t)n_complex * 2, buf, (size_t)nLeft * (size_t)n_complex * sizeof(double) * 2);
+
+            if (CONSTRUCT_V > 0)
+            {
+                for (int j = bunch_start; j < bunch_end; ++j)
+                {
+                    for (int k = 0; k < n_complex; ++k)
+                    {
+                        *ptr++ *= CoulG[k]; /// TODO: use ISPC to accelerate
+                        *ptr++ *= CoulG[k]; /// TODO: use ISPC to accelerate
+                    }
+                }
+
+                // backward transform
+
+                fftw_execute_dft_c2r(p_backward, (fftw_complex *)buf, V + (size_t)bunch_start * (size_t)n_real);
+
+                // scale
+
+                ptr = V + (size_t)bunch_start * (size_t)n_real;
+
+                for (int j = bunch_start; j < bunch_end; ++j)
+                {
+                    for (int k = 0; k < n_real; ++k)
+                    {
+                        *ptr++ *= fac; /// TODO: use ISPC to accelerate
+                    }
+                }
+            }
+
+            // destroy plan
+
+            fftw_destroy_plan(p_forward);
+            fftw_destroy_plan(p_backward);
+        }
+        else
+        {
+
+            // use parallel thread to handle the left, assume the nTransform is 1
+
+            int bunch_start = nBunch * BunchSize;
+            int bunch_end = bunch_start + nLeft;
+
+            // create plan
+
+            fftw_plan p_forward = fftw_plan_dft_r2c(3, mesh, auxBasis + bunch_start * n_real, (fftw_complex *)buf, FFTW_ESTIMATE);
+            fftw_plan p_backward = fftw_plan_dft_c2r(3, mesh, (fftw_complex *)buf, V + bunch_start * n_real, FFTW_ESTIMATE);
+
+            size_t nbuf_per_thread = ((n_complex * 2 + 15) / 16) * 16; // make sure the memory is aligned
+
+#pragma omp parallel num_threads(nThread)
+            {
+                int thread_id = omp_get_thread_num();
+                double *buf_thread = buf + thread_id * (size_t)nbuf_per_thread;
+                size_t k;
+                double *ptr;
+
+#pragma omp for schedule(static)
+                for (size_t j = bunch_start; j < bunch_end; ++j)
+                {
+
+                    // forward transform
+
+                    fftw_execute_dft_r2c(p_forward, auxBasis + j * (size_t)n_real, (fftw_complex *)buf_thread);
+
+                    // multiply CoulG
+
+                    ptr = buf_thread;
+
+                    // copy
+
+                    memcpy(auxBasisFFT + j * (size_t)n_complex * 2, buf_thread, (size_t)n_complex * sizeof(double) * 2);
+
+                    if (CONSTRUCT_V > 0)
+                    {
+                        for (k = 0; k < n_complex; ++k)
+                        {
+                            *ptr++ *= CoulG[k]; /// TODO: use ISPC to accelerate
+                            *ptr++ *= CoulG[k]; /// TODO: use ISPC to accelerate
+                        }
+
+                        // backward transform
+
+                        fftw_execute_dft_c2r(p_backward, (fftw_complex *)buf_thread, V + j * (size_t)n_real);
+
+                        // scale
+
+                        ptr = V + j * (size_t)n_real;
+
+                        for (k = 0; k < n_real; ++k)
+                        {
+                            *ptr++ *= fac; /// TODO: use ISPC to accelerate
+                        }
+                    }
+                }
+            }
+
+            // destroy plan
+
+            fftw_destroy_plan(p_forward);
+            fftw_destroy_plan(p_backward);
+        }
+    }
+}
+
+void _construct_W_multiG(
+    int naux,
+    int p0,
+    int p1,
+    double *auxBasisFFT,
+    double *CoulG)
+{
+    int ngrid = p1 - p0;
+    int nThread = get_omp_threads();
+
+    size_t i;
+
+    const double *ptr_G = CoulG + p0;
+
+#pragma omp parallel for num_threads(nThread) schedule(static)
+    for (i = 0; i < naux; i++)
+    {
+        size_t j;
+        double *ptr_basis = auxBasisFFT + i * ngrid * 2;
+        for (j = 0; j < ngrid; j++)
+        {
+            ptr_basis[j * 2] *= ptr_G[j];
+            ptr_basis[j * 2 + 1] *= ptr_G[j];
+        }
+    }
+}
+
+///////////// get_jk linear scaling /////////////
+
+void _extract_dm_involved_ao(
+    double *dm,
+    const int nao,
+    double *res_buf,
+    const int *ao_involved,
+    const int nao_involved)
+{
+    int nThread = get_omp_threads();
+
+#pragma omp parallel for num_threads(nThread) schedule(static)
+    for (size_t i = 0; i < nao_involved; ++i)
+    {
+        for (size_t j = 0; j < nao_involved; ++j)
+        {
+            res_buf[i * nao_involved + j] = dm[ao_involved[i] * nao + ao_involved[j]];
+        }
+    }
+}
+
+void _extract_dm_involved_ao_RS(
+    double *dm,
+    const int nao,
+    double *res_buf,
+    const int *bra_ao_involved,
+    const int bra_nao_involved,
+    const int *ket_ao_involved,
+    const int ket_nao_involved)
+{
+    int nThread = get_omp_threads();
+
+#pragma omp parallel for num_threads(nThread) schedule(static)
+    for (size_t i = 0; i < bra_nao_involved; ++i)
+    {
+        for (size_t j = 0; j < ket_nao_involved; ++j)
+        {
+            res_buf[i * ket_nao_involved + j] = dm[bra_ao_involved[i] * nao + ket_ao_involved[j]];
+        }
+    }
+}
+
+void _packadd_local_dm(
+    double *local_dm,
+    const int nao_involved,
+    const int *ao_involved,
+    double *dm,
+    const int nao)
+{
+    int nThread = get_omp_threads();
+
+#pragma omp parallel for num_threads(nThread) schedule(static)
+    for (size_t i = 0; i < nao_involved; ++i)
+    {
+        for (size_t j = 0; j < nao_involved; ++j)
+        {
+            dm[ao_involved[i] * nao + ao_involved[j]] += local_dm[i * nao_involved + j];
+        }
+    }
+}
+
+void _packadd_local_dm2_add_transpose(
+    double *local_dm,
+    const int bra_nao_involved,
+    const int *bra_ao_involved,
+    const int ket_nao_involved,
+    const int *ket_ao_involved,
+    double *dm,
+    const int nao)
+{
+    int nThread = get_omp_threads();
+
+#pragma omp parallel for num_threads(nThread) schedule(static)
+    for (size_t i = 0; i < bra_nao_involved; ++i)
+    {
+        for (size_t j = 0; j < ket_nao_involved; ++j)
+        {
+            dm[bra_ao_involved[i] * nao + ket_ao_involved[j]] += local_dm[i * ket_nao_involved + j];
+            dm[ket_ao_involved[j] * nao + bra_ao_involved[i]] += local_dm[i * ket_nao_involved + j];
+        }
+    }
+}
+
+void _packadd_local_dm2(
+    double *local_dm,
+    const int bra_nao_involved,
+    const int *bra_ao_involved,
+    const int ket_nao_involved,
+    const int *ket_ao_involved,
+    double *dm,
+    const int nao)
+{
+    int nThread = get_omp_threads();
+
+#pragma omp parallel for num_threads(nThread) schedule(static)
+    for (size_t i = 0; i < bra_nao_involved; ++i)
+    {
+        for (size_t j = 0; j < ket_nao_involved; ++j)
+        {
+            dm[bra_ao_involved[i] * nao + ket_ao_involved[j]] += local_dm[i * ket_nao_involved + j];
+        }
+    }
+}
+
+void _packadd_local_RS(
+    double *local_dm,
+    const int bra_nao_involved,
+    const int *bra_ao_involved,
+    const int ket_nao_involved,
+    const int *ket_ao_involved,
+    double *dm,
+    const int nao)
+{
+    int nThread = get_omp_threads();
+
+#pragma omp parallel for num_threads(nThread) schedule(static)
+    for (size_t i = 0; i < bra_nao_involved; ++i)
+    {
+        for (size_t j = 0; j < ket_nao_involved; ++j)
+        {
+            dm[bra_ao_involved[i] * nao + ket_ao_involved[j]] += local_dm[i * ket_nao_involved + j];
+            dm[ket_ao_involved[j] * nao + bra_ao_involved[i]] += local_dm[i * ket_nao_involved + j];
+        }
+    }
+}
+
+void _buildJ_k_packaddrow(
+    double *target,
+    const int nrow_target,
+    const int ncol_target,
+    double *source,
+    const int nrow_source,
+    const int ncol_source,
+    const int *rowloc,
+    const int *colloc)
+{
+    int nThread = get_omp_threads();
+
+#pragma omp parallel for num_threads(nThread) schedule(static)
+    for (size_t i = 0; i < nrow_source; ++i)
+    {
+        size_t row_loc = rowloc[i];
+        for (size_t j = 0; j < ncol_source; ++j)
+        {
+            target[row_loc * ncol_target + colloc[j]] += source[i * ncol_source + j];
+        }
+    }
+}
+
+void _buildK_packaddrow(
+    double *target,
+    const int nrow_target,
+    const int ncol_target,
+    double *source,
+    const int nrow_source,
+    const int ncol_source,
+    const int *ao_involved)
+{
+    int nThread = get_omp_threads();
+
+    static const int INC = 1;
+    static const double ONE = 1.0;
+
+#pragma omp parallel for num_threads(nThread) schedule(static)
+    for (size_t i = 0; i < nrow_source; ++i)
+    {
+        size_t row_loc = ao_involved[i];
+        // memcpy(target + row_loc * ncol_target, source + i * ncol_source, sizeof(double) * ncol_source);
+        daxpy_(&ncol_source, &ONE, source + i * ncol_source, &INC, target + row_loc * ncol_target, &INC);
+    }
+}
+
+void _buildK_packaddrow_shift_col(
+    double *target,
+    const int nrow_target,
+    const int ncol_target,
+    double *source,
+    const int nrow_source,
+    const int ncol_source,
+    const int *ao_involved,
+    const int kmesh,
+    const int nao_prim,
+    const int *box_permutation)
+{
+    int nThread = get_omp_threads();
+
+    static const int INC = 1;
+    static const double ONE = 1.0;
+
+    if (ncol_target != (kmesh * nao_prim))
+    {
+        printf("Error: ncol_target!=(kmesh *nao_prim)\n");
+        exit(1);
+    }
+
+    if (ncol_source != ncol_target)
+    {
+        printf("Error: ncol_source!=ncol_target\n");
+        exit(1);
+    }
+
+#pragma omp parallel for num_threads(nThread) schedule(static)
+    for (size_t i = 0; i < nrow_source; ++i)
+    {
+        size_t row_loc = ao_involved[i];
+        // memcpy(target + row_loc * ncol_target, source + i * ncol_source, sizeof(double) * ncol_source);
+        // daxpy_(&ncol_source, &ONE, source + i * ncol_source, &INC, target + row_loc * ncol_target, &INC);
+        for (size_t j = 0; j < kmesh; ++j)
+        {
+            daxpy_(&nao_prim, &ONE, source + i * ncol_source + j * nao_prim, &INC, target + row_loc * ncol_target + box_permutation[j] * nao_prim, &INC);
+        }
+    }
+}
+
+void _buildK_packaddcol(
+    double *target,
+    const int nrow_target,
+    const int ncol_target,
+    double *source,
+    const int nrow_source,
+    const int ncol_source,
+    const int *ao_involved)
+
+{
+    int nThread = get_omp_threads();
+
+#pragma omp parallel for num_threads(nThread) schedule(static)
+    for (size_t i = 0; i < nrow_source; ++i)
+    {
+        for (size_t j = 0; j < ncol_source; ++j)
+        {
+            target[i * ncol_target + ao_involved[j]] += source[i * ncol_source + j];
+        }
+    }
+}
+
+void _buildK_packrow(
+    double *target,
+    const int nrow_target,
+    const int ncol_target,
+    double *source,
+    const int nrow_source,
+    const int ncol_source,
+    const int *ao_involved)
+{
+    int nThread = get_omp_threads();
+
+    // static const int INC = 1;
+    // static const double ONE = 1.0;
+
+#pragma omp parallel for num_threads(nThread) schedule(static)
+    for (size_t i = 0; i < nrow_target; ++i)
+    {
+        size_t row_loc = ao_involved[i];
+        memcpy(target + i * ncol_target, source + row_loc * ncol_source, sizeof(double) * ncol_source);
+    }
+}
+
+void _buildK_packcol(
+    double *target,
+    const int nrow_target,
+    const int ncol_target,
+    double *source,
+    const int nrow_source,
+    const int ncol_source,
+    const int *ao_involved)
+{
+    int nThread = get_omp_threads();
+
+    // static const int INC = 1;
+    // static const double ONE = 1.0;
+
+#pragma omp parallel for num_threads(nThread) schedule(static)
+    for (size_t i = 0; i < nrow_target; ++i)
+    {
+        for (size_t j = 0; j < ncol_target; ++j)
+        {
+            target[i * ncol_target + j] = source[i * ncol_source + ao_involved[j]];
+        }
+    }
+}
+
+void _buildK_unpackcol(
+    double *target,
+    const int nrow_target,
+    const int ncol_target,
+    double *source,
+    const int nrow_source,
+    const int ncol_source,
+    const int *source_ind)
+{
+    int nThread = get_omp_threads();
+
+    // static const int INC = 1;
+    // static const double ONE = 1.0;
+
+    memset(target, 0, sizeof(double) * nrow_target * ncol_target);
+
+#pragma omp parallel for num_threads(nThread) schedule(static)
+    for (size_t i = 0; i < nrow_source; ++i)
+    {
+        for (size_t j = 0; j < ncol_source; ++j)
+        {
+            target[i * ncol_target + source_ind[j]] = source[i * ncol_source + j];
+        }
+    }
+}
+
+void _buildK_packcol2(
+    double *target,
+    const int nrow_target,
+    const int ncol_target,
+    double *source,
+    const int nrow_source,
+    const int ncol_source,
+    const int col_indx_begin,
+    const int col_indx_end)
+{
+    int nThread = get_omp_threads();
+
+    // static const int INC = 1;
+    // static const double ONE = 1.0;
+
+#pragma omp parallel for num_threads(nThread) schedule(static)
+    for (size_t i = 0; i < nrow_target; ++i)
+    {
+        memcpy(target + i * ncol_target, source + i * ncol_source + col_indx_begin, sizeof(double) * (col_indx_end - col_indx_begin));
+    }
+}
+
+void _buildK_packcol3(
+    double *target,
+    const int nrow_target,
+    const int ncol_target,
+    const int col_indx_begin,
+    const int col_indx_end,
+    double *source,
+    const int nrow_source,
+    const int ncol_source)
+{
+    int nThread = get_omp_threads();
+
+    // static const int INC = 1;
+    // static const double ONE = 1.0;
+
+#pragma omp parallel for num_threads(nThread) schedule(static)
+    for (size_t i = 0; i < nrow_target; ++i)
+    {
+        memcpy(target + i * ncol_target + col_indx_begin, source + i * ncol_source, sizeof(double) * (col_indx_end - col_indx_begin));
+    }
+}
+
+void _buildK_copy(double *target, double *source, const size_t size)
+{
+    memcpy(target, source, sizeof(double) * size);
+}
+
+////////// used in moR to density //////////
+
+void moR_to_Density(
+    const int ngrids,
+    const int nMO,
+    const double *moR,
+    double *rhoR)
+{
+    int nThread = get_omp_threads();
+
+    int ngrid_per_thread = (ngrids + nThread - 1) / nThread;
+
+    memset(rhoR, 0, sizeof(double) * ngrids);
+
+#pragma omp parallel num_threads(nThread)
+    {
+        int thread_id = omp_get_thread_num();
+        int grid_start = thread_id * ngrid_per_thread;
+        grid_start = grid_start < ngrids ? grid_start : ngrids;
+        int grid_end = (thread_id + 1) * ngrid_per_thread;
+        grid_end = grid_end < ngrids ? grid_end : ngrids;
+
+        for (int i = 0; i < nMO; i++)
+        {
+            for (int j = grid_start; j < grid_end; j++)
+            {
+                rhoR[j] += moR[i * ngrids + j] * moR[i * ngrids + j];
+            }
+        }
+    }
+}
+
+////////// transpose 012 -> 021 //////////
+
+void transpose_012_to_021(
+    double *target,
+    double *source,
+    const int n1,
+    const int n2,
+    const int n3)
+{
+    int nThread = get_omp_threads();
+
+#pragma omp parallel for num_threads(nThread) schedule(static)
+    for (size_t i = 0; i < n1; i++)
+    {
+        size_t shift = i * n2 * n3;
+        double *ptr_target = target + shift;
+        double *ptr_source = source + shift;
+        for (size_t j = 0; j < n2; j++)
+        {
+            for (size_t k = 0; k < n3; k++)
+            {
+                ptr_target[k * n2 + j] = ptr_source[j * n3 + k];
+            }
+        }
+    }
+}
+
+void transpose_012_to_021_InPlace(
+    double *target,
+    const int n1,
+    const int n2,
+    const int n3,
+    double *buf)
+{
+    int nThread = get_omp_threads();
+
+#pragma omp parallel for num_threads(nThread) schedule(static)
+    for (size_t i = 0; i < n1; i++)
+    {
+        size_t shift = i * n2 * n3;
+        double *ptr_buf = buf + shift;
+        double *ptr_source = target + shift;
+        for (size_t j = 0; j < n2; j++)
+        {
+            for (size_t k = 0; k < n3; k++)
+            {
+                ptr_buf[k * n2 + j] = ptr_source[j * n3 + k];
+            }
+        }
+    }
+
+    memcpy(target, buf, sizeof(double) * n1 * n2 * n3);
+}
+
+void contract_ipk_pk_to_ik(
+    double *A,
+    double *B,
+    double *C,
+    const int n1,
+    const int n2,
+    const int n3)
+{
+    int nThread = get_omp_threads();
+
+    memset(C, 0, sizeof(double) * n1 * n3);
+
+#pragma omp parallel for num_threads(nThread) schedule(static)
+    for (size_t i = 0; i < n1; i++)
+    {
+        double *ptr_A = A + i * n2 * n3;
+        double *ptr_B = B;
+        for (size_t j = 0; j < n2; j++)
+        {
+            double *ptr_res = C + i * n3;
+            for (size_t k = 0; k < n3; k++)
+            {
+                *ptr_res++ += *ptr_A++ * *ptr_B++;
+            }
+        }
+    }
+}
+
+////////// used in CCCC for LR part in RS-ISDF //////////
+
+void _unpack_aoPairR(
+    double *target,
+    const int n1,
+    const int n2,
+    const int n3,
+    double *source,
+    const int m1,
+    const int m2,
+    const int m3,
+    const int m2_begin,
+    const int m2_end,
+    const int *grid_involved)
+{
+    int nThread = get_omp_threads();
+
+    int ntask = n1 * (m2_end - m2_begin);
+
+    memset(target, 0, sizeof(double) * n1 * n2 * n3);
+
+#pragma omp parallel for num_threads(nThread) schedule(static)
+    for (size_t i = 0; i < ntask; i++)
+    {
+        size_t i1 = i / (m2_end - m2_begin);
+        size_t i2 = i % (m2_end - m2_begin);
+
+        size_t shift_target = i1 * n2 * n3 + i2 * n3;
+        size_t shift_source = i1 * m2 * m3 + (i2 + m2_begin) * m3;
+
+        for (size_t j = 0; j < m3; ++j)
+        {
+            target[shift_target + grid_involved[j]] = source[shift_source + j];
+        }
+    }
+}
+
+void _pack_aoPairR_index1(
+    double *target,
+    const int n1,
+    const int n2,
+    const int n3,
+    double *source,
+    const int m1,
+    const int m2,
+    const int m3,
+    const int m2_begin,
+    const int m2_end)
+{
+    int nThread = get_omp_threads();
+
+    // memset(target, 0, sizeof(double) * n1 * n2 * n3);
+
+#pragma omp parallel for num_threads(nThread) schedule(static)
+    for (size_t i = 0; i < n1; i++)
+    {
+        size_t shift_target = i * n2 * n3;
+        size_t shift_source = i * m2 * m3 + m2_begin * m3;
+
+        memcpy(target + shift_target, source + shift_source, sizeof(double) * n2 * m3);
+    }
+}
+
+////////// in determing partition //////////
+
+double _distance_translation(double *pa, double *pb, double *a)
+{
+    double dx, dx1, dx2;
+    double dy, dy1, dy2;
+    double dz, dz1, dz2;
+
+    dx = pa[0] - pb[0];
+    dx1 = dx - a[0];
+    dx2 = dx + a[0];
+    dx = fabs(dx);
+    dx1 = fabs(dx1);
+    dx2 = fabs(dx2);
+    dx = fmin(fmin(dx, dx1), dx2);
+
+    dy = pa[1] - pb[1];
+    dy1 = dy - a[1];
+    dy2 = dy + a[1];
+    dy = fabs(dy);
+    dy1 = fabs(dy1);
+    dy2 = fabs(dy2);
+    dy = fmin(fmin(dy, dy1), dy2);
+
+    dz = pa[2] - pb[2];
+    dz1 = dz - a[2];
+    dz2 = dz + a[2];
+    dz = fabs(dz);
+    dz1 = fabs(dz1);
+    dz2 = fabs(dz2);
+    dz = fmin(fmin(dz, dz1), dz2);
+
+    return sqrt(dx * dx + dy * dy + dz * dz);
+}
+
+void distance_between_point_atms(
+    double *distance,
+    double *pnt,
+    double *atm_coords,
+    const int natm,
+    const double *lattice_vector)
+{
+    double a[3];
+    a[0] = lattice_vector[0 * 3 + 0];
+    a[1] = lattice_vector[1 * 3 + 1];
+    a[2] = lattice_vector[2 * 3 + 2];
+
+#pragma omp parallel for schedule(static) num_threads(get_omp_threads())
+    for (int i = 0; i < natm; i++)
+    {
+        distance[i] = _distance_translation(pnt, atm_coords + i * 3, a);
+    }
+}
+
+void distance_between_points_atms(
+    double *distance,
+    double *pnt,
+    const int npnt,
+    double *atm_coords,
+    const int natm,
+    const double *lattice_vector)
+{
+    double a[3];
+    a[0] = lattice_vector[0 * 3 + 0];
+    a[1] = lattice_vector[1 * 3 + 1];
+    a[2] = lattice_vector[2 * 3 + 2];
+
+#pragma omp parallel for schedule(static) num_threads(get_omp_threads())
+    for (size_t i = 0; i < npnt; i++)
+    {
+        for (size_t j = 0; j < natm; j++)
+        {
+            distance[i * natm + j] = _distance_translation(pnt + i * 3, atm_coords + j * 3, a);
+        }
+    }
+}
+
+//////////// further linear algebra operations
+
+void NPdcwisemul(double *out, double *a, double *b, size_t n)
+{
+#pragma omp parallel
+    {
+        size_t i;
+#pragma omp for schedule(static)
+        for (i = 0; i < n; i++)
+        {
+            out[i] = a[i] * b[i];
+        }
+    }
+}
+
+void NPz2d_InPlace(double complex *in, const size_t n)
+{
+    // printf("n = %d\n", n);
+    // fflush(stdout);
+
+    double *out = (double *)in;
+
+    int nThread = get_omp_threads();
+
+    int BunchSize = n / nThread;
+    int nLeft = n - BunchSize * nThread;
+
+#pragma omp parallel num_threads(nThread)
+    {
+        size_t i;
+
+        int tid = omp_get_thread_num();
+        int start = tid * BunchSize;
+        int end = start + BunchSize;
+
+        if (tid == nThread - 1)
+        {
+            end += nLeft;
+        }
+
+        double *ptr_real = (double *)(in + start);
+        double complex *ptr_complex = in + start;
+
+        for (i = 0; i < end - start; i++)
+        {
+            ptr_real[i] = creal(ptr_complex[i]);
+        }
+    }
+
+    // copy back
+
+    for (int i = 1; i < nThread; i++)
+    {
+        int start = i * BunchSize;
+        int end = start + BunchSize;
+
+        if (i == nThread - 1)
+        {
+            end += nLeft;
+        }
+
+        memcpy(out + start, in + start, sizeof(double) * (end - start));
+    }
+}
+
+void NPdsquare_inPlace(double *a, size_t n)
+{
+#pragma omp parallel for schedule(static)
+    for (size_t i = 0; i < n; i++)
+    {
+        a[i] = a[i] * a[i];
+    }
+}
+
+void NPd_ij_j_ij(double *out, double *a, double *b, size_t nrow, size_t ncol)
+{
+#pragma omp parallel
+    {
+        size_t i, j;
+        double *pa, *pout;
+#pragma omp for schedule(static)
+        for (i = 0; i < nrow; i++)
+        {
+            pa = a + i * ncol;
+            pout = out + i * ncol;
+            for (j = 0; j < ncol; j++)
+            {
+                pout[j] = pa[j] * b[j]; // out[i,j] = a[i,j] * b[j]
+            }
+        }
+    }
+}
+
+void NPd_i_ij_ij(double *out, double *a, double *b, size_t nrow, size_t ncol)
+{
+#pragma omp parallel
+    {
+        size_t i, j;
+        double *pb, *pout;
+#pragma omp for schedule(static)
+        for (i = 0; i < nrow; i++)
+        {
+            pb = b + i * ncol;
+            pout = out + i * ncol;
+            for (j = 0; j < ncol; j++)
+            {
+                pout[j] = a[i] * pb[j]; // out[i,j] = a[i] * b[i,j]
+            }
+        }
+    }
+}
+
diff --git a/pyscf/isdf/pbc_isdf_auxbasis.c b/pyscf/isdf/pbc_isdf_auxbasis.c
new file mode 100644
index 000000000..415db61c4
--- /dev/null
+++ b/pyscf/isdf/pbc_isdf_auxbasis.c
@@ -0,0 +1,482 @@
+#include "vhf/fblas.h"
+#include <stdio.h>
+#include <stdint.h>
+#include <string.h>
+#include <math.h>
+int get_omp_threads();
+int omp_get_thread_num();
+
+void ColPivotQRRelaCut(
+    double *aoPaironGrid, // (nPair, nGrid)
+    const int nPair,
+    const int nGrid,
+    const int max_rank,
+    const double cutoff, // abs_cutoff
+    const double relacutoff,
+    int *pivot,
+    double *R,
+    int *npt_find,
+    double *thread_buffer, // (nThread, nGrid)
+    double *global_buffer) // nGrid
+{
+    static const int INC = 1;
+
+    // printf("nPair: %d\n", nPair);
+    // printf("nGrid: %d\n", nGrid);
+    // printf("max_rank: %d\n", max_rank);
+    // printf("cutoff: %f\n", cutoff);
+
+    double *Q = aoPaironGrid;
+
+    for (int i = 0; i < nGrid; ++i)
+    {
+        pivot[i] = i;
+    }
+
+    int nThread = get_omp_threads();
+    *npt_find = 0;
+
+    int *reduce_indx_buffer = (int *)(thread_buffer + nThread * nGrid);
+
+    int i;
+
+    int argmaxnorm = 0;
+    double maxnorm = 0.0;
+
+    for (i = 0; i < max_rank; i++)
+    {
+        // printf("i: %d\n", i);
+
+#pragma omp parallel num_threads(nThread)
+        {
+
+            int thread_id = omp_get_thread_num();
+            double *buf = thread_buffer + thread_id * nGrid;
+            memset(buf, 0, sizeof(double) * nGrid);
+
+            int j, k;
+
+            double *dptr;
+
+            //// 1. determine the arg of maxinaml norm
+
+#pragma omp for schedule(static)
+            for (j = 0; j < nPair; j++)
+            {
+                dptr = Q + j * nGrid;
+                for (k = i; k < nGrid; k++)
+                {
+                    buf[k] += dptr[k] * dptr[k];
+                }
+            }
+
+            int bunchsize = (nGrid - i) / nThread + 1;
+            int begin_id = i + thread_id * bunchsize;
+            int end_id = i + (thread_id + 1) * bunchsize;
+            if (thread_id == nThread - 1)
+            {
+                end_id = nGrid;
+            }
+
+            if (begin_id >= nGrid)
+            {
+                begin_id = nGrid;
+            }
+
+            if (end_id > nGrid)
+            {
+                end_id = nGrid;
+            }
+
+            memcpy(global_buffer + begin_id, thread_buffer + begin_id, sizeof(double) * (end_id - begin_id));
+
+            for (j = 1; j < nThread; j++)
+            {
+                dptr = thread_buffer + j * nGrid;
+                for (k = begin_id; k < end_id; ++k)
+                {
+                    global_buffer[k] += dptr[k];
+                }
+            }
+
+            // get the local max
+
+            if (begin_id < end_id)
+            {
+                double max_norm2 = global_buffer[begin_id];
+                reduce_indx_buffer[thread_id] = begin_id;
+                for (j = begin_id + 1; j < end_id; j++)
+                {
+                    if (global_buffer[j] > max_norm2)
+                    {
+                        max_norm2 = global_buffer[j];
+                        reduce_indx_buffer[thread_id] = j;
+                    }
+                }
+            }
+            else
+            {
+                reduce_indx_buffer[thread_id] = begin_id - 1;
+            }
+
+            // printf("max_norm2: %.3e\n", max_norm2);
+
+#pragma omp barrier
+
+#pragma omp single
+            {
+                // printf("--------------------------------\n");
+                maxnorm = global_buffer[reduce_indx_buffer[0]];
+                argmaxnorm = reduce_indx_buffer[0];
+                // printf("maxnorm: %.3e\n", maxnorm);
+                // printf("argmaxnorm: %d\n", argmaxnorm);
+                for (j = 1; j < nThread; j++)
+                {
+                    if (global_buffer[reduce_indx_buffer[j]] > maxnorm)
+                    {
+                        // printf("j = %d\n", j);
+                        // printf("global_buffer[reduce_indx_buffer[j]]: %.3e\n", global_buffer[reduce_indx_buffer[j]]);
+
+                        maxnorm = global_buffer[reduce_indx_buffer[j]];
+                        argmaxnorm = reduce_indx_buffer[j];
+
+                        // printf("maxnorm: %.3e\n", maxnorm);
+                        // printf("argmaxnorm: %d\n", argmaxnorm);
+                    }
+                }
+
+                // printf("i = %d\n", i);
+                // printf("argmaxnorm = %d\n", argmaxnorm);
+
+                int tmp;
+                tmp = pivot[i];
+                pivot[i] = pivot[argmaxnorm];
+                pivot[argmaxnorm] = tmp;
+
+                // printf("argmaxnorm: %d\n", argmaxnorm);
+                // printf("tmp = %d\n", tmp);
+                // printf("pivot[i] = %d\n", pivot[i]);
+                // printf("pivot[argmaxnorm] = %d\n", pivot[argmaxnorm]);
+                // printf("--------------------------------\n");
+
+                maxnorm = sqrt(maxnorm);
+                R[i * nGrid + i] = maxnorm;
+                // printf("R[%3d,%3d] = maxnorm = %10.3e\n", i, i, maxnorm);
+            }
+
+#pragma omp barrier
+
+            //// 2. switch
+
+            ///// Q
+
+#pragma omp for schedule(static) nowait
+            for (j = 0; j < nPair; ++j)
+            {
+                dptr = Q + j * nGrid;
+                double tmp;
+                tmp = dptr[i];
+                dptr[i] = dptr[argmaxnorm];
+                dptr[argmaxnorm] = tmp;
+                dptr[i] /= maxnorm;
+            }
+
+            ///// R
+
+#pragma omp for schedule(static)
+            for (j = 0; j < i; ++j)
+            {
+                dptr = R + i * nGrid;
+                double tmp;
+                tmp = dptr[i];
+                dptr[i] = dptr[argmaxnorm];
+                dptr[argmaxnorm] = tmp;
+            }
+
+            //// 3. perform Schimidt decomposition
+
+            ///// calculate the inner product
+
+            memset(buf, 0, sizeof(double) * nGrid);
+
+            int nleft = nGrid - i - 1;
+
+#pragma omp for schedule(static)
+            for (j = 0; j < nPair; ++j)
+            {
+                dptr = Q + j * nGrid;
+                daxpy_(&nleft, dptr + i, dptr + i + 1, &INC, buf + i + 1, &INC);
+            }
+
+            bunchsize = nleft / nThread;
+            begin_id = i + 1 + thread_id * bunchsize;
+            end_id = i + 1 + (thread_id + 1) * bunchsize;
+            if (thread_id == nThread - 1)
+            {
+                end_id = nGrid;
+            }
+
+            memcpy(global_buffer + begin_id, thread_buffer + begin_id, sizeof(double) * (end_id - begin_id));
+
+            for (j = 1; j < nThread; j++)
+            {
+                dptr = thread_buffer + j * nGrid;
+                for (k = begin_id; k < end_id; ++k)
+                {
+                    global_buffer[k] += dptr[k];
+                }
+            }
+
+#pragma omp barrier
+
+            // project out
+
+            double *inner_prod = global_buffer + i + 1;
+
+#pragma omp for schedule(static) nowait
+            for (j = 0; j < nPair; ++j)
+            {
+                dptr = Q + j * nGrid;
+                double alpha = -dptr[i];
+                daxpy_(&nleft, &alpha, inner_prod, &INC, dptr + i + 1, &INC);
+            }
+
+            // update R
+
+#pragma omp single
+            {
+                memcpy(R + i * nGrid + i + 1, inner_prod, sizeof(double) * nleft);
+            }
+        }
+
+        if ((maxnorm < cutoff) || (maxnorm < R[0] * relacutoff))
+        {
+            break;
+        }
+        else
+        {
+            (*npt_find)++;
+        }
+    }
+}
+
+void ColPivotQR(
+    double *aoPaironGrid, // (nPair, nGrid)
+    const int nPair,
+    const int nGrid,
+    const int max_rank,
+    const double cutoff,
+    int *pivot,
+    double *R,
+    int *npt_find,
+    double *thread_buffer, // (nThread, nGrid)
+    double *global_buffer) // nGrid
+{
+    ColPivotQRRelaCut(
+        aoPaironGrid, nPair, nGrid, max_rank, cutoff, 0.0, pivot, R, npt_find, thread_buffer, global_buffer);
+}
+
+void NP_d_ik_jk_ijk(
+    const double *A,
+    const double *B,
+    double *out,
+    const int nA,
+    const int nB,
+    const int nC)
+{
+    // printf("nA: %d\n", nA);
+    // printf("nB: %d\n", nB);
+    // printf("nC: %d\n", nC);
+
+    int i, j;
+#pragma omp parallel for private(i, j)
+    for (i = 0; i < nA * nB; ++i)
+    {
+        int i1 = i / nB;
+        int i2 = i % nB;
+        for (j = 0; j < nC; ++j)
+        {
+            out[i * nC + j] = A[i1 * nC + j] * B[i2 * nC + j];
+        }
+    }
+}
+
+void NPdsliceFirstCol(double *out, const double *a, size_t ncol_left, size_t nrow, size_t ncol)
+{
+#pragma omp parallel
+    {
+        size_t i;
+#pragma omp for schedule(static)
+        for (i = 0; i < nrow; i++)
+        {
+            memcpy(out + i * ncol_left, a + i * ncol, sizeof(double) * ncol_left);
+        }
+    }
+}
+
+void CalculateNormRemained(
+    const double *InnerProd, // (nIP, nPntPotential)
+    const int nIP,
+    const int nPntPotential,
+    const double *aoPaironGrid, // (nPair, nPntPotential)
+    const int nPair,
+    double *thread_buffer,
+    double *global_buffer)
+{
+    int nThread = get_omp_threads();
+
+#pragma omp parallel num_threads(nThread)
+    {
+        int thread_id = omp_get_thread_num();
+        double *buf = thread_buffer + thread_id * nPntPotential;
+        memset(buf, 0, sizeof(double) * nPntPotential);
+
+        int i, j;
+
+        double *dptr;
+        const double *cdptr;
+
+#pragma omp for schedule(static)
+        for (i = 0; i < nPair; i++)
+        {
+            cdptr = aoPaironGrid + i * nPntPotential;
+            for (j = 0; j < nPntPotential; j++)
+            {
+                buf[j] += cdptr[j] * cdptr[j];
+            }
+        }
+
+        int bunchsize = nPntPotential / nThread;
+        int begin_id = thread_id * bunchsize;
+        int end_id = (thread_id + 1) * bunchsize;
+        if (thread_id == nThread - 1)
+        {
+            end_id = nPntPotential;
+        }
+
+        memcpy(global_buffer + begin_id, thread_buffer + begin_id, sizeof(double) * (end_id - begin_id));
+
+        for (i = 1; i < nThread; i++)
+        {
+            dptr = thread_buffer + i * nPntPotential;
+            for (j = begin_id; j < end_id; j++)
+            {
+                global_buffer[j] += dptr[j];
+            }
+        }
+
+        // if (begin_id == 0)
+        // {
+        //     printf("global_buffer[0]: %f\n", sqrt(global_buffer[0]));
+        // }
+
+        for (i = 0; i < nIP; i++)
+        {
+            const double *dptr = InnerProd + i * nPntPotential;
+            for (j = begin_id; j < end_id; j++)
+            {
+                global_buffer[j] -= dptr[j] * dptr[j];
+            }
+        }
+
+        for (j = begin_id; j < end_id; j++)
+        {
+            global_buffer[j] = sqrt(global_buffer[j]);
+        }
+    }
+}
+
+void PackAFirstCol(
+    const double *A, //
+    double *out,     //
+    const int nRow,
+    const int nACol,
+    const int nFirst)
+{
+}
+
+void PackABwithSlice(
+    const double *A, //
+    const double *B, //
+    double *out,     //
+    const int nRow,
+    const int nACol,
+    const int nBCol,
+    const int *SliceB,
+    const int nSliceB,
+    double *Packbuf)
+{
+    int i, j;
+    int nThread = get_omp_threads();
+
+    const int nOutCol = nACol + nSliceB;
+
+#pragma omp parallel for num_threads(nThread) schedule(static)
+    for (i = 0; i < nRow; ++i)
+    {
+        memcpy(Packbuf + i * nOutCol, A + i * nACol, sizeof(double) * nACol);
+        for (j = 0; j < nSliceB; ++j)
+        {
+            Packbuf[i * nOutCol + nACol + j] = B[i * nBCol + SliceB[j]];
+        }
+    }
+
+    memcpy(out, Packbuf, sizeof(double) * nRow * nOutCol);
+}
+
+void PackABwithABSlice(
+    const double *A, //
+    const double *B, //
+    double *out,     //
+    const int nRow,
+    const int nACol,
+    const int nBCol,
+    const int *Slice,
+    const int nSlice,
+    double *Packbuf,
+    double *thread_buffer)
+{
+    int i, j;
+    int nThread = get_omp_threads();
+
+    const int nOutCol = nSlice;
+
+#pragma omp parallel num_threads(nThread)
+    {
+        int thread_id = omp_get_thread_num();
+        double *buf = thread_buffer + thread_id * (nACol + nBCol);
+
+#pragma omp for schedule(static)
+        for (i = 0; i < nRow; ++i)
+        {
+            memcpy(buf, A + i * nACol, sizeof(double) * nACol);
+            memcpy(buf + nACol, B + i * nBCol, sizeof(double) * nBCol);
+            for (j = 0; j < nSlice; ++j)
+            {
+                Packbuf[i * nSlice + j] = buf[Slice[j]];
+            }
+        }
+    }
+    memcpy(out, Packbuf, sizeof(double) * nRow * nOutCol);
+}
+
+void PackAB(
+    const double *A, //
+    const double *B, //
+    double *out,     //
+    const int nRow,
+    const int nACol,
+    const int nBCol)
+{
+    int i, j;
+    int nThread = get_omp_threads();
+
+    const int nOutCol = nACol + nBCol;
+
+#pragma omp parallel for num_threads(nThread) schedule(static)
+    for (i = 0; i < nRow; ++i)
+    {
+        memcpy(out + i * nOutCol, A + i * nACol, sizeof(double) * nACol);
+        memcpy(out + i * nOutCol + nACol, B + i * nBCol, sizeof(double) * nBCol);
+    }
+}
diff --git a/pyscf/isdf/pbc_isdf_eri.c b/pyscf/isdf/pbc_isdf_eri.c
new file mode 100644
index 000000000..d63437b0e
--- /dev/null
+++ b/pyscf/isdf/pbc_isdf_eri.c
@@ -0,0 +1,438 @@
+#include "fft.h"
+#include <omp.h>
+#include <string.h>
+#include <complex.h>
+#include "vhf/fblas.h"
+#include <math.h>
+#include "np_helper/np_helper.h"
+#include <stdbool.h>
+
+int get_omp_threads();
+int omp_get_thread_num();
+
+void _pack_aoR_to_aoPairR_diff(
+    double *aoR_i,
+    double *aoR_j,
+    double *aoPairR,
+    int nao_i,
+    int nao_j,
+    int ngrid)
+{
+    int nPair = nao_i * nao_j;
+#pragma omp parallel for schedule(static)
+    for (int i = 0; i < nPair; i++)
+    {
+        int i1 = i / nao_j;
+        int j1 = i % nao_j;
+        for (int k = 0; k < ngrid; k++)
+        {
+            aoPairR[i * ngrid + k] = aoR_i[i1 * ngrid + k] * aoR_j[j1 * ngrid + k];
+        }
+    }
+}
+
+void _pack_aoR_to_aoPairR_same(
+    double *aoR,
+    double *aoPairR,
+    int nao,
+    int ngrid)
+{
+    // int nPair = nao * (nao + 1) / 2;
+
+#pragma omp parallel for schedule(static)
+    for (int i1 = 0; i1 < nao; ++i1)
+    {
+        for (int j1 = 0; j1 <= i1; ++j1)
+        {
+            int i = i1 * (i1 + 1) / 2 + j1;
+            for (int k = 0; k < ngrid; ++k)
+            {
+                aoPairR[i * ngrid + k] = aoR[i1 * ngrid + k] * aoR[j1 * ngrid + k];
+            }
+        }
+    }
+}
+
+#define COMBINE2(i, j) ((i) < (j) ? (j) * (j + 1) / 2 + i : i * (i + 1) / 2 + j)
+
+void _unpack_suberi_to_eri(
+    double *eri,
+    const int nao,
+    double *suberi,
+    const int nao_bra,
+    const int *ao_loc_bra,
+    const int nao_ket,
+    const int *ao_loc_ket,
+    const int add_transpose)
+{
+    int nPair = nao * (nao + 1) / 2;
+
+    int nPair_ket = nao_ket * (nao_ket + 1) / 2;
+    // int nPair_bra = nao_bra * (nao_bra + 1) / 2;
+
+#pragma omp parallel for schedule(static)
+    for (int i1 = 0; i1 < nao_bra; ++i1)
+    {
+        for (int j1 = 0; j1 <= i1; ++j1)
+        {
+            int i = ao_loc_bra[i1];
+            int j = ao_loc_bra[j1];
+            int ij = COMBINE2(i, j);
+            int i1j1 = COMBINE2(i1, j1);
+            // printf("i1: %d, j1: %d, i: %d, j: %d, ij: %d, i1j1: %d\n", i1, j1, i, j, ij, i1j1);
+            for (int k1 = 0; k1 < nao_ket; ++k1)
+            {
+                for (int l1 = 0; l1 <= k1; ++l1)
+                {
+                    int k = ao_loc_ket[k1];
+                    int l = ao_loc_ket[l1];
+                    int kl = COMBINE2(k, l);
+                    int k1l1 = COMBINE2(k1, l1);
+                    eri[ij * nPair + kl] += suberi[i1j1 * nPair_ket + k1l1];
+                }
+            }
+        }
+    }
+
+    if (add_transpose)
+    {
+#pragma omp parallel for schedule(static)
+        for (int i1 = 0; i1 < nao_bra; ++i1)
+        {
+            for (int j1 = 0; j1 <= i1; ++j1)
+            {
+                int i = ao_loc_bra[i1];
+                int j = ao_loc_bra[j1];
+                int ij = COMBINE2(i, j);
+                int i1j1 = COMBINE2(i1, j1);
+                for (int k1 = 0; k1 < nao_ket; ++k1)
+                {
+                    for (int l1 = 0; l1 <= k1; ++l1)
+                    {
+                        int k = ao_loc_ket[k1];
+                        int l = ao_loc_ket[l1];
+                        int kl = COMBINE2(k, l);
+                        int k1l1 = COMBINE2(k1, l1);
+                        eri[kl * nPair + ij] += suberi[i1j1 * nPair_ket + k1l1];
+                    }
+                }
+            }
+        }
+    }
+}
+
+void _unpack_suberi_to_eri_ovov(
+    double *eri,
+    double *suberi,
+    const int nPair,
+    const int add_transpose)
+{
+    static const double ALPHA = 1.0;
+    static const int INCX = 1;
+
+#pragma omp parallel for schedule(static)
+    for (int i = 0; i < nPair; i++)
+    {
+        daxpy_(&nPair, &ALPHA, suberi + i * nPair, &INCX, eri + i * nPair, &INCX);
+    }
+
+    if (add_transpose)
+    {
+#pragma omp parallel for schedule(static)
+        for (int i = 0; i < nPair; i++)
+        {
+            daxpy_(&nPair, &ALPHA, suberi + i * nPair, &INCX, eri + i, &nPair);
+        }
+    }
+}
+
+#undef COMBINE2
+
+/// sliced operation ///
+
+void fn_slice_2_0(
+    const double *tensor_A,
+    double *tensor_B,
+    const int n0,
+    const int n1,
+    const int slice_0_0,
+    const int slice_0_1)
+{
+    int dim0 = slice_0_1 - slice_0_0;
+
+#pragma omp parallel for
+    for (size_t i = slice_0_0; i < slice_0_1; i++)
+    {
+        memcpy(tensor_B + (i - slice_0_0) * n1, tensor_A + i * n1, sizeof(double) * n1);
+    }
+}
+
+void fn_slice_2_1(
+    const double *tensor_A,
+    double *tensor_B,
+    const int n0,
+    const int n1,
+    const int slice_1_0,
+    const int slice_1_1)
+{
+    int dim1 = slice_1_1 - slice_1_0;
+#pragma omp parallel for
+    for (size_t i = 0; i < n0; i++)
+    {
+        memcpy(tensor_B + i * dim1, tensor_A + i * n1 + slice_1_0, sizeof(double) * dim1);
+    }
+}
+
+void fn_slice_3_2(
+    const double *tensor_A,
+    double *tensor_B,
+    const int n0,
+    const int n1,
+    const int n2,
+    const int slice_2_0,
+    const int slice_2_1)
+{
+    int dim2 = slice_2_1 - slice_2_0;
+
+#pragma omp parallel for schedule(static)
+    for (size_t ij = 0; ij < n0 * n1; ij++)
+    {
+        int i = ij / n1;
+        int j = ij % n1;
+        memcpy(tensor_B + ij * dim2, tensor_A + i * n1 * n2 + j * n2 + slice_2_0, sizeof(double) * dim2);
+    }
+}
+
+void fn_slice_3_0_2(
+    const double *tensor_A,
+    double *tensor_B,
+    const int n0,
+    const int n1,
+    const int n2,
+    const int slice_0_0,
+    const int slice_0_1,
+    const int slice_2_0,
+    const int slice_2_1)
+{
+    int dim0 = slice_0_1 - slice_0_0;
+    int dim2 = slice_2_1 - slice_2_0;
+
+#pragma omp parallel for
+    for (size_t i = slice_0_0; i < slice_0_1; i++)
+    {
+        for (size_t j = 0; j < n1; j++)
+        {
+            memcpy(tensor_B + (i - slice_0_0) * n1 * dim2 + j * dim2,
+                   tensor_A + i * n1 * n2 + j * n2 + slice_2_0, sizeof(double) * dim2);
+        }
+    }
+}
+
+void fn_slice_4_0_1_2(
+    const double *tensor_A,
+    double *tensor_B,
+    const int n0,
+    const int n1,
+    const int n2,
+    const int n3,
+    const int slice_0_0,
+    const int slice_0_1,
+    const int slice_1_0,
+    const int slice_1_1,
+    const int slice_2_0,
+    const int slice_2_1)
+{
+    int dim1 = slice_1_1 - slice_1_0;
+    int dim2 = slice_2_1 - slice_2_0;
+
+#pragma omp parallel for
+    for (size_t i = slice_0_0; i < slice_0_1; i++)
+    {
+        for (size_t j = slice_1_0; j < slice_1_1; j++)
+        {
+            memcpy(tensor_B + (i - slice_0_0) * dim1 * dim2 * n3 + (j - slice_1_0) * dim2 * n3,
+                   tensor_A + i * n1 * n2 * n3 + j * n2 * n3 + slice_2_0 * n3, sizeof(double) * dim2 * n3);
+        }
+    }
+}
+
+void fn_slice_3_1_2(
+    const double *tensor_A,
+    double *tensor_B,
+    const int n0,
+    const int n1,
+    const int n2,
+    const int slice_1_0,
+    const int slice_1_1,
+    const int slice_2_0,
+    const int slice_2_1)
+{
+    int dim1 = slice_1_1 - slice_1_0;
+    int dim2 = slice_2_1 - slice_2_0;
+
+#pragma omp parallel for
+    for (size_t i = 0; i < n0; i++)
+    {
+        for (size_t j = slice_1_0; j < slice_1_1; j++)
+        {
+            memcpy(tensor_B + i * dim1 * dim2 + (j - slice_1_0) * dim2,
+                   tensor_A + i * n1 * n2 + j * n2 + slice_2_0, sizeof(double) * dim2);
+        }
+    }
+}
+
+void fn_slice_4_1_2(
+    const double *tensor_A,
+    double *tensor_B,
+    const int n0,
+    const int n1,
+    const int n2,
+    const int n3,
+    const int slice_1_0,
+    const int slice_1_1,
+    const int slice_2_0,
+    const int slice_2_1)
+{
+    int dim1 = slice_1_1 - slice_1_0;
+    int dim2 = slice_2_1 - slice_2_0;
+
+#pragma omp parallel for schedule(static)
+    for (size_t i = 0; i < n0; i++)
+    {
+        for (size_t j = slice_1_0; j < slice_1_1; j++)
+        {
+            memcpy(tensor_B + i * dim1 * dim2 * n3 + (j - slice_1_0) * dim2 * n3,
+                   tensor_A + i * n1 * n2 * n3 + j * n2 * n3 + slice_2_0 * n3, sizeof(double) * dim2 * n3);
+        }
+    }
+}
+
+void fn_slice_3_0_1(
+    const double *tensor_A,
+    double *tensor_B,
+    const int n0,
+    const int n1,
+    const int n2,
+    const int slice_0_0,
+    const int slice_0_1,
+    const int slice_1_0,
+    const int slice_1_1)
+{
+    int dim0 = slice_0_1 - slice_0_0;
+    int dim1 = slice_1_1 - slice_1_0;
+
+#pragma omp parallel for schedule(static)
+    for (size_t i = slice_0_0; i < slice_0_1; i++)
+    {
+        for (size_t j = slice_1_0; j < slice_1_1; j++)
+        {
+            memcpy(tensor_B + (i - slice_0_0) * dim1 * n2 + (j - slice_1_0) * n2,
+                   tensor_A + i * n1 * n2 + j * n2, sizeof(double) * n2);
+        }
+    }
+}
+
+/// packadd ///
+
+void fn_packadd_3_1_2(
+    double *tensor_A,
+    const double *tensor_B,
+    const int n0,
+    const int n1,
+    const int n2,
+    const int slice_1_0,
+    const int slice_1_1,
+    const int slice_2_0,
+    const int slice_2_1)
+{
+    int dim1 = slice_1_1 - slice_1_0;
+    int dim2 = slice_2_1 - slice_2_0;
+
+#pragma omp parallel for schedule(static)
+    for (size_t i = 0; i < n0; i++)
+    {
+        for (size_t j = slice_1_0; j < slice_1_1; j++)
+        {
+            for (size_t k = slice_2_0; k < slice_2_1; k++)
+            {
+                tensor_A[i * n1 * n2 + j * n2 + k] += tensor_B[i * dim1 * dim2 + (j - slice_1_0) * dim2 + (k - slice_2_0)];
+                // printf("tensor_A[%d,%d,%d] = %f\n", i, j, k, tensor_A[i * n1 * n2 + j * n2 + k]);
+            }
+        }
+    }
+}
+
+void fn_packadd_3_1(
+    double *tensor_A,
+    const double *tensor_B,
+    const int n0,
+    const int n1,
+    const int n2,
+    const int slice_1_0,
+    const int slice_1_1)
+{
+    int dim1 = slice_1_1 - slice_1_0;
+
+    static const int INCX = 1;
+    static const double ALPHA = 1;
+
+#pragma omp parallel for schedule(static)
+    for (size_t i = 0; i < n0; i++)
+    {
+        for (size_t j = slice_1_0; j < slice_1_1; j++)
+        {
+            daxpy_(&n2, &ALPHA, tensor_B + i * dim1 * n2 + (j - slice_1_0) * n2, &INCX, tensor_A + i * n1 * n2 + j * n2, &INCX);
+        }
+    }
+}
+
+void fn_copy(
+    const double *tensor_A,
+    double *tensor_B,
+    const int size)
+{
+    if (tensor_A != tensor_B)
+    {
+        memcpy(tensor_B, tensor_A, sizeof(double) * size);
+    }
+}
+
+void fn_add(
+    const double *tensor_A,
+    double *tensor_B,
+    const int size)
+{
+    static const int INCX = 1;
+    static const double ALPHA = 1;
+
+    const int nthread = get_omp_threads();
+    const int bunch_size = size / nthread + 1;
+
+    if (size < 1024)
+    {
+        daxpy_(&size, &ALPHA, tensor_A, &INCX, tensor_B, &INCX);
+        return;
+    }
+
+#pragma omp parallel
+    {
+        const int ithread = omp_get_thread_num();
+        int start = ithread * bunch_size;
+        int end = start + bunch_size;
+        start = start > size ? size : start;
+        end = end > size ? size : end;
+        const int n = end - start;
+
+        if (n > 0)
+        {
+            daxpy_(&n, &ALPHA, tensor_A + start, &INCX, tensor_B + start, &INCX);
+        }
+    }
+}
+
+void fn_clean(
+    double *tensor_A,
+    const int size)
+{
+    memset(tensor_A, 0, sizeof(double) * size);
+}
\ No newline at end of file
diff --git a/pyscf/isdf/pbc_isdf_samplek.c b/pyscf/isdf/pbc_isdf_samplek.c
new file mode 100644
index 000000000..b144fc157
--- /dev/null
+++ b/pyscf/isdf/pbc_isdf_samplek.c
@@ -0,0 +1,632 @@
+#include "vhf/fblas.h"
+#include <stdio.h>
+#include <stdint.h>
+#include <string.h>
+#include <math.h>
+#include <omp.h>
+#include "fft.h"
+#include <stdlib.h>
+
+int get_omp_threads();
+int omp_get_thread_num();
+
+void _FFT_Matrix_Col_InPlace(double *matrix, // the size of matrix should be (nRow, nCol* *mesh)
+                             int nRow, int nCol, int *mesh,
+                             double *buf)
+{
+    int mesh_complex[3] = {mesh[0], mesh[1], mesh[2] / 2 + 1};
+    int64_t nComplex = mesh_complex[0] * mesh_complex[1] * mesh_complex[2];
+    int64_t nReal = mesh[0] * mesh[1] * mesh[2];
+    const int nThread = get_omp_threads();
+
+    // printf("nThread: %d\n", nThread);
+    // printf("nRow: %d\n", nRow);
+    // printf("nCol: %d\n", nCol);
+    // printf("mesh: %d %d %d\n", mesh[0], mesh[1], mesh[2]);
+    // printf("nComplex: %d\n", nComplex);
+
+    const int64_t m = nRow;
+    const int64_t n = nCol * mesh[0] * mesh[1] * mesh[2];
+    const int64_t n_complex = nCol * mesh_complex[0] * mesh_complex[1] * mesh_complex[2];
+    const int64_t nMesh = mesh[0] * mesh[1] * mesh[2];
+    const int64_t nMeshComplex = mesh_complex[0] * mesh_complex[1] * mesh_complex[2];
+
+    // printf("m: %d\n", m);
+    // printf("n: %d\n", n);
+    // printf("nMesh: %d\n", nMesh);
+    // printf("nMeshComplex: %d\n", nMeshComplex);
+
+    // (1) transform (Row, Block, Col) -> (Row, Col, Block)
+
+#pragma omp parallel for num_threads(nThread)
+    for (int64_t i = 0; i < m; i++)
+    {
+        int64_t iCol = 0;
+
+        for (int64_t iBlock = 0; iBlock < nMesh; iBlock++)
+        {
+            for (int64_t j = 0; j < nCol; j++, iCol++)
+            {
+                buf[i * n + j * nMesh + iBlock] = matrix[i * n + iCol];
+            }
+        }
+    }
+
+    // printf("finish (1) \n");
+
+    // (2) perform FFT on the last dimension
+
+    int64_t nFFT = nRow * nCol;
+
+    double __complex__ *mat_complex = (double __complex__ *)buf;
+    double __complex__ *buf_complex = (double __complex__ *)matrix;
+
+    // create plan
+
+    const int BunchSize = nFFT / nThread + 1;
+
+#pragma omp parallel num_threads(nThread)
+    {
+        int tid = omp_get_thread_num();
+        int64_t start = tid * BunchSize;
+        int64_t end = (tid + 1) * BunchSize;
+        if (end > nFFT)
+        {
+            end = nFFT;
+        }
+
+        fftw_plan plan = fftw_plan_many_dft_r2c(3, mesh, end - start, buf + start * nReal, mesh, 1, nReal, (fftw_complex *)buf_complex + start * nComplex, mesh_complex, 1, nComplex, FFTW_ESTIMATE);
+        fftw_execute(plan);
+        fftw_destroy_plan(plan);
+    }
+
+    // printf("finish (2) \n");
+
+    // (3) transform (Row, Col, Block) -> (Row, Block, Col)
+
+    mat_complex = (double __complex__ *)matrix;
+    buf_complex = (double __complex__ *)buf;
+
+#pragma omp parallel for num_threads(nThread)
+    for (int64_t i = 0; i < m; i++)
+    {
+        int64_t iCol = 0;
+
+        for (int64_t j = 0; j < nCol; j++)
+        {
+            for (int64_t iBlock = 0; iBlock < nMeshComplex; iBlock++, iCol++)
+            {
+                buf_complex[i * n_complex + iBlock * nCol + j] = mat_complex[i * n_complex + iCol];
+            }
+        }
+    }
+
+    // printf("finish (3) \n");
+
+    memcpy(matrix, buf, sizeof(double __complex__) * m * nCol * mesh_complex[0] * mesh_complex[1] * mesh_complex[2]);
+
+    // printf("finish memcpy \n");
+}
+
+void _iFFT_Matrix_Col_InPlace(double __complex__ *matrix, // the size of matrix should be (nRow, nCol* *mesh)
+                              int nRow, int nCol, int *mesh,
+                              double __complex__ *buf)
+{
+    int mesh_complex[3] = {mesh[0], mesh[1], mesh[2] / 2 + 1};
+    int64_t nComplex = mesh_complex[0] * mesh_complex[1] * mesh_complex[2];
+    int64_t nReal = mesh[0] * mesh[1] * mesh[2];
+    const int64_t nThread = get_omp_threads();
+
+    const int64_t m            = nRow;
+    const int64_t n            = nCol * mesh[0] * mesh[1] * mesh[2];
+    const int64_t n_Complex    = nCol * mesh_complex[0] * mesh_complex[1] * mesh_complex[2];
+    const int64_t nMesh        = mesh[0] * mesh[1] * mesh[2];
+    const int64_t nMeshComplex = mesh_complex[0] * mesh_complex[1] * mesh_complex[2];
+    const double factor = 1.0 / (double)(nMesh);
+
+    // printf("m: %d\n", m);
+    // printf("n: %d\n", n);
+    // printf("n_Complex: %d\n", n_Complex);
+    // printf("nMesh: %d\n", nMesh);
+    // printf("nMeshComplex: %d\n", nMeshComplex);
+    // printf("nThread: %d\n", nThread);
+    // printf("nRow: %d\n", nRow);
+    // printf("nCol: %d\n", nCol);
+    // printf("mesh: %d %d %d\n", mesh[0], mesh[1], mesh[2]);
+    // printf("nComplex: %d\n", nComplex);
+    // printf("nReal: %d\n", nReal);
+
+    // (1) transform (Row, Block, Col) -> (Row, Col, Block)
+
+#pragma omp parallel for num_threads(nThread)
+    for (int64_t i = 0; i < m; i++)
+    {
+        int64_t iCol = 0;
+
+        for (int64_t iBlock = 0; iBlock < nMeshComplex; iBlock++)
+        {
+            for (int64_t j = 0; j < nCol; j++, iCol++)
+            {
+                buf[i * n_Complex + j * nMeshComplex + iBlock] = matrix[i * n_Complex + iCol];
+            }
+        }
+    }
+
+    // (2) perform iFFT on the last dimension
+
+    int64_t nFFT = nRow * nCol;
+
+    double *mat_real = (double *)buf;
+    double *buf_real = (double *)matrix;
+
+    // create plan
+
+    const int64_t BunchSize = nFFT / nThread + 1;
+
+#pragma omp parallel num_threads(nThread)
+    {
+        int64_t tid = omp_get_thread_num();
+        int64_t start = tid * BunchSize;
+        int64_t end = (tid + 1) * BunchSize;
+        if (end > nFFT)
+        {
+            end = nFFT;
+        }
+
+        fftw_plan plan = fftw_plan_many_dft_c2r(3, mesh, end - start, (fftw_complex *)buf + start * nComplex, mesh_complex, 1, nComplex, buf_real + start * nReal, mesh, 1, nReal, FFTW_ESTIMATE);
+        fftw_execute(plan);
+        fftw_destroy_plan(plan);
+    }
+
+    // (3) transform (Row, Col, Block) -> (Row, Block, Col)
+
+    mat_real = (double *)matrix;
+    buf_real = (double *)buf;
+
+#pragma omp parallel for num_threads(nThread)
+    for (int64_t i = 0; i < m; i++)
+    {
+        int64_t iCol = 0;
+
+        for (int64_t j = 0; j < nCol; j++)
+        {
+            for (int64_t iBlock = 0; iBlock < nMesh; iBlock++, iCol++)
+            {
+                // printf("i: %d, j: %d, iBlock: %d, iCol: %d %15.8f\n", i, j, iBlock, iCol, mat_real[i * n + iCol]);
+                buf_real[i * n + iBlock * nCol + j] = mat_real[i * n + iCol] * factor;
+            }
+        }
+    }
+
+    memcpy(mat_real, buf_real, sizeof(double) * m * nCol * mesh[0] * mesh[1] * mesh[2]);
+}
+
+void _FinalFFT(
+    double __complex__ *a,
+    const double __complex__ *freq,
+    int m, int n, int *mesh,
+    double __complex__ *buf)
+{
+    const int nThread = get_omp_threads();
+
+    if (n != mesh[0] * mesh[1] * mesh[2])
+    {
+        fprintf(stderr, "The size of a is not compatible with mesh\n");
+        exit(1);
+    }
+
+#pragma omp parallel num_threads(nThread)
+    {
+        int thread_id = omp_get_thread_num();
+
+        double __complex__ *buf_thread = buf + thread_id * n;
+
+        fftw_plan plan = fftw_plan_dft_3d(mesh[0], mesh[1], mesh[2], (fftw_complex *)buf_thread, (fftw_complex *)a, FFTW_FORWARD, FFTW_ESTIMATE);
+
+#pragma omp for schedule(static, 1) nowait
+        for (size_t i = 0; i < m; i++)
+        {
+            double __complex__ *in = a + i * n;
+            for (int j = 0; j < n; j++)
+            {
+                buf_thread[j] = in[j] * freq[j];
+            }
+            fftw_execute_dft(plan, (fftw_complex *)buf_thread, (fftw_complex *)in);
+        }
+
+        fftw_destroy_plan(plan);
+    }
+}
+
+void _FinaliFFT(
+    double __complex__ *a,
+    const double __complex__ *freq,
+    int m, int n, int *mesh,
+    double __complex__ *buf)
+{
+    const int nThread = get_omp_threads();
+
+    double factor = 1.0 / (double)n;
+
+    if (n != mesh[0] * mesh[1] * mesh[2])
+    {
+        printf("n: %d\n", n);
+        printf("mesh: %d %d %d\n", mesh[0], mesh[1], mesh[2]);
+        fprintf(stderr, "The size of a is not compatible with mesh\n");
+        exit(1);
+    }
+
+#pragma omp parallel num_threads(nThread)
+    {
+        int thread_id = omp_get_thread_num();
+
+        double __complex__ *buf_thread = buf + thread_id * n;
+
+        fftw_plan plan = fftw_plan_dft_3d(mesh[0], mesh[1], mesh[2], (fftw_complex *)buf_thread, (fftw_complex *)a, FFTW_BACKWARD, FFTW_ESTIMATE);
+
+#pragma omp for schedule(static, 1) nowait
+        for (size_t i = 0; i < m; i++)
+        {
+            double __complex__ *in = a + i * n;
+            fftw_execute_dft(plan, (fftw_complex *)in, (fftw_complex *)buf_thread);
+            for (int j = 0; j < n; j++)
+            {
+                // buf_thread[j] = in[j] * conj(freq[j]) * factor;
+                in[j] = buf_thread[j] * conj(freq[j]) * factor;
+            }
+        }
+
+        fftw_destroy_plan(plan);
+    }
+}
+
+void _PermutationConj(
+    double __complex__ *a,
+    int m, int n, int *permutation,
+    double __complex__ *buf)
+{
+    const int nThread = get_omp_threads();
+
+#pragma omp parallel num_threads(nThread)
+    {
+        int thread_id = omp_get_thread_num();
+
+        double __complex__ *buf_thread = buf + thread_id * n;
+
+#pragma omp for schedule(static, 1) nowait
+        for (size_t i = 0; i < m; i++)
+        {
+            double __complex__ *in = a + i * n;
+            for (int j = 0; j < n; j++)
+            {
+                buf_thread[j] = conj(in[permutation[j]]);
+                // buf_thread[permutation[j]] = conj(in[j]);
+            }
+            memcpy(in, buf_thread, sizeof(double __complex__) * n);
+        }
+    }
+}
+
+#define PI 3.14159265358979323846
+
+void meshgrid(int *range1, int size1, int *range2, int size2, int *range3, int size3, int *output)
+{
+#pragma omp parallel for collapse(3)
+    for (int i = 0; i < size1; i++)
+    {
+        for (int j = 0; j < size2; j++)
+        {
+            for (int k = 0; k < size3; k++)
+            {
+                output[(i * size2 * size3 + j * size3 + k) * 3 + 0] = range1[i];
+                output[(i * size2 * size3 + j * size3 + k) * 3 + 1] = range2[j];
+                output[(i * size2 * size3 + j * size3 + k) * 3 + 2] = range3[k];
+            }
+        }
+    }
+}
+
+void _FREQ(
+    double __complex__ *FREQ,
+    const int *meshPrim,
+    const int *Ls)
+{
+    int *freq1_q = (int *)malloc(meshPrim[0] * sizeof(int));
+    int *freq2_q = (int *)malloc(meshPrim[1] * sizeof(int));
+    int *freq3_q = (int *)malloc(meshPrim[2] * sizeof(int));
+
+    for (int i = 0; i < meshPrim[0]; i++)
+    {
+        freq1_q[i] = i;
+    }
+    for (int i = 0; i < meshPrim[1]; i++)
+    {
+        freq2_q[i] = i;
+    }
+    for (int i = 0; i < meshPrim[2]; i++)
+    {
+        freq3_q[i] = i;
+    }
+
+    int *freq_q = (int *)malloc(meshPrim[0] * meshPrim[1] * meshPrim[2] * 3 * sizeof(int));
+    meshgrid(freq1_q, meshPrim[0], freq2_q, meshPrim[1], freq3_q, meshPrim[2], freq_q);
+
+    int *freq1_Q = (int *)malloc(Ls[0] * sizeof(int));
+    int *freq2_Q = (int *)malloc(Ls[1] * sizeof(int));
+    int *freq3_Q = (int *)malloc((Ls[2] / 2 + 1) * sizeof(int));
+
+    for (int i = 0; i < Ls[0]; i++)
+    {
+        freq1_Q[i] = i;
+    }
+    for (int i = 0; i < Ls[1]; i++)
+    {
+        freq2_Q[i] = i;
+    }
+    for (int i = 0; i < Ls[2] / 2 + 1; i++)
+    {
+        freq3_Q[i] = i;
+    }
+
+    int *freq_Q = (int *)malloc(Ls[0] * Ls[1] * (Ls[2] / 2 + 1) * 3 * sizeof(int));
+    meshgrid(freq1_Q, Ls[0], freq2_Q, Ls[1], freq3_Q, Ls[2] / 2 + 1, freq_Q);
+
+#pragma omp parallel for collapse(6)
+    for (int i = 0; i < Ls[0]; i++)
+    {
+        for (int j = 0; j < Ls[1]; j++)
+        {
+            for (int k = 0; k < Ls[2] / 2 + 1; k++)
+            {
+                for (int p = 0; p < meshPrim[0]; p++)
+                {
+                    for (int q = 0; q < meshPrim[1]; q++)
+                    {
+                        for (int s = 0; s < meshPrim[2]; s++)
+                        {
+                            FREQ[(i * Ls[1] * (Ls[2] / 2 + 1) * meshPrim[0] * meshPrim[1] * meshPrim[2] +
+                                  j * (Ls[2] / 2 + 1) * meshPrim[0] * meshPrim[1] * meshPrim[2] +
+                                  k * meshPrim[0] * meshPrim[1] * meshPrim[2] +
+                                  p * meshPrim[1] * meshPrim[2] +
+                                  q * meshPrim[2] +
+                                  s)] = freq_Q[(i * Ls[1] * (Ls[2] / 2 + 1) + j * (Ls[2] / 2 + 1) + k) * 3 + 0] * freq_q[(p * meshPrim[1] * meshPrim[2] + q * meshPrim[2] + s) * 3 + 0] / (double)(Ls[0] * meshPrim[0]) +
+                                        freq_Q[(i * Ls[1] * (Ls[2] / 2 + 1) + j * (Ls[2] / 2 + 1) + k) * 3 + 1] * freq_q[(p * meshPrim[1] * meshPrim[2] + q * meshPrim[2] + s) * 3 + 1] / (double)(Ls[1] * meshPrim[1]) +
+                                        freq_Q[(i * Ls[1] * (Ls[2] / 2 + 1) + j * (Ls[2] / 2 + 1) + k) * 3 + 2] * freq_q[(p * meshPrim[1] * meshPrim[2] + q * meshPrim[2] + s) * 3 + 2] / (double)(Ls[2] * meshPrim[2]);
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+#pragma omp parallel for
+    for (int i = 0; i < Ls[0] * Ls[1] * (Ls[2] / 2 + 1) * meshPrim[0] * meshPrim[1] * meshPrim[2]; i++)
+    {
+        FREQ[i] = cexp(-2.0 * PI * I * FREQ[i]);
+    }
+
+    free(freq1_q);
+    free(freq2_q);
+    free(freq3_q);
+    free(freq_q);
+    free(freq1_Q);
+    free(freq2_Q);
+    free(freq3_Q);
+    free(freq_Q);
+}
+
+#undef PI
+
+void _permutation(int nx, int ny, int nz, int shift_x, int shift_y, int shift_z, int *res)
+{
+
+#pragma omp parallel for collapse(3)
+    for (int ix = 0; ix < nx; ix++)
+    {
+        for (int iy = 0; iy < ny; iy++)
+        {
+            for (int iz = 0; iz < nz; iz++)
+            {
+                int ix2 = (nx - ix - shift_x) % nx;
+                int iy2 = (ny - iy - shift_y) % ny;
+                int iz2 = (nz - iz - shift_z) % nz;
+                int loc = ix2 * ny * nz + iy2 * nz + iz2;
+                int loc_now = ix * ny * nz + iy * nz + iz;
+                res[loc] = loc_now;
+            }
+        }
+    }
+}
+
+void _get_permutation(
+    const int *meshPrim,
+    int *res)
+{
+    int nGridPrim = meshPrim[0] * meshPrim[1] * meshPrim[2];
+
+#pragma omp parallel sections
+    {
+#pragma omp section
+        _permutation(meshPrim[0], meshPrim[1], meshPrim[2], 0, 0, 0, &res[0 * nGridPrim]);
+
+#pragma omp section
+        _permutation(meshPrim[0], meshPrim[1], meshPrim[2], 0, 0, 1, &res[1 * nGridPrim]);
+
+#pragma omp section
+        _permutation(meshPrim[0], meshPrim[1], meshPrim[2], 0, 1, 0, &res[2 * nGridPrim]);
+
+#pragma omp section
+        _permutation(meshPrim[0], meshPrim[1], meshPrim[2], 0, 1, 1, &res[3 * nGridPrim]);
+
+#pragma omp section
+        _permutation(meshPrim[0], meshPrim[1], meshPrim[2], 1, 0, 0, &res[4 * nGridPrim]);
+
+#pragma omp section
+        _permutation(meshPrim[0], meshPrim[1], meshPrim[2], 1, 0, 1, &res[5 * nGridPrim]);
+
+#pragma omp section
+        _permutation(meshPrim[0], meshPrim[1], meshPrim[2], 1, 1, 0, &res[6 * nGridPrim]);
+
+#pragma omp section
+        _permutation(meshPrim[0], meshPrim[1], meshPrim[2], 1, 1, 1, &res[7 * nGridPrim]);
+    }
+}
+
+int _get_loc(
+    const int freq,
+    const int mesh)
+{
+    int max_freq = mesh / 2;
+    int min_freq = -mesh / 2;
+
+    if (mesh % 2 == 0)
+    {
+        max_freq = mesh / 2 - 1;
+        min_freq = -mesh / 2;
+    }
+
+    if (freq > max_freq || freq < min_freq)
+    {
+        return -1;
+    }
+
+    if (freq >= 0)
+    {
+        return freq;
+    }
+    else
+    {
+        int shift = mesh / 2;
+        if (mesh % 2 == 1)
+        {
+            shift += 1;
+        }
+        return (freq - min_freq) + shift;
+    }
+}
+
+int _get_loc2(
+    const int freq,
+    const int mesh) // for real signal, the freq and loc must always be non-negative !
+{
+    int max_freq = (mesh / 2) + 1;
+
+    if (freq >= 0 && freq < max_freq)
+    {
+        return freq;
+    }
+    else
+    {
+        return -1;
+    }
+}
+
+int _get_freq(
+    const int loc,
+    const int mesh)
+{
+    int mid_loc = mesh / 2;
+    if (mesh % 2 == 1)
+    {
+        mid_loc += 1;
+    }
+
+    if ((loc < 0) || (loc >= mesh))
+    {
+        printf("loc: %d, mesh: %d\n", loc, mesh);
+        exit(1);
+    }
+
+    if (loc < mid_loc)
+    {
+        return loc;
+    }
+    else
+    {
+        return loc - mesh;
+    }
+}
+
+int _get_freq2(
+    const int loc,
+    const int mesh)
+{
+    int loc_max = mesh / 2 + 1;
+
+    if (loc >= 0 && loc < loc_max)
+    {
+        return loc;
+    }
+    else
+    {
+        return -1;
+    }
+}
+
+void map_fftfreq(int *mesh_source, int *mesh_target, int *res)
+{
+    int nGrid = mesh_source[0] * mesh_source[1] * mesh_source[2];
+
+#pragma omp parallel for
+    for (int i = 0; i < nGrid; i++)
+    {
+        int ix_loc = i / (mesh_source[1] * mesh_source[2]);
+        int iy_loc = (i % (mesh_source[1] * mesh_source[2])) / mesh_source[2];
+        int iz_loc = i % mesh_source[2];
+
+        int ix_freq = _get_freq(ix_loc, mesh_source[0]);
+        int iy_freq = _get_freq(iy_loc, mesh_source[1]);
+        int iz_freq = _get_freq(iz_loc, mesh_source[2]);
+
+        int ix_target = _get_loc(ix_freq, mesh_target[0]);
+        int iy_target = _get_loc(iy_freq, mesh_target[1]);
+        int iz_target = _get_loc(iz_freq, mesh_target[2]);
+
+        if (ix_target == -1 || iy_target == -1 || iz_target == -1)
+        {
+            res[i] = -1;
+        }
+        else
+        {
+            res[i] = ix_target * mesh_target[1] * mesh_target[2] + iy_target * mesh_target[2] + iz_target;
+        }
+
+        res[i] = ix_target * mesh_target[1] * mesh_target[2] + iy_target * mesh_target[2] + iz_target;
+    }
+}
+
+void map_rfftfreq(int *mesh_source, int *mesh_target, int *res)
+{
+    int nGrid = mesh_source[0] * mesh_source[1] * (mesh_source[2] / 2 + 1);
+
+#pragma omp parallel for
+    for (int i = 0; i < nGrid; i++)
+    {
+        int ix_loc = i / (mesh_source[1] * (mesh_source[2] / 2 + 1));
+        int iy_loc = (i % (mesh_source[1] * (mesh_source[2] / 2 + 1))) / (mesh_source[2] / 2 + 1);
+        int iz_loc = i % (mesh_source[2] / 2 + 1);
+
+        int ix_freq = _get_freq(ix_loc, mesh_source[0]);
+        int iy_freq = _get_freq(iy_loc, mesh_source[1]);
+        int iz_freq = _get_freq2(iz_loc, mesh_source[2]);
+
+        if (iz_freq == -1)
+        {
+            printf("iz_loc: %d, mesh_source[2]: %d\n", iz_loc, mesh_source[2]);
+            exit(1);
+        }
+
+        int ix_target = _get_loc(ix_freq, mesh_target[0]);
+        int iy_target = _get_loc(iy_freq, mesh_target[1]);
+        int iz_target = _get_loc2(iz_freq, mesh_target[2]);
+
+        if (ix_target == -1 || iy_target == -1 || iz_target == -1)
+        {
+            res[i] = -1;
+        }
+        else
+        {
+            res[i] = ix_target * mesh_target[1] * (mesh_target[2] / 2 + 1) + iy_target * (mesh_target[2] / 2 + 1) + iz_target;
+        }
+    }
+}
\ No newline at end of file
diff --git a/pyscf/isdf/pbc_isdf_sparse.c b/pyscf/isdf/pbc_isdf_sparse.c
new file mode 100644
index 000000000..8f542ce25
--- /dev/null
+++ b/pyscf/isdf/pbc_isdf_sparse.c
@@ -0,0 +1,419 @@
+#include "fft.h"
+#include <omp.h>
+#include "vhf/fblas.h"
+#include <stdio.h>
+#include <stdint.h>
+#include <string.h>
+#include <math.h>
+#include <time.h>
+
+int get_omp_threads();
+int omp_get_thread_num();
+
+void _process_dm(
+    const double *dm,
+    const int nao,
+    const double cutoff,
+    int *nElmtRow, // record the number of elements in each row, size of which should be larger than nao + 1
+    int *nNonZeroElmt)
+{
+    *nNonZeroElmt = 0;
+    int nThread = get_omp_threads();
+
+#pragma omp parallel num_threads(nThread)
+    {
+        int i = 0;
+
+        int NonZeroFound = 0;
+
+#pragma omp for schedule(dynamic)
+        for (i = 0; i < nao; i++)
+        {
+            int nNonZero = 0;
+            for (int j = 0; j < nao; j++)
+            {
+                if (fabs(dm[i * nao + j]) > cutoff)
+                {
+                    nNonZero++;
+                }
+            }
+            nElmtRow[i] = nNonZero;
+            NonZeroFound += nNonZero;
+        }
+
+#pragma omp critical
+        {
+            *nNonZeroElmt += NonZeroFound;
+        }
+    }
+}
+
+void _compress_dm(
+    const double *dm,
+    const int nao,
+    const double cutoff,
+    const int *nElmtRow,
+    int *RowLoc,
+    int *ColIndx,
+    double *dm_sparse)
+{
+    *RowLoc = 0;
+    for (int i = 0; i < nao; i++)
+    {
+        RowLoc[i + 1] = RowLoc[i] + nElmtRow[i];
+    }
+
+    int nThread = get_omp_threads();
+
+#pragma omp parallel num_threads(nThread)
+    {
+        int i = 0;
+
+        double *dm_ptr;
+        int *indx_ptr;
+
+#pragma omp for schedule(dynamic)
+        for (i = 0; i < nao; i++)
+        {
+            dm_ptr = dm_sparse + RowLoc[i];
+            indx_ptr = ColIndx + RowLoc[i];
+            for (int j = 0; j < nao; j++)
+            {
+                if (fabs(dm[i * nao + j]) > cutoff)
+                {
+                    *dm_ptr++ = dm[i * nao + j];
+                    *indx_ptr++ = j;
+                }
+            }
+        }
+    }
+}
+
+void _dm_aoR_spMM(
+    const double *dm_sparse,
+    const int *RowLoc,
+    const int *ColIndx,
+    const double *aoR,
+    const int nao,
+    const int ngrids,
+    double *out)
+{
+    static const int ONE = 1.0;
+
+    // parallel over each row of dm_sparse
+
+    int nThread = get_omp_threads();
+
+#pragma omp parallel num_threads(nThread)
+    {
+        int i = 0;
+
+        double *out_ptr;
+        const double *aoR_ptr;
+        const double *dm_ptr;
+        const int *indx_ptr;
+
+#pragma omp for schedule(dynamic)
+        for (i = 0; i < nao; i++)
+        {
+            out_ptr = out + i * ngrids;
+            dm_ptr = dm_sparse + RowLoc[i];
+            indx_ptr = ColIndx + RowLoc[i];
+            memset(out_ptr, 0, sizeof(double) * ngrids);
+            for (int j = 0; j < RowLoc[i + 1] - RowLoc[i]; j++)
+            {
+                aoR_ptr = aoR + indx_ptr[j] * ngrids;
+                daxpy_(&ngrids, dm_ptr + j, aoR_ptr, &ONE, out_ptr, &ONE);
+            }
+        }
+    }
+}
+
+void NPdcwisemul(double *out, double *a, double *b, size_t n);
+
+void _cwise_product_check_Sparsity(
+    const double *V,
+    const double *dmRgR,
+    double *out,
+    const int naux,
+    const int ngrids,
+    const double cutoff,
+    double *buf,
+    int *UseSparsity,
+    int *IsSparsity)
+{
+    /// choose seed based on the current time
+
+    static const double COMPRESS_CRITERION = 0.15;
+
+    srand(time(NULL));
+
+    *UseSparsity = 1;
+    int nThread = get_omp_threads();
+
+    int nNonZeroElmt = 0;
+
+    NPdcwisemul(out, V, dmRgR, naux * ngrids);
+
+#pragma omp parallel num_threads(nThread)
+    {
+        int i = 0;
+        int nNonZero = 0;
+
+#pragma omp for schedule(static) nowait
+        for (i = 0; i < naux * ngrids; i++)
+        {
+            if (fabs(out[i]) > cutoff)
+            {
+                nNonZero++;
+            }
+            else
+            {
+                out[i] = 0.0;
+            }
+        }
+
+#pragma omp critical
+        {
+            nNonZeroElmt += nNonZero;
+        }
+    }
+
+    double sparsity = (double)nNonZeroElmt / (naux * ngrids);
+    printf("sparsity: %8.2f percentage \n", sparsity * 100);
+
+    if (sparsity < COMPRESS_CRITERION)
+    {
+        *UseSparsity = 1;
+    }
+    else
+    {
+        *UseSparsity = 0;
+    }
+
+    if (*UseSparsity == 1)
+    {
+        const int nMaxElmt = ngrids * COMPRESS_CRITERION * 2;
+
+        int nDense = 0;
+
+#pragma omp parallel num_threads(nThread)
+        {
+            int32_t *nElmt_ptr, *indx_ptr;
+            double *Elmt_ptr, *out_ptr;
+
+            int thread_id = omp_get_thread_num();
+            double *buf_thread = buf + thread_id * ngrids;
+
+#pragma omp for schedule(static) nowait
+            for (int i = 0; i < naux; i++)
+            {
+                nElmt_ptr = (int32_t *)buf_thread;
+                indx_ptr = (int32_t *)((char *)buf_thread + sizeof(int32_t));
+                Elmt_ptr = buf_thread + ngrids - 1;
+                out_ptr = out + i * ngrids;
+
+                *nElmt_ptr = 0;
+                for (int j = 0; j < ngrids; j++)
+                {
+                    if (fabs(out_ptr[j]) > cutoff)
+                    {
+                        *Elmt_ptr-- = out_ptr[j];
+                        *indx_ptr++ = j;
+                        *nElmt_ptr += 1;
+                    }
+                }
+                if (*nElmt_ptr > nMaxElmt)
+                {
+                    IsSparsity[i] = 0;
+
+#pragma omp atomic
+                    nDense++;
+                }
+                else
+                {
+                    IsSparsity[i] = 1;
+                    memcpy(out_ptr, buf_thread, sizeof(double) * ngrids);
+                }
+            }
+        }
+        printf("nDense: %d \n", nDense);
+    }
+}
+
+void _V_Dm_product_SpMM(
+    const double *V_Dm_Product,
+    const int *IsSparsity,
+    const double *aoR,
+    const int nao,
+    const int naux,
+    const int ngrids,
+    double *out)
+{
+    static const int ONE = 1;
+
+    int nThread = get_omp_threads();
+
+#pragma omp parallel num_threads(nThread)
+    {
+        int i = 0, j = 0, k = 0;
+
+        double *out_ptr;
+
+        int32_t *nElmt_ptr, *indx_ptr;
+        const double *Elmt_ptr;
+        const double *aoR_ptr;
+
+        int32_t nElmt;
+
+#pragma omp for schedule(static) nowait
+        for (i = 0; i < naux; i++)
+        {
+            if (IsSparsity[i] == 0)
+            {
+                out_ptr = out + i * nao;
+                memset(out_ptr, 0, sizeof(double) * nao);
+
+                for (j = 0; j < nao; j++)
+                {
+                    out_ptr[j] = ddot_(&ngrids, aoR + j * ngrids, &ONE, V_Dm_Product + i * ngrids, &ONE);
+                }
+            }
+            else
+            {
+                // # note extremely slow
+                nElmt_ptr = (int32_t *)(V_Dm_Product + i * ngrids);
+                indx_ptr = (int32_t *)(nElmt_ptr + 1);
+                Elmt_ptr = V_Dm_Product + (i + 1) * ngrids - 1;
+                nElmt = *nElmt_ptr;
+
+                out_ptr = out + i * nao;
+                memset(out_ptr, 0, sizeof(double) * nao);
+
+                if (nElmt == 0)
+                {
+                    continue;
+                }
+
+                for (j = 0; j < nao; j++)
+                {
+                    aoR_ptr = aoR + j * ngrids;
+                    for (k = 0; k < nElmt; k++)
+                    {
+                        out_ptr[j] += aoR_ptr[indx_ptr[k]] * Elmt_ptr[-k];
+                    }
+                }
+            }
+        }
+    }
+}
+
+void _V_Dm_product_SpMM2(
+    const double *V_Dm_Product,
+    const int *IsSparsity,
+    const double *aoRT,
+    const int nao,
+    const int naux,
+    const int ngrids,
+    double *out)
+{
+    static const int ONE = 1;
+
+    int nThread = get_omp_threads();
+
+#pragma omp parallel num_threads(nThread)
+    {
+        int i = 0, j = 0, k = 0;
+
+        double *out_ptr;
+
+        int32_t *nElmt_ptr, *indx_ptr;
+        const double *Elmt_ptr;
+        const double *aoR_ptr;
+
+        int32_t nElmt;
+
+#pragma omp for schedule(static) nowait
+        for (i = 0; i < naux; i++)
+        {
+            if (IsSparsity[i] == 0)
+            {
+                out_ptr = out + i * nao;
+                memset(out_ptr, 0, sizeof(double) * nao);
+
+                // summation over grids
+
+                for (j = 0; j < ngrids; j++)
+                {
+                    daxpy_(&nao, V_Dm_Product + i * ngrids + j, aoRT + j * nao, &ONE, out_ptr, &ONE);
+                }
+            }
+            else
+            {
+                // # note extremely slow
+                nElmt_ptr = (int32_t *)(V_Dm_Product + i * ngrids);
+                indx_ptr = (int32_t *)(nElmt_ptr + 1);
+                Elmt_ptr = V_Dm_Product + (i + 1) * ngrids - 1;
+                nElmt = *nElmt_ptr;
+
+                out_ptr = out + i * nao;
+                memset(out_ptr, 0, sizeof(double) * nao);
+
+                if (nElmt == 0)
+                {
+                    continue;
+                }
+
+                for (j = 0; j < nElmt; j++)
+                {
+                    aoR_ptr = aoRT + indx_ptr[j] * nao;
+                    daxpy_(&nao, Elmt_ptr - j, aoR_ptr, &ONE, out_ptr, &ONE);
+                }
+            }
+        }
+    }
+}
+
+//////// BASIC OPERATION USED IN GET_JK for k_ISDF ////////
+
+void dcwisemul_dense_sparse_kernel(
+    double *out,
+    const double *global,
+    const int nrow_global,
+    const int ncol_global,
+    const int row_shift,
+    const int col_shift,
+    const double *local,
+    const int *ao_invovled,
+    const int row_local,
+    const int col_local,
+    const int row_begin,
+    const int row_end,
+    const int col_begin,
+    const int col_end)
+{
+    const int nrow = row_end - row_begin;
+    const int ncol = col_end - col_begin;
+    memset(out, 0, sizeof(double) * nrow * ncol);
+
+    int nthread = get_omp_threads();
+
+    const double *global_head = global + row_shift * ncol_global + col_shift;
+
+#pragma omp parallel for num_threads(nthread) schedule(static)
+    for (int i = 0; i < row_local; i++)
+    {
+        const int irow = ao_invovled[i];
+        if (irow < row_begin || irow >= row_end)
+        {
+            continue;
+        }
+        double *p_global = global_head + (irow - row_begin) * ncol_global;
+        double *p_local = local + i * col_local;
+        double *p_out = out + i * ncol;
+        for (int j = 0; j < ncol; j++)
+        {
+            p_out[j] = p_global[j] * p_local[j];
+        }
+    }
+}
\ No newline at end of file
diff --git a/pyscf/lib/CMakeLists.txt b/pyscf/lib/CMakeLists.txt
index 1e65e1a31..dcd9c5e42 100644
--- a/pyscf/lib/CMakeLists.txt
+++ b/pyscf/lib/CMakeLists.txt
@@ -120,3 +120,16 @@ set_target_properties (clib_pdft PROPERTIES
     CLEAN_DIRECT_OUTPUT 1
     LIBRARY_OUTPUT_DIRECTORY ${PROJECT_SOURCE_DIR}
     OUTPUT_NAME "pdft")
+
+# Build the ISDF library
+add_library(isdf SHARED 
+../isdf/pbc_isdf_samplek.c 
+../isdf/pbc_isdf_V.c 
+../isdf/pbc_isdf_auxbasis.c 
+../isdf/pbc_isdf_sparse.c 
+../isdf/pbc_isdf_eri.c
+../isdf/fft.c)
+
+target_link_libraries(isdf cgto cint cvhf np_helper fftw3_threads fftw3 ${BLAS_LIBRARIES} ${OPENMP_C_PROPERTIES})
+set_target_properties(isdf PROPERTIES
+LIBRARY_OUTPUT_DIRECTORY ${PROJECT_SOURCE_DIR})
\ No newline at end of file