diff --git a/examples/isdf/00-RHF_diamond.py b/examples/isdf/00-RHF_diamond.py new file mode 100644 index 000000000..411e1bdf8 --- /dev/null +++ b/examples/isdf/00-RHF_diamond.py @@ -0,0 +1,121 @@ +import numpy as np +from pyscf import lib +from pyscf.gto.mole import * + +from pyscf.isdf import isdf_tools_cell +from pyscf.isdf import isdf_local_k +from pyscf.isdf import isdf_jk +from pyscf.isdf import isdf_local + +from pyscf.lib.parameters import BOHR + +MOL_STRUCTURE = ''' + C 0. 0. 0. + C 0.8917 0.8917 0.8917 + C 1.7834 1.7834 0. + C 2.6751 2.6751 0.8917 + C 1.7834 0. 1.7834 + C 2.6751 0.8917 2.6751 + C 0. 1.7834 1.7834 + C 0.8917 2.6751 2.6751 + ''' + +#### NOTE: a full tests on combinations of parameters #### + +C_ARRAY = [15, 15, 20, 25, 30, 30] +RELA_CUTOFF = [3e-2, 1e-2, 3e-3, 1e-3, 3e-4, 1e-4] +SuperCell_ARRAY = [ + # [1, 1, 1], + [1, 1, 2], + # [1, 2, 2], + # [2, 2, 2], + # [3, 3, 3], + # [2, 4, 4], + # [3, 4, 4], + # [5, 5, 5], + # [6, 6, 6], + # [1, 1, 4], + # [1, 1, 8], + # [1, 1, 16], + # [1, 1, 32], +] + + +Ke_CUTOFF = [70] +boxlen = 3.5668 +Basis = ['gth-dzvp'] + +PARTITION = [ + [[0,1],[2,3],[4,5],[6,7]], + [[0,1,2,3],[4,5,6,7]], + [[0,1,2,3,4,5,6,7]], + [[0],[1],[2],[3],[4],[5],[6],[7]], +] + +if __name__ == '__main__': + + boxlen = 3.57371000 + prim_a = np.array([[boxlen,0.0,0.0],[0.0,boxlen,0.0],[0.0,0.0,boxlen]]) + atm = [ + ['C', (0. , 0. , 0. )], + ['C', (0.8934275 , 0.8934275 , 0.8934275)], + ['C', (1.786855 , 1.786855 , 0. )], + ['C', (2.6802825 , 2.6802825 , 0.8934275)], + ['C', (1.786855 , 0. , 1.786855)], + ['C', (2.6802825 , 0.8934275 , 2.6802825)], + ['C', (0. , 1.786855 , 1.786855)], + ['C', (0.8934275 , 2.6802825 , 2.6802825)], + ] + + for supercell in SuperCell_ARRAY: + ke_cutoff = Ke_CUTOFF[0] + for partition in PARTITION: ## test different partition of atoms + for basis in Basis: + for c, rela_cutoff in zip(C_ARRAY, RELA_CUTOFF): + # for c in C_ARRAY: + print('--------------------------------------------') + print('C = %.2e, supercell = %s, kc_cutoff = %d, basis = %s, partition = %s' % ( + c, str(supercell), ke_cutoff, basis, partition)) + + prim_cell = isdf_tools_cell.build_supercell(atm, prim_a, Ls = [1,1,1], ke_cutoff=ke_cutoff, basis=basis, pseudo="gth-pade", verbose=4) + prim_mesh = prim_cell.mesh + print("prim_mesh = ", prim_mesh) + + mesh = [supercell[0] * prim_mesh[0], supercell[1] * prim_mesh[1], supercell[2] * prim_mesh[2]] + mesh = np.array(mesh, dtype=np.int32) + + cell, supercell_group = isdf_tools_cell.build_supercell_with_partition(atm, prim_a, partition=partition, Ls = supercell, ke_cutoff=ke_cutoff, mesh=mesh, basis=basis, pseudo="gth-pade", verbose=4) + + cell.incore_anyway = False + cell.max_memory = 200 # force to call with_df.get_jk + + t1 = (lib.logger.process_clock(),lib.logger.perf_counter()) + + pbc_isdf_info = isdf_local.PBC_ISDF_Info_Quad(cell, with_robust_fitting=True, direct=False, rela_cutoff_QRCP=rela_cutoff) + pbc_isdf_info.build_IP_local(c=c, group=supercell_group, Ls=[supercell[0]*4, supercell[1]*4, supercell[2]*4]) + print("pbc_isdf_info.naux = ", pbc_isdf_info.naux) + print("effective c = ", float(pbc_isdf_info.naux) / pbc_isdf_info.nao) + pbc_isdf_info.build_auxiliary_Coulomb() + + t2 = (lib.logger.process_clock(), lib.logger.perf_counter()) + + print(isdf_jk._benchmark_time(t1, t2, 'build_isdf', pbc_isdf_info)) + + # for bunch_size in BUNCHSIZE_IO: + ### perform scf ### + + from pyscf.pbc import scf + + t1 = (lib.logger.process_clock(), lib.logger.perf_counter()) + mf = scf.RHF(cell) + mf.with_df = pbc_isdf_info + mf.max_cycle = 32 + mf.conv_tol = 1e-7 + pbc_isdf_info.direct_scf = mf.direct_scf + mf.kernel() + t2 = (lib.logger.process_clock(), lib.logger.perf_counter()) + print(isdf_jk._benchmark_time(t1, t2, 'scf_isdf', pbc_isdf_info)) + + del mf + del pbc_isdf_info + exit(1) \ No newline at end of file diff --git a/examples/isdf/01-KRHF_TiO2.py b/examples/isdf/01-KRHF_TiO2.py new file mode 100644 index 000000000..b9f612495 --- /dev/null +++ b/examples/isdf/01-KRHF_TiO2.py @@ -0,0 +1,115 @@ +from functools import reduce +import numpy as np +from pyscf import lib +import pyscf.pbc.gto as pbcgto +from pyscf.pbc.gto import Cell +from pyscf.pbc import tools +from pyscf.pbc.lib.kpts import KPoints +from pyscf.pbc.lib.kpts_helper import is_zero, gamma_point, member +from pyscf.gto.mole import * + +from pyscf.isdf import isdf_tools_cell +from pyscf.isdf import isdf_local_k +from pyscf.isdf import isdf_jk + +MOL_STRUCTURE = ''' +Ti 2.3246330643 2.3246330643 1.4853414945 +Ti 0.0000000000 0.0000000000 -0.0000000000 +O 0.9065353261 3.7427308025 1.4853414945 +O 3.7427308025 0.9065353261 1.4853414945 +O 1.4180977382 1.4180977382 0.0000000000 +O 3.2311683903 3.2311683903 0.0000000000 +''' + +atm = [ +['Ti',(2.3246330643,2.3246330643, 1.4853414945)], +['Ti',(0.0000000000,0.0000000000, 0.0000000000)], +['O ',(0.9065353261,3.7427308025, 1.4853414945)], +['O ',(3.7427308025,0.9065353261, 1.4853414945)], +['O ',(1.4180977382,1.4180977382, 0.0000000000)], +['O ',(3.2311683903,3.2311683903, 0.0000000000)], +] +boxlen = [4.6492659759,4.6492659759,2.9706828877] + +C_ARRAY = [15,20,25,30] ## if rela_cutoff_QRCP is set, then c is used to when performing random projection, which can be relative large. +RELA_QR = [1e-2,1e-3,2e-4,1e-4] +SuperCell_ARRAY = [ + # [1,1,1], + [2,2,2], + [3,3,3], + [4,4,4], + [5,5,5], + [6,6,6], +] +Ke_CUTOFF = [128, 192] + +Basis = ['gth-cc-tzvp-Ye'] + +prim_partition = [[0],[1],[2],[3],[4],[5]] + +if __name__ == '__main__': + + prim_a = np.array([[boxlen[0],0.0,0.0],[0.0,boxlen[1],0.0],[0.0,0.0,boxlen[2]]]) + pseudo = 'gth-hf-rev' + + for supercell in SuperCell_ARRAY: + for basis in Basis: + for ke_cutoff in Ke_CUTOFF: + + DM_CACHED = None + + from pyscf.gto.basis import parse_nwchem + fbas="basis.dat" + atms = ['O', 'Ti'] + basis = {atm:parse_nwchem.load(fbas, atm) for atm in atms} + print("basis = ", basis) + + + prim_cell = isdf_tools_cell.build_supercell(atm, prim_a, Ls = [1,1,1], ke_cutoff=ke_cutoff, basis=basis, pseudo=pseudo, spin=0, verbose=10) + cell = prim_cell + + ### perform scf ### + + from pyscf.pbc import scf, dft + from pyscf.pbc.dft import multigrid + + nk = supercell + kpts = cell.make_kpts(nk) + + for c,rela_qr in list(zip(C_ARRAY,RELA_QR)): + + print('--------------------------------------------') + print('C = %d, QR=%f, supercell = %s, kc_cutoff = %d, basis = %s' % (c, rela_qr, str(supercell), ke_cutoff, basis)) + + ### create the isdf object ### + + t1 = (lib.logger.process_clock(), lib.logger.perf_counter()) + pbc_isdf_info = isdf_local_k.PBC_ISDF_Info_Quad_K(cell, + kmesh=nk, + with_robust_fitting=True, + rela_cutoff_QRCP=rela_qr, + direct=True, + limited_memory=True, + build_K_bunchsize=128, ## NOTE:control the memory cost in building K + # use_occ_RI_K=False + ) + pbc_isdf_info.verbose = 10 + pbc_isdf_info.build_IP_local(c=c, m=5, group=prim_partition) + print("effective c = ", float(pbc_isdf_info.naux) / pbc_isdf_info.nao) + t2 = (lib.logger.process_clock(), lib.logger.perf_counter()) + print(isdf_jk._benchmark_time(t1, t2, 'build ISDF', pbc_isdf_info)) + + t1 = (lib.logger.process_clock(), lib.logger.perf_counter()) + mf = scf.KRHF(cell, kpts) + mf.with_df = pbc_isdf_info + mf.max_cycle = 100 + mf.conv_tol = 1e-8 + mf.conv_tol_grad = 1e-3 + if DM_CACHED is not None: + mf.kernel(DM_CACHED) + else: + mf.kernel() + t2 = (lib.logger.process_clock(), lib.logger.perf_counter()) + + print(isdf_jk._benchmark_time(t1, t2, 'RHF_bench', mf)) + DM_CACHED = mf.make_rdm1() \ No newline at end of file diff --git a/examples/isdf/02-UHF_CCO.py b/examples/isdf/02-UHF_CCO.py new file mode 100644 index 000000000..aa9dc62b5 --- /dev/null +++ b/examples/isdf/02-UHF_CCO.py @@ -0,0 +1,139 @@ +import numpy as np +from pyscf import lib +from pyscf.gto.mole import * + +from pyscf.isdf import isdf_tools_cell +from pyscf.isdf import isdf_local_k +from pyscf.isdf import isdf_jk +from pyscf.isdf import isdf_local + +from pyscf.lib.parameters import BOHR + +#### NOTE: a full tests on combinations of parameters #### + +prim_a = np.array( + [[14.572056092, 0.000000000, 0.000000000], + [0.000000000, 14.572056092, 0.000000000], + [0.000000000, 0.000000000, 6.010273939],]) * BOHR +atm = [ +['Cu', (1.927800, 1.927800, 1.590250)], +['Cu', (5.783400, 5.783400, 1.590250)], +['Cu', (1.927800, 5.783400, 1.590250)], +['Cu', (5.783400, 1.927800, 1.590250)], +['O', (1.927800, 3.855600, 1.590250)], +['O', (3.855600, 5.783400, 1.590250)], +['O', (5.783400, 3.855600, 1.590250)], +['O', (3.855600, 1.927800, 1.590250)], +['O', (0.000000, 1.927800, 1.590250)], +['O', (1.927800, 7.711200, 1.590250)], +['O', (7.711200, 5.783400, 1.590250)], +['O', (5.783400, 0.000000, 1.590250)], +['Ca', (0.000000, 0.000000, 0.000000)], +['Ca', (3.855600, 3.855600, 0.000000)], +['Ca', (7.711200, 3.855600, 0.000000)], +['Ca', (3.855600, 7.711200, 0.000000)], +] + +C_ARRAY = [25, 30, 35] +RELA_CUTOFF = [1e-3, 3e-4, 1e-4] +SuperCell_ARRAY = [ + [1, 1, 1], +] +Ke_CUTOFF = [256] +Basis = ['gth-dzvp'] + +PARTITION = [ + [[0], [1], [2], [3], + [4], [5], [6], [7], + [8], [9], [10], [11], + [12], [13], [14], [15]] +] + +if __name__ == '__main__': + + for supercell in SuperCell_ARRAY: + ke_cutoff = Ke_CUTOFF[0] + for partition in PARTITION: ## test different partition of atoms + for _basis_ in Basis: + + DM_CACHED = None + + from pyscf.gto.basis import parse_nwchem + fbas="basis2.dat" + atms = ['O', 'Cu', "Ca"] + basis = {atm:parse_nwchem.load(fbas, atm) for atm in atms} + # print("basis = ", basis) + + pseudo = {'Cu': 'gth-pbe-q19', 'O': 'gth-pbe', 'Ca': 'gth-pbe'} + + prim_cell = isdf_tools_cell.build_supercell(atm, prim_a, Ls = [1,1,1], ke_cutoff=ke_cutoff, basis=basis, pseudo=pseudo, verbose=4) + prim_mesh = prim_cell.mesh + # print("prim_mesh = ", prim_mesh) + + mesh = [supercell[0] * prim_mesh[0], supercell[1] * prim_mesh[1], supercell[2] * prim_mesh[2]] + mesh = np.array(mesh, dtype=np.int32) + + cell, supercell_group = isdf_tools_cell.build_supercell_with_partition(atm, prim_a, + partition = partition, + Ls = supercell, + ke_cutoff = ke_cutoff, + mesh = mesh, + basis = basis, + pseudo = pseudo, + verbose = 4) + + cell.incore_anyway = False + cell.max_memory = 200 # force to call with_df.get_jk + + for c, rela_cutoff in zip(C_ARRAY, RELA_CUTOFF): + + print('--------------------------------------------') + print('C = %.2e, supercell = %s, kc_cutoff = %d, basis = %s, partition = %s' % ( + c, str(supercell), ke_cutoff, basis, partition)) + + t1 = (lib.logger.process_clock(),lib.logger.perf_counter()) + + pbc_isdf_info = isdf_local.PBC_ISDF_Info_Quad(cell, with_robust_fitting=True, + direct=True, + rela_cutoff_QRCP=rela_cutoff, + limited_memory=True, build_K_bunchsize=56 + ) + pbc_isdf_info.build_IP_local(c=c, group=supercell_group) + print("pbc_isdf_info.naux = ", pbc_isdf_info.naux) + print("effective c = ", float(pbc_isdf_info.naux) / pbc_isdf_info.nao) + pbc_isdf_info.build_auxiliary_Coulomb() + + t2 = (lib.logger.process_clock(), lib.logger.perf_counter()) + + print(isdf_jk._benchmark_time(t1, t2, 'build_isdf', pbc_isdf_info)) + + # for bunch_size in BUNCHSIZE_IO: + ### perform scf ### + + from pyscf.pbc import scf + + t1 = (lib.logger.process_clock(), lib.logger.perf_counter()) + mf = scf.UHF(cell) + mf.with_df = pbc_isdf_info + mf.max_cycle = 64 + mf.conv_tol = 1e-7 + pbc_isdf_info.direct_scf = mf.direct_scf + if DM_CACHED is not None: + mf.kernel(DM_CACHED) + else: + mf.kernel() + t2 = (lib.logger.process_clock(), lib.logger.perf_counter()) + print(isdf_jk._benchmark_time(t1, t2, 'scf_isdf', pbc_isdf_info)) + + del mf + del pbc_isdf_info + + ### GDF benchmark ### + + mf = scf.UHF(cell).density_fit() + mf.max_cycle = 64 + mf.conv_tol = 1e-7 + # pbc_isdf_info.direct_scf = mf.direct_scf + mf.kernel(DM_CACHED) + + exit(1) \ No newline at end of file diff --git a/examples/isdf/03-GDF_CCO.py b/examples/isdf/03-GDF_CCO.py new file mode 100644 index 000000000..2a58105a8 --- /dev/null +++ b/examples/isdf/03-GDF_CCO.py @@ -0,0 +1,136 @@ +import numpy as np +from pyscf import lib +from pyscf.gto.mole import * + +from pyscf.isdf import isdf_tools_cell +from pyscf.isdf import isdf_local_k +from pyscf.isdf import isdf_jk +from pyscf.isdf import isdf_local + +from pyscf.lib.parameters import BOHR + +#### NOTE: a full tests on combinations of parameters #### + +prim_a = np.array( + [[14.572056092, 0.000000000, 0.000000000], + [0.000000000, 14.572056092, 0.000000000], + [0.000000000, 0.000000000, 6.010273939],]) * BOHR +atm = [ +['Cu', (1.927800, 1.927800, 1.590250)], +['Cu', (5.783400, 5.783400, 1.590250)], +['Cu', (1.927800, 5.783400, 1.590250)], +['Cu', (5.783400, 1.927800, 1.590250)], +['O', (1.927800, 3.855600, 1.590250)], +['O', (3.855600, 5.783400, 1.590250)], +['O', (5.783400, 3.855600, 1.590250)], +['O', (3.855600, 1.927800, 1.590250)], +['O', (0.000000, 1.927800, 1.590250)], +['O', (1.927800, 7.711200, 1.590250)], +['O', (7.711200, 5.783400, 1.590250)], +['O', (5.783400, 0.000000, 1.590250)], +['Ca', (0.000000, 0.000000, 0.000000)], +['Ca', (3.855600, 3.855600, 0.000000)], +['Ca', (7.711200, 3.855600, 0.000000)], +['Ca', (3.855600, 7.711200, 0.000000)], +] + +C_ARRAY = [25, 30, 35] +RELA_CUTOFF = [1e-3, 3e-4, 1e-4] +SuperCell_ARRAY = [ + [1, 1, 1], +] +Ke_CUTOFF = [256] +Basis = ['gth-dzvp'] + +PARTITION = [ + [[0], [1], [2], [3], + [4], [5], [6], [7], + [8], [9], [10], [11], + [12], [13], [14], [15]] +] + +if __name__ == '__main__': + + for supercell in SuperCell_ARRAY: + ke_cutoff = Ke_CUTOFF[0] + for partition in PARTITION: ## test different partition of atoms + for _basis_ in Basis: + + DM_CACHED = None + + from pyscf.gto.basis import parse_nwchem + fbas="basis2.dat" + atms = ['O', 'Cu', "Ca"] + basis = {atm:parse_nwchem.load(fbas, atm) for atm in atms} + # print("basis = ", basis) + + pseudo = {'Cu': 'gth-pbe-q19', 'O': 'gth-pbe', 'Ca': 'gth-pbe'} + + prim_cell = isdf_tools_cell.build_supercell(atm, prim_a, Ls = [1,1,1], ke_cutoff=ke_cutoff, basis=basis, pseudo=pseudo, verbose=4) + prim_mesh = prim_cell.mesh + # print("prim_mesh = ", prim_mesh) + + mesh = [supercell[0] * prim_mesh[0], supercell[1] * prim_mesh[1], supercell[2] * prim_mesh[2]] + mesh = np.array(mesh, dtype=np.int32) + + cell, supercell_group = isdf_tools_cell.build_supercell_with_partition(atm, prim_a, + partition = partition, + Ls = supercell, + ke_cutoff = ke_cutoff, + mesh = mesh, + basis = basis, + pseudo = pseudo, + verbose = 4) + + cell.incore_anyway = False + cell.max_memory = 200 # force to call with_df.get_jk + + for c, rela_cutoff in zip(C_ARRAY, RELA_CUTOFF): + + print('--------------------------------------------') + print('C = %.2e, supercell = %s, kc_cutoff = %d, basis = %s, partition = %s' % ( + c, str(supercell), ke_cutoff, basis, partition)) + + t1 = (lib.logger.process_clock(),lib.logger.perf_counter()) + + pbc_isdf_info = isdf_local.PBC_ISDF_Info_Quad(cell, with_robust_fitting=True, direct=True, rela_cutoff_QRCP=rela_cutoff, + limited_memory=True, build_K_bunchsize=128) + pbc_isdf_info.build_IP_local(c=c, group=supercell_group) + print("pbc_isdf_info.naux = ", pbc_isdf_info.naux) + print("effective c = ", float(pbc_isdf_info.naux) / pbc_isdf_info.nao) + pbc_isdf_info.build_auxiliary_Coulomb() + + t2 = (lib.logger.process_clock(), lib.logger.perf_counter()) + + print(isdf_jk._benchmark_time(t1, t2, 'build_isdf', pbc_isdf_info)) + + # for bunch_size in BUNCHSIZE_IO: + ### perform scf ### + + from pyscf.pbc import scf + + t1 = (lib.logger.process_clock(), lib.logger.perf_counter()) + mf = scf.GHF(cell) + mf.with_df = pbc_isdf_info + mf.max_cycle = 64 + mf.conv_tol = 1e-7 + pbc_isdf_info.direct_scf = mf.direct_scf + if DM_CACHED is not None: + mf.kernel(DM_CACHED) + else: + mf.kernel() + t2 = (lib.logger.process_clock(), lib.logger.perf_counter()) + print(isdf_jk._benchmark_time(t1, t2, 'scf_isdf', pbc_isdf_info)) + + del mf + del pbc_isdf_info + + ### GDF benchmark ### + + mf = scf.GHF(cell).density_fit() + mf.max_cycle = 64 + mf.conv_tol = 1e-7 + # pbc_isdf_info.direct_scf = mf.direct_scf + mf.kernel(DM_CACHED) + + exit(1) \ No newline at end of file diff --git a/examples/isdf/basis.dat b/examples/isdf/basis.dat new file mode 100644 index 000000000..55c2727a0 --- /dev/null +++ b/examples/isdf/basis.dat @@ -0,0 +1,49 @@ +#BASIS SET: +O S +14.482841 -3.658934e-02 +6.284704 -1.303224e-01 +1.164884 3.769821e-01 +0.468441 5.431582e-01 +0.184961 2.084140e-01 +O S +0.221262 1.000000e+00 +O P +10.213949 6.086918e-02 +3.622324 1.870524e-01 +1.299051 3.714779e-01 +0.463791 4.256889e-01 +0.157848 2.088730e-01 +O P +0.274670 1.000000e+00 +O D +1.200187 1.000000e+00 +#BASIS SET: +Ti S +4.314400 1.000000e+00 +Ti S +1.211440 1.000000e+00 +Ti S +0.507273 1.000000e+00 +Ti S +0.083635 1.000000e+00 +Ti S +0.032238 1.000000e+00 +Ti P +6.628548 -1.876134e-01 -1.202160e-02 +2.469901 3.648950e-01 2.614988e-02 +1.068373 8.015420e-01 -1.647797e-02 +0.438086 4.330265e-01 7.362989e-02 +Ti P +0.157971 1.000000e+00 +Ti P +0.068125 1.000000e+00 +Ti D +5.692516 -2.291856e-01 +1.923332 -5.221114e-01 +Ti D +0.647040 1.000000e+00 +Ti D +0.199065 1.000000e+00 +Ti F +1.121189 -9.777480e-01 +0.284205 -2.097828e-01 \ No newline at end of file diff --git a/examples/isdf/basis2.dat b/examples/isdf/basis2.dat new file mode 100644 index 000000000..455dbef6e --- /dev/null +++ b/examples/isdf/basis2.dat @@ -0,0 +1,118 @@ +#BASIS SET: (5s,5p,5d) -> [3s,3p,2d] Ca +Ca S + 7.213557 -4.811677e-02 6.509227e-02 + 3.953199 2.795449e-01 -3.375291e-01 + 0.887945 -5.055939e-01 5.085903e-01 + 0.381928 -4.403097e-01 3.189670e-01 +Ca S + 0.044801 1.000000e+00 +Ca P + 5.522531 6.327636e-02 4.842917e-03 + 1.446307 -5.437431e-01 -3.188125e-03 + 0.605733 -7.882950e-01 -6.033881e-02 + 0.239083 -2.800260e-01 9.655471e-02 +Ca P + 0.062765 1.000000e+00 +Ca D + 3.010924 4.928622e-02 + 1.064269 1.087188e-01 + 0.316680 2.101310e-01 + 0.175804 -3.737671e-02 +Ca D + 0.073715 1.000000e+00 +#BASIS SET: (6s,6p,1d) -> [2s,2p,1d] O +O S + 14.482841 -3.658934e-02 + 6.284704 -1.303224e-01 + 1.164884 3.769821e-01 + 0.468441 5.431582e-01 + 0.184961 2.084140e-01 +O S + 0.221262 1.000000e+00 +O P + 10.213949 6.086918e-02 + 3.622324 1.870524e-01 + 1.299051 3.714779e-01 + 0.463791 4.256889e-01 + 0.157848 2.088730e-01 +O P + 0.274670 1.000000e+00 +O D + 1.200187 1.000000e+00 +#BASIS SET: (5s,6p,4d,2f) -> [3s,3p,2d,1f] Cu +Cu S + 9.083669 -3.622183e-01 1.093857e-01 + 2.375895 7.613242e-01 -1.645715e-01 + 0.936687 5.169596e-01 3.951353e-02 + 0.116029 -8.984813e-02 -6.615456e-01 +Cu S + 0.041075 1.000000e+00 +Cu P + 11.566615 -1.354626e-01 2.324231e-02 + 4.918638 4.749338e-01 -7.531200e-02 + 2.290556 7.098130e-01 -2.250031e-02 + 1.043427 4.923407e-01 -1.498897e-01 + 0.429044 9.926123e-02 1.645767e-01 +Cu P + 0.139040 1.000000e+00 +Cu D + 8.082843 -3.663286e-01 + 3.149999 -5.868172e-01 + 1.067441 -6.124219e-01 +Cu D + 0.308911 1.000000e+00 +Cu F + 4.078302 -4.962922e-01 + 1.072255 -8.681555e-01 +#BASIS SET: (5s,5p,1d) -> [4s,3p,2d] Ba +Ba S + 4.860079 -2.877241e-02 2.157408e-02 + 1.399001 3.828487e-01 -2.577906e-01 + 0.402710 -7.842840e-01 3.891714e-01 +Ba S + 0.115922 1.000000e+00 +Ba S + 0.033369 1.000000e+00 +Ba P + 3.592628 4.674179e-02 + 1.573935 -3.519620e-01 + 0.689543 5.167405e-01 + 0.302090 7.478454e-01 +Ba P + 0.132346 1.000000e+00 +Ba P + 0.042656 1.000000e+00 +Ba D + 1.680976 5.054502e-02 + 0.883691 -7.506956e-02 + 0.464558 -5.025319e-02 + 0.244219 -4.749195e-01 + 0.128386 1.435246e-01 +Ba D + 0.067493 1.000000e+00 +#BASIS SET: (4s,4p,5d,2f) -> [3s,3p,3d,1f] Hg +Hg S + 1.915700 1.357890e-01 + 0.979308 -4.520937e-01 +Hg S + 0.150255 1.000000e+00 +Hg S + 0.054019 1.000000e+00 +Hg P + 2.173806 -8.329602e-03 + 0.867935 4.694467e-02 +Hg P + 0.346540 1.000000e+00 +Hg P + 0.138363 1.000000e+00 +Hg D + 3.674806 2.455544e-02 + 1.622342 -6.047205e-01 + 0.768788 -6.547334e-01 +Hg D + 0.339247 1.000000e+00 +Hg D + 0.136379 1.000000e+00 +Hg F + 1.330141 5.428473e-01 + 0.507359 8.398314e-01 diff --git a/pyscf/isdf/__init__.py b/pyscf/isdf/__init__.py new file mode 100644 index 000000000..392dd9021 --- /dev/null +++ b/pyscf/isdf/__init__.py @@ -0,0 +1,19 @@ +# Copyright 2014-2018 The PySCF Developers. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .isdf import ISDF +from .isdf_fast import PBC_ISDF_Info +from .isdf_local import PBC_ISDF_Info_Quad +from .isdf_local_k import PBC_ISDF_Info_Quad_K +from .isdf_tools_cell import build_supercell, build_supercell_with_partition \ No newline at end of file diff --git a/pyscf/isdf/_isdf_local_K_direct.py b/pyscf/isdf/_isdf_local_K_direct.py new file mode 100644 index 000000000..6aede0d47 --- /dev/null +++ b/pyscf/isdf/_isdf_local_K_direct.py @@ -0,0 +1,723 @@ +#!/usr/bin/env python +# Copyright 2014-2020 The PySCF Developers. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Author: Ning Zhang +# + +######## a unified driver for getting K directly for both ISDF with/without k-points + +############ sys module ############ + +import copy, sys +import ctypes +import numpy as np + +############ pyscf module ############ + +from pyscf import lib +from pyscf.lib import logger +from pyscf.pbc import tools +from pyscf.pbc.lib.kpts_helper import is_zero, gamma_point +from pyscf.pbc.df.df_jk import _ewald_exxdiv_for_G0 +from pyscf.pbc.df.df_jk import _format_dms, _format_kpts_band, _format_jks +libisdf = lib.load_library('libisdf') + +############ isdf utils ############ + +from pyscf.isdf.isdf_jk import _benchmark_time +from pyscf.isdf.isdf_tools_local import _pack_aoR_holder, _get_aoR_holders_memory +import pyscf.isdf.isdf_tools_linearop as lib_isdf + +############ profile ############ + +cputime_RgAO = 0.0 +cputime_V = 0.0 +cputime_W = 0.0 +cputime_RgR = 0.0 +cputime_Ktmp1 = 0.0 +cputime_Ktmp2 = 0.0 + +walltime_RgAO = 0.0 +walltime_V = 0.0 +walltime_W = 0.0 +walltime_RgR = 0.0 +walltime_Ktmp1 = 0.0 +walltime_Ktmp2 = 0.0 + +def add_cputime_RgAO(t1): + global cputime_RgAO + cputime_RgAO += t1 + +def add_walltime_RgAO(t1): + global walltime_RgAO + walltime_RgAO += t1 + +def reset_profile_buildK_time(): + + global cputime_RgAO, cputime_V, cputime_W, cputime_RgR, cputime_Ktmp1, cputime_Ktmp2 + global walltime_RgAO, walltime_V, walltime_W, walltime_RgR, walltime_Ktmp1, walltime_Ktmp2 + + cputime_RgAO = 0.0 + cputime_V = 0.0 + cputime_W = 0.0 + cputime_RgR = 0.0 + cputime_Ktmp1 = 0.0 + cputime_Ktmp2 = 0.0 + + walltime_RgAO = 0.0 + walltime_V = 0.0 + walltime_W = 0.0 + walltime_RgR = 0.0 + walltime_Ktmp1 = 0.0 + walltime_Ktmp2 = 0.0 + +def log_profile_buildK_time(mydf, use_mpi=False): + + global cputime_RgAO, cputime_V, cputime_W, cputime_RgR, cputime_Ktmp1, cputime_Ktmp2 + global walltime_RgAO, walltime_V, walltime_W, walltime_RgR, walltime_Ktmp1, walltime_Ktmp2 + + log = logger.Logger(mydf.stdout, mydf.verbose) + + if not use_mpi: + log.info('In _isdf_get_K_direct_kernel RgAO cputime = %16.3f walltime = %16.3f paralell = %4.2f' % (cputime_RgAO, walltime_RgAO, cputime_RgAO/walltime_RgAO)) + log.info('In _isdf_get_K_direct_kernel RgR cputime = %16.3f walltime = %16.3f paralell = %4.2f' % (cputime_RgR, walltime_RgR, cputime_RgR/walltime_RgR)) + log.info('In _isdf_get_K_direct_kernel V cputime = %16.3f walltime = %16.3f paralell = %4.2f' % (cputime_V, walltime_V, cputime_V/walltime_V)) + log.info('In _isdf_get_K_direct_kernel W cputime = %16.3f walltime = %16.3f paralell = %4.2f' % (cputime_W, walltime_W, cputime_W/walltime_W)) + log.info('In _isdf_get_K_direct_kernel Ktmp1 cputime = %16.3f walltime = %16.3f paralell = %4.2f' % (cputime_Ktmp1, walltime_Ktmp1, cputime_Ktmp1/walltime_Ktmp1)) + log.info('In _isdf_get_K_direct_kernel Ktmp2 cputime = %16.3f walltime = %16.3f paralell = %4.2f' % (cputime_Ktmp2, walltime_Ktmp2, cputime_Ktmp2/walltime_Ktmp2)) + else: + if rank == 0: + log.info('In _isdf_get_K_direct_kernel RgAO cputime = %16.3f walltime = %16.3f paralell = %4.2f' % (cputime_RgAO, walltime_RgAO, cputime_RgAO/walltime_RgAO)) + log.info('In _isdf_get_K_direct_kernel RgR cputime = %16.3f walltime = %16.3f paralell = %4.2f' % (cputime_RgR, walltime_RgR, cputime_RgR/walltime_RgR)) + log.info('In _isdf_get_K_direct_kernel V cputime = %16.3f walltime = %16.3f paralell = %4.2f' % (cputime_V, walltime_V, cputime_V/walltime_V)) + log.info('In _isdf_get_K_direct_kernel W cputime = %16.3f walltime = %16.3f paralell = %4.2f' % (cputime_W, walltime_W, cputime_W/walltime_W)) + log.info('In _isdf_get_K_direct_kernel Ktmp1 cputime = %16.3f walltime = %16.3f paralell = %4.2f' % (cputime_Ktmp1, walltime_Ktmp1, cputime_Ktmp1/walltime_Ktmp1)) + log.info('In _isdf_get_K_direct_kernel Ktmp2 cputime = %16.3f walltime = %16.3f paralell = %4.2f' % (cputime_Ktmp2, walltime_Ktmp2, cputime_Ktmp2/walltime_Ktmp2)) + comm.Barrier() + +############ GLOBAL PARAMETER ############ + +K_DIRECT_NAUX_BUNCHSIZE = 256 + +############ subroutines to keep ISDF w./w.o k-points consistent ############ + +def _add_kpnt_info(mydf): + if hasattr(mydf, "kmesh"): + assert mydf.kmesh is None or (mydf.kmesh[0] == 1 and mydf.kmesh[1] == 1 and mydf.kmesh[2] == 1) + + mydf.meshPrim = np.array(mydf.mesh) + mydf.natmPrim = mydf.cell.natm + mydf.primCell = mydf.cell + mydf.nao_prim = mydf.nao + mydf.nIP_Prim = mydf.naux + +def _permutation_box(mydf, kmesh): + permutation = [] + for kx in range(kmesh[0]): + for ky in range(kmesh[1]): + for kz in range(kmesh[2]): + + tmp = [] + + for ix in range(kmesh[0]): + for iy in range(kmesh[1]): + for iz in range(kmesh[2]): + ix_ = (ix + kx) % kmesh[0] + iy_ = (iy + ky) % kmesh[1] + iz_ = (iz + kz) % kmesh[2] + tmp.append(ix_*kmesh[1]*kmesh[2] + iy_*kmesh[2] + iz_) + + tmp = np.array(tmp, dtype=np.int32) + permutation.append(tmp) + mydf._permutation_box = permutation + return permutation + + +def construct_V(aux_basis:np.ndarray, + buf, + V, + ### some helper info ### + grid_ID, grid_ordering, + mesh, coulG_real): + fn = getattr(libisdf, "_construct_V_local_bas", None) + assert(fn is not None) + + nThread = buf.shape[0] + bufsize_per_thread = buf.shape[1] + nrow = aux_basis.shape[0] + ncol = aux_basis.shape[1] + shift_row = 0 + + fn(mesh.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(nrow), + ctypes.c_int(ncol), + grid_ID.ctypes.data_as(ctypes.c_void_p), + aux_basis.ctypes.data_as(ctypes.c_void_p), + coulG_real.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(shift_row), + V.ctypes.data_as(ctypes.c_void_p), + grid_ordering.ctypes.data_as(ctypes.c_void_p), + buf.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(bufsize_per_thread)) + +def _isdf_get_K_direct_kernel_1( + mydf, + coulG_real, + ##### input #### + group_id, ## the contribution of K from which group + dm_RgAO, + V_or_W_tmp, + construct_K1, + calculate_W, + ##### buffer #####, + buf_build_V_thread, + build_VW_buf, + offset_V_tmp, + Density_RgR_buf, + dm_RgAO_buf, + dm_RgAO_packed_offset, + ddot_res_RgR_buf, + K1_tmp1_buf, + K1_tmp1_ddot_res_buf, + K1_final_ddot_buf, + ##### bunchsize ##### + naux_bunchsize = K_DIRECT_NAUX_BUNCHSIZE, + ##### other info ##### + use_mpi =False, + begin_id=None, + end_id =None, + ##### out ##### + K1_or_2 = None +): + + log = logger.Logger(mydf.stdout, mydf.verbose) + + + ######### profile ######### + + global cputime_RgAO, cputime_V, cputime_W, cputime_RgR, cputime_Ktmp1, cputime_Ktmp2 + global walltime_RgAO, walltime_V, walltime_W, walltime_RgR, walltime_Ktmp1, walltime_Ktmp2 + + ######### cutoff ######### + + use_cutoff = False + rela_cutoff = None + abs_cutoff = None + distance_cutoff = None + + if hasattr(mydf, "_build_K_rela_cutoff"): + rela_cutoff = mydf._build_K_rela_cutoff + if rela_cutoff is not None: + use_cutoff = True + if hasattr(mydf, "_build_K_abs_cutoff"): + abs_cutoff = mydf._build_K_abs_cutoff + if abs_cutoff is not None: + use_cutoff = True + if hasattr(mydf, "_build_K_distance_cutoff"): + distance_cutoff = mydf._build_K_distance_cutoff + if distance_cutoff is not None: + assert not use_cutoff + use_cutoff = True + + if use_cutoff and abs_cutoff is None: + if distance_cutoff is None: + abs_cutoff = 1.0e-9 + + distance_matrix = mydf.distance_matrix + + ######### info ######### + + assert K1_or_2 is not None + + if construct_K1 == False: + assert V_or_W_tmp is not None + + # if use_mpi: + # from pyscf.isdf.isdf_tools_mpi import rank, comm_size, comm, allgather, bcast + # size = comm.Get_size() + # if group_id % comm_size != rank: + # raise ValueError + + nao = mydf.nao + mesh = np.array(mydf.cell.mesh, dtype=np.int32) + ngrid = np.prod(mesh) + naux = mydf.naux + + ######### to be compatible with kmesh ######### + + if mydf.kmesh is None: + kmesh = [1,1,1] + else: + kmesh = mydf.kmesh + + nkpts = np.prod(kmesh) + + if not hasattr(mydf, "nao_prim"): + _add_kpnt_info(mydf) + natm_prim = mydf.natmPrim + nao_prim = mydf.nao_prim + + ngrid_prim = np.prod(mesh) // np.prod(kmesh) + nIP_prim = mydf.nIP_Prim + + assert np.prod(mesh) % np.prod(kmesh) == 0 + assert mesh[0] % kmesh[0] == 0 + assert mesh[1] % kmesh[1] == 0 + assert mesh[2] % kmesh[2] == 0 + + if hasattr(mydf, "_permutation_box"): + permutation = mydf._permutation_box + else: + permutation = _permutation_box(mydf, kmesh) + + ######### fetch ao values on grids or IPs ######### + + aoRg = mydf.aoRg + assert isinstance(aoRg, list) + aoR = mydf.aoR + assert isinstance(aoR, list) + + if hasattr(mydf, "aoR1"): + aoR1 = mydf.aoR1 + else: + aoR = aoR + + if hasattr(mydf, "aoRg1"): + aoRg1 = mydf.aoRg1 + else: + aoRg1 = aoRg + + ######### fetch the atm_ordering ######### + + group = mydf.group + + ngroup_prim = len(group) + + if hasattr(mydf, "atm_ordering"): + atm_ordering = mydf.atm_ordering + else: + atm_ordering = [] + for group_idx, atm_idx in enumerate(group): + atm_idx.sort() + atm_ordering.extend(atm_idx) + atm_ordering = np.array(atm_ordering, dtype=np.int32) + mydf.atm_ordering = atm_ordering + + aux_basis = mydf.aux_basis + assert len(group) == len(aux_basis) + + ### the number of aux basis involved ### + + naux_tmp = 0 + aoRg_packed = [] + IP_2_atm_id = [] + ILOC = 0 + for kx in range(kmesh[0]): + for ky in range(kmesh[1]): + for kz in range(kmesh[2]): + aoRg_holders = [] + naux_tmp = 0 + for atm_id in group[group_id]: + # print("atm_id = ", atm_id, "ILOC = ", ILOC, "shape = ", aoRg1[atm_id+ILOC*natm_prim].aoR.shape) + naux_tmp += aoRg1[atm_id+ILOC*natm_prim].aoR.shape[1] + IP_2_atm_id.extend([atm_id+ILOC*natm_prim] * aoRg1[atm_id+ILOC*natm_prim].aoR.shape[1]) + aoRg_holders.append(aoRg1[atm_id+ILOC*natm_prim]) + aoRg_packed.append(_pack_aoR_holder(aoRg_holders, nao)) + # print("naux_tmp = ", naux_tmp) + # print("aux_basis[group_id].shape[0] = ", aux_basis[group_id].shape[0]) + assert naux_tmp == aux_basis[group_id].shape[0] + ILOC += 1 + IP_2_atm_id = np.array(IP_2_atm_id, dtype=np.int32) + # print("IP_2_atm_id = ", IP_2_atm_id) + + # grid ID involved for the given group + + aux_basis_grip_ID = mydf.partition_group_to_gridID[group_id] + + # pack aoRg for loop over Rg # + + # aoRg_packed = _pack_aoR_holder(aoRg_holders, nao) + # memory = _get_aoR_holders_memory(aoRg_holders) + + memory = _get_aoR_holders_memory(aoRg_packed) + + # log.info('In _isdf_get_K_direct_kernel1 aoRg_packed Memory = %d Bytes' % (memory)) + # log.info('In _isdf_get_K_direct_kernel1 group_id = %d, naux = %d' % (group_id, naux_tmp)) + # log.info('In _isdf_get_K_direct_kernel1 aoRg_holders Memory = %d Bytes' % (memory)) + # log.info('In _isdf_get_K_direct_kernel1 naux_bunchsize = %d' % (naux_bunchsize)) + + # assert aoRg_packed.ngrid_tot == naux_tmp + + ######### get involved C function ######### + + fn_packcol1 = getattr(libisdf, "_buildK_packcol", None) + fn_packcol2 = getattr(libisdf, "_buildK_packcol2", None) + fn_packadd_col = getattr(libisdf, "_buildK_packaddcol", None) + fn_packadd_row_k = getattr(libisdf, "_buildK_packaddrow_shift_col", None) + + assert fn_packcol1 is not None + assert fn_packcol2 is not None + assert fn_packadd_col is not None + assert fn_packadd_row_k is not None + + # determine bunchsize # + + bunchsize = min(naux_bunchsize, naux_tmp) + + if construct_K1: + + ### allocate buf ### + + V_tmp = np.ndarray((bunchsize, ngrid), + buffer=build_VW_buf, + offset=offset_V_tmp, + dtype =np.float64) + offset_after_V_tmp = offset_V_tmp + V_tmp.size * V_tmp.dtype.itemsize + + # buffer for W_tmp # + + W_tmp = np.ndarray((naux_tmp, naux), + buffer=build_VW_buf, + offset=offset_after_V_tmp, + dtype =np.float64) + W_tmp.ravel()[:] = 0.0 # clean + + else: + offset_after_V_tmp = offset_V_tmp + W_tmp = None + + ###### CUTOFF ###### + + # if use_cutoff: + # dm_RgAO_max = np.max(np.abs(dm_RgAO[:, :nao_prim])) + # log.info('In _isdf_get_K_direct_kernel1 dm_RgAO_max = %16.8e' % (dm_RgAO_max)) + + #################### + + if begin_id is None: + begin_id = 0 + if end_id is None: + end_id = naux_tmp + + #### loop over Rg #### + + for p0, p1 in lib.prange(begin_id, end_id, bunchsize): + + unique_elements = np.unique(IP_2_atm_id[p0:p1]) + + #### 2. build the V matrix if constructK1 #### + + if construct_K1: + + V_tmp = np.ndarray((p1 - p0, ngrid), + buffer=build_VW_buf, + offset=offset_V_tmp, + dtype =np.float64) + V_tmp.ravel()[:] = 0.0 # clean + + t1 = (lib.logger.process_clock(), lib.logger.perf_counter()) + + construct_V(aux_basis[group_id][p0:p1, :], + buf_build_V_thread, + V_tmp, + aux_basis_grip_ID, + mydf.grid_ID_ordered, + mesh, + coulG_real) + + t2 = (lib.logger.process_clock(), lib.logger.perf_counter()) + + cputime_V += t2[0] - t1[0] + walltime_V += t2[1] - t1[1] + + else: + + V_tmp = V_or_W_tmp[p0:p1, :] # W_tmp in fact + + #### 3. build the K1_or_2 matrix #### + + ###### 3.1 build density RgR + + if construct_K1: + Density_RgR_tmp = np.ndarray((p1 - p0, ngrid), + buffer=Density_RgR_buf, + offset=0, + dtype =np.float64) + else: + Density_RgR_tmp = np.ndarray((p1 - p0, naux), + buffer=Density_RgR_buf, + offset=0, + dtype =np.float64) + Density_RgR_tmp.ravel()[:] = 0.0 # clean + + t1 = (lib.logger.process_clock(), lib.logger.perf_counter()) + + ILOC = 0 + for kx in range(kmesh[0]): + for ky in range(kmesh[1]): + for kz in range(kmesh[2]): + + if kx!=0 or ky!=0 or kz!=0: + if construct_K1: + col_permutation = mydf._get_permutation_column_aoR(kx, ky, kz) + else: + col_permutation = mydf._get_permutation_column_aoRg(kx, ky, kz) + + for atm_id in atm_ordering[:natm_prim]: + + if construct_K1: + aoR_holder = aoR[atm_id] + else: + aoR_holder = aoRg[atm_id] + + if aoR_holder is None: + raise ValueError("aoR_holder is None") + + ngrid_now = aoR_holder.aoR.shape[1] + nao_involved = aoR_holder.aoR.shape[0] + + ###### CUTOFF ###### + + if use_cutoff: + if distance_cutoff is not None: + distance = np.min(distance_matrix[unique_elements, ILOC*natm_prim+atm_id]) + if distance > distance_cutoff: + continue + + #################### + + ##### packed involved DgAO ##### + + if kx == 0 and ky == 0 and kz == 0: + ao_permutation = aoR_holder.ao_involved + else: + ao_permutation = col_permutation[atm_id] + + if (nao_involved == nao) and (kx == 0 and ky == 0 and kz == 0): + Density_RgAO_packed = dm_RgAO[p0:p1, :] + else: + Density_RgAO_packed = np.ndarray((p1-p0, nao_involved), + buffer=dm_RgAO_buf, + offset=dm_RgAO_packed_offset, + dtype =np.float64) + + fn_packcol1( + Density_RgAO_packed.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(p1-p0), + ctypes.c_int(nao_involved), + dm_RgAO[p0:p1, :].ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(p1-p0), + ctypes.c_int(nao), + ao_permutation.ctypes.data_as(ctypes.c_void_p) + ) + + ###### CUTOFF ###### + + if use_cutoff: + if distance_cutoff is None: + dm_RgAO_packed_max = np.max(np.abs(Density_RgAO_packed)) + if dm_RgAO_packed_max < abs_cutoff: + continue + + #################### + + if construct_K1: + grid_begin = aoR_holder.global_gridID_begin + ILOC*ngrid_prim + else: + grid_begin = aoR_holder.global_gridID_begin + ILOC*nIP_prim + + ddot_res_RgR = np.ndarray((p1-p0, ngrid_now), buffer=ddot_res_RgR_buf) + lib.ddot(Density_RgAO_packed, aoR_holder.aoR, c=ddot_res_RgR) + Density_RgR_tmp[:, grid_begin:grid_begin+ngrid_now] = ddot_res_RgR + + ILOC += 1 + + t2 = (lib.logger.process_clock(), lib.logger.perf_counter()) + + cputime_RgR += t2[0] - t1[0] + walltime_RgR += t2[1] - t1[1] + + Density_RgR = Density_RgR_tmp + + #### 3.2 V_tmp = Density_RgR * V + + lib_isdf.cwise_mul(V_tmp, Density_RgR, out=Density_RgR) + V2_tmp = Density_RgR + + + ###### CUTOFF ###### + + # if use_cutoff: + # if construct_K1: + # V2_tmp_max = np.max(np.abs(V2_tmp[:, :ngrid_prim])) + # else: + # V2_tmp_max = np.max(np.abs(V2_tmp[:, :nIP_prim])) + # log.info('In _isdf_get_K_direct_kernel1 V2_tmp_max = %16.8e' % (V2_tmp_max)) + + #################### + + #### 3.3 K1_tmp1 = V2_tmp * aoR.T + + K1_tmp1 = np.ndarray((p1-p0, nao), buffer=K1_tmp1_buf) + K1_tmp1.ravel()[:] = 0.0 + + t1 = (lib.logger.process_clock(), lib.logger.perf_counter()) + + ILOC = 0 + for kx in range(kmesh[0]): + for ky in range(kmesh[1]): + for kz in range(kmesh[2]): + + if kx!=0 or ky!=0 or kz!=0: + if construct_K1: + col_permutation = mydf._get_permutation_column_aoR(kx, ky, kz) + else: + col_permutation = mydf._get_permutation_column_aoRg(kx, ky, kz) + + for atm_id in atm_ordering[:natm_prim]: + + if construct_K1: + aoR_holder = aoR[atm_id] + else: + aoR_holder = aoRg[atm_id] + + ngrid_now = aoR_holder.aoR.shape[1] + nao_involved = aoR_holder.aoR.shape[0] + ddot_res = np.ndarray((p1-p0, nao_involved), buffer=K1_tmp1_ddot_res_buf) + + if construct_K1: + grid_loc_begin = aoR_holder.global_gridID_begin + ILOC*ngrid_prim + else: + grid_loc_begin = aoR_holder.global_gridID_begin + ILOC*nIP_prim + + ###### CUTOFF ###### + + if use_cutoff: + if distance_cutoff is None: + V2_tmp_max2 = np.max(np.abs(V2_tmp[:, grid_loc_begin:grid_loc_begin+ngrid_now])) + if V2_tmp_max2 < abs_cutoff: + continue + else: + distance = np.min(distance_matrix[unique_elements, ILOC*natm_prim+atm_id]) + if distance > distance_cutoff: + continue + + #################### + + lib.ddot(V2_tmp[:, grid_loc_begin:grid_loc_begin+ngrid_now], + aoR_holder.aoR.T, + c=ddot_res) + + if kx == 0 and ky == 0 and kz == 0: + ao_permutation = aoR_holder.ao_involved + else: + ao_permutation = col_permutation[atm_id] + assert col_permutation[atm_id].shape[0] == nao_involved + + if (nao_involved == nao) and (kx == 0 and ky == 0 and kz == 0): + K1_tmp1 += ddot_res + else: + fn_packadd_col( + K1_tmp1.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(K1_tmp1.shape[0]), + ctypes.c_int(K1_tmp1.shape[1]), + ddot_res.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(ddot_res.shape[0]), + ctypes.c_int(ddot_res.shape[1]), + ao_permutation.ctypes.data_as(ctypes.c_void_p) + ) + + ILOC += 1 + + t2 = (lib.logger.process_clock(), lib.logger.perf_counter()) + + cputime_Ktmp1 += t2[0] - t1[0] + walltime_Ktmp1 += t2[1] - t1[1] + + #### 3.4 K1_or_2 += aoRg * K1_tmp1 + + t1 = (lib.logger.process_clock(), lib.logger.perf_counter()) + + ILOC = 0 + for kx in range(kmesh[0]): + for ky in range(kmesh[1]): + for kz in range(kmesh[2]): + + box_permutation = permutation[ILOC] + + nao_involved = aoRg_packed[ILOC].nao_involved + ddot_res = np.ndarray((nao_involved, nao), buffer=K1_final_ddot_buf) + lib.ddot(aoRg_packed[ILOC].aoR[:,p0:p1], K1_tmp1, c=ddot_res) + fn_packadd_row_k( + K1_or_2.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(K1_or_2.shape[0]), + ctypes.c_int(K1_or_2.shape[1]), + ddot_res.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(ddot_res.shape[0]), + ctypes.c_int(ddot_res.shape[1]), + aoRg_packed[ILOC].ao_involved.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(nkpts), + ctypes.c_int(nao_prim), + box_permutation.ctypes.data_as(ctypes.c_void_p) + ) + + ILOC += 1 + + t2 = (lib.logger.process_clock(), lib.logger.perf_counter()) + + cputime_Ktmp2 += t2[0] - t1[0] + walltime_Ktmp2 += t2[1] - t1[1] + + #### 4. build the W matrix #### + + if calculate_W: + + aux_ket_shift = 0 + grid_shift = 0 + + t1 = (lib.logger.process_clock(), lib.logger.perf_counter()) + + ILOC = 0 + for ix in range(kmesh[0]): + for iy in range(kmesh[1]): + for iz in range(kmesh[2]): + + skip = False + if use_cutoff: + if distance_cutoff is not None: + distance = np.min(distance_matrix[unique_elements, ILOC*natm_prim:(ILOC+1)*natm_prim]) + if distance > distance_cutoff: + skip = True + + for j in range(len(group)): + aux_basis_ket = mydf.aux_basis[j] + ngrid_now = aux_basis_ket.shape[1] + naux_ket = aux_basis_ket.shape[0] + if not skip: + W_tmp[p0:p1, aux_ket_shift:aux_ket_shift+naux_ket] = lib.ddot( + V_tmp[:, grid_shift:grid_shift+ngrid_now], aux_basis_ket.T) + aux_ket_shift += naux_ket + grid_shift += ngrid_now + ILOC += 1 + + t2 = (lib.logger.process_clock(), lib.logger.perf_counter()) + + cputime_W += t2[0] - t1[0] + walltime_W += t2[1] - t1[1] + + assert grid_shift == ngrid + + return W_tmp + \ No newline at end of file diff --git a/pyscf/isdf/fft.c b/pyscf/isdf/fft.c new file mode 100644 index 000000000..d7f0f6cfb --- /dev/null +++ b/pyscf/isdf/fft.c @@ -0,0 +1,261 @@ +/* Copyright 2021- The PySCF Developers. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + + * + * Author: Xing Zhang + */ + +#include +#include +#include "fft.h" +#include "config.h" + +#define BLKSIZE 128 +#define MAX(x, y) (((x) > (y)) ? (x) : (y)) + +fftw_plan fft_create_r2c_plan(double *in, complex double *out, int rank, int *mesh) +{ + fftw_plan p; + p = fftw_plan_dft_r2c(rank, mesh, in, out, FFTW_ESTIMATE); + return p; +} + +fftw_plan fft_create_c2r_plan(complex double *in, double *out, int rank, int *mesh) +{ + fftw_plan p; + p = fftw_plan_dft_c2r(rank, mesh, in, out, FFTW_ESTIMATE); + return p; +} + +void fft_execute(fftw_plan p) +{ + fftw_execute(p); +} + +void fft_destroy_plan(fftw_plan p) +{ + fftw_destroy_plan(p); +} + +void _complex_fft(complex double *in, complex double *out, int *mesh, int rank, int sign) +{ + int i; + int nx = mesh[0]; + int nyz = 1; + for (i = 1; i < rank; i++) + { + nyz *= mesh[i]; + } + int nmax = nyz / BLKSIZE * BLKSIZE; + fftw_plan p_2d = fftw_plan_dft(rank - 1, mesh + 1, in, out, sign, FFTW_ESTIMATE); + int nn[BLKSIZE] = {nx}; + fftw_plan p_3d_x = fftw_plan_many_dft(1, nn, BLKSIZE, + out, NULL, nyz, 1, + out, NULL, nyz, 1, + sign, FFTW_ESTIMATE); + +#pragma omp parallel private(i) + { + int off; +#pragma omp for schedule(dynamic) + for (i = 0; i < nx; i++) + { + off = i * nyz; + fftw_execute_dft(p_2d, in + off, out + off); + } + +#pragma omp for schedule(dynamic) + for (i = 0; i < nmax; i += BLKSIZE) + { + fftw_execute_dft(p_3d_x, out + i, out + i); + } + } + fftw_destroy_plan(p_2d); + fftw_destroy_plan(p_3d_x); + + int nres = nyz - nmax; + if (nres > 0) + { + fftw_plan p_3d_x = fftw_plan_many_dft(1, nn, nres, + out + nmax, NULL, nyz, 1, + out + nmax, NULL, nyz, 1, + sign, FFTW_ESTIMATE); + fftw_execute(p_3d_x); + fftw_destroy_plan(p_3d_x); + } +} + +void fft(complex double *in, complex double *out, int *mesh, int rank) +{ + _complex_fft(in, out, mesh, rank, FFTW_FORWARD); +} + +void ifft(complex double *in, complex double *out, int *mesh, int rank) +{ + _complex_fft(in, out, mesh, rank, FFTW_BACKWARD); + size_t i, n = 1; + for (i = 0; i < rank; i++) + { + n *= mesh[i]; + } + double fac = 1. / (double)n; +#pragma omp parallel for schedule(static) + for (i = 0; i < n; i++) + { + out[i] *= fac; + } +} + +void rfft(double *in, complex double *out, int *mesh, int rank) +{ + fftw_plan p = fftw_plan_dft_r2c(rank, mesh, in, out, FFTW_ESTIMATE); + fftw_execute(p); + fftw_destroy_plan(p); +} + +void rfft_3d(double *in, complex double *out, int *mesh, int rank) +{ + fftw_plan p = fftw_plan_dft_r2c_3d(mesh[0], mesh[1], mesh[2], in, out, FFTW_ESTIMATE); + fftw_execute(p); + fftw_destroy_plan(p); +} + +void irfft(complex double *in, double *out, int *mesh, int rank) +{ + fftw_plan p = fftw_plan_dft_c2r(rank, mesh, in, out, FFTW_ESTIMATE); + fftw_execute(p); + fftw_destroy_plan(p); + size_t i, n = 1; + for (i = 0; i < rank; i++) + { + n *= mesh[i]; + } + double fac = 1. / (double)n; +#pragma omp parallel for schedule(static) + for (i = 0; i < n; i++) + { + out[i] *= fac; + } +} + +void irfft_3d(complex double *in, double *out, int *mesh, int rank) +{ + fftw_plan p = fftw_plan_dft_c2r_3d(mesh[0], mesh[1], mesh[2], in, out, FFTW_ESTIMATE); + fftw_execute(p); + fftw_destroy_plan(p); + size_t i, n = 1; + for (i = 0; i < rank; i++) + { + n *= mesh[i]; + } + double fac = 1. / (double)n; +#pragma omp parallel for schedule(static) + for (i = 0; i < n; i++) + { + out[i] *= fac; + } +} + +//// the following subroutines are designed for the 3D FFT for ISDF //// + +void _rfft_3d_ISDF(double *in, complex double *out, int *mesh, int nTransform) /// single thread mode +{ + fftw_plan p = fftw_plan_dft_r2c_3d(mesh[0], mesh[1], mesh[2], in, out, FFTW_ESTIMATE); + int n_in = mesh[0] * mesh[1] * mesh[2]; + int n_out = mesh[0] * mesh[1] * (mesh[2] / 2 + 1); + for (int i = 0; i < nTransform; i++) + { + fftw_execute_dft_r2c(p, in + i * n_in, out + i * n_out); + } + fftw_destroy_plan(p); +} + +void _rfft_3d_ISDF_manydft(double *in, complex double *out, int *mesh, int nTransform) /// not to be very efficient +{ + int n_in = mesh[0] * mesh[1] * mesh[2]; + int n_out = mesh[0] * mesh[1] * (mesh[2] / 2 + 1); + int mesh_out[3] = {mesh[0], mesh[1], mesh[2] / 2 + 1}; + fftw_plan p = fftw_plan_many_dft_r2c( + 3, mesh, nTransform, in, mesh, 1, n_in, out, mesh_out, 1, n_out, FFTW_ESTIMATE); + fftw_execute(p); + fftw_destroy_plan(p); +} + +void _rfft_3d_ISDF_parallel(double *in, complex double *out, int *mesh, int nTransform) /// parallel thread mode +{ + fftw_plan p = fftw_plan_dft_r2c_3d(mesh[0], mesh[1], mesh[2], in, out, FFTW_ESTIMATE); + int n_in = mesh[0] * mesh[1] * mesh[2]; + int n_out = mesh[0] * mesh[1] * (mesh[2] / 2 + 1); +#pragma omp parallel for schedule(static) + for (int i = 0; i < nTransform; i++) + { + fftw_execute_dft_r2c(p, in + i * n_in, out + i * n_out); + } + fftw_destroy_plan(p); +} + +void _irfft_3d_ISDF(complex double *in, double *out, int *mesh, int nTransform) /// single thread mode +{ + fftw_plan p = fftw_plan_dft_c2r_3d(mesh[0], mesh[1], mesh[2], in, out, FFTW_ESTIMATE); + int n_in = mesh[0] * mesh[1] * (mesh[2] / 2 + 1); + int n_out = mesh[0] * mesh[1] * mesh[2]; + double fac = 1. / (double)n_out; + + for (int i = 0; i < nTransform; i++) + { + fftw_execute_dft_c2r(p, in + i * n_in, out + i * n_out); + for (int j = 0; j < n_out; j++) + { + out[i * n_out + j] *= fac; + } + } + + fftw_destroy_plan(p); +} + +void _irfft_3d_ISDF_manydft(complex double *in, double *out, int *mesh, int nTransform) /// not to be very efficient +{ + int n_in = mesh[0] * mesh[1] * (mesh[2] / 2 + 1); + int n_out = mesh[0] * mesh[1] * mesh[2]; + double fac = 1. / (double)n_out; + int mesh_in[3] = {mesh[0], mesh[1], mesh[2] / 2 + 1}; + fftw_plan p = fftw_plan_many_dft_c2r( + 3, mesh, nTransform, in, mesh_in, 1, n_in, out, mesh, 1, n_out, FFTW_ESTIMATE); + fftw_execute(p); + fftw_destroy_plan(p); + for (int i = 0; i < nTransform * n_out; i++) + { + out[i] *= fac; + } +} + +void _irfft_3d_ISDF_parallel(complex double *in, double *out, int *mesh, int nTransform) /// parallel thread mode +{ + fftw_plan p = fftw_plan_dft_c2r_3d(mesh[0], mesh[1], mesh[2], in, out, FFTW_ESTIMATE); + int n_in = mesh[0] * mesh[1] * (mesh[2] / 2 + 1); + int n_out = mesh[0] * mesh[1] * mesh[2]; + double fac = 1. / (double)n_out; + +#pragma omp parallel for schedule(static) + for (int i = 0; i < nTransform; i++) + { + fftw_execute_dft_c2r(p, in + i * n_in, out + i * n_out); + for (int j = 0; j < n_out; j++) + { + out[i * n_out + j] *= fac; + } + } + + fftw_destroy_plan(p); +} \ No newline at end of file diff --git a/pyscf/isdf/fft.h b/pyscf/isdf/fft.h new file mode 100644 index 000000000..995a914e6 --- /dev/null +++ b/pyscf/isdf/fft.h @@ -0,0 +1,27 @@ +/* Copyright 2021- The PySCF Developers. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + + * + * Author: Xing Zhang + */ + +#include +#include + +#define FFT_PLAN fftw_plan + +FFT_PLAN fft_create_r2c_plan(double* in, double __complex__ * out, int rank, int* mesh); +FFT_PLAN fft_create_c2r_plan(double __complex__ * in, double* out, int rank, int* mesh); +void fft_execute(FFT_PLAN p); +void fft_destroy_plan(FFT_PLAN p); diff --git a/pyscf/isdf/isdf.py b/pyscf/isdf/isdf.py new file mode 100644 index 000000000..e3b1493be --- /dev/null +++ b/pyscf/isdf/isdf.py @@ -0,0 +1,415 @@ +#!/usr/bin/env python +# Copyright 2014-2020 The PySCF Developers. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Author: Ning Zhang +# Xing Zhang +# + +############ sys module ############ + +import os +import sys +import numpy as np +import numpy +import scipy + +############ pyscf module ############ + +import pyscf +from pyscf import lib +from pyscf.lib import logger +from pyscf import pbc +from pyscf.pbc import gto as pbcgto +from pyscf.pbc import scf as pbcscf +from pyscf.pbc import dft as pbcdft +from pyscf.pbc import tools +from pyscf.pbc import df +from pyscf.pbc.dft import gen_grid +from pyscf.pbc.dft import multigrid +from pyscf.pbc.df.df_jk import _format_dms, _format_kpts_band +from pyscf.pbc.dft.multigrid.multigrid_pair import MultiGridFFTDF2, _eval_rhoG + +import pyscf.isdf.isdf_ao2mo as isdf_ao2mo +import pyscf.isdf.isdf_jk as isdf_jk +from pyscf.isdf.isdf_jk import _benchmark_time + +############ subroutines ############ + +def _get_rhoR(mydf, dm_kpts, hermi=1): + ''' + get the electron density in real space (on grids) + + ''' + + kpts = np.zeros((1,3)) + kpts_band = None + + ### step 1 , evaluate ao_values on the grid + + grids = mydf.grids + coords = np.asarray(grids.coords).reshape(-1,3) + mesh = grids.mesh + ngrids = np.prod(mesh) + assert ngrids == coords.shape[0] + + ### step 2, evaluate the density on the grid as the weight for kmean + ### TODO: make it linear scaling + + dm_kpts = np.asarray(dm_kpts, order='C') + dms = _format_dms(dm_kpts, kpts) + nset, nkpts, _ = dms.shape[:3] + assert nset == 1 + assert nkpts == 1 # only gamma point for now + kpts_band = _format_kpts_band(kpts_band, kpts) + + # density in grid space $\rho(G)=\int_\Omega \rho(R) e^{-iGr} dr$ + rhoG = _eval_rhoG(mydf, dm_kpts, hermi, kpts, deriv=0) + + weight = cell.vol / ngrids + # *(1./weight) because rhoR is scaled by weight in _eval_rhoG. When + # computing rhoR with IFFT, the weight factor is not needed. + # the above comment is from pyscf/pbc/dft/multigrid_pair.py + # $\rho(R) = 1/\Omega \int_BZ \rho(G) e^{iGr} dG$ ??? + rhoR = tools.ifft(rhoG.reshape(-1,ngrids), mesh).real * (1./weight) + rhoR = rhoR.flatten() + assert rhoR.size == ngrids + + return rhoR + +def isdf(mydf, dm_kpts, hermi=1, naux=None, c=5, max_iter=100, kpts=np.zeros((1,3)), kpts_band=None, verbose=None): + + ''' + + Args: + mydf : the DF object + dm_kpts (np.ndarray): (nset, nkpts, nao, nao) density matrix in k-space + hermi (int) : int, optional + If :math:`hermi=1`, the task list is built only for + the upper triangle of the matrix. Default is 0. + naux (int) : number of auxiliary basis functions + c (int) : the ratio between the number of auxiliary basis functions and the number of atomic basis functions + if naux is none, then naux is set to c * cell.nao + max_iter (int) : max number of iterations for kmean + verbose (int) : verbosity level + kpts (np.ndarray) : + + Returns: + W (np.ndarray) : (naux,naux) matrix of the ISDF potential + aoRg (np.ndarray) : (naux,ngrids) matrix of the auxiliary basis + aoR (np.ndarray) : (nao, ngrids) matrix of the (scaled) atomic basis in real space + V_R (np.ndarray) : (naux,ngrids) matrix of the ISDF potential in real space + idx (np.ndarray) : (naux,) index of the auxiliary basis in the real space grid + + Ref: + (1) Lu2015 + (2) Hu2023 10.1021/acs.jctc.2c00927 + (3) Sandeep2022 https://pubs.acs.org/doi/10.1021/acs.jctc.2c00720 + + ''' + + t1 = (logger.process_clock(), logger.perf_counter()) + + ### step 1 , evaluate ao_values on the grid + + cell = mydf.cell + grids = mydf.grids + coords = np.asarray(grids.coords).reshape(-1,3) + mesh = grids.mesh + ngrids = np.prod(mesh) + assert ngrids == coords.shape[0] + + log = logger.Logger(sys.stdout, 4) + cput0 = (logger.process_clock(), logger.perf_counter()) + aoR = mydf._numint.eval_ao(cell, coords)[0] + + aoR *= np.sqrt(cell.vol / ngrids) ## NOTE: scaled ! + + print("aoR.shape = ", aoR.shape) + + cput1 = log.timer('eval_ao', *cput0) + if naux is None: + naux = cell.nao * c # number of auxiliary basis functions + + ### step 2, evaluate the density on the grid as the weight for kmean + ### TODO: make it linear scaling + + dm_kpts = np.asarray(dm_kpts, order='C') + dms = _format_dms(dm_kpts, kpts) + nset, nkpts, nao = dms.shape[:3] + assert nset == 1 + assert nkpts == 1 # only gamma point for now + # kpts_band, input_band = _format_kpts_band(kpts_band, kpts), kpts_band + kpts_band = _format_kpts_band(kpts_band, kpts) + + # density in grid space $\rho(G)=\int_\Omega \rho(R) e^{-iGr} dr$ + rhoG = _eval_rhoG(mydf, dm_kpts, hermi, kpts, deriv=0) + + weight = cell.vol / ngrids + # *(1./weight) because rhoR is scaled by weight in _eval_rhoG. When + # computing rhoR with IFFT, the weight factor is not needed. + # the above comment is from pyscf/pbc/dft/multigrid_pair.py + # $\rho(R) = 1/\Omega \int_BZ \rho(G) e^{iGr} dG$ ??? + rhoR = tools.ifft(rhoG.reshape(-1,ngrids), mesh).real * (1./weight) + rhoR = rhoR.flatten() + assert rhoR.size == ngrids + + ### step 3, kmean clustering get the IP + ### TODO: implement QRCP as an option + + cput1 = log.timer('eval_rhoR', *cput1) + from sklearn.cluster import KMeans + # from cuml.cluster import KMeans + # from scikit-learn.cluster import KMeans + from sklearn.cluster import KMeans + kmeans_float = KMeans(n_clusters=naux, + max_iter=max_iter, + # max_samples_per_batch=32768*8//naux, + # output_type='numpy' + ) + kmeans_float.fit(coords, sample_weight=rhoR) + centers = kmeans_float.cluster_centers_ + + cput1 = log.timer('kmeans', *cput1) + + t2 = (logger.process_clock(), logger.perf_counter()) + _benchmark_time(t1, t2, "kmeans", mydf) + t1 = t2 + + ### step 4, get the auxiliary basis + + a = cell.lattice_vectors() + scaled_centers = np.dot(centers, np.linalg.inv(a)) + + idx = (np.rint(scaled_centers*mesh[None,:]) + mesh[None,:]) % (mesh[None,:]) + idx = idx[:,2] + idx[:,1]*mesh[2] + idx[:,0]*(mesh[1]*mesh[2]) + idx = idx.astype(int) + idx = list(set(idx)) + idx.sort() + idx = np.asarray(idx) + print("idx = ", idx) + + cput1 = log.timer('get idx', *cput1) + + aoRg = aoR[idx] # (nIP, nao), nIP = naux + # A = numpy.dot(aoRg, aoRg.T) ** 2 # (Naux, Naux) + A = np.asarray(lib.dot(aoRg, aoRg.T), order='C') + A = A ** 2 + cput1 = log.timer('get A', *cput1) + + X = np.empty((naux,ngrids)) + blksize = int(10*1e9/8/naux) + for p0, p1 in lib.prange(0, ngrids, blksize): + # B = numpy.dot(aoRg, aoR[p0:p1].T) ** 2 + B = np.asarray(lib.dot(aoRg, aoR[p0:p1].T), order='C') + B = B ** 2 + X[:,p0:p1] = scipy.linalg.lstsq(A, B)[0] + B = None + A = None + + cput1 = log.timer('least squre fit', *cput1) + + t2 = (logger.process_clock(), logger.perf_counter()) + _benchmark_time(t1, t2, "Construct Xg", mydf) + t1 = t2 + + ### step 5, get the ISDF potential, V(R_g, R') + + V_R = np.empty((naux,ngrids)) + coulG = tools.get_coulG(cell, mesh=mesh) + + blksize1 = int(5*1e9/8/ngrids) + for p0, p1 in lib.prange(0, naux, blksize1): + X_freq = numpy.fft.fftn(X[p0:p1].reshape(-1,*mesh), axes=(1,2,3)).reshape(-1,ngrids) + V_G = X_freq * coulG[None,:] + X_freq = None + V_R[p0:p1] = numpy.fft.ifftn(V_G.reshape(-1,*mesh), axes=(1,2,3)).real.reshape(-1,ngrids) + V_G = None + coulG = None + # V_R *= 2 * np.pi + + cput1 = log.timer('fft', *cput1) + + t2 = (logger.process_clock(), logger.perf_counter()) + _benchmark_time(t1, t2, "Construct VR", mydf) + t1 = t2 + + W = np.zeros((naux,naux)) + for p0, p1 in lib.prange(0, ngrids, blksize): + W += numpy.dot(X[:,p0:p1], V_R[:,p0:p1].T) + + # for i in range(naux): + # for j in range(i): + # print("W[%5d, %5d] = %15.8e" % (i, j, W[i,j])) + + cput1 = log.timer('get W', *cput1) + + t2 = (logger.process_clock(), logger.perf_counter()) + _benchmark_time(t1, t2, "Construct WR", mydf) + + return W, aoRg.T, aoR.T, V_R, idx, X + +class ISDF(df.fft.FFTDF): + def __init__(self, cell): + super().__init__(cell=cell) + + def build(self, dm=None, naux=None, c=8, max_iter=128): + ''' + Args: + dm (np.ndarray): (nset, nkpts, nao, nao) density matrix in k-space + naux (int) : number of auxiliary basis functions + c (int) : the ratio between the number of auxiliary basis functions and the number of atomic basis functions + if naux is none, then naux is set to c * cell.nao + max_iter (int) : max number of iterations for kmean + + Returns: + + ''' + + if naux is None and c is None: + c = 8 + + self.c = c + self.naux = naux + + ## dm is the weight for kmean + + if dm is None: + + mf = pbcdft.RKS(self.cell) + mf.xc = "PBE,PBE" + mf.init_guess = 'atom' # atom guess is fast + mf.with_df = multigrid.MultiGridFFTDF2(self.cell) + dm = mf.get_init_guess(self.cell, 'atom') + + df_tmp = MultiGridFFTDF2(self.cell) + self.W, self.aoRg, self.aoR, self.V_R, _, aux_basis = isdf( + df_tmp, dm, naux=naux, c=c, max_iter=max_iter, verbose=self.cell.verbose) + + ## WARNING: self.aoRG, self.aoR is scaled by a factor of sqrt(cell.vol / ngrids) + + self.naux = self.W.shape[0] + + if self.cell.verbose >= logger.INFO: + logger.info(self, 'naux = %d', self.naux) + print("naux = ", self.naux) + + self.check_sanity() + + ##### functions defined in isdf_ao2mo.py ##### + + get_eri = get_ao_eri = isdf_ao2mo.get_eri + ao2mo = get_mo_eri = isdf_ao2mo.general + ao2mo_7d = isdf_ao2mo.ao2mo_7d # seems to be only called in kadc and kccsd, NOT implemented! + + ##### functions defined in isdf_jk.py ##### + + get_jk = isdf_jk.get_jk_dm + +if __name__ == "__main__": + + cell = pbcgto.Cell() + boxlen = 3.5668 + cell.a = np.array([[boxlen,0.0,0.0],[0.0,boxlen,0.0],[0.0,0.0,boxlen]]) + + cell.atom = ''' + C 0. 0. 0. + C 0.8917 0.8917 0.8917 + C 1.7834 1.7834 0. + C 2.6751 2.6751 0.8917 + C 1.7834 0. 1.7834 + C 2.6751 0.8917 2.6751 + C 0. 1.7834 1.7834 + C 0.8917 2.6751 2.6751 + ''' + cell.basis = 'gth-szv' + cell.pseudo = 'gth-pade' + cell.verbose = 4 + + cell.ke_cutoff = 128 + cell.max_memory = 800 # 800 Mb + cell.precision = 1e-8 # integral precision + cell.use_particle_mesh_ewald = True + + print(cell.energy_nuc()) + print(cell.enuc) + + cell.build() + + print("Number of electrons: ", cell.nelectron) + print("Number of atoms : ", cell.natm) + print("Number of basis : ", cell.nao) + print("Number of images : ", cell.nimgs) + + # make a super cell + + cell = tools.super_cell(cell, [1,1,1]) + + print("Number of electrons: ", cell.nelectron) + print("Number of atoms : ", cell.natm) + print("Number of basis : ", cell.nao) + print("Number of images : ", cell.nimgs) + + # construct DF object + + mf = pbcdft.RKS(cell) + mf.xc = "PBE,PBE" + mf.init_guess = 'atom' # atom guess is fast + mf.with_df = multigrid.MultiGridFFTDF2(cell) + + dm1 = mf.get_init_guess(cell, 'atom') + mydf = MultiGridFFTDF2(cell) + + s1e = mf.get_ovlp(cell) + + print(s1e.shape) + print(dm1.shape) + print(mydf.grids.mesh) + print(mydf.grids.coords.shape) + + # perform ISDF + + rhoR = _get_rhoR(mydf, dm1) + print("rhoR.shape = ", rhoR.shape) + print("nelec from rhoR is ", np.sum(rhoR) * cell.vol / np.prod(cell.mesh)) + + W, aoRg, aoR, V_R, idx, _ = isdf(mydf, dm1, naux=cell.nao*10, max_iter=100, verbose=4) + + print("W.shape = ", W.shape) + print("aoRg.shape = ", aoRg.shape) + print("aoR.shape = ", aoR.shape) + print("V_R.shape = ", V_R.shape) + print("idx.shape = ", idx.shape) + + # check norm + + print(np.sum(aoR[0, :] ** 2)) + ovlp = cell.pbc_intor('cint1e_ovlp_sph') + print(ovlp[0, 0]) + + mydf_eri = df.FFTDF(cell) + eri = mydf_eri.get_eri(compact=False).reshape(cell.nao, cell.nao, cell.nao, cell.nao) + print("eri.shape = ", eri.shape) + + eri_isdf = isdf_ao2mo.isdf_eri_robust_fit(mydf, W, aoRg, aoR, V_R, verbose=4) + + print("eri_isdf.shape = ", eri_isdf.shape) + + for i in range(cell.nao): + for j in range(cell.nao): + for k in range(cell.nao): + for l in range(cell.nao): + if abs(eri[i,j,k,l] - eri_isdf[i,j,k,l]) > 1e-6: + print("eri[{}, {}, {}, {}] = {} != {}".format(i,j,k,l,eri[i,j,k,l], eri_isdf[i,j,k,l]), + "ration = ", eri[i,j,k,l]/eri_isdf[i,j,k,l]) diff --git a/pyscf/isdf/isdf_ao2mo.py b/pyscf/isdf/isdf_ao2mo.py new file mode 100644 index 000000000..d28b16029 --- /dev/null +++ b/pyscf/isdf/isdf_ao2mo.py @@ -0,0 +1,1213 @@ +#!/usr/bin/env python +# Copyright 2014-2020 The PySCF Developers. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Author: Ning Zhang +# + +############ sys module ############ + +import numpy, scipy +import numpy as np +import ctypes + +############ pyscf module ############ + +from pyscf import lib +from pyscf import ao2mo +from pyscf.ao2mo.incore import iden_coeffs +from pyscf.pbc import tools +from pyscf.pbc.lib import kpts_helper +from pyscf.pbc.lib.kpts_helper import is_zero, gamma_point, unique +from pyscf import __config__ +from pyscf.pbc.df.fft_ao2mo import _format_kpts, _iskconserv, _contract_compact +libisdf = lib.load_library('libisdf') + +############ isdf utils ############ + +from pyscf.isdf.isdf_tools_local import aoR_Holder +from pyscf.isdf.isdf_jk import _benchmark_time +import pyscf.isdf.isdf_tools_linearop as lib_isdf + +############ subroutines ---- AO2MO ############ + +def isdf_eri_robust_fit(mydf, W, aoRg, aoR, V_r, verbose=None): + r''' + + Get (AO) electron repulsion integrals (ERI) from ISDF with robust fitting. + Illurstrate the idea of iSDF with robust fitting in a human-readable way. + + Args: + mydf : ISDF objects + W : W matrix in Sandeep2022 eq 13 + aoR : AO values on grids (typically uniform mesh) + aoRg : Atomic orbitals' values on interpolation ponts. + V_r : V matrix in Sandeep2022 eq 13 + + Return: ERI with s1 symmetry + + NOTE: it is an abandoned func + + Ref: + + (1) Sandeep2022 https://pubs.acs.org/doi/10.1021/acs.jctc.2c00720 + + ''' + + cell = mydf.cell + nao = cell.nao + ngrid = np.prod(cell.mesh) + vol = cell.vol + + eri = numpy.zeros((nao,nao,nao,nao)) + + pair_Rg = np.einsum('ix,jx->ijx', aoRg, aoRg) + pair_R = np.einsum('ix,jx->ijx', aoR, aoR) + + ### step 1, term1 + + path = np.einsum_path('ijx,xy,kly->ijkl', pair_Rg, V_r, pair_R, optimize='optimal')[0] + eri_tmp = np.einsum('ijx,xy,kly->ijkl', pair_Rg, V_r, pair_R, optimize=path) + + ### step 2, term2 + + eri = eri_tmp + eri_tmp.transpose(2,3,0,1) + + ### step 3, term3 + + path = np.einsum_path('ijx,xy,kly->ijkl', pair_Rg, W, pair_Rg, optimize='optimal')[0] + eri -= np.einsum('ijx,xy,kly->ijkl', pair_Rg, W, pair_Rg, optimize=path) + + return eri * ngrid / vol + + +def isdf_eri(mydf, mo_coeff = None, verbose=None): + + """ + Perform AO2MO transformation from ISDF with robust fitting with s4 symmetry + Locality is supported if explored! + + Args: + mydf : + mo_coeff : Molecular orbital coefficients. + + Returns: + eri : MO-ERI with s4 symmetry. + + TODO: + when eri is very small, use DGEMM! + + """ + + #### basic info #### + + direct = mydf.direct + if direct is True: + raise NotImplementedError("direct is not supported in isdf_eri_robust") + with_robust_fitting = mydf.with_robust_fitting + + nao = mydf.cell.nao + naux = mydf.naux + vol = mydf.cell.vol + ngrid = np.prod(mydf.cell.mesh) + natm = mydf.cell.natm + + if mo_coeff is not None: + assert mo_coeff.shape[0] == nao + nmo = mo_coeff.shape[1] + else: + nmo = nao + + size = nmo * (nmo + 1) // 2 + eri = numpy.zeros((size, size)) + + aoR = mydf.aoR + aoRg = mydf.aoRg + assert isinstance(aoR, list) + assert isinstance(aoRg, list) + + if mo_coeff is not None: + + moR = [] + moRg = [] + + for i in range(natm): + + if with_robust_fitting: + ao_involved = aoR[i].ao_involved + mo_coeff_packed = mo_coeff[ao_involved,:].copy() + _moR = lib.ddot(mo_coeff_packed.T, aoR[i].aoR) + mo_involved = np.arange(nmo) + moR.append( + aoR_Holder( + aoR = _moR, + ao_involved = mo_involved, + local_gridID_begin = aoR[i].local_gridID_begin, + local_gridID_end = aoR[i].local_gridID_end, + global_gridID_begin = aoR[i].global_gridID_begin, + global_gridID_end = aoR[i].global_gridID_end) + ) + else: + moR.append(None) + + ao_involved = aoRg[i].ao_involved + mo_coeff_packed = mo_coeff[ao_involved,:].copy() + _moRg = lib.ddot(mo_coeff_packed.T, aoRg[i].aoR) + mo_involved = np.arange(nmo) + moRg.append( + aoR_Holder( + aoR = _moRg, + ao_involved = mo_involved, + local_gridID_begin = aoRg[i].local_gridID_begin, + local_gridID_end = aoRg[i].local_gridID_end, + global_gridID_begin = aoRg[i].global_gridID_begin, + global_gridID_end = aoRg[i].global_gridID_end) + ) + else: + moR = aoR + moRg = aoRg + + if with_robust_fitting: + max_nao_involved = np.max([aoR_holder.aoR.shape[0] for aoR_holder in moR if aoR_holder is not None]) + max_ngrid_involved = np.max([aoR_holder.aoR.shape[1] for aoR_holder in moR if aoR_holder is not None]) + else: + max_nao_involved = np.max([aoR_holder.aoR.shape[0] for aoR_holder in moRg if aoR_holder is not None]) + max_ngrid_involved = None + max_nIP_involved = np.max([aoR_holder.aoR.shape[1] for aoR_holder in moRg if aoR_holder is not None]) + + ###### loop over basic info to allocate the buf ###### + + aoPairRg_buf = np.zeros((max_nao_involved, max_nao_involved, max_nIP_involved)) + aoPairRg_buf2 = np.zeros((max_nao_involved, max_nao_involved, max_nIP_involved)) + if with_robust_fitting: + aoPairR_buf = np.zeros((max_nao_involved, max_nao_involved, max_ngrid_involved)) + else: + aoPairR_buf = None + + if with_robust_fitting: + V_W_pack_buf = np.zeros((max_nIP_involved, max_ngrid_involved)) + else: + V_W_pack_buf = np.zeros((max_nIP_involved, max_nIP_involved)) + + max_npair = (max_nao_involved * (max_nao_involved + 1)) // 2 + suberi_buf = np.zeros((max_npair, max_npair)) + ddot_res_buf = np.zeros((max_nIP_involved, max_npair)) + + #### involved function #### + + fn_packcol = getattr(libisdf, "_buildK_packcol2", None) + assert fn_packcol is not None + + fn_unpack_suberi_to_eri = getattr(libisdf, "_unpack_suberi_to_eri", None) + assert fn_unpack_suberi_to_eri is not None + + fn_pack_aoR_to_aoPairR = getattr(libisdf, "_pack_aoR_to_aoPairR_same", None) + assert fn_pack_aoR_to_aoPairR is not None + + ### V_R term ### + + V_R = mydf.V_R + + if with_robust_fitting: + + for partition_i in range(natm): + + aoRg_i = moRg[partition_i] + ao_involved_i = aoRg_i.ao_involved + nao_i = aoRg_i.aoR.shape[0] + global_IP_begin_i = aoRg_i.global_gridID_begin + nIP_i = aoRg_i.aoR.shape[1] + nPair_i = (nao_i * (nao_i + 1)) // 2 + aoPair_i = np.ndarray((nPair_i, nIP_i), dtype=np.float64, buffer=aoPairRg_buf) + + fn_pack_aoR_to_aoPairR( + aoRg_i.aoR.ctypes.data_as(ctypes.c_void_p), + aoPair_i.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(nao_i), + ctypes.c_int(nIP_i) + ) + + for partition_j in range(natm): + + aoR_j = moR[partition_j] + ao_involved_j = aoR_j.ao_involved + nao_j = aoR_j.aoR.shape[0] + global_IP_begin_j = aoR_j.global_gridID_begin + ngrid_j = aoR_j.aoR.shape[1] + nPair_j = (nao_j * (nao_j + 1)) // 2 + aoPair_j = np.ndarray((nPair_j, ngrid_j), dtype=np.float64, buffer=aoPairR_buf) + + fn_pack_aoR_to_aoPairR( + aoR_j.aoR.ctypes.data_as(ctypes.c_void_p), + aoPair_j.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(nao_j), + ctypes.c_int(ngrid_j) + ) + + V_packed = np.ndarray((nIP_i, ngrid_j), dtype=np.float64, buffer=V_W_pack_buf) + + fn_packcol( + V_packed.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(nIP_i), + ctypes.c_int(ngrid_j), + V_R[global_IP_begin_i:global_IP_begin_i+nIP_i, :].ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(nIP_i), + ctypes.c_int(V_R.shape[1]), + ctypes.c_int(global_IP_begin_j), + ctypes.c_int(global_IP_begin_j+ngrid_j) + ) + + ddot_res = np.ndarray((nIP_i, nPair_j), dtype=np.float64, buffer=ddot_res_buf) + lib.ddot(V_packed, aoPair_j.T, c=ddot_res) + sub_eri = np.ndarray((nPair_i, nPair_j), dtype=np.float64, buffer=suberi_buf) + lib.ddot(aoPair_i, ddot_res, c=sub_eri) + + transpose = 1 + fn_unpack_suberi_to_eri( + eri.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(nmo), + sub_eri.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(nao_i), + ao_involved_i.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(nao_j), + ao_involved_j.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(transpose) + ) + + ### W term ### + + W = mydf.W + + for partition_i in range(natm): + + aoRg_i = moRg[partition_i] + ao_involved_i = aoRg_i.ao_involved + nao_i = aoRg_i.aoR.shape[0] + global_IP_begin_i = aoRg_i.global_gridID_begin + nIP_i = aoRg_i.aoR.shape[1] + nPair_i = (nao_i * (nao_i + 1)) // 2 + aoPair_i = np.ndarray((nPair_i, nIP_i), dtype=np.float64, buffer=aoPairRg_buf) + + fn_pack_aoR_to_aoPairR( + aoRg_i.aoR.ctypes.data_as(ctypes.c_void_p), + aoPair_i.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(nao_i), + ctypes.c_int(nIP_i) + ) + + for partition_j in range(partition_i+1): + + aoRg_j = moRg[partition_j] + ao_involved_j = aoRg_j.ao_involved + nao_j = aoRg_j.aoR.shape[0] + global_IP_begin_j = aoRg_j.global_gridID_begin + nIP_j = aoRg_j.aoR.shape[1] + nPair_j = (nao_j * (nao_j + 1)) // 2 + aoPair_j = np.ndarray((nPair_j, nIP_j), dtype=np.float64, buffer=aoPairRg_buf2) + + fn_pack_aoR_to_aoPairR( + aoRg_j.aoR.ctypes.data_as(ctypes.c_void_p), + aoPair_j.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(nao_j), + ctypes.c_int(nIP_j) + ) + + ## pack_W ## + + W_packed = np.ndarray((nIP_i, nIP_j), dtype=np.float64, buffer=V_W_pack_buf) + + fn_packcol( + W_packed.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(nIP_i), + ctypes.c_int(nIP_j), + W[global_IP_begin_i:global_IP_begin_i+nIP_i, :].ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(nIP_i), + ctypes.c_int(W.shape[1]), + ctypes.c_int(global_IP_begin_j), + ctypes.c_int(global_IP_begin_j+nIP_j) + ) + + ddot_res = np.ndarray((nIP_i, nPair_j), dtype=np.float64, buffer=ddot_res_buf) + lib.ddot(W_packed, aoPair_j.T, c=ddot_res) + sub_eri = np.ndarray((nPair_i, nPair_j), dtype=np.float64, buffer=suberi_buf) + + alpha = 1 + if with_robust_fitting: + alpha = -1 + lib.ddot(aoPair_i, ddot_res, c=sub_eri, alpha=alpha) + + transpose = 1 + if partition_i == partition_j: + transpose = 0 + + fn_unpack_suberi_to_eri( + eri.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(nmo), + sub_eri.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(nao_i), + ao_involved_i.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(nao_j), + ao_involved_j.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(transpose) + ) + + ### del buf ### + + # assert np.allclose(eri, eri.T) + + del aoPairRg_buf + del aoPairRg_buf2 + del aoPairR_buf + + return eri * ngrid / vol + +def isdf_eri_2(mydf, mo_coeff = None, verbose=None): + + """ + Perform AO2MO transformation from ISDF with robust fitting with s4 symmetry + Locality is supported if explored! + + Args: + mydf : + mo_coeff : Molecular orbital coefficients. + + Returns: + eri : MO-ERI with s4 symmetry. + + NOTE: + + For small eri case + + """ + + #### basic info #### + + assert mo_coeff is not None + + direct = mydf.direct + if direct is True: + raise NotImplementedError("direct is not supported in isdf_eri_robust") + with_robust_fitting = mydf.with_robust_fitting + + nao = mydf.cell.nao + naux = mydf.naux + vol = mydf.cell.vol + ngrid = np.prod(mydf.cell.mesh) + natm = mydf.cell.natm + + if mo_coeff is not None: + assert mo_coeff.shape[0] == nao + nmo = mo_coeff.shape[1] + else: + nmo = nao + + size = nmo * (nmo + 1) // 2 + eri = numpy.zeros((size, size)) + + aoR = mydf.aoR + aoRg = mydf.aoRg + assert isinstance(aoR, list) + assert isinstance(aoRg, list) + + if mo_coeff is not None: + + moR = [] + moRg = [] + + for i in range(natm): + + if with_robust_fitting: + ao_involved = aoR[i].ao_involved + mo_coeff_packed = mo_coeff[ao_involved,:].copy() + _moR = lib.ddot(mo_coeff_packed.T, aoR[i].aoR) + mo_involved = np.arange(nmo) + moR.append( + aoR_Holder( + aoR = _moR, + ao_involved = mo_involved, + local_gridID_begin = aoR[i].local_gridID_begin, + local_gridID_end = aoR[i].local_gridID_end, + global_gridID_begin = aoR[i].global_gridID_begin, + global_gridID_end = aoR[i].global_gridID_end) + ) + else: + moR.append(None) + + ao_involved = aoRg[i].ao_involved + mo_coeff_packed = mo_coeff[ao_involved,:].copy() + _moRg = lib.ddot(mo_coeff_packed.T, aoRg[i].aoR) + mo_involved = np.arange(nmo) + moRg.append( + aoR_Holder( + aoR = _moRg, + ao_involved = mo_involved, + local_gridID_begin = aoRg[i].local_gridID_begin, + local_gridID_end = aoRg[i].local_gridID_end, + global_gridID_begin = aoRg[i].global_gridID_begin, + global_gridID_end = aoRg[i].global_gridID_end) + ) + else: + moR = aoR + moRg = aoRg + + max_nao_involved = np.max([aoR_holder.aoR.shape[0] for aoR_holder in moR if aoR_holder is not None]) + max_ngrid_involved = np.max([aoR_holder.aoR.shape[1] for aoR_holder in moR if aoR_holder is not None]) + max_nIP_involved = np.max([aoR_holder.aoR.shape[1] for aoR_holder in moRg if aoR_holder is not None]) + + ###### loop over basic info to allocate the buf ###### + + #max_npair = (max_nao_involved * (max_nao_involved + 1)) // 2 + #ddot_res_buf = np.zeros((max_nIP_involved, max_npair)) + max_npair = nmo * (nmo + 1) // 2 + npair = max_npair + suberi = np.zeros((npair, npair)) + ddot_res_buf = np.zeros((naux, npair)) + + aoPairRg_buf = np.zeros((nmo, nmo, max_nIP_involved)) + #aoPairRg_buf2 = np.zeros((max_nao_involved, max_nao_involved, max_nIP_involved)) + aoPairRg = np.zeros((npair, naux)) + + if with_robust_fitting: + aoPairR_buf = np.zeros((nmo, nmo, max_ngrid_involved)) + aoPairR = np.zeros((npair, ngrid)) + else: + aoPairR_buf = None + + #### involved function #### + + fn_packcol = getattr(libisdf, "_buildK_packcol2", None) + assert fn_packcol is not None + + fn_unpack_suberi_to_eri = getattr(libisdf, "_unpack_suberi_to_eri", None) + assert fn_unpack_suberi_to_eri is not None + + fn_pack_aoR_to_aoPairR = getattr(libisdf, "_pack_aoR_to_aoPairR_same", None) + assert fn_pack_aoR_to_aoPairR is not None + + ### construct aoPairRg, aoPairR ### + + for partition_i in range(natm): + + aoRg_i = moRg[partition_i] + ao_involved_i = aoRg_i.ao_involved + nao_i = aoRg_i.aoR.shape[0] + global_IP_begin_i = aoRg_i.global_gridID_begin + nIP_i = aoRg_i.aoR.shape[1] + nPair_i = (nao_i * (nao_i + 1)) // 2 + assert nPair_i == npair + aoPair_i = np.ndarray((nPair_i, nIP_i), dtype=np.float64, buffer=aoPairRg_buf) + + fn_pack_aoR_to_aoPairR( + aoRg_i.aoR.ctypes.data_as(ctypes.c_void_p), + aoPair_i.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(nao_i), + ctypes.c_int(nIP_i) + ) + + aoPairRg[:, global_IP_begin_i:global_IP_begin_i+nIP_i] = aoPair_i + + if with_robust_fitting: + + aoR_i = moR[partition_i] + ao_involved_i = aoR_i.ao_involved + nao_i = aoR_i.aoR.shape[0] + global_IP_begin_i = aoR_i.global_gridID_begin + ngrid_i = aoR_i.aoR.shape[1] + nPair_i = (nao_i * (nao_i + 1)) // 2 + assert nPair_i == npair + aoPair_i = np.ndarray((nPair_i, ngrid_i), dtype=np.float64, buffer=aoPairR_buf) + + fn_pack_aoR_to_aoPairR( + aoR_i.aoR.ctypes.data_as(ctypes.c_void_p), + aoPair_i.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(nao_i), + ctypes.c_int(ngrid_i) + ) + + aoPairR[:, global_IP_begin_i:global_IP_begin_i+ngrid_i] = aoPair_i + + ### V_R term ### + + V_R = mydf.V_R + + if with_robust_fitting: + + lib.ddot(V_R, aoPairR.T, c=ddot_res_buf) + lib.ddot(aoPairRg, ddot_res_buf, c=suberi) + eri += suberi + eri += suberi.T + + ### W term ### + + W = mydf.W + + lib.ddot(W, aoPairRg.T, c=ddot_res_buf) + lib.ddot(aoPairRg, ddot_res_buf, c=suberi) + if with_robust_fitting: + eri -= suberi + else: + eri += suberi + + ### del buf ### + + # assert np.allclose(eri, eri.T) + + del aoPairRg_buf + #del aoPairRg_buf2 + del aoPairR_buf + del aoPairRg + del aoPairR + + return eri * ngrid / vol + +def isdf_eri_ovov(mydf, mo_coeff_o: np.ndarray = None, mo_coeff_v: np.ndarray = None, verbose=None): + + """ + Perform AO2MO transformation from ISDF for specific orbital types (ovov), for MP2 calculation + Locality is supported if explored! + + Args: + mydf : ISDF objects. + mo_coeff_o : Molecular orbital coefficients for occupied orbitals + mo_coeff_v : Molecular orbital coefficients for virtual orbitals + + Return: + eri : ovov part of MO-ERI + + """ + + #### basic info #### + + direct = mydf.direct + if direct is True: + raise NotImplementedError("direct is not supported in isdf_eri_robust") + with_robust_fitting = mydf.with_robust_fitting + + nao = mydf.cell.nao + naux = mydf.naux + vol = mydf.cell.vol + ngrid = np.prod(mydf.cell.mesh) + natm = mydf.cell.natm + + nao_o = mo_coeff_o.shape[1] + nao_v = mo_coeff_v.shape[1] + + size = nao_o * nao_v + eri = numpy.zeros((size, size)) + + aoR = mydf.aoR + aoRg = mydf.aoRg + assert isinstance(aoR, list) + assert isinstance(aoRg, list) + + ############ transformation of moRg/moR ############ + + moR_o = [] + moRg_o = [] + + moR_v = [] + moRg_v = [] + + for i in range(natm): + + if with_robust_fitting: + ao_involved = aoR[i].ao_involved + mo_coeff_packed = mo_coeff_o[ao_involved,:].copy() + _moR = lib.ddot(mo_coeff_packed.T, aoR[i].aoR) + mo_involved = np.arange(nao_o) + moR_o.append( + aoR_Holder( + aoR = _moR, + ao_involved = mo_involved, + local_gridID_begin = aoR[i].local_gridID_begin, + local_gridID_end = aoR[i].local_gridID_end, + global_gridID_begin = aoR[i].global_gridID_begin, + global_gridID_end = aoR[i].global_gridID_end) + ) + + mo_coeff_packed = mo_coeff_v[ao_involved,:].copy() + _moR = lib.ddot(mo_coeff_packed.T, aoR[i].aoR) + mo_involved = np.arange(nao_v) + moR_v.append( + aoR_Holder( + aoR = _moR, + ao_involved = mo_involved, + local_gridID_begin = aoR[i].local_gridID_begin, + local_gridID_end = aoR[i].local_gridID_end, + global_gridID_begin = aoR[i].global_gridID_begin, + global_gridID_end = aoR[i].global_gridID_end) + ) + + else: + moR_o.append(None) + moR_v.append(None) + + ao_involved = aoRg[i].ao_involved + mo_coeff_packed = mo_coeff_o[ao_involved,:].copy() + _moRg = lib.ddot(mo_coeff_packed.T, aoRg[i].aoR) + mo_involved = np.arange(nao_o) + moRg_o.append( + aoR_Holder( + aoR = _moRg, + ao_involved = mo_involved, + local_gridID_begin = aoRg[i].local_gridID_begin, + local_gridID_end = aoRg[i].local_gridID_end, + global_gridID_begin = aoRg[i].global_gridID_begin, + global_gridID_end = aoRg[i].global_gridID_end) + ) + + mo_coeff_packed = mo_coeff_v[ao_involved,:].copy() + _moRg = lib.ddot(mo_coeff_packed.T, aoRg[i].aoR) + mo_involved = np.arange(nao_v) + moRg_v.append( + aoR_Holder( + aoR = _moRg, + ao_involved = mo_involved, + local_gridID_begin = aoRg[i].local_gridID_begin, + local_gridID_end = aoRg[i].local_gridID_end, + global_gridID_begin = aoRg[i].global_gridID_begin, + global_gridID_end = aoRg[i].global_gridID_end) + ) + + ######################################################## + + max_nao_involved = max(nao_o, nao_v) + max_ngrid_involved = np.max([aoR_holder.aoR.shape[1] for aoR_holder in moR_o if aoR_holder is not None]) + max_nIP_involved = np.max([aoR_holder.aoR.shape[1] for aoR_holder in moRg_o if aoR_holder is not None]) + + ###### loop over basic info to allocate the buf ###### + + aoPairRg_buf = np.zeros((nao_o, nao_v, max_nIP_involved)) + aoPairRg_buf2 = np.zeros((nao_o, nao_v, max_nIP_involved)) + if with_robust_fitting: + aoPairR_buf = np.zeros((nao_o, nao_v, max_ngrid_involved)) + else: + aoPairR_buf = None + + if with_robust_fitting: + V_W_pack_buf = np.zeros((max_nIP_involved, max_ngrid_involved)) + else: + V_W_pack_buf = np.zeros((max_nIP_involved, max_nIP_involved)) + + max_npair = nao_o * nao_v + suberi_buf = np.zeros((max_npair, max_npair)) + ddot_res_buf = np.zeros((max_nIP_involved, max_npair)) + + #### involved function #### + + fn_packcol = getattr(libisdf, "_buildK_packcol2", None) + assert fn_packcol is not None + + fn_unpack_suberi_to_eri = getattr(libisdf, "_unpack_suberi_to_eri_ovov", None) + assert fn_unpack_suberi_to_eri is not None + + fn_pack_aoR_to_aoPairR = getattr(libisdf, "_pack_aoR_to_aoPairR_diff", None) + assert fn_pack_aoR_to_aoPairR is not None + + ### V_R term ### + + V_R = mydf.V_R + + if with_robust_fitting: + + for partition_i in range(natm): + + aoRg_i_o = moRg_o[partition_i] + nocc_i = aoRg_i_o.aoR.shape[0] + + aoRg_i_v = moRg_v[partition_i] + nvir_i = aoRg_i_v.aoR.shape[0] + + global_IP_begin_i = aoRg_i_o.global_gridID_begin + nIP_i = aoRg_i_o.aoR.shape[1] + + nPair_i = nocc_i * nvir_i + aoPair_i = np.ndarray((nPair_i, nIP_i), dtype=np.float64, buffer=aoPairRg_buf) + + fn_pack_aoR_to_aoPairR( + aoRg_i_o.aoR.ctypes.data_as(ctypes.c_void_p), + aoRg_i_v.aoR.ctypes.data_as(ctypes.c_void_p), + aoPair_i.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(nocc_i), + ctypes.c_int(nvir_i), + ctypes.c_int(nIP_i) + ) + + for partition_j in range(natm): + + aoR_j_o = moR_o[partition_j] + nocc_j = aoR_j_o.aoR.shape[0] + + aoR_j_v = moR_v[partition_j] + nvir_j = aoR_j_v.aoR.shape[0] + + global_IP_begin_j = aoR_j_o.global_gridID_begin + ngrid_j = aoR_j_o.aoR.shape[1] + + nPair_j = nocc_j * nvir_j + aoPair_j = np.ndarray((nPair_j, ngrid_j), dtype=np.float64, buffer=aoPairR_buf) + + fn_pack_aoR_to_aoPairR( + aoR_j_o.aoR.ctypes.data_as(ctypes.c_void_p), + aoR_j_v.aoR.ctypes.data_as(ctypes.c_void_p), + aoPair_j.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(nocc_j), + ctypes.c_int(nvir_j), + ctypes.c_int(ngrid_j) + ) + + V_packed = np.ndarray((nIP_i, ngrid_j), dtype=np.float64, buffer=V_W_pack_buf) + + fn_packcol( + V_packed.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(nIP_i), + ctypes.c_int(ngrid_j), + V_R[global_IP_begin_i:global_IP_begin_i+nIP_i, :].ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(nIP_i), + ctypes.c_int(V_R.shape[1]), + ctypes.c_int(global_IP_begin_j), + ctypes.c_int(global_IP_begin_j+ngrid_j) + ) + + ddot_res = np.ndarray((nIP_i, nPair_j), dtype=np.float64, buffer=ddot_res_buf) + lib.ddot(V_packed, aoPair_j.T, c=ddot_res) + sub_eri = np.ndarray((nPair_i, nPair_j), dtype=np.float64, buffer=suberi_buf) + lib.ddot(aoPair_i, ddot_res, c=sub_eri) + + assert nPair_i == nPair_j == (nao_o * nao_v) + + transpose = 1 + fn_unpack_suberi_to_eri( + eri.ctypes.data_as(ctypes.c_void_p), + sub_eri.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(nPair_i), + ctypes.c_int(transpose) + ) + + ### W term ### + + W = mydf.W + + for partition_i in range(natm): + + aoRg_i_o = moRg_o[partition_i] + nocc_i = aoRg_i_o.aoR.shape[0] + + aoRg_i_v = moRg_v[partition_i] + nvir_i = aoRg_i_v.aoR.shape[0] + + global_IP_begin_i = aoRg_i_o.global_gridID_begin + nIP_i = aoRg_i_o.aoR.shape[1] + + nPair_i = nocc_i * nvir_i + aoPair_i = np.ndarray((nPair_i, nIP_i), dtype=np.float64, buffer=aoPairRg_buf) + + fn_pack_aoR_to_aoPairR( + aoRg_i_o.aoR.ctypes.data_as(ctypes.c_void_p), + aoRg_i_v.aoR.ctypes.data_as(ctypes.c_void_p), + aoPair_i.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(nocc_i), + ctypes.c_int(nvir_i), + ctypes.c_int(nIP_i) + ) + + for partition_j in range(partition_i+1): + + aoRg_j_o = moRg_o[partition_j] + nocc_j = aoRg_j_o.aoR.shape[0] + + aoRg_j_v = moRg_v[partition_j] + nvir_j = aoRg_j_v.aoR.shape[0] + + global_IP_begin_j = aoRg_j_o.global_gridID_begin + nIP_j = aoRg_j_o.aoR.shape[1] + + nPair_j = nocc_j * nvir_j + aoPair_j = np.ndarray((nPair_j, nIP_j), dtype=np.float64, buffer=aoPairRg_buf2) + + fn_pack_aoR_to_aoPairR( + aoRg_j_o.aoR.ctypes.data_as(ctypes.c_void_p), + aoRg_j_v.aoR.ctypes.data_as(ctypes.c_void_p), + aoPair_j.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(nocc_j), + ctypes.c_int(nvir_j), + ctypes.c_int(nIP_j) + ) + + ## pack_W ## + + W_packed = np.ndarray((nIP_i, nIP_j), dtype=np.float64, buffer=V_W_pack_buf) + + fn_packcol( + W_packed.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(nIP_i), + ctypes.c_int(nIP_j), + W[global_IP_begin_i:global_IP_begin_i+nIP_i, :].ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(nIP_i), + ctypes.c_int(W.shape[1]), + ctypes.c_int(global_IP_begin_j), + ctypes.c_int(global_IP_begin_j+nIP_j) + ) + + ddot_res = np.ndarray((nIP_i, nPair_j), dtype=np.float64, buffer=ddot_res_buf) + lib.ddot(W_packed, aoPair_j.T, c=ddot_res) + sub_eri = np.ndarray((nPair_i, nPair_j), dtype=np.float64, buffer=suberi_buf) + + assert nPair_i == nPair_j == (nao_o * nao_v) + + alpha = 1 + if with_robust_fitting: + alpha = -1 + lib.ddot(aoPair_i, ddot_res, c=sub_eri, alpha=alpha) + + transpose = 1 + if partition_i == partition_j: + transpose = 0 + + fn_unpack_suberi_to_eri( + eri.ctypes.data_as(ctypes.c_void_p), + sub_eri.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(nPair_i), + ctypes.c_int(transpose) + ) + + ### del buf ### + + assert np.allclose(eri, eri.T) + + del aoPairRg_buf + del aoPairRg_buf2 + del aoPairR_buf + + return eri.reshape(nao_o, nao_v, nao_o, nao_v) * ngrid / vol + +def get_eri(mydf, kpts=None, + compact=getattr(__config__, 'pbc_df_ao2mo_get_eri_compact', True)): + + cell = mydf.cell + nao = cell.nao_nr() + kptijkl = _format_kpts(kpts) + if not _iskconserv(cell, kptijkl): + lib.logger.warn(cell, 'isdf_ao2mo: momentum conservation not found in ' + 'the given k-points %s', kptijkl) + return numpy.zeros((nao,nao,nao,nao)) + + # kpti, kptj, kptk, kptl = kptijkl + # q = kptj - kpti + # coulG = tools.get_coulG(cell, q, mesh=mydf.mesh) + # coords = cell.gen_uniform_grids(mydf.mesh) + # max_memory = mydf.max_memory - lib.current_memory()[0] + +#################### + +# gamma point, the integral is real and with s4 symmetry + + if gamma_point(kptijkl): + + eri = isdf_eri(mydf, verbose=mydf.cell.verbose) + + if compact: + return eri + else: + return ao2mo.restore(1, eri, nao) + +#################### +# aosym = s1, complex integrals + + else: + raise NotImplementedError + + +def general(mydf, mo_coeffs, kpts=None, + compact=getattr(__config__, 'pbc_df_ao2mo_general_compact', True)): + '''General MO integral transformation''' + + from pyscf.pbc.df.df_ao2mo import warn_pbc2d_eri + warn_pbc2d_eri(mydf) + cell = mydf.cell + nao = cell.nao_nr() + kptijkl = _format_kpts(kpts) + kpti, kptj, kptk, kptl = kptijkl + if isinstance(mo_coeffs, numpy.ndarray) and mo_coeffs.ndim == 2: + mo_coeffs = (mo_coeffs,) * 4 + mo_coeffs = [numpy.asarray(mo, order='F') for mo in mo_coeffs] + if not _iskconserv(cell, kptijkl): + lib.logger.warn(cell, 'fft_ao2mo: momentum conservation not found in ' + 'the given k-points %s', kptijkl) + return numpy.zeros([mo.shape[1] for mo in mo_coeffs]) + + allreal = not any(numpy.iscomplexobj(mo) for mo in mo_coeffs) + q = kptj - kpti + # coulG = tools.get_coulG(cell, q, mesh=mydf.mesh) + # coords = cell.gen_uniform_grids(mydf.mesh) + max_memory = mydf.max_memory - lib.current_memory()[0] + + if hasattr(mydf, "W2") or (hasattr(mydf, "force_LS_THC") and mydf.force_LS_THC == True): # NOTE: this means that LS_THC_recompression is called, we do not perform ao2mo with robust fitting, as it is very expensive! + #print("use_LS_THC_anyway") + use_LS_THC_anyway = True + else: + #print("no_use_LS_THC_anyway") + use_LS_THC_anyway = False + + IsMOERI = (iden_coeffs(mo_coeffs[0], mo_coeffs[1]) and + iden_coeffs(mo_coeffs[0], mo_coeffs[2]) and + iden_coeffs(mo_coeffs[0], mo_coeffs[3])) + if not IsMOERI: + IsOVOV = False + IsGeneral = False + else: + IsOVOV = (iden_coeffs(mo_coeffs[0], mo_coeffs[2]) and + iden_coeffs(mo_coeffs[1], mo_coeffs[3])) + if IsOVOV: + IsGeneral = False + else: + IsGeneral = True + + if gamma_point(kptijkl) and allreal: + + ##### check whether LS-THC anyway ##### + + if use_LS_THC_anyway: + + vol = mydf.cell.vol + ngrid = np.prod(mydf.cell.mesh) + + if hasattr(mydf, "W2"): + eri = LS_THC_moeri(mydf, mydf.W2, mydf.aoRg2, mo_coeffs) * ngrid / vol + else: + eri = LS_THC_moeri(mydf, mydf.W, mydf.aoRg, mo_coeffs) * ngrid / vol + if compact: + if IsMOERI: + return ao2mo.restore(4, eri, nao) + else: + return eri + else: + return eri + + if ((iden_coeffs(mo_coeffs[0], mo_coeffs[1]) and + iden_coeffs(mo_coeffs[0], mo_coeffs[2]) and + iden_coeffs(mo_coeffs[0], mo_coeffs[3]))): + + #### Full MO-ERI #### + + t1 = (lib.logger.process_clock(), lib.logger.perf_counter()) + eri = isdf_eri(mydf, mo_coeffs[0].copy(), verbose=mydf.cell.verbose) + # eri = isdf_eri_2(mydf, mo_coeffs[0].copy(), verbose=mydf.cell.verbose) # requires aoPairR, which is very expensive + t2 = (lib.logger.process_clock(), lib.logger.perf_counter()) + _benchmark_time(t1, t2, 'isdf_eri', mydf) + + if compact: + return eri + else: + return ao2mo.restore(1, eri, nao) + else: + + #### ovov MO-ERI #### + + if ((iden_coeffs(mo_coeffs[0], mo_coeffs[2]) and + iden_coeffs(mo_coeffs[1], mo_coeffs[3]))): + + t1 = (lib.logger.process_clock(), lib.logger.perf_counter()) + eri = isdf_eri_ovov(mydf, mo_coeffs[0].copy(), mo_coeffs[1].copy(), verbose=mydf.cell.verbose) + t2 = (lib.logger.process_clock(), lib.logger.perf_counter()) + _benchmark_time(t1, t2, 'isdf_eri_ovov', mydf) + + if compact: + print("compact is not supported in general with ov ov mode") + return eri + else: + return eri + + else: + raise NotImplementedError + + else: + raise NotImplementedError + + return + +def ao2mo_7d(mydf, mo_coeff_kpts, kpts=None, factor=1, out=None): + raise NotImplementedError + +############ subroutines ---- LS-THC ############ + +def LS_THC(mydf, R:np.ndarray): + ''' + Least-Square Tensorhypercontraction decomposition of ERI. + Given an R matrix, compute the Z matrix such that the electron repulsion integral (ERI) can be expressed as eri ~ R R Z R R. + Supports both ISDF w./w.o. robust fitting. + + Args: + mydf : ISDF objects. + R : A matrix used in the computation of the ERI. + + Returns: + Z : eri = R R Z R R. + + Ref: + (1) Martinez2012: Parrish, Hohenstein, Martinez and Sherill. J. Chem. Phys. 137, 224106 (2012), DOI: https://doi.org/10.1063/1.4768233 + + ''' + + log = lib.logger.Logger(mydf.stdout, mydf.verbose) + + t1 = (lib.logger.process_clock(), lib.logger.perf_counter()) + + nGrid_R = R.shape[1] + nao = R.shape[0] + + assert nao == mydf.cell.nao + + ngrid = np.prod(mydf.cell.mesh) + nIP = mydf.naux + naux = mydf.naux + vol = mydf.cell.vol + natm = mydf.cell.natm + + Z = np.zeros((nGrid_R, nGrid_R)) + + #### step 1 construct #### + + RR = lib.ddot(R.T, R) + lib_isdf.square_inPlace(RR) + + # diag RR # + + D_RR, U_RR = scipy.linalg.eigh(RR) + D_RR_inv = (1.0/D_RR).copy() + + ## for debug ## + + log.debug4("***** LS_THC ***** ") + log.debug4("max D_RR = %f", np.max(D_RR)) + log.debug4("min D_RR = %f", np.min(D_RR)) + log.debug4("condition number = %f", np.max(D_RR)/np.min(D_RR)) + + #### step 2 construct R R ERI R R with O(N^3) cost #### + + # build (RX)^{PA} = \sum_mu R_mu^P X_\mu^A with X = aoRg # + + RX = np.zeros((nGrid_R, nIP)) + + aoRg = mydf.aoRg + + if isinstance(aoRg, np.ndarray): + + RX = lib.ddot(R.T, aoRg) + + else: + + assert isinstance(aoRg, list) + + for partition_i in range(natm): + + aoRg_i = aoRg[partition_i] + ao_involved_i = aoRg_i.ao_involved + nao_i = aoRg_i.aoR.shape[0] + global_IP_begin_i = aoRg_i.global_gridID_begin + nIP_i = aoRg_i.aoR.shape[1] + + R_packed = R[ao_involved_i,:].copy() + RX_tmp = lib.ddot(R_packed.T, aoRg_i.aoR) + + RX[:,global_IP_begin_i:global_IP_begin_i+nIP_i] = RX_tmp + + RX = lib_isdf.square_inPlace(RX) + + # build (RY)^{PB} = \sum_mu R_mu^P Y_\mu^B with Y = aoR # + + if mydf.with_robust_fitting: + + if isinstance(mydf.aoR, np.ndarray): + + RY = lib.ddot(R.T, mydf.aoR) + + else: + + assert isinstance(mydf.aoR, list) + + aoR = mydf.aoR + RY = np.zeros((nGrid_R, ngrid)) + for partition_i in range(natm): + + aoR_i = aoR[partition_i] + ao_involved_i = aoR_i.ao_involved + nao_i = aoR_i.aoR.shape[0] + global_gridID_i = aoR_i.global_gridID_begin + ngrid_i = aoR_i.aoR.shape[1] + + R_packed = R[ao_involved_i,:].copy() + RY_tmp = lib.ddot(R_packed.T, aoR_i.aoR) + + RY[:,global_gridID_i:global_gridID_i+ngrid_i] = RY_tmp + + RY = lib_isdf.square_inPlace(RY) + else: + RY = None + + #### V term #### + + with_robust_fitting = mydf.with_robust_fitting + + if with_robust_fitting: + V_R = mydf.V_R + Z_tmp1 = lib.ddot(V_R, RY.T) + lib.ddot(RX, Z_tmp1, c=Z) + Z += Z.T + del Z_tmp1 + + #### W term #### + + W = mydf.W + Z_tmp2 = lib.ddot(W, RX.T) + if with_robust_fitting: + lib.ddot(RX, Z_tmp2, c=Z, alpha=-1, beta=1) + else: + lib.ddot(RX, Z_tmp2, c=Z) + del Z_tmp2 + + Z1 = lib.ddot(U_RR.T, Z) + Z2 = lib.ddot(Z1, U_RR, c=Z) + Z = Z2 + + lib_isdf.d_i_ij_ij(D_RR_inv, Z, out=Z) + lib_isdf.d_ij_j_ij(Z, D_RR_inv, out=Z) + lib.ddot(U_RR, Z, c=Z1) + lib.ddot(Z1, U_RR.T, c=Z) + + t2 = (lib.logger.process_clock(), lib.logger.perf_counter()) + + log.timer('LS_THC fitting', *t1) + + return Z * ngrid / vol + +def LS_THC_eri(Z:np.ndarray, R:np.ndarray): + + einsum_str = "iP,jP,PQ,kQ,lQ->ijkl" + + path_info = np.einsum_path(einsum_str, R,R,Z,R,R, optimize='optimal') + + return np.einsum(einsum_str,R,R,Z,R,R,optimize=path_info[0]) + +def LS_THC_moeri(mydf, Z:np.ndarray, R:np.ndarray, mo_coeff:np.ndarray): + + t1 = (lib.logger.process_clock(), lib.logger.perf_counter()) + assert len(mo_coeff) == 4 + moRg = [lib.ddot(x.T, R) for x in mo_coeff] + einsum_str = "iP,jP,PQ,kQ,lQ->ijkl" + path_info = np.einsum_path(einsum_str, moRg[0], moRg[1], Z, moRg[2], moRg[3], optimize='optimal') + res = np.einsum(einsum_str, moRg[0], moRg[1], Z, moRg[2], moRg[3], optimize=path_info[0]) + log = lib.logger.Logger(mydf.stdout, mydf.verbose) + log.timer('LS_THC MOERI', *t1) + return res \ No newline at end of file diff --git a/pyscf/isdf/isdf_eval_gto.py b/pyscf/isdf/isdf_eval_gto.py new file mode 100644 index 000000000..2b6d2b568 --- /dev/null +++ b/pyscf/isdf/isdf_eval_gto.py @@ -0,0 +1,193 @@ +#!/usr/bin/env python +# Copyright 2014-2020 The PySCF Developers. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Author: Ning Zhang +# + +import ctypes +import numpy +from pyscf import lib +from pyscf.gto import moleintor +from pyscf.gto.eval_gto import _get_intor_and_comp, BLKSIZE +from pyscf.pbc.gto import _pbcintor +from pyscf import __config__ + +EXTRA_PREC = getattr(__config__, 'pbc_gto_eval_gto_extra_precision', 1e-2) + +libpbc = _pbcintor.libpbc +libisdf = lib.load_library('libisdf') + +def z2d_InPlace(z): + '''Convert complex array to double array in-place''' + assert(z.dtype == numpy.complex128) + + fn = getattr(libisdf, "NPz2d_InPlace") + assert(fn is not None) + fn(z.ctypes.data_as(ctypes.c_void_p), + ctypes.c_size_t(z.size)) + z_real = numpy.ndarray(shape=z.shape, dtype=numpy.double, buffer=z) + return z_real + +def _estimate_rcut(cell): + '''Cutoff raidus, above which each shell decays to a value less than the + required precsion''' + log_prec = numpy.log(cell.precision * EXTRA_PREC) + rcut = [] + for ib in range(cell.nbas): + l = cell.bas_angular(ib) + es = cell.bas_exp(ib) + cs = abs(cell.bas_ctr_coeff(ib)).max(axis=1) + r = 5. + r = (((l+2)*numpy.log(r)+numpy.log(cs) - log_prec) / es)**.5 + r[r < 1.] = 1. + r = (((l+2)*numpy.log(r)+numpy.log(cs) - log_prec) / es)**.5 + rcut.append(r.max()) + return numpy.array(rcut) + +def ISDF_eval_gto(cell, eval_name=None, coords=None, comp=None, kpts=numpy.zeros((1,3)), kpt=None, + shls_slice=None, non0tab=None, ao_loc=None, cutoff=None, + out=None, Ls=None, rcut=None): + r'''Evaluate PBC-AO function value on the given grids, + + Args: + eval_name : str + + ========================== ======================= + Function Expression + ========================== ======================= + "GTOval_sph" \sum_T exp(ik*T) |AO> + "GTOval_ip_sph" nabla \sum_T exp(ik*T) |AO> + "GTOval_cart" \sum_T exp(ik*T) |AO> + "GTOval_ip_cart" nabla \sum_T exp(ik*T) |AO> + ========================== ======================= + + atm : int32 ndarray + libcint integral function argument + bas : int32 ndarray + libcint integral function argument + env : float64 ndarray + libcint integral function argument + + coords : 2D array, shape (N,3) + The coordinates of the grids. + + Kwargs: + shls_slice : 2-element list + (shl_start, shl_end). + If given, only part of AOs (shl_start <= shell_id < shl_end) are + evaluated. By default, all shells defined in cell will be evaluated. + non0tab : 2D bool array + mask array to indicate whether the AO values are zero. The mask + array can be obtained by calling :func:`dft.gen_grid.make_mask` + cutoff : float + AO values smaller than cutoff will be set to zero. The default + cutoff threshold is ~1e-22 (defined in gto/grid_ao_drv.h) + out : ndarray + If provided, results are written into this array. + + Returns: + A list of 2D (or 3D) arrays to hold the AO values on grids. + + WARNING : only support gamma point calculation !!!! + + ''' + + if eval_name is None: + if cell.cart: + eval_name = 'GTOval_cart_deriv%d' % 0 + else: + eval_name = 'GTOval_sph_deriv%d' % 0 + + if eval_name[:3] == 'PBC': # PBCGTOval_xxx + eval_name, comp = _get_intor_and_comp(cell, eval_name[3:], comp) + else: + eval_name, comp = _get_intor_and_comp(cell, eval_name, comp) + eval_name = 'PBC' + eval_name + + assert comp == 1 + + atm = numpy.asarray(cell._atm, dtype=numpy.int32, order='C') + bas = numpy.asarray(cell._bas, dtype=numpy.int32, order='C') + env = numpy.asarray(cell._env, dtype=numpy.double, order='C') + natm = atm.shape[0] + nbas = bas.shape[0] + if kpts is None: + if kpt is not None: + raise RuntimeError('kpt should be a list of k-points') + kpts_lst = numpy.reshape(kpt, (1,3)) + else: + kpts_lst = numpy.zeros((1,3)) + else: + kpts_lst = numpy.reshape(kpts, (-1,3)) + nkpts = len(kpts_lst) + ngrids = len(coords) + + assert kpts_lst.shape[0] == 1 + + # print("kpts_lst = ", kpts_lst) + + if non0tab is None: + non0tab = numpy.empty(((ngrids+BLKSIZE-1)//BLKSIZE, nbas), + dtype=numpy.uint8) +# non0tab stores the number of images to be summed in real space. +# Initializing it to 255 means all images should be included + non0tab[:] = 0xff + + if ao_loc is None: + ao_loc = moleintor.make_loc(bas, eval_name) + if shls_slice is None: + shls_slice = (0, nbas) + sh0, sh1 = shls_slice + nao = ao_loc[sh1] - ao_loc[sh0] + + if out is None: + out = numpy.empty((nkpts,comp,nao,ngrids), dtype=numpy.complex128) # NOTE THE definition of the shape! + else: + # print("out is given") + out = numpy.ndarray((nkpts,comp,nao,ngrids), dtype=numpy.complex128, + buffer=out) + coords = numpy.asarray(coords, order='F') + + # For atoms near the boundary of the cell, it is necessary (even in low- + # dimensional systems) to include lattice translations in all 3 dimensions. + if Ls is None: + if cell.dimension < 2 or cell.low_dim_ft_type == 'inf_vacuum': + Ls = cell.get_lattice_Ls(dimension=cell.dimension) + else: + Ls = cell.get_lattice_Ls(dimension=3) + Ls = Ls[numpy.argsort(lib.norm(Ls, axis=1))] + expLk = numpy.exp(1j * numpy.asarray(numpy.dot(Ls, kpts_lst.T), order='C')) + if rcut is None: + rcut = _estimate_rcut(cell) + + with cell.with_integral_screen(cutoff): + drv = getattr(libpbc, eval_name) + drv(ctypes.c_int(ngrids), + (ctypes.c_int*2)(*shls_slice), ao_loc.ctypes.data_as(ctypes.c_void_p), + Ls.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(len(Ls)), + expLk.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(nkpts), + out.ctypes.data_as(ctypes.c_void_p), + coords.ctypes.data_as(ctypes.c_void_p), + rcut.ctypes.data_as(ctypes.c_void_p), + non0tab.ctypes.data_as(ctypes.c_void_p), + atm.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(natm), + bas.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(nbas), + env.ctypes.data_as(ctypes.c_void_p)) + + out = out[0] + out = z2d_InPlace(out) + return out[0] + + diff --git a/pyscf/isdf/isdf_fast.py b/pyscf/isdf/isdf_fast.py new file mode 100644 index 000000000..0ec0950dc --- /dev/null +++ b/pyscf/isdf/isdf_fast.py @@ -0,0 +1,1218 @@ +#!/usr/bin/env python +# Copyright 2014-2020 The PySCF Developers. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Author: Ning Zhang +# + +############ sys module ############ + +import copy +import numpy as np +import ctypes + +############ pyscf module ############ + +from pyscf import lib +from pyscf.lib import logger +import pyscf.pbc.gto as pbcgto +from pyscf.pbc.gto import Cell +from pyscf.pbc import tools +from pyscf.pbc.lib.kpts import KPoints +from pyscf.pbc.lib.kpts_helper import is_zero, gamma_point, member +from pyscf.gto.mole import * +from pyscf.pbc.dft import multigrid + +############ isdf utils ############ + +from pyscf.isdf.isdf_jk import _benchmark_time +import pyscf.isdf.isdf_ao2mo as isdf_ao2mo +import pyscf.isdf.isdf_jk as isdf_jk +from pyscf.isdf.isdf_eval_gto import ISDF_eval_gto +from pyscf.isdf.isdf_tools_kSampling import _kmesh_to_Kpoints +libisdf = lib.load_library('libisdf') + +############ global variables ############ + +BASIS_CUTOFF = 1e-18 # too small may lead to numerical instability +CRITERION_CALL_PARALLEL_QR = 256 + +############ subroutines --- select IP and build aux basis ############ + +def _select_IP_direct(mydf, c:int, m:int, first_natm=None, global_IP_selection=True, + aoR_cutoff = None, + rela_cutoff = 0.0, + no_retriction_on_nIP = False, + use_mpi=False): + r''' Select the interpolation points (IP) based on the given criteria. + + Args: + mydf : object + The interpolative separable density fitting (ISDF) object. + + c : int + if rela_cutoff is None or 0.0, control the number of IPs selected with c * nao at most. + + + rela_cutoff : float + The relative cutoff value for IP selection. + IPs with values smaller than rela_cutoff * max_QR_value will not be selected. + Default is 0.0. (no control via QR values) + + + Kwargs: + first_natm : int + The number of atoms to be considered for IP selection. + If not given, all atoms will be considered. + If set, it *should* be used in ISDF with k-sampling class, first_natm is the number of atoms in the first cell. + + global_IP_selection : bool + Whether to perform global IP selection. + If True, IPs will be re-selected after the individual selection of each atom. + Default is True. + + aoR_cutoff : float + The cutoff value for AO values. + Points with max AO values smaller than this cutoff will not be considered for IP selection. + Default is None. + + no_retriction_on_nIP : bool + Whether to remove the restriction on the number of IPs. + If True, there will be no limit on the number of selected IPs. + Default is False. + + use_mpi : bool + Whether to use MPI for parallel computation. + Default is False. + + m : int + Control the number of + + Returns: + selected_IP : list + The list of selected interpolation points. + + Ref: + + (1) Sandeep2022 https://pubs.acs.org/doi/10.1021/acs.jctc.2c00720 + + ''' + + if use_mpi: + from isdf_tools_mpi import rank, comm_size, comm, allgather, bcast + if rank == 0: + logger.debug4(mydf, "_select_IP_direct: num_threads = %d", lib.num_threads()) + else: + rank = 0 + logger.debug4(mydf, "_select_IP_direct: num_threads = %d", lib.num_threads()) + + ### determine the largest grids point of one atm ### + + natm = mydf.cell.natm + nao = mydf.nao + naux_max = 0 + + nao_per_atm = np.zeros((natm), dtype=np.int32) + for i in range(mydf.nao): + atm_id = mydf.ao2atomID[i] + nao_per_atm[atm_id] += 1 + + for nao_atm in nao_per_atm: + naux_max = max(naux_max, int(np.sqrt(c*nao_atm)) + m) + + nthread = lib.num_threads() + + buf_size_per_thread = mydf.get_buffer_size_in_IP_selection(c, m) + buf_size = buf_size_per_thread + + if hasattr(mydf, "IO_buf"): + buf = mydf.IO_buf + else: + buf = np.zeros((buf_size), dtype=np.float64) + mydf.IO_buf = buf + + if buf.size < buf_size: + mydf.IO_buf = np.zeros((buf_size), dtype=np.float64) + buf = mydf.IO_buf + buf_tmp = np.ndarray((buf_size), dtype=np.float64, buffer=buf) + + ### loop over atm ### + + from pyscf.pbc.dft.multigrid.multigrid_pair import MultiGridFFTDF2 + + df_tmp = MultiGridFFTDF2(mydf.cell) + grids = df_tmp.grids + coords = np.asarray(grids.coords).reshape(-1,3) + assert coords is not None + + results = [] + + fn_colpivot_qr = getattr(libisdf, "ColPivotQRRelaCut", None) + assert(fn_colpivot_qr is not None) + fn_ik_jk_ijk = getattr(libisdf, "NP_d_ik_jk_ijk", None) + assert(fn_ik_jk_ijk is not None) + + weight = np.sqrt(mydf.cell.vol / coords.shape[0]) + + for p0, p1 in lib.prange(0, 1, 1): + + taskinfo = [] + + # clear buffer + + if first_natm is None: + first_natm = natm + + for atm_id in range(first_natm): + + if use_mpi: + if atm_id % comm_size != rank: + continue + + buf_tmp[:buf_size_per_thread] = 0.0 + + grid_ID = np.where(mydf.partition == atm_id)[0] + + offset = 0 + aoR_atm = np.ndarray((nao, grid_ID.shape[0]), dtype=np.complex128, buffer=buf_tmp, offset=offset) + aoR_atm = ISDF_eval_gto(mydf.cell, coords=coords[grid_ID], out=aoR_atm) * weight + + nao_tmp = nao + + if aoR_cutoff is not None: + logger.debug4(mydf, "_select_IP_direct: aoR_cutoff = %12.6e", aoR_cutoff) + max_row = np.max(np.abs(aoR_atm), axis=1) + where = np.where(max_row > mydf.aoR_cutoff)[0] + aoR_atm = aoR_atm[where] + nao_tmp = aoR_atm.shape[0] + + # create buffer for this atm + + dtypesize = buf.dtype.itemsize + + offset += nao_tmp*grid_ID.shape[0] * dtypesize + + nao_atm = nao_per_atm[atm_id] + naux_now = int(np.sqrt(c*nao_atm)) + m + naux2_now = naux_now * naux_now + + R = np.ndarray((naux2_now, grid_ID.shape[0]), dtype=np.float64) + offset += naux2_now*grid_ID.shape[0] * dtypesize + + aoR_atm1 = np.ndarray((naux_now, grid_ID.shape[0]), dtype=np.float64, buffer=buf_tmp, offset=offset) + offset += naux_now*grid_ID.shape[0] * dtypesize + + aoR_atm2 = np.ndarray((naux_now, grid_ID.shape[0]), dtype=np.float64, buffer=buf_tmp, offset=offset) + offset += naux_now*grid_ID.shape[0] * dtypesize + + aoPairBuffer = np.ndarray( + (naux_now*naux_now, grid_ID.shape[0]), dtype=np.float64, buffer=buf_tmp, offset=offset) + offset += naux_now*naux_now*grid_ID.shape[0] * dtypesize + + G1 = np.random.rand(nao_tmp, naux_now) + G1, _ = numpy.linalg.qr(G1) + G1 = G1.T + G2 = np.random.rand(nao_tmp, naux_now) + G2, _ = numpy.linalg.qr(G2) + G2 = G2.T + + lib.dot(G1, aoR_atm, c=aoR_atm1) + lib.dot(G2, aoR_atm, c=aoR_atm2) + + fn_ik_jk_ijk(aoR_atm1.ctypes.data_as(ctypes.c_void_p), + aoR_atm2.ctypes.data_as(ctypes.c_void_p), + aoPairBuffer.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(naux_now), + ctypes.c_int(naux_now), + ctypes.c_int(grid_ID.shape[0])) + if global_IP_selection: + if no_retriction_on_nIP: + max_rank = min(naux2_now, grid_ID.shape[0]) + else: + max_rank = min(naux2_now, grid_ID.shape[0], nao_atm * c + m) + else: + if no_retriction_on_nIP: + max_rank = min(naux2_now, grid_ID.shape[0]) + else: + max_rank = min(naux2_now, grid_ID.shape[0], nao_atm * c) + + npt_find = ctypes.c_int(0) + pivot = np.arange(grid_ID.shape[0], dtype=np.int32) + thread_buffer = np.ndarray((nthread+1, grid_ID.shape[0]+1), dtype=np.float64, buffer=buf_tmp, offset=offset) + offset += (nthread+1)*(grid_ID.shape[0]+1) * dtypesize + global_buffer = np.ndarray((1, grid_ID.shape[0]), dtype=np.float64, buffer=buf_tmp, offset=offset) + offset += grid_ID.shape[0] * dtypesize + + fn_colpivot_qr(aoPairBuffer.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(naux2_now), + ctypes.c_int(grid_ID.shape[0]), + ctypes.c_int(max_rank), + ctypes.c_double(1e-14), + ctypes.c_double(rela_cutoff), + pivot.ctypes.data_as(ctypes.c_void_p), + R.ctypes.data_as(ctypes.c_void_p), + ctypes.byref(npt_find), + thread_buffer.ctypes.data_as(ctypes.c_void_p), + global_buffer.ctypes.data_as(ctypes.c_void_p)) + + npt_find = npt_find.value + + cutoff = abs(R[npt_find-1, npt_find-1]) + pivot = pivot[:npt_find] + pivot.sort() + results.extend(list(grid_ID[pivot])) + + logger.debug4(mydf, "_select_IP_direct: ngrid = %d, npt_find = %d, cutoff = %12.6e", grid_ID.shape[0], npt_find, cutoff) + + if use_mpi: + comm.Barrier() + results = allgather(results) + results.sort() + + ### global IP selection, we can use this step to avoid numerical issue ### + + ### but this step is not necessary if locality is explored ### + + if global_IP_selection and rank == 0: + + #if mydf.verbose: + # print("global IP selection") + + bufsize = mydf.get_buffer_size_in_global_IP_selection(len(results), c, m) + + if buf.size < bufsize: + mydf.IO_buf = np.zeros((bufsize), dtype=np.float64) + buf = mydf.IO_buf + if mydf.verbose: + print("reallocate buf of size = ", bufsize) + + dtypesize = buf.dtype.itemsize + + buf_tmp = np.ndarray((bufsize), dtype=np.float64, buffer=buf) + + offset = 0 + aoRg = np.ndarray((nao, len(results)), dtype=np.complex128, buffer=buf_tmp) + aoRg = ISDF_eval_gto(mydf.cell, coords=coords[results], out=aoRg) * weight + + offset += nao*len(results) * dtypesize + + naux_now = int(np.sqrt(c*nao)) + m + naux2_now = naux_now * naux_now + + R = np.ndarray((naux2_now, len(results)), dtype=np.float64) + offset += naux2_now*len(results) * dtypesize + + aoRg1 = np.ndarray((naux_now, len(results)), dtype=np.float64, buffer=buf_tmp, offset=offset) + offset += naux_now*len(results) * dtypesize + + aoRg2 = np.ndarray((naux_now, len(results)), dtype=np.float64, buffer=buf_tmp, offset=offset) + offset += naux_now*len(results) * dtypesize + + aoPairBuffer = np.ndarray( + (naux_now*naux_now, len(results)), dtype=np.float64, buffer=buf_tmp, offset=offset) + offset += naux_now*naux_now*len(results) * dtypesize + + G1 = np.random.rand(nao, naux_now) + G1, _ = numpy.linalg.qr(G1) + G1 = G1.T + G2 = np.random.rand(nao, naux_now) + G2, _ = numpy.linalg.qr(G2) + G2 = G2.T + + lib.dot(G1, aoRg, c=aoRg1) + lib.dot(G2, aoRg, c=aoRg2) + + fn_ik_jk_ijk(aoRg1.ctypes.data_as(ctypes.c_void_p), + aoRg2.ctypes.data_as(ctypes.c_void_p), + aoPairBuffer.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(naux_now), + ctypes.c_int(naux_now), + ctypes.c_int(len(results))) + + nao_first = np.sum(nao_per_atm[:first_natm]) + + if no_retriction_on_nIP: + max_rank = min(naux2_now, len(results)) + else: + max_rank = min(naux2_now, len(results), nao_first * c) + + npt_find = ctypes.c_int(0) + pivot = np.arange(len(results), dtype=np.int32) + thread_buffer = np.ndarray((nthread+1, len(results)+1), dtype=np.float64, buffer=buf_tmp, offset=offset) + offset += (nthread+1)*(len(results)+1) * dtypesize + global_buffer = np.ndarray((1, len(results)), dtype=np.float64, buffer=buf_tmp, offset=offset) + offset += len(results) * dtypesize + + fn_colpivot_qr(aoPairBuffer.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(naux2_now), + ctypes.c_int(len(results)), + ctypes.c_int(max_rank), + ctypes.c_double(1e-14), + ctypes.c_double(rela_cutoff), + pivot.ctypes.data_as(ctypes.c_void_p), + R.ctypes.data_as(ctypes.c_void_p), + ctypes.byref(npt_find), + thread_buffer.ctypes.data_as(ctypes.c_void_p), + global_buffer.ctypes.data_as(ctypes.c_void_p)) + npt_find = npt_find.value + + cutoff = abs(R[npt_find-1, npt_find-1]) + pivot = pivot[:npt_find] + + pivot.sort() + + results = np.array(results, dtype=np.int32) + results = list(results[pivot]) + + logger.debug4(mydf, "_select_IP_direct: ngrid = %d, npt_find = %d, cutoff = %12.6e", len(results), npt_find, cutoff) + + if global_IP_selection and use_mpi: + results = bcast(results) + + return results + +def build_aux_basis(mydf, debug=True, use_mpi=False): + '''build the auxiliary basis for ISDF given IP_ID and aoR. + ''' + + if use_mpi: + from isdf_tools_mpi import rank, bcast, comm + + t1 = (lib.logger.process_clock(), lib.logger.perf_counter()) + + # allocate memory for the auxiliary basis + + naux = mydf.IP_ID.shape[0] + mydf.naux = naux + mydf._allocate_jk_buffer(datatype=np.double) + buffer1 = np.ndarray((mydf.naux , mydf.naux), dtype=np.double, buffer=mydf.jk_buffer, offset=0) + + nao = mydf.nao + IP_ID = mydf.IP_ID + aoR = mydf.aoR + + if not hasattr(mydf, "aoRg") or mydf.aoRg is None: + aoRg = numpy.empty((mydf.nao, mydf.IP_ID.shape[0])) + lib.dslice(aoR, IP_ID, out=aoRg) + else: + aoRg = mydf.aoRg + + e = None + h = None + + if not use_mpi or (use_mpi and rank == 0): + A = np.asarray(lib.ddot(aoRg.T, aoRg, c=buffer1), order='C') # buffer 1 size = naux * naux + lib.square_inPlace(A) + + t11 = (lib.logger.process_clock(), lib.logger.perf_counter()) + e, h = scipy.linalg.eigh(A) + t12 = (lib.logger.process_clock(), lib.logger.perf_counter()) + + _benchmark_time(t11, t12, "diag_A", mydf) + + logger.debug4(mydf, "build_aux_basis: condition number = %12.6e", e[-1]/e[0]) + + where = np.where(e > e[-1]*1e-16)[0] + e = e[where] + h = h[:, where] + + if use_mpi: + e = bcast(e) + h = bcast(h) + + mydf.aux_basis = np.asarray(lib.ddot(aoRg.T, aoR), order='C') # buffer 2 size = naux * ngrids + lib.square_inPlace(mydf.aux_basis) + + #fn_build_aux = getattr(libisdf, "Solve_LLTEqualB_Parallel", None) + #assert(fn_build_aux is not None) + + nThread = lib.num_threads() + nGrids = aoR.shape[1] + Bunchsize = nGrids // nThread + + buffer2 = np.ndarray((e.shape[0] , mydf.aux_basis.shape[1]), dtype=np.double, buffer=mydf.jk_buffer, + offset=mydf.naux * mydf.naux * mydf.jk_buffer.dtype.itemsize) + B = np.asarray(lib.ddot(h.T, mydf.aux_basis, c=buffer2), order='C') + lib.d_i_ij_ij(1.0/e, B, out=B) + np.asarray(lib.ddot(h, B, c=mydf.aux_basis), order='C') + + if use_mpi: + comm.Barrier() + + t2 = (lib.logger.process_clock(), lib.logger.perf_counter()) + + _benchmark_time(t1, t2, "build_auxiliary_basis", mydf) + + mydf.naux = naux + mydf.aoRg = aoRg + +from pyscf.pbc import df + +class PBC_ISDF_Info(df.fft.FFTDF): + ''' Interpolative separable density fitting (ISDF) for periodic systems. + Not recommended as the locality is not explored! + + Examples: + + >>> #### code to construct aoR ommited ### + >>> aoR *= np.sqrt(cell.vol / ngrids) + >>> pbc_isdf = PBC_ISDF_Info(cell, aoR=aoR) + >>> pbc_isdf.build_IP_Sandeep(build_global_basis=True, c=C, global_IP_selection=False) + >>> pbc_isdf.build_auxiliary_Coulomb() + >>> from pyscf.pbc import scf + >>> mf = scf.RHF(cell) + >>> pbc_isdf.direct_scf = mf.direct_scf + >>> mf.with_df = pbc_isdf + >>> mf.verbose = 0 + >>> mf.kernel() + + ''' + + def __init__(self, mol:Cell, + aoR: np.ndarray = None, ## convention: aoR is scaled by np.sqrt(mol.vol / ngrids) + with_robust_fitting=True, + kmesh=None, + get_partition=True, + verbose = None + ): + + if kmesh == None: + kmesh = numpy.asarray([1,1,1], dtype=numpy.int32) + KPoints = _kmesh_to_Kpoints(mol, kmesh) ### WARNING: this subroutine is not correct ! + + super().__init__(cell=mol, kpts=KPoints) + + if verbose is not None: + self.verbose = verbose + + ## the following variables are used in build_sandeep + + self.with_robust_fitting = with_robust_fitting + + self.IP_ID = None + self.aux_basis = None + self.c = None + self.naux = None + self.W = None + self.aoRg = None + self.aoR = aoR + self.grid_begin = 0 + if aoR is not None: + self.aoRT = aoR.T + else: + self.aoRT = None + self.V_R = None + self.cell = mol + self.mesh = mol.mesh + + self.partition = None + + self.natm = mol.natm + self.nao = mol.nao_nr() + + from pyscf.pbc.dft.multigrid.multigrid_pair import MultiGridFFTDF2 + + logger.info(self, "PBC_ISDF_Info: mol.ke_cutoff = %f", mol.ke_cutoff) + + df_tmp = MultiGridFFTDF2(mol) + + if aoR is None: + # df_tmp = MultiGridFFTDF2(mol) + self.coords = np.asarray(df_tmp.grids.coords).reshape(-1,3) + self.ngrids = self.coords.shape[0] + else: + self.ngrids = aoR.shape[1] + assert self.nao == aoR.shape[0] + + self.grid_end = self.ngrids + + ## preallocated buffer for parallel calculation + + self.jk_buffer = None + self.ddot_buf = None + + ao2atomID = np.zeros(self.nao, dtype=np.int32) + ao2atomID = np.zeros(self.nao, dtype=np.int32) + + # only valid for spherical GTO + + ao_loc = 0 + for i in range(mol._bas.shape[0]): + atm_id = mol._bas[i, ATOM_OF] + nctr = mol._bas[i, NCTR_OF] + angl = mol._bas[i, ANG_OF] + nao_now = nctr * (2 * angl + 1) # NOTE: sph basis assumed! + ao2atomID[ao_loc:ao_loc+nao_now] = atm_id + ao_loc += nao_now + + self.ao2atomID = ao2atomID + + # given aoG, determine at given grid point, which ao has the maximal abs value + + if aoR is not None: + self.partition = np.argmax(np.abs(aoR), axis=0) + # map aoID to atomID + self.partition = np.asarray([ao2atomID[x] for x in self.partition]) + grids = df_tmp.grids + self.coords = np.asarray(grids.coords).reshape(-1,3) + self._numints = df_tmp._numint + else: + grids = df_tmp.grids + coords = np.asarray(grids.coords).reshape(-1,3) + NumInts = df_tmp._numint + + coords_now = coords + + if kmesh is not None: + + mesh = mol.mesh + meshPrim = np.array(mesh, dtype=np.int32) // kmesh + coords_now = coords_now.reshape(kmesh[0], meshPrim[0], kmesh[1], meshPrim[1], kmesh[2], meshPrim[2], 3) + coords_now = coords_now.transpose(0, 2, 4, 1, 3, 5, 6).reshape(-1, 3) + coords_now = coords_now[:np.prod(meshPrim), :] + + self.partition = np.zeros(coords_now.shape[0], dtype=np.int32) + + from pyscf.isdf.isdf_eval_gto import ISDF_eval_gto + + if hasattr(self, "IO_buf"): + logger.debug4(self, "PBC_ISDF_Info: IO_buf is already allocated") + else: + logger.debug4(self, "PBC_ISDF_Info: IO_buf is not allocated") + max_memory = max(2000, self.max_memory-lib.current_memory()[0]) + self.IO_buf = np.zeros((int(max_memory*1e6//8),), dtype=np.double) + + logger.debug4(self, "PBC_ISDF_Info: IO_buf.size = %d", self.IO_buf.size) + logger.debug4(self, "PBC_ISDF_Info: coords.shape[0] = %d", coords_now.shape[0]) + logger.debug4(self, "PBC_ISDF_Info: self.nao = %d", self.nao) + + bufsize = min(self.IO_buf.size, 4*1e9/8) // 2 + bunchsize = int(bufsize / (self.nao)) + + assert bunchsize > 0 + + if get_partition and aoR is None: + for p0, p1 in lib.prange(0, coords_now.shape[0], bunchsize): + AoR_Buf = np.ndarray((self.nao, p1-p0), dtype=np.complex128, buffer=self.IO_buf, offset=0) + AoR_Buf = ISDF_eval_gto(self.cell, coords=coords_now[p0:p1], out=AoR_Buf) + res = np.argmax(np.abs(AoR_Buf), axis=0) + self.partition[p0:p1] = np.asarray([ao2atomID[x] for x in res]) + AoR_Buf = None + else: + self.partition = None + + res = None + + self.coords = coords + self._numints = NumInts + + ########### attr used in build K directly with cutoff ########### + + self._build_K_rela_cutoff = None + self._build_K_abs_cutoff = None + self._build_K_distance_cutoff = None + + def _allocate_jk_buffer(self, datatype): + + if self.jk_buffer is None: + + nao = self.nao + ngrids = self.ngrids + naux = self.naux + + logger.debug4(self, "_allocate_jk_buffer: nao = %d, ngrids = %d, naux = %d", nao, ngrids, naux) + buffersize_k = nao * ngrids + naux * ngrids + naux * naux + nao * nao + buffersize_j = nao * ngrids + ngrids + nao * naux + naux + naux + nao * nao + + nThreadsOMP = lib.num_threads() + size_ddot_buf = max((naux*naux)+2, ngrids) * nThreadsOMP + + if hasattr(self, "IO_buf"): + + if self.IO_buf.size < (max(buffersize_k, buffersize_j) + size_ddot_buf): + self.IO_buf = np.zeros((max(buffersize_k, buffersize_j) + size_ddot_buf,), dtype=datatype) + + self.jk_buffer = np.ndarray((max(buffersize_k, buffersize_j),), + dtype=datatype, buffer=self.IO_buf, offset=0) + offset = max(buffersize_k, buffersize_j) * self.jk_buffer.dtype.itemsize + self.ddot_buf = np.ndarray((nThreadsOMP, max((nao*nao)+2, ngrids)), + dtype=datatype, buffer=self.IO_buf, offset=offset) + + else: + + self.jk_buffer = np.ndarray((max(buffersize_k, buffersize_j),), dtype=datatype) + self.ddot_buf = np.zeros((nThreadsOMP, max((nao*nao)+2, ngrids)), dtype=datatype) + + else: + assert self.jk_buffer.dtype == datatype + assert self.ddot_buf.dtype == datatype + + def set_build_K_cutoff(self, rela_cutoff=None, abs_cutoff=None): + ''' set the cutoff for building K matrix directly. + ''' + self._build_K_rela_cutoff = rela_cutoff + self._build_K_abs_cutoff = abs_cutoff + + def set_build_K_distance_cutoff(self, distance_cutoff=None): + ''' set the cutoff for building K matrix directly. + ''' + self._build_K_distance_cutoff = distance_cutoff + + def build(self): + raise NotImplementedError + + def build_only_partition(self): + raise NotImplementedError + + def get_buffer_size_in_IP_selection(self, c, m=5): + natm = self.cell.natm + nao_per_atm = np.zeros((natm), dtype=np.int32) + for i in range(self.nao): + atm_id = self.ao2atomID[i] + nao_per_atm[atm_id] += 1 + + naux_max = 0 + for nao_atm in nao_per_atm: + naux_max = max(naux_max, int(np.sqrt(c*nao_atm)) + m) + + ngrid_on_atm = np.zeros((self.cell.natm), dtype=np.int32) + for atm_id in self.partition: + ngrid_on_atm[atm_id] += 1 + + naux_max2 = naux_max * naux_max + + ngrid_on_atm = np.max(ngrid_on_atm) + + nThread = lib.num_threads() + + buf_size = self.nao*ngrid_on_atm # aoR_atm + buf_size += naux_max2*ngrid_on_atm # R + buf_size += naux_max*ngrid_on_atm*2 # aoR_atm1, aoR_atm2 + buf_size += naux_max*naux_max*ngrid_on_atm # aoPairBuffer + buf_size += (nThread+1)*(ngrid_on_atm+1) + buf_size += ngrid_on_atm + + return max(buf_size, 2*self.nao*ngrid_on_atm) + + def get_buffer_size_in_global_IP_selection(self, ngrids_possible, c, m=5): + + nao = self.nao + naux_max = int(np.sqrt(c*nao)) + m + ngrids_now = ngrids_possible + naux_max2 = naux_max * naux_max + + nThread = lib.num_threads() + + buf_size = self.nao*ngrids_now # aoR_atm + buf_size += naux_max2*ngrids_now # R + buf_size += naux_max*ngrids_now*2 # aoR_atm1, aoR_atm2 + buf_size += naux_max*naux_max*ngrids_now # aoPairBuffer + buf_size += (nThread+1)*(ngrids_now+1) + buf_size += ngrids_now + + return max(buf_size, 2*self.nao*ngrids_now) + + def get_A_B(self): + '''aux basis is contructed via solving AX=B + ''' + + aoR = self.aoR + IP_ID = self.IP_ID + aoRG = aoR[:, IP_ID] + + A = np.asarray(lib.dot(aoRG.T, aoRG), order='C') + A = A ** 2 + B = np.asarray(lib.dot(aoRG.T, aoR), order='C') + B = B ** 2 + + return A, B + + + def build_IP_Sandeep(self, c=5, m=5, + global_IP_selection=True, + build_global_basis=True, + IP_ID=None, + debug=True): + ''' select the interpolation points (IP) based on the given criteria using Sandeep's method. + Ref: + (1) Sandeep2022 https://pubs.acs.org/doi/10.1021/acs.jctc.2c00720 + ''' + + # build partition + + ao2atomID = self.ao2atomID + partition = self.partition + aoR = self.aoR + natm = self.natm + nao = self.nao + ao2atomID = self.ao2atomID + partition = self.partition + aoR = self.aoR + natm = self.natm + nao = self.nao + + # for each atm + + if not hasattr(self, "use_mpi"): + self.use_mpi = False + rank = 0 + + t1 = (lib.logger.process_clock(), lib.logger.perf_counter()) + + if IP_ID is None: + IP_ID = _select_IP_direct(self, c, m, global_IP_selection=global_IP_selection, use_mpi=self.use_mpi) + IP_ID.sort() + IP_ID = np.array(IP_ID, dtype=np.int32) + self.IP_ID = np.array(IP_ID, dtype=np.int32) + + t2 = (lib.logger.process_clock(), lib.logger.perf_counter()) + if rank == 0: + _benchmark_time(t1, t2, "build_IP", self) + t1 = t2 + + # build the auxiliary basis + + self.c = c + build_aux_basis(self) + + def build_auxiliary_Coulomb(self, cell:Cell = None, mesh=None, debug=True): + ''' build V and W matrix see eq(13) of Sandeep2022. + + Ref: + (1) Sandeep2022 https://pubs.acs.org/doi/10.1021/acs.jctc.2c00720 + ''' + + self._allocate_jk_buffer(datatype=np.double) + + # build the ddot buffer + + naux = self.naux + + if cell is None: + cell = self.cell + if mesh is None: + mesh = self.cell.mesh + + def constrcuct_V_CCode(aux_basis:np.ndarray, mesh, coul_G): + + coulG_real = coul_G.reshape(*mesh)[:, :, :mesh[2]//2+1].reshape(-1) + nThread = lib.num_threads() + bunchsize = naux // (2*nThread) + bufsize_per_thread = bunchsize * coulG_real.shape[0] * 2 + bufsize_per_thread = (bufsize_per_thread + 15) // 16 * 16 + nAux = aux_basis.shape[0] + ngrids = aux_basis.shape[1] + mesh_int32 = np.array(mesh, dtype=np.int32) + + V = np.zeros((nAux, ngrids), dtype=np.double) + + fn = getattr(libisdf, "_construct_V", None) + assert(fn is not None) + + fn(mesh_int32.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(nAux), + aux_basis.ctypes.data_as(ctypes.c_void_p), + coulG_real.ctypes.data_as(ctypes.c_void_p), + V.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(bunchsize), + self.jk_buffer.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(bufsize_per_thread)) + + return V + + t1 = (lib.logger.process_clock(), lib.logger.perf_counter()) + + if cell is None: + cell = self.cell + #print("cell.__class__ = ", cell.__class__) + + coulG = tools.get_coulG(cell, mesh=mesh) + + V_R = constrcuct_V_CCode(self.aux_basis, mesh, coulG) + + t2 = (lib.logger.process_clock(), lib.logger.perf_counter()) + if debug: + _benchmark_time(t1, t2, "build_auxiliary_Coulomb_V_R", self) + t1 = t2 + + W = lib.ddot(a=self.aux_basis, b=V_R.T) + + self.coulG = coulG.copy() + + t2 = (lib.logger.process_clock(), lib.logger.perf_counter()) + if debug: + _benchmark_time(t1, t2, "build_auxiliary_Coulomb_W", self) + + self.V_R = V_R + self.W = W + self.mesh = mesh + + def check_AOPairError(self): + + assert(self.aoR is not None) + assert(self.IP_ID is not None) + assert(self.aux_basis is not None) + + aoR = self.aoR + aoRg = aoR[:, self.IP_ID] + nao = self.nao + + logger.debug4(self, "check_AOPairError") + + for i in range(nao): + + coeff = numpy.einsum('k,jk->jk', aoRg[i, :], aoRg).reshape(-1, self.IP_ID.shape[0]) + aoPair = numpy.einsum('k,jk->jk', aoR[i, :], aoR).reshape(-1, aoR.shape[1]) + aoPair_approx = coeff @ self.aux_basis + + diff = aoPair - aoPair_approx + diff_pair_abs_max = np.max(np.abs(diff), axis=1) + + for j in range(diff_pair_abs_max.shape[0]): + logger.debug4(self, "(%5d, %5d, %15.8e)", i, j, diff_pair_abs_max[j]) + + def __del__(self): + return + + @property + def kpt(self): + return np.zeros(3) + + def get_pp(self, kpts=None): + if hasattr(self, "PP") and self.PP is not None: + return self.PP + else: + + use_super_pp = False + + if hasattr(self, "_use_super_pp"): + if self._use_super_pp: + use_super_pp = True + t0 = (lib.logger.process_clock(), lib.logger.perf_counter()) + self.PP = super().get_pp(kpts=np.zeros(3)) + t1 = (lib.logger.process_clock(), lib.logger.perf_counter()) + if not use_super_pp: + t0 = (lib.logger.process_clock(), lib.logger.perf_counter()) + cell = self.cell.copy() + cell.omega = 0.0 + if hasattr(self, "ke_cutoff_pp"): + cell.ke_cutoff = self.ke_cutoff_pp + cell.build() + df_tmp = multigrid.MultiGridFFTDF2(cell) + v_pp_loc2_nl = df_tmp.get_pp() + v_pp_loc1_G = df_tmp.vpplocG_part1 + v_pp_loc1 = multigrid.multigrid_pair._get_j_pass2(df_tmp, v_pp_loc1_G) + self.PP = (v_pp_loc1 + v_pp_loc2_nl)[0] + t1 = (lib.logger.process_clock(), lib.logger.perf_counter()) + + #if self.verbose: + if self.use_mpi: + from pyscf.isdf.isdf_tools_mpi import rank + if rank == 0: + _benchmark_time(t0, t1, "get_pp", self) + else: + _benchmark_time(t0, t1, "get_pp", self) + + #### kpts #### + + if kpts is not None: + + nkpts = kpts.shape[0] + + if hasattr(self, "kmesh") and self.kmesh is not None: + pass + else: + self.kmesh = np.asarray([1,1,1], dtype=np.int32) + kmesh = np.asarray(self.kmesh, dtype=np.int32) + assert kpts.shape[0] == np.prod(self.kmesh, dtype=np.int32) or kpts.shape[0] == 1 or kpts.ndim == 1 + is_single_kpt = kpts.shape[0] == 1 or kpts.ndim == 1 + + if is_single_kpt: + #### use the calculated one by default #### + if self.use_mpi: + from pyscf.isdf.isdf_tools_mpi import bcast + self.PP = bcast(self.PP, root = 0) + return self.PP + + #### the following is used to test KRHF #### + + ### info used in super().get_pp() ### + + assert hasattr(self, "prim_cell") + + nao_prim = self.cell.nao_nr() // nkpts + assert self.cell.nao_nr() % nkpts == 0 + self.PP = self.PP[:nao_prim, :].copy() + + n_complex = self.kmesh[0] * self.kmesh[1] * (self.kmesh[2]//2+1) + n_cell = np.prod(self.kmesh) + + PP_complex = np.zeros((nao_prim, n_complex * nao_prim), dtype=np.complex128) + PP_real = np.ndarray((nao_prim, n_cell * nao_prim), dtype=np.double, buffer=PP_complex) + PP_real.ravel()[:] = self.PP.ravel() + buf_fft = np.zeros((nao_prim, n_complex, nao_prim), dtype=np.complex128) + + fn1 = getattr(libisdf, "_FFT_Matrix_Col_InPlace", None) + assert fn1 is not None + + fn1( + PP_real.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(nao_prim), + ctypes.c_int(nao_prim), + kmesh.ctypes.data_as(ctypes.c_void_p), + buf_fft.ctypes.data_as(ctypes.c_void_p) + ) + del buf_fft + + from pyscf.isdf.isdf_tools_densitymatrix import pack_JK_in_FFT_space + + PP_complex = PP_complex.conj().copy() + self.PP = pack_JK_in_FFT_space(PP_complex, kmesh, nao_prim) + + if self.use_mpi: + from pyscf.isdf.isdf_tools_mpi import bcast + self.PP = bcast(self.PP, root = 0) + + return self.PP + + def get_nuc(self, kpts=None): + if hasattr(self, "nuc") and self.nuc is not None: + return self.nuc + else: + + t0 = (lib.logger.process_clock(), lib.logger.perf_counter()) + self.nuc = super().get_nuc(kpts=np.zeros(3)) + t1 = (lib.logger.process_clock(), lib.logger.perf_counter()) + if self.verbose: + _benchmark_time(t0, t1, "get_nuc", self) + + #### kpts #### + + if kpts is not None: + + nkpts = kpts.shape[0] + + if hasattr(self, "kmesh") and self.kmesh is not None: + pass + else: + self.kmesh = np.asarray([1,1,1], dtype=np.int32) + kmesh = np.asarray(self.kmesh, dtype=np.int32) + + assert kpts.shape[0] == np.prod(self.kmesh, dtype=np.int32) or kpts.shape[0] == 1 or kpts.ndim == 1 + + is_single_kpt = kpts.shape[0] == 1 or kpts.ndim == 1 + + if is_single_kpt: + if self.use_mpi: + from pyscf.isdf.isdf_tools_mpi import bcast + self.nuc = bcast(self.nuc, root = 0) + return self.nuc + + #### the following is used in KRHF #### + + ### info used in super().get_pp() ### + + assert hasattr(self, "prim_cell") + + nao_prim = self.cell.nao_nr() // nkpts + assert self.cell.nao_nr() % nkpts == 0 + self.nuc = self.nuc[:nao_prim, :].copy() + + n_complex = self.kmesh[0] * self.kmesh[1] * (self.kmesh[2]//2+1) + n_cell = np.prod(self.kmesh) + + nuc_complex = np.zeros((nao_prim, n_complex * nao_prim), dtype=np.complex128) + nuc_real = np.ndarray((nao_prim, n_cell * nao_prim), dtype=np.double, buffer=nuc_complex) + nuc_real.ravel()[:] = self.nuc.ravel() + buf_fft = np.zeros((nao_prim, n_complex, nao_prim), dtype=np.complex128) + + fn1 = getattr(libisdf, "_FFT_Matrix_Col_InPlace", None) + assert fn1 is not None + + fn1( + nuc_real.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(nao_prim), + ctypes.c_int(nao_prim), + kmesh.ctypes.data_as(ctypes.c_void_p), + buf_fft.ctypes.data_as(ctypes.c_void_p) + ) + del buf_fft + + from pyscf.isdf.isdf_tools_densitymatrix import pack_JK_in_FFT_space + + nuc_complex = nuc_complex.conj().copy() + self.nuc = pack_JK_in_FFT_space(nuc_complex, kmesh, nao_prim) + + if self.use_mpi: + from pyscf.isdf.isdf_tools_mpi import bcast + self.nuc = bcast(self.nuc, root = 0) + + return self.nuc + + def LS_THC_recompression(self, X:np.ndarray, force_LS_THC=True): + + from isdf_ao2mo import LS_THC + + if force_LS_THC: + self.with_robust_fitting = False + self.force_LS_THC = True + self.W = LS_THC(self, X) / (self.ngrids/self.cell.vol) + self.aoRg = X + self.V_R = None + else: + self.force_LS_THC = False + self.W2 = LS_THC(self, X) / (self.ngrids/self.cell.vol) + self.aoRg2 = X + + def aoRg_full(self): + return self.aoRg, None + + ##### functions defined in isdf_ao2mo.py ##### + + get_eri = get_ao_eri = isdf_ao2mo.get_eri + ao2mo = get_mo_eri = isdf_ao2mo.general + ao2mo_7d = isdf_ao2mo.ao2mo_7d # seems to be only called in kadc and kccsd, NOT implemented! + + ##### functions defined in isdf_jk.py ##### + + get_jk = isdf_jk.get_jk_dm + + + +if __name__ == '__main__': + + C = 15 + + cell = pbcgto.Cell() + boxlen = 3.5668 + cell.a = np.array([[boxlen,0.0,0.0],[0.0,boxlen,0.0],[0.0,0.0,boxlen]]) + cell.atom = ''' + C 0. 0. 0. + C 0.8917 0.8917 0.8917 + C 1.7834 1.7834 0. + C 2.6751 2.6751 0.8917 + C 1.7834 0. 1.7834 + C 2.6751 0.8917 2.6751 + C 0. 1.7834 1.7834 + C 0.8917 2.6751 2.6751 + ''' + +# boxlen = 4.2 +# cell.a = np.array([[boxlen,0.0,0.0],[0.0,boxlen,0.0],[0.0,0.0,boxlen]]) +# cell.atom = ''' +# Li 0.0 0.0 0.0 +# Li 2.1 2.1 0.0 +# Li 0.0 2.1 2.1 +# Li 2.1 0.0 2.1 +# H 0.0 0.0 2.1 +# H 0.0 2.1 0.0 +# H 2.1 0.0 0.0 +# H 2.1 2.1 2.1 +# ''' + + cell.basis = 'gth-dzvp' + # cell.basis = 'gth-tzvp' + cell.pseudo = 'gth-pade' + cell.verbose = 10 + + # cell.ke_cutoff = 128 # kinetic energy cutoff in a.u. + cell.ke_cutoff = 70 + cell.max_memory = 800 # 800 Mb + cell.precision = 1e-8 # integral precision + cell.use_particle_mesh_ewald = True + + cell.build() + + cell = tools.super_cell(cell, [1, 1, 1]) + + from pyscf.pbc.dft.multigrid.multigrid_pair import MultiGridFFTDF2, _eval_rhoG + + df_tmp = MultiGridFFTDF2(cell) + + grids = df_tmp.grids + coords = np.asarray(grids.coords).reshape(-1,3) + nx = grids.mesh[0] + + mesh = grids.mesh + ngrids = np.prod(mesh) + assert ngrids == coords.shape[0] + + aoR = df_tmp._numint.eval_ao(cell, coords)[0].T # the T is important + aoR *= np.sqrt(cell.vol / ngrids) + + pbc_isdf_info = PBC_ISDF_Info(cell, aoR=aoR) + pbc_isdf_info.build_IP_Sandeep(build_global_basis=True, c=C, global_IP_selection=False) + pbc_isdf_info.build_auxiliary_Coulomb() + + ### perform scf ### + + from pyscf.pbc import scf + + mf = scf.RHF(cell) + pbc_isdf_info.direct_scf = mf.direct_scf + mf.with_df = pbc_isdf_info + mf.max_cycle = 100 + mf.conv_tol = 1e-7 + + print("mf.direct_scf = ", mf.direct_scf) + + mf.kernel() + + print("mf.with_df.IP_ID = ", mf.with_df.IP_ID) + print("mf.with_df.partition = ", mf.with_df.partition) + + for i in range(cell.natm): + print("i = ", i, "partition = ", mf.with_df.partition[mf.with_df.partition == i].shape[0]) + + #exit(1) + + # without robust fitting + + pbc_isdf_info.with_robust_fitting = False + + mf = scf.RHF(cell) + pbc_isdf_info.direct_scf = mf.direct_scf + mf.with_df = pbc_isdf_info + mf.max_cycle = 100 + mf.conv_tol = 1e-7 + mf.kernel() + + mf = scf.RHF(cell) + mf.max_cycle = 100 + mf.conv_tol = 1e-8 + #mf.kernel() + pbc_isdf_info.with_robust_fitting = True + + ##### test the LS_THC_recompression ##### + + _pbc_isdf_info = PBC_ISDF_Info(cell, aoR) + _pbc_isdf_info.build_IP_Sandeep(build_global_basis=True, c=12, global_IP_selection=False) + + pbc_isdf_info.LS_THC_recompression(_pbc_isdf_info.aoRg, force_LS_THC=False) + + mf = scf.RHF(cell) + pbc_isdf_info.direct_scf = mf.direct_scf + mf.with_df = pbc_isdf_info + mf.max_cycle = 10 + mf.conv_tol = 1e-7 + mf.kernel() + + pbc_isdf_info.LS_THC_recompression(_pbc_isdf_info.aoRg) + + mf = scf.RHF(cell) + pbc_isdf_info.direct_scf = mf.direct_scf + mf.with_df = pbc_isdf_info + mf.max_cycle = 10 + mf.conv_tol = 1e-7 + mf.kernel() \ No newline at end of file diff --git a/pyscf/isdf/isdf_jk.py b/pyscf/isdf/isdf_jk.py new file mode 100644 index 000000000..7ebcd5ff5 --- /dev/null +++ b/pyscf/isdf/isdf_jk.py @@ -0,0 +1,598 @@ +#!/usr/bin/env python +# Copyright 2014-2020 The PySCF Developers. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Author: Ning Zhang +# + +import copy +import numpy as np +import numpy +import ctypes + +from pyscf import lib +from pyscf.lib import logger +from pyscf.pbc import tools +from pyscf.pbc.lib.kpts_helper import is_zero, gamma_point +libpbc = lib.load_library('libpbc') + +################################################## +# +# only Gamma Point +# +################################################## + +######### tools ######### + +def _benchmark_time(t1, t2, label, rec): + lib.logger.debug4(rec, "%20s wall time: %12.6f CPU time: %12.6f" % (label, t2[1] - t1[1], t2[0] - t1[0])) + +def _contract_j_dm(mydf, dm, with_robust_fitting=True, use_mpi=False): + ''' + + Args: + mydf : density fitting object + dm : the density matrix + + ''' + + assert use_mpi == False + + t1 = (logger.process_clock(), logger.perf_counter()) + + if len(dm.shape) == 3: + assert dm.shape[0] == 1 + dm = dm[0] + + nao = dm.shape[0] + + cell = mydf.cell + assert cell.nao == nao + vol = cell.vol + + W = mydf.W + aoRg = mydf.aoRg + aoR = mydf.aoR + ngrid = aoR.shape[1] + + if hasattr(mydf, "V_R"): + V_R = mydf.V_R + else: + V_R = None + naux = aoRg.shape[1] + IP_ID = mydf.IP_ID + + #### step 2. get J term1 and term2 + + buffer = mydf.jk_buffer + buffer1 = np.ndarray((nao,ngrid), dtype=dm.dtype, buffer=buffer, offset=0) + buffer2 = np.ndarray((ngrid), dtype=dm.dtype, buffer=buffer, offset=nao * ngrid * dm.dtype.itemsize) + buffer3 = np.ndarray((nao,naux), dtype=dm.dtype, buffer=buffer, + offset=(nao * ngrid + ngrid) * dm.dtype.itemsize) + buffer4 = np.ndarray((naux), dtype=dm.dtype, buffer=buffer, offset=(nao * + ngrid + ngrid + nao * naux) * dm.dtype.itemsize) + buffer5 = np.ndarray((naux), dtype=dm.dtype, buffer=buffer, offset=(nao * + ngrid + ngrid + nao * naux + naux) * dm.dtype.itemsize) + buffer6 = np.ndarray((nao,nao), dtype=dm.dtype, buffer=buffer, offset=(nao * + ngrid + ngrid + nao * naux + naux + naux) * dm.dtype.itemsize) + buffer7 = np.ndarray((nao,naux), dtype=dm.dtype, buffer=buffer, offset=0) + buffer8 = np.ndarray((naux), dtype=dm.dtype, buffer=buffer, offset=nao * ngrid * dm.dtype.itemsize) + + ## constract dm and aoR + + # need allocate memory, size = nao * ngrid, (buffer 1) + + lib.ddot(dm, aoR, c=buffer1) + tmp1 = buffer1 + + # need allocate memory, size = ngrid, (buffer 2) + + density_R = np.asarray(lib.multiply_sum_isdf(aoR, tmp1, out=buffer2), order='C') + + # need allocate memory, size = nao * naux, (buffer 3) + + # lib.dslice(tmp1, IP_ID, buffer3) + # tmp1 = buffer3 + tmp1 = lib.ddot(dm, aoRg) + + density_Rg = np.asarray(lib.multiply_sum_isdf(aoRg, tmp1, out=buffer4), + order='C') # need allocate memory, size = naux, (buffer 4) + + # This should be the leading term of the computation cost in a single-thread mode. + + # need allocate memory, size = naux, (buffer 5) + + J = None + + if with_robust_fitting: + J = np.asarray(lib.ddot_withbuffer(V_R, density_R.reshape(-1,1), c=buffer5.reshape(-1,1), buf=mydf.ddot_buf), order='C').reshape(-1) # with buffer, size + + # do not need allocate memory, use buffer 3 + + J = np.asarray(lib.d_ij_j_ij(aoRg, J, out=buffer3), order='C') + + # need allocate memory, size = nao * nao, (buffer 6) + + J = np.asarray(lib.ddot_withbuffer(aoRg, J.T, c=buffer6, buf=mydf.ddot_buf), order='C') + + # do not need allocate memory, use buffer 2 + + J2 = np.asarray(lib.dot(V_R.T, density_Rg.reshape(-1,1), c=buffer2.reshape(-1,1)), order='C').reshape(-1) + + # do not need allocate memory, use buffer 1 + + # J2 = np.einsum('ij,j->ij', aoR, J2) + J2 = np.asarray(lib.d_ij_j_ij(aoR, J2, out=buffer1), order='C') + + # do not need allocate memory, use buffer 6 + + # J += np.asarray(lib.dot(aoR, J2.T), order='C') + lib.ddot_withbuffer(aoR, J2.T, c=J, beta=1, buf=mydf.ddot_buf) + + #### step 3. get J term3 + + # do not need allocate memory, use buffer 2 + + tmp = np.asarray(lib.dot(W, density_Rg.reshape(-1,1), c=buffer8.reshape(-1,1)), order='C').reshape(-1) + + # do not need allocate memory, use buffer 1 but viewed as buffer 7 + + tmp = np.asarray(lib.d_ij_j_ij(aoRg, tmp, out=buffer7), order='C') + + # do not need allocate memory, use buffer 6 + + if with_robust_fitting: + lib.ddot_withbuffer(aoRg, -tmp.T, c=J, beta=1, buf=mydf.ddot_buf) + else: + J = buffer6 + lib.ddot_withbuffer(aoRg, tmp.T, c=J, beta=0, buf=mydf.ddot_buf) + + t2 = (logger.process_clock(), logger.perf_counter()) + + if mydf.verbose: + _benchmark_time(t1, t2, "_contract_j_dm", mydf) + + return J * ngrid / vol + +def _contract_j_dm_fast(mydf, dm, with_robust_fitting=True, use_mpi=False): + + assert use_mpi == False + + t1 = (logger.process_clock(), logger.perf_counter()) + + if len(dm.shape) == 3: + assert dm.shape[0] == 1 + dm = dm[0] + + nao = dm.shape[0] + cell = mydf.cell + assert cell.nao == nao + ngrid = np.prod(cell.mesh) + assert ngrid == mydf.ngrids + vol = cell.vol + + W = mydf.W + aoRg = mydf.aoRg + aoR = mydf.aoR + ngrid = aoR.shape[1] + if hasattr(mydf, "V_R"): + V_R = mydf.V_R + else: + V_R = None + naux = mydf.naux + IP_ID = mydf.IP_ID + + mesh = np.array(cell.mesh, dtype=np.int32) + + #### step 0. allocate buffer + + buffer = mydf.jk_buffer + buffer1 = np.ndarray((nao,ngrid), dtype=dm.dtype, buffer=buffer, offset=0) + buffer2 = np.ndarray((ngrid), dtype=dm.dtype, buffer=buffer, offset=nao * ngrid * dm.dtype.itemsize) + buffer3 = np.ndarray((nao,naux), dtype=dm.dtype, buffer=buffer, + offset=(nao * ngrid + ngrid) * dm.dtype.itemsize) + buffer4 = np.ndarray((naux), dtype=dm.dtype, buffer=buffer, offset=(nao * + ngrid + ngrid + nao * naux) * dm.dtype.itemsize) + buffer5 = np.ndarray((naux), dtype=dm.dtype, buffer=buffer, offset=(nao * + ngrid + ngrid + nao * naux + naux) * dm.dtype.itemsize) + buffer6 = np.ndarray((nao,nao), dtype=dm.dtype, buffer=buffer, offset=(nao * + ngrid + ngrid + nao * naux + naux + naux) * dm.dtype.itemsize) + buffer7 = np.ndarray((nao,naux), dtype=dm.dtype, buffer=buffer, offset=0) + buffer8 = np.ndarray((naux), dtype=dm.dtype, buffer=buffer, offset=nao * ngrid * dm.dtype.itemsize) + + #### step 1. get density value on real space grid and IPs + + lib.ddot(dm, aoR, c=buffer1) + tmp1 = buffer1 + density_R = np.asarray(lib.multiply_sum_isdf(aoR, tmp1, out=buffer2), order='C') + + if hasattr(mydf, "grid_ID_ordered"): + if (use_mpi and rank == 0) or (use_mpi == False): + density_R_original = np.zeros_like(density_R) + + fn_order = getattr(libpbc, "_Reorder_Grid_to_Original_Grid", None) + assert fn_order is not None + + fn_order( + ctypes.c_int(density_R.size), + mydf.grid_ID_ordered.ctypes.data_as(ctypes.c_void_p), + density_R.ctypes.data_as(ctypes.c_void_p), + density_R_original.ctypes.data_as(ctypes.c_void_p), + ) + + density_R = density_R_original.copy() + + J = None + + if (use_mpi and rank == 0) or (use_mpi == False): + + fn_J = getattr(libpbc, "_construct_J", None) + assert(fn_J is not None) + + J = np.zeros_like(density_R) + + fn_J( + mesh.ctypes.data_as(ctypes.c_void_p), + density_R.ctypes.data_as(ctypes.c_void_p), + mydf.coulG.ctypes.data_as(ctypes.c_void_p), + J.ctypes.data_as(ctypes.c_void_p), + ) + + if hasattr(mydf, "grid_ID_ordered"): + + J_ordered = np.zeros_like(J) + + fn_order = getattr(libpbc, "_Original_Grid_to_Reorder_Grid", None) + assert fn_order is not None + + fn_order( + ctypes.c_int(J.size), + mydf.grid_ID_ordered.ctypes.data_as(ctypes.c_void_p), + J.ctypes.data_as(ctypes.c_void_p), + J_ordered.ctypes.data_as(ctypes.c_void_p), + ) + + J = J_ordered.copy() + + #### step 3. get J + + J = np.asarray(lib.d_ij_j_ij(aoR, J, out=buffer1), order='C') + J = lib.ddot_withbuffer(aoR, J.T, buf=mydf.ddot_buf) + + t2 = (logger.process_clock(), logger.perf_counter()) + + if mydf.verbose: + _benchmark_time(t1, t2, "_contract_j_dm_fast", mydf) + + return J * ngrid / vol + +def _contract_j_dm_wo_robust_fitting(mydf, dm, with_robust_fitting=False, use_mpi=False): + + assert with_robust_fitting == False + assert use_mpi == False + + if use_mpi: + raise NotImplementedError("MPI is not supported in this function") + + t1 = (logger.process_clock(), logger.perf_counter()) + + if len(dm.shape) == 3: + assert dm.shape[0] == 1 + dm = dm[0] + + nao = dm.shape[0] + + cell = mydf.cell + assert cell.nao == nao + vol = cell.vol + mesh = np.array(cell.mesh, dtype=np.int32) + ngrid = np.prod(cell.mesh) + + W = mydf.W + aoRg = mydf.aoRg + + naux = aoRg.shape[1] + + tmp1 = lib.ddot(dm, aoRg) + density_Rg = np.asarray(lib.multiply_sum_isdf(aoRg, tmp1), + order='C') + tmp = np.asarray(lib.dot(W, density_Rg.reshape(-1,1)), order='C').reshape(-1) + tmp = np.asarray(lib.d_ij_j_ij(aoRg, tmp), order='C') + + J = lib.ddot(aoRg, tmp.T) + + del tmp1 + tmp1 = None + del tmp + tmp = None + del density_Rg + density_Rg = None + + t2 = (logger.process_clock(), logger.perf_counter()) + + _benchmark_time(t1, t2, "_contract_j_dm_wo_robust_fitting", mydf) + + return J * ngrid / vol + +def _contract_k_dm(mydf, dm, with_robust_fitting=True, use_mpi=False): + ''' + + Args: + mydf : + mo_coeffs : the occupied MO coefficients + + ''' + + assert use_mpi == False + + t1 = (logger.process_clock(), logger.perf_counter()) + + if len(dm.shape) == 3: + assert dm.shape[0] == 1 + dm = dm[0] + + nao = dm.shape[0] + + cell = mydf.cell + assert cell.nao == nao + ngrid = np.prod(cell.mesh) + assert ngrid == mydf.ngrids + vol = cell.vol + + W = mydf.W + aoRg = mydf.aoRg + aoR = mydf.aoR + ngrid = aoR.shape[1] + if hasattr(mydf, "V_R"): + V_R = mydf.V_R + else: + V_R = None + # naux = aoRg.shape[1] + naux = mydf.naux + IP_ID = mydf.IP_ID + + buffer = mydf.jk_buffer + buffer1 = np.ndarray((nao,ngrid), dtype=dm.dtype, buffer=buffer, offset=0) + buffer2 = np.ndarray((naux,ngrid), dtype=dm.dtype, buffer=buffer, offset=nao * ngrid * dm.dtype.itemsize) + buffer3 = np.ndarray((naux,naux), dtype=dm.dtype, buffer=buffer, + offset=(nao * ngrid + naux * ngrid) * dm.dtype.itemsize) + buffer4 = np.ndarray((nao,nao), dtype=dm.dtype, buffer=buffer, offset=(nao * + ngrid + naux * ngrid + naux * naux) * dm.dtype.itemsize) + buffer5 = np.ndarray((naux,nao), dtype=dm.dtype, buffer=buffer, offset=0) + buffer6 = np.ndarray((naux,nao), dtype=dm.dtype, buffer=buffer, offset=nao * ngrid * dm.dtype.itemsize) + + #### step 1. get density value on real space grid and IPs + + # need allocate memory, size = nao * ngrid, this buffer does not need anymore (buffer 1) + + density_RgR = np.asarray(lib.dot(dm, aoR, c=buffer1), order='C') + + # need allocate memory, size = naux * ngrid (buffer 2) + + # density_RgR = np.asarray(lib.dot(aoRg.T, density_RgR, c=buffer2), order='C') + lib.ddot(aoRg.T, density_RgR, c=buffer2) + density_RgR = buffer2 + + # need allocate memory, size = naux * naux (buffer 3) + + density_RgRg = lib.ddot(dm, aoRg) + density_RgRg = lib.ddot(aoRg.T, density_RgRg) + + #### step 2. get K term1 and term2 + + ### todo: optimize the following 4 lines, it seems that they may not parallize! + + # tmp = V_R * density_RgR # pointwise multiplication, TODO: this term should be parallized + # do not need allocate memory, size = naux * ngrid, (buffer 2) + + # tmp = np.asarray(lib.cwise_mul(V_R, density_RgR, out=buffer2), order='C') + + # lib.cwise_mul(V_R, density_RgR, out=buffer2) + + K = None + + if with_robust_fitting: + lib.cwise_mul(V_R, density_RgR, out=buffer2) + tmp = buffer2 + + # do not need allocate memory, size = naux * nao, (buffer 1, but viewed as buffer5) + + K = np.asarray(lib.ddot_withbuffer(tmp, aoR.T, c=buffer5, buf=mydf.ddot_buf), order='C') + + ### the order due to the fact that naux << ngrid # need allocate memory, size = nao * nao, (buffer 4) + + K = np.asarray(lib.ddot_withbuffer(aoRg, K, c=buffer4, buf=mydf.ddot_buf), order='C') + + K += K.T + + #### step 3. get K term3 + + ### todo: optimize the following 4 lines, it seems that they may not parallize! + # pointwise multiplication, do not need allocate memory, size = naux * naux, use buffer for (buffer 3) + # tmp = W * density_RgRg + + lib.cwise_mul(W, density_RgRg, out=density_RgRg) + tmp = density_RgRg + + # do not need allocate memory, size = naux * nao, use buffer 2 but viewed as buffer 6 + + tmp = np.asarray(lib.dot(tmp, aoRg.T, c=buffer6), order='C') + + # K -= np.asarray(lib.dot(aoRg, tmp, c=K, beta=1), order='C') # do not need allocate memory, size = nao * nao, (buffer 4) + + if with_robust_fitting: + lib.ddot_withbuffer(aoRg, -tmp, c=K, beta=1, buf=mydf.ddot_buf) + else: + K = buffer4 + lib.ddot_withbuffer(aoRg, tmp, c=K, beta=0, buf=mydf.ddot_buf) + + t2 = (logger.process_clock(), logger.perf_counter()) + + if mydf.verbose: + _benchmark_time(t1, t2, "_contract_k_dm", mydf) + + if K is None: + K = np.zeros((nao, nao)) + + return K * ngrid / vol + +def _contract_k_dm_wo_robust_fitting(mydf, dm, with_robust_fitting=False, use_mpi=False): + + assert with_robust_fitting == False + + if use_mpi: + raise NotImplementedError("MPI is not supported in this function") + + t1 = (logger.process_clock(), logger.perf_counter()) + + if len(dm.shape) == 3: + assert dm.shape[0] == 1 + dm = dm[0] + + nao = dm.shape[0] + + cell = mydf.cell + assert cell.nao == nao + vol = cell.vol + mesh = np.array(cell.mesh, dtype=np.int32) + ngrid = np.prod(cell.mesh) + + W = mydf.W + aoRg = mydf.aoRg + + naux = aoRg.shape[1] + + density_RgRg = lib.ddot(dm, aoRg) + density_RgRg = lib.ddot(aoRg.T, density_RgRg) + + lib.cwise_mul(W, density_RgRg, out=density_RgRg) + tmp = density_RgRg + tmp = np.asarray(lib.dot(tmp, aoRg.T), order='C') + if hasattr(mydf, "ddot_buf") and mydf.ddot_buf is not None: + K = lib.ddot_withbuffer(aoRg, tmp, buf=mydf.ddot_buf) + else: + K = lib.ddot(aoRg, tmp) + + t2 = (logger.process_clock(), logger.perf_counter()) + + # if mydf.verbose: + _benchmark_time(t1, t2, "_contract_k_dm_wo_robust_fitting", mydf) + + del tmp + tmp = None + del density_RgRg + density_RgRg = None + + return K * ngrid / vol # take care this factor + +def get_jk_dm(mydf, dm, hermi=1, kpt=np.zeros(3), + kpts_band=None, with_j=True, with_k=True, omega=None, + use_mpi = False, **kwargs): + + '''JK for given k-point''' + + if len(dm.shape) == 3: + assert dm.shape[0] == 1 or dm.shape[0] == 2 + #dm = dm[0] + else: + assert dm.ndim == 2 + dm = dm.reshape(1, dm.shape[0], dm.shape[1]) + + nset = dm.shape[0] + + if hasattr(mydf, 'Ls'): + from pyscf.pbc.df.isdf.isdf_tools_densitymatrix import symmetrize_dm + dm = symmetrize_dm(dm, mydf.Ls) + else: + if hasattr(mydf, 'kmesh'): + from pyscf.pbc.df.isdf.isdf_tools_densitymatrix import symmetrize_dm + dm = symmetrize_dm(dm, mydf.kmesh) + + #### perform the calculation #### + + if mydf.jk_buffer is None: # allocate the buffer for get jk + mydf._allocate_jk_buffer(dm.dtype) + + if "exxdiv" in kwargs: + exxdiv = kwargs["exxdiv"] + else: + exxdiv = None + + #vj = vk = None + vj = np.zeros_like(dm) + vk = np.zeros_like(dm) + + if kpts_band is not None and abs(kpt-kpts_band).sum() > 1e-9: + raise NotImplementedError("ISDF does not support kpts_band != kpt") + + log = logger.Logger(mydf.stdout, mydf.verbose) + t1 = (logger.process_clock(), logger.perf_counter()) + + j_real = gamma_point(kpt) + k_real = gamma_point(kpt) and not np.iscomplexobj(dm) + + assert j_real + assert k_real + + mem_now = lib.current_memory()[0] + max_memory = max(2000, (mydf.max_memory - mem_now)) + + log.debug1('max_memory = %d MB (%d in use)', max_memory, mem_now) + + for iset in range(nset): + + if with_j: + if mydf.with_robust_fitting: + vj[iset] = _contract_j_dm_fast(mydf, dm[iset], mydf.with_robust_fitting, use_mpi) + else: + vj[iset] = _contract_j_dm_wo_robust_fitting(mydf, dm[iset], mydf.with_robust_fitting, use_mpi) + if with_k: + if mydf.with_robust_fitting: + vk[iset] = _contract_k_dm(mydf, dm[iset], mydf.with_robust_fitting, use_mpi) + else: + vk[iset] = _contract_k_dm_wo_robust_fitting(mydf, dm[iset], mydf.with_robust_fitting, use_mpi) + if exxdiv == 'ewald': + print("WARNING: ISDF does not support ewald") + + ##### the following code is added to deal with _ewald_exxdiv_for_G0 ##### + + from pyscf.pbc.df.df_jk import _format_dms, _format_kpts_band, _format_jks, _ewald_exxdiv_for_G0 + + kpts = kpt.reshape(1,3) + kpts = np.asarray(kpts) + dm_kpts = dm.copy() + dm_kpts = lib.asarray(dm_kpts, order='C') + dms = _format_dms(dm_kpts, kpts) + nset, nkpts, nao = dms.shape[:3] + if nset > 2: + logger.warn(mydf, 'nset > 2, please confirm what you are doing, for RHF nset == 1, for UHF nset == 2') + assert nkpts == 1 + + kpts_band, input_band = _format_kpts_band(kpts_band, kpts), kpts_band + nband = len(kpts_band) + assert nband == 1 + + if is_zero(kpts_band) and is_zero(kpts): + vk = vk.reshape(nset,nband,nao,nao) + else: + raise NotImplementedError("ISDF does not support kpts_band != 0") + + if exxdiv == 'ewald': + _ewald_exxdiv_for_G0(mydf.cell, kpts, dms, vk, kpts_band=kpts_band) + + vk = vk.reshape(nset,nao,nao) + + t1 = log.timer('sr jk', *t1) + + return vj, vk \ No newline at end of file diff --git a/pyscf/isdf/isdf_libdmet_tran_2e.py b/pyscf/isdf/isdf_libdmet_tran_2e.py new file mode 100644 index 000000000..22c169bbe --- /dev/null +++ b/pyscf/isdf/isdf_libdmet_tran_2e.py @@ -0,0 +1,473 @@ +#!/usr/bin/env python +# Copyright 2014-2020 The PySCF Developers. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Author: Ning Zhang +# + +############ sys module ############ + +import numpy, scipy +import numpy as np +import ctypes + +############ pyscf module ############ + +from pyscf import lib +from pyscf import ao2mo +from pyscf.ao2mo.incore import iden_coeffs +from pyscf.pbc import tools +from pyscf.pbc.lib import kpts_helper +from pyscf.pbc.lib.kpts_helper import is_zero, gamma_point, unique +from pyscf import __config__ +from pyscf.pbc.df.fft_ao2mo import _format_kpts, _iskconserv, _contract_compact +libisdf = lib.load_library('libisdf') + +############ isdf utils ############ + +from pyscf.isdf.isdf_tools_local import aoR_Holder +from pyscf.isdf.isdf_jk import _benchmark_time +from pyscf.isdf.isdf_local_k import PBC_ISDF_Info_Quad_K + + +def _aoR_full_col(mydf): + ''' + return aoR[:, :ngrid_prim] for the supercell system + ''' + + assert isinstance(mydf, PBC_ISDF_Info_Quad_K) + + fn_pack = getattr(libisdf, "_Pack_Matrix_SparseRow_DenseCol", None) + assert fn_pack is not None + + prim_cell = mydf.primCell + prim_mesh = prim_cell.mesh + prim_ngrid = np.prod(prim_mesh) + prim_natm = mydf.natmPrim + + assert len(mydf.aoR) == prim_natm + + res = np.zeros((mydf.nao, prim_ngrid), dtype=np.float64) + + for i in range(prim_natm): + aoR_i = mydf.aoR[i] + ao_involved_i = aoR_i.ao_involved + nao_i = aoR_i.aoR.shape[0] + global_grid_begin_i = aoR_i.global_gridID_begin + ngrid_i = aoR_i.aoR.shape[1] + + fn_pack( + res.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(res.shape[0]), + ctypes.c_int(res.shape[1]), + aoR_i.aoR.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(nao_i), + ctypes.c_int(ngrid_i), + ao_involved_i.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(global_grid_begin_i), + ctypes.c_int(global_grid_begin_i+ngrid_i) + ) + + return res + +def _aoRg_full_col(mydf): + ''' + return aoR[:, :ngrid_prim] for the supercell system + ''' + + assert isinstance(mydf, PBC_ISDF_Info_Quad_K) + + fn_pack = getattr(libisdf, "_Pack_Matrix_SparseRow_DenseCol", None) + assert fn_pack is not None + + prim_cell = mydf.primCell + prim_mesh = prim_cell.mesh + prim_ngrid = np.prod(prim_mesh) + prim_natm = mydf.natmPrim + prim_nIP = mydf.nIP_Prim + + assert len(mydf.aoR) == prim_natm + + res = np.zeros((mydf.nao, prim_nIP), dtype=np.float64) + + for i in range(mydf.natmPrim): + aoRg_i = mydf.aoRg[i] + ao_involved_i = aoRg_i.ao_involved + nao_i = aoRg_i.aoR.shape[0] + global_IP_begin_i = aoRg_i.global_gridID_begin + nIP_i = aoRg_i.aoR.shape[1] + + fn_pack( + res.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(res.shape[0]), + ctypes.c_int(res.shape[1]), + aoRg_i.aoR.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(nao_i), + ctypes.c_int(nIP_i), + ao_involved_i.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(global_IP_begin_i), + ctypes.c_int(global_IP_begin_i+nIP_i) + ) + + return res + +######## copy from libdmet ######## + +def eri_restore(eri, symmetry, nemb): + """ + Restore eri with given permutation symmetry. + """ + spin_pair = eri.shape[0] + if spin_pair == 1: + eri_res = ao2mo.restore(symmetry, eri[0].real, nemb) + else: + if symmetry == 4: + nemb_pair = nemb*(nemb+1) // 2 + if eri.size == spin_pair * nemb_pair * nemb_pair: + eri_res = eri.real.reshape(spin_pair, nemb_pair, nemb_pair) + else: + eri_res = np.empty((spin_pair, nemb_pair, nemb_pair)) + for s in range(spin_pair): + eri_res[s] = ao2mo.restore(symmetry, eri[s].real, nemb) + elif symmetry == 1: + if eri.size == spin_pair * nemb**4: + eri_res = eri.real.reshape(spin_pair, nemb, nemb, nemb, nemb) + else: + eri_res = np.empty((spin_pair, nemb, nemb, nemb, nemb)) + for s in range(spin_pair): + eri_res[s] = ao2mo.restore(symmetry, eri[s].real, nemb) + else: + raise ValueError("Spin unrestricted ERI does not support 8-fold symmetry.") + eri_res = np.asarray(eri_res, order='C') + return eri_res + +def get_emb_eri_isdf(mydf, C_ao_emb:np.ndarray=None, symmetry=4): + + ''' + get eri for embedding system + ''' + + #### preprocess #### + + assert isinstance(mydf, PBC_ISDF_Info_Quad_K) + assert not mydf.direct + + if C_ao_emb.ndim == 2: + C_ao_emb = C_ao_emb.reshape(1, *C_ao_emb.shape) + assert C_ao_emb.ndim == 3 + assert C_ao_emb.dtype == np.float64 ## supercell basis + + nspin, nao_full, nemb = C_ao_emb.shape + + print("nspin = ", nspin) + print("nao_full = ", nao_full) + print("nemb = ", nemb) + + supercell = mydf.cell + print("supercell.nao = ", supercell.nao) + assert supercell.nao == nao_full + + ngrid = mydf.ngrids + vol = supercell.vol + mesh_prim = mydf.primCell.mesh + ngrid_prim = np.prod(mesh_prim) + nao_prim = mydf.nao_prim + nIP_prim = mydf.nIP_Prim + kmesh = mydf.kmesh + nkpts = np.prod(kmesh) + nIP = mydf.naux + + with_robust_fitting = mydf.with_robust_fitting + + #eri = np.zeros((nspin*(nspin+1)//2, nemb, nemb, nemb, nemb), dtype=np.float64) ## the ordering of spin is aa, bb, ab + eri = np.zeros((nspin*(nspin+1)//2, nemb**2, nemb**2), dtype=np.float64) ## the ordering of spin is aa, bb, ab + + ### emb values on grid and IPs ### + + emb_R = [] + emb_Rg= [] + for i in range(nspin): + emb_R.append([]) + emb_Rg.append([]) + + if with_robust_fitting: + aoR_fullcol = _aoR_full_col(mydf) + assert aoR_fullcol.shape == (nao_full, ngrid_prim) + aoRg_fullcol = _aoRg_full_col(mydf) + assert aoRg_fullcol.shape == (nao_full, nIP_prim) + + aoR_tmp = np.zeros_like(aoR_fullcol) + aoRg_tmp = np.zeros_like(aoRg_fullcol) + + for kx in range(kmesh[0]): + for ky in range(kmesh[1]): + for kz in range(kmesh[2]): + + for ix in range(kmesh[0]): + for iy in range(kmesh[1]): + for iz in range(kmesh[2]): + + ILOC = ix*kmesh[1]*kmesh[2] + iy*kmesh[2] + iz + ix_ = (ix + kx) % kmesh[0] + iy_ = (iy + ky) % kmesh[1] + iz_ = (iz + kz) % kmesh[2] + ILOC_ = ix_*kmesh[1]*kmesh[2] + iy_*kmesh[2] + iz_ + + if with_robust_fitting: + aoR_tmp[ILOC_*nao_prim:(ILOC_+1)*nao_prim,:] = aoR_fullcol[ILOC*nao_prim:(ILOC+1)*nao_prim,:] + aoRg_tmp[ILOC_*nao_prim:(ILOC_+1)*nao_prim,:] = aoRg_fullcol[ILOC*nao_prim:(ILOC+1)*nao_prim,:] + + for i in range(nspin): + if with_robust_fitting: + emb_R[i].append(np.dot(C_ao_emb[i].T, aoR_tmp)) + emb_Rg[i].append(np.dot(C_ao_emb[i].T, aoRg_tmp)) + + + ### V_R term ### + + #V_R = mydf.V_R + #assert V_R.shape == (nIP_prim, ngrid) + + tmp_V = np.zeros((nspin, nIP, nemb*nemb), dtype=np.float64) + + def _construct_tmp_V_W(Is_V=False): + + tmp_V.ravel()[:] = 0.0 + + if Is_V: + V = mydf.V_R + ngrid_per_box = ngrid_prim + _emb_R = emb_R + else: + V = mydf.W + ngrid_per_box = nIP_prim + _emb_R = emb_Rg + + for kx in range(kmesh[0]): + for ky in range(kmesh[1]): + for kz in range(kmesh[2]): + + ILOC = kx*kmesh[1]*kmesh[2] + ky*kmesh[2] + kz + + for i in range(nspin): + + _emb_pair = np.einsum('iP,jP->ijP', _emb_R[i][ILOC], _emb_R[i][ILOC]) + _emb_pair = _emb_pair.reshape(nemb*nemb, ngrid_per_box) + # _tmp_V = lib.ddot(V[:,ILOC*ngrid_per_box:(ILOC+1)*ngrid_per_box],_emb_pair.T) + + ## another pass to account for the transposition ## + + for ix in range(kmesh[0]): + for iy in range(kmesh[1]): + for iz in range(kmesh[2]): + + ix_ = (kx-ix+kmesh[0]) % kmesh[0] + iy_ = (ky-iy+kmesh[1]) % kmesh[1] + iz_ = (kz-iz+kmesh[2]) % kmesh[2] + + ILOC_ = ix_*kmesh[1]*kmesh[2] + iy_*kmesh[2] + iz_ + ILOC = ix *kmesh[1]*kmesh[2] + iy *kmesh[2] + iz + + lib.ddot( + a=V[:,ILOC_*ngrid_per_box:(ILOC_+1)*ngrid_per_box], + b=_emb_pair.T, + alpha=1.0, + c=tmp_V[i][ILOC*nIP_prim:(ILOC+1)*nIP_prim,:], + beta=1.0) + + def _the_last_pass(plus): + + if plus: + alpha = 1 + else: + alpha =-1 + + for ix in range(kmesh[0]): + for iy in range(kmesh[1]): + for iz in range(kmesh[2]): + + ILOC = ix*kmesh[1]*kmesh[2] + iy*kmesh[2] + iz + + if nspin == 1: + + emb_pair_Rg = np.einsum('iP,jP->ijP', emb_Rg[0][ILOC], emb_Rg[0][ILOC]) + emb_pair_Rg = emb_pair_Rg.reshape(nemb*nemb, nIP_prim) + + lib.ddot( + a = emb_pair_Rg, + b = tmp_V[0][ILOC*nIP_prim:(ILOC+1)*nIP_prim,:], + alpha = alpha, + c = eri[0], + beta = 1 + ) + else: + if nspin == 2: + + emb_pair_Rg_alpha = np.einsum('iP,jP->ijP', emb_Rg[0][ILOC], emb_Rg[0][ILOC]) + emb_pair_Rg_beta = np.einsum('iP,jP->ijP', emb_Rg[1][ILOC], emb_Rg[1][ILOC]) + + lib.ddot( + a = emb_pair_Rg_alpha, + b = tmp_V[0][ILOC*nIP_prim:(ILOC+1)*nIP_prim,:], + alpha = alpha, + c = eri[0], + beta = 1 + ) + + lib.ddot( + a = emb_pair_Rg_beta, + b = tmp_V[1][ILOC*nIP_prim:(ILOC+1)*nIP_prim,:], + alpha = alpha, + c = eri[1], + beta = 1 + ) + + lib.ddot( + a = emb_pair_Rg_alpha, + b = tmp_V[1][ILOC*nIP_prim:(ILOC+1)*nIP_prim,:], + alpha = alpha, + c = eri[2], + beta = 1 + ) + + else: + raise ValueError("nspin > 2 is not supported") + + if with_robust_fitting: + + _construct_tmp_V_W(True) + _the_last_pass(plus=True) + nspinpair = nspin*(nspin+1)//2 + + for i in range(nspinpair): + eri[i] += eri[i].T + + ### W term ### + + _construct_tmp_V_W(False) + if with_robust_fitting: + _the_last_pass(plus=False) + else: + _the_last_pass(plus=True) + + #### post process #### + + # reshape the eri + + eri = eri.reshape(nspin*(nspin+1)//2, nemb, nemb, nemb, nemb) + eri = eri_restore(eri, symmetry, nemb) + + return eri * ngrid / vol + + +if __name__ == "__main__": + + from isdf_tools_cell import build_supercell, build_supercell_with_partition + C = 25 + + verbose = 10 + import pyscf.pbc.gto as pbcgto + + cell = pbcgto.Cell() + boxlen = 3.5668 + cell.a = np.array([[boxlen,0.0,0.0],[0.0,boxlen,0.0],[0.0,0.0,boxlen]]) + prim_a = np.array([[boxlen,0.0,0.0],[0.0,boxlen,0.0],[0.0,0.0,boxlen]]) + atm = [ + ['C', (0. , 0. , 0. )], + ['C', (0.8917 , 0.8917 , 0.8917)], + ['C', (1.7834 , 1.7834 , 0. )], + ['C', (2.6751 , 2.6751 , 0.8917)], + ['C', (1.7834 , 0. , 1.7834)], + ['C', (2.6751 , 0.8917 , 2.6751)], + ['C', (0. , 1.7834 , 1.7834)], + ['C', (0.8917 , 2.6751 , 2.6751)], + ] + + KE_CUTOFF = 70 + basis = 'gth-szv' + + prim_cell = build_supercell(atm, prim_a, Ls = [1,1,1], basis=basis, ke_cutoff=KE_CUTOFF) + prim_mesh = prim_cell.mesh + # prim_partition = [[0], [1], [2], [3], [4], [5], [6], [7]] + # prim_partition = [[0,1,2,3,4,5,6,7]] + prim_partition = [[0,1],[2,3],[4,5],[6,7]] + + Ls = [1, 2, 2] + kpts = prim_cell.make_kpts(Ls) + Ls = np.array(Ls, dtype=np.int32) + mesh = [Ls[0] * prim_mesh[0], Ls[1] * prim_mesh[1], Ls[2] * prim_mesh[2]] + mesh = np.array(mesh, dtype=np.int32) + + cell, group_partition = build_supercell_with_partition(atm, prim_a, mesh=mesh, + Ls=Ls, + basis=basis, + #pseudo=pseudo, + partition=prim_partition, ke_cutoff=KE_CUTOFF, verbose=verbose) + + pbc_isdf_info = PBC_ISDF_Info_Quad_K(prim_cell, kmesh=Ls, with_robust_fitting=True, aoR_cutoff=1e-8, + # direct=True, + direct=False, + rela_cutoff_QRCP=1e-4, + limited_memory=True, build_K_bunchsize=32) + pbc_isdf_info.build_IP_local(c=C, m=5, group=prim_partition, Ls=[Ls[0]*10, Ls[1]*10, Ls[2]*10]) + pbc_isdf_info.verbose = 10 + pbc_isdf_info.build_auxiliary_Coulomb(debug=True) + + # print("grid_segment = ", pbc_isdf_info.grid_segment) + + from pyscf.pbc import scf + + mf = scf.KRHF(prim_cell, kpts) + pbc_isdf_info.direct_scf = mf.direct_scf + mf.with_df = pbc_isdf_info + mf.max_cycle = 16 + mf.conv_tol = 1e-7 + + mf.kernel() + + nao_full = pbc_isdf_info.cell.nao + nao_emb = nao_full // 5 + C_ao_emb = np.random.rand(nao_full, nao_emb) + + eri_emb = get_emb_eri_isdf(pbc_isdf_info, C_ao_emb, symmetry=4) + + supercell = pbc_isdf_info.cell + + from pyscf.isdf.isdf_local import PBC_ISDF_Info_Quad + + pbc_isdf_info2 = PBC_ISDF_Info_Quad(supercell, with_robust_fitting=True, + aoR_cutoff=1e-8, + direct=False, + # direct=True, + limited_memory=True, build_K_bunchsize=32, + use_occ_RI_K=False, rela_cutoff_QRCP=1e-4) + + pbc_isdf_info2.build_IP_local(c=C, m=5, group=group_partition) + pbc_isdf_info2.build_auxiliary_Coulomb() + + eri_emb_benchmark = pbc_isdf_info2.ao2mo(C_ao_emb) + + assert eri_emb.shape == eri_emb_benchmark.shape + + diff = np.linalg.norm(eri_emb - eri_emb_benchmark) + print("diff = ", diff) + max_diff = np.max(np.abs(eri_emb - eri_emb_benchmark)) + print("max_diff = ", max_diff) + + # print("eri_emb.shape = ", eri_emb.shape) + # print("eri_emb = ", eri_emb[0,0],eri_emb[0,1]) + # print("eri_emb_benchmark = ", eri_emb_benchmark[0,0], eri_emb_benchmark[0,1]) + # for i in range(eri_emb.shape[0]): + # for j in range(eri_emb.shape[1]): + # print(eri_emb[i,j], eri_emb_benchmark[i,j], eri_emb[i,j]/eri_emb_benchmark[i,j]) diff --git a/pyscf/isdf/isdf_local.py b/pyscf/isdf/isdf_local.py new file mode 100644 index 000000000..8c6b39ed4 --- /dev/null +++ b/pyscf/isdf/isdf_local.py @@ -0,0 +1,1692 @@ +#!/usr/bin/env python +# Copyright 2014-2020 The PySCF Developers. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Author: Ning Zhang +# + +############ sys module ############ + +import copy +import numpy as np +import scipy +import ctypes, sys + +############ pyscf module ############ + +from pyscf import lib +from pyscf.pbc.gto import Cell +from pyscf.pbc import tools +from pyscf.gto.mole import * +libisdf = lib.load_library('libisdf') + +############ isdf utils ############ + +from pyscf.isdf.isdf_jk import _benchmark_time +import pyscf.isdf.isdf_fast as ISDF +from pyscf.isdf.isdf_eval_gto import ISDF_eval_gto +import pyscf.isdf.isdf_tools_local as ISDF_Local_Utils +import pyscf.isdf.isdf_local_jk as ISDF_Local_JK +import pyscf.isdf.isdf_tools_linearop as lib_isdf + +##### all the involved algorithm in ISDF based on aoR_Holder ##### + +USE_SCIPY_QR = False ## true for single-thread mode to compare with Kori's code +USE_SCIPY_CHOLESKY = True +assert USE_SCIPY_CHOLESKY == True + +############ subroutines --- select IP ############ + +############ ls refers to linear scaling ############ + +def select_IP_atm_ls(mydf, + c:int, m:int, + first_natm = None, + rela_cutoff = 0.0, + no_retriction_on_nIP = False, + use_mpi = False): + + if use_mpi: + from pyscf.isdf.isdf_tools_mpi import rank, comm, comm_size + else: + rank = 0 + comm = None + comm_size = 1 + + assert isinstance(mydf.aoR, list) + assert isinstance(mydf.partition, list) + + ### determine the largest grids point of one atm ### + + log = lib.logger.Logger(mydf.cell.stdout, mydf.cell.verbose) + + natm = mydf.cell.natm + nao = mydf.nao + naux_max = 0 + + nao_per_atm = np.zeros((natm), dtype=np.int32) + for i in range(mydf.nao): + atm_id = mydf.ao2atomID[i] + nao_per_atm[atm_id] += 1 + + for nao_atm in nao_per_atm: + naux_max = max(naux_max, int(np.sqrt(c*nao_atm)) + m) + + nthread = lib.num_threads() + + ### loop over atm ### + + coords = mydf.coords + assert coords is not None + + results = [] + + fn_colpivot_qr = getattr(libisdf, "ColPivotQRRelaCut", None) + assert(fn_colpivot_qr is not None) + fn_ik_jk_ijk = getattr(libisdf, "NP_d_ik_jk_ijk", None) + assert(fn_ik_jk_ijk is not None) + + weight = np.sqrt(mydf.cell.vol / coords.shape[0]) + + if first_natm is None: + first_natm = natm + + group_begin, group_end = ISDF_Local_Utils._range_partition(first_natm, rank, comm_size, use_mpi) + + for i in range(first_natm): + results.append(None) + + aoR_atm1 = None + aoR_atm2 = None + aoPairBuffer = None + R = None + thread_buffer = None + global_buffer = None + + log.debug4("-------------------------------------------") + + for atm_id in range(group_begin, group_end): + + aoR = mydf.aoR[atm_id] + if aoR is None: # it is used to split the task when using MPI + continue + + grid_ID = mydf.partition[atm_id] + aoR_atm = mydf.aoR[atm_id].aoR + nao_tmp = aoR_atm.shape[0] + + # create buffer for this atm + + dtypesize = aoR_atm.dtype.itemsize + nao_atm = nao_per_atm[atm_id] + naux_now = int(np.sqrt(c*nao_atm)) + m + naux2_now = naux_now * naux_now + + R = np.ndarray((naux2_now, grid_ID.shape[0]), dtype=np.float64) + + aoR_atm1 = np.ndarray((naux_now, grid_ID.shape[0]), dtype=np.float64) + aoR_atm2 = np.ndarray((naux_now, grid_ID.shape[0]), dtype=np.float64) + + aoPairBuffer = np.ndarray( + (naux_now*naux_now, grid_ID.shape[0]), dtype=np.float64) + + G1 = np.random.rand(nao_tmp, naux_now) + G1, _ = numpy.linalg.qr(G1) + G1 = G1.T + G2 = np.random.rand(nao_tmp, naux_now) + G2, _ = numpy.linalg.qr(G2) + G2 = G2.T + + lib.dot(G1, aoR_atm, c=aoR_atm1) + lib.dot(G2, aoR_atm, c=aoR_atm2) + + fn_ik_jk_ijk(aoR_atm1.ctypes.data_as(ctypes.c_void_p), + aoR_atm2.ctypes.data_as(ctypes.c_void_p), + aoPairBuffer.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(naux_now), + ctypes.c_int(naux_now), + ctypes.c_int(grid_ID.shape[0])) + if no_retriction_on_nIP: + max_rank = min(naux2_now, grid_ID.shape[0]) + log.debug4("In select_IP_atm_ls, no_retriction_on_nIP") + else: + max_rank = min(naux2_now, grid_ID.shape[0], nao_atm * c + m) + log.debug4("In select_IP_atm_ls, retriction_on_nIP") + npt_find = ctypes.c_int(0) + pivot = np.arange(grid_ID.shape[0], dtype=np.int32) + thread_buffer = np.ndarray((nthread+1, grid_ID.shape[0]+1), dtype=np.float64) + global_buffer = np.ndarray((1, grid_ID.shape[0]), dtype=np.float64) + + log.debug4("In select_IP_atm_ls, max_rank = %d" % (max_rank)) + log.debug4("In select_IP_atm_ls, naux2_now = %d" % (naux2_now)) + log.debug4("In select_IP_atm_ls, grid_ID.shape = %s" % (grid_ID.shape)) + log.debug4("In select_IP_atm_ls, rela_cutoff = %e" % (rela_cutoff)) + + if USE_SCIPY_QR: + R, pivot = scipy.linalg.qr(aoPairBuffer, pivoting=True, mode='r', check_finite=False, overwrite_a=True) + npt_find = nao_atm * c + m + else: + fn_colpivot_qr(aoPairBuffer.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(naux2_now), + ctypes.c_int(grid_ID.shape[0]), + ctypes.c_int(max_rank), + ctypes.c_double(1e-14), + ctypes.c_double(rela_cutoff), + pivot.ctypes.data_as(ctypes.c_void_p), + R.ctypes.data_as(ctypes.c_void_p), + ctypes.byref(npt_find), + thread_buffer.ctypes.data_as(ctypes.c_void_p), + global_buffer.ctypes.data_as(ctypes.c_void_p)) + npt_find = npt_find.value + cutoff = abs(R[npt_find-1, npt_find-1]) + log.debug4("ngrid = %d, npt_find = %d, cutoff = %12.6e" % (grid_ID.shape[0], npt_find, cutoff)) + + pivot = pivot[:npt_find] + pivot.sort() + + atm_IP = grid_ID[pivot] + atm_IP = np.array(atm_IP, dtype=np.int32) + atm_IP.sort() + results[atm_id] = atm_IP + + log.debug4("In select_IP_atm_ls, npt_find = %d" %(npt_find)) + log.debug4("-------------------------------------------") + + del aoR_atm1 + del aoR_atm2 + del aoPairBuffer + del R + del thread_buffer + del global_buffer + + if use_mpi: + results = ISDF_Local_Utils._sync_list(results, first_natm) + + assert len(results) == first_natm + + return results + +def select_IP_group_ls(mydf, aoRg_possible, c:int, m:int, group=None, atm_2_IP_possible = None): + + assert isinstance(aoRg_possible, list) + assert isinstance(group, list) or isinstance(group, np.ndarray) + assert isinstance(atm_2_IP_possible, list) + + assert len(aoRg_possible) == len(atm_2_IP_possible) + # assert len(aoRg_possible) == mydf.natm + + log = lib.logger.Logger(mydf.cell.stdout, mydf.cell.verbose) + + if group is None: + raise ValueError("group must be specified") + + #if mydf.verbose: + # print("In select_IP, num_threads = ", lib.num_threads()) + + nthread = lib.num_threads() + + coords = mydf.coords + + fn_colpivot_qr = getattr(libisdf, "ColPivotQRRelaCut", None) + assert(fn_colpivot_qr is not None) + fn_ik_jk_ijk = getattr(libisdf, "NP_d_ik_jk_ijk", None) + assert(fn_ik_jk_ijk is not None) + + weight = np.sqrt(mydf.cell.vol / coords.shape[0]) + + #### perform QRCP #### + + nao_group = 0 + for atm_id in group: + shl_begin = mydf.shl_atm[atm_id][0] + shl_end = mydf.shl_atm[atm_id][1] + nao_atm = mydf.aoloc_atm[shl_end] - mydf.aoloc_atm[shl_begin] + nao_group += nao_atm + + ##### random projection ##### + + nao = mydf.nao + + # aoR_atm = ISDF_eval_gto(mydf.cell, coords=coords[IP_possible]) * weight + + aoRg_unpacked = [] + for atm_id in group: + aoRg_unpacked.append(aoRg_possible[atm_id]) + if len(aoRg_unpacked) == 1: + aoRg_packed = aoRg_unpacked[0].aoR + else: + aoRg_packed = ISDF_Local_Utils._pack_aoR_holder(aoRg_unpacked, nao).aoR + + nao = aoRg_packed.shape[0] + + log.debug4("In select_IP_group_ls, nao_group = %d" % (nao_group)) + log.debug4("In select_IP_group_ls, nao = %d" % (nao)) + log.debug4("In select_IP_group_ls, c = %d, m = %d" % (c, m)) + log.debug4("In select_IP_group_ls, rela_cutoff = %e" % (mydf.rela_cutoff_QRCP)) + + # naux_now = int(np.sqrt(c*nao)) + m # seems to be too large + naux_now = int(np.sqrt(c*nao_group)) + m + G1 = np.random.rand(nao, naux_now) + G1, _ = numpy.linalg.qr(G1) + G1 = G1.T + + G2 = np.random.rand(nao, naux_now) + G2, _ = numpy.linalg.qr(G2) + G2 = G2.T + # naux_now = nao + + aoR_atm1 = lib.ddot(G1, aoRg_packed) + naux_now1 = aoR_atm1.shape[0] + aoR_atm2 = lib.ddot(G2, aoRg_packed) + naux_now2 = aoR_atm2.shape[0] + + naux2_now = naux_now1 * naux_now2 + + R = np.ndarray((naux2_now, aoRg_packed.shape[1]), dtype=np.float64) + + aoPairBuffer = np.ndarray((naux2_now, aoRg_packed.shape[1]), dtype=np.float64) + + fn_ik_jk_ijk(aoR_atm1.ctypes.data_as(ctypes.c_void_p), + aoR_atm2.ctypes.data_as(ctypes.c_void_p), + aoPairBuffer.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(naux_now1), + ctypes.c_int(naux_now2), + ctypes.c_int(aoRg_packed.shape[1])) + + aoR_atm1 = None + aoR_atm2 = None + del aoR_atm1 + del aoR_atm2 + + IP_possible = [] + for atm_id in group: + if atm_2_IP_possible[atm_id] is None: + continue + IP_possible.extend(atm_2_IP_possible[atm_id]) + IP_possible = np.array(IP_possible, dtype=np.int32) + + if mydf.no_restriction_on_nIP: + max_rank = min(naux2_now, IP_possible.shape[0]) + log.debug4("In select_IP_group_ls, no_restriction_on_nIP") + else: + max_rank = min(naux2_now, IP_possible.shape[0], nao_group * c) + log.debug4("In select_IP_group_ls, restriction_on_nIP") + + log.debug4("In select_IP_group_ls, naux2_now = %d, max_rank = %d" % (naux2_now, max_rank)) + log.debug4("In select_IP_group_ls, IP_possible.shape = %s" % (IP_possible.shape)) + log.debug4("In select_IP_group_ls, nao_group = %d" % (nao_group)) + log.debug4("In select_IP_group_ls, c = %d" % (c)) + log.debug4("In select_IP_group_ls, nao_group * c = %d" % (nao_group * c)) + + npt_find = ctypes.c_int(0) + pivot = np.arange(IP_possible.shape[0], dtype=np.int32) + + thread_buffer = np.ndarray((nthread+1, IP_possible.shape[0]+1), dtype=np.float64) + global_buffer = np.ndarray((1, IP_possible.shape[0]), dtype=np.float64) + + + if not USE_SCIPY_QR: + fn_colpivot_qr(aoPairBuffer.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(naux2_now), + ctypes.c_int(IP_possible.shape[0]), + ctypes.c_int(max_rank), + ctypes.c_double(1e-14), + ctypes.c_double(mydf.rela_cutoff_QRCP), + pivot.ctypes.data_as(ctypes.c_void_p), + R.ctypes.data_as(ctypes.c_void_p), + ctypes.byref(npt_find), + thread_buffer.ctypes.data_as(ctypes.c_void_p), + global_buffer.ctypes.data_as(ctypes.c_void_p)) + npt_find = npt_find.value + cutoff = abs(R[npt_find-1, npt_find-1]) + log.debug4("ngrid = %d, npt_find = %d, cutoff = %12.6e" % (IP_possible.shape[0], npt_find, cutoff)) + else: + # pivot, rankc = scipy.linalg.lapack.dpstrf(aoPairBuffer)[1:3] + # pivot = pivot[:rankc]-1 + # npt_find = nao_group * c + R, pivot = scipy.linalg.qr(aoPairBuffer, pivoting=True, mode='r', check_finite=False, overwrite_a=True) + npt_find = nao_group * c + + log.debug4("In select_IP_group_ls, npt_find = %d" % (npt_find)) + + pivot = pivot[:npt_find] + pivot.sort() + results = list(IP_possible[pivot]) + results = np.array(results, dtype=np.int32) + + ### clean up ### + + del aoPairBuffer + del R + del thread_buffer + del global_buffer + del G1 + del G2 + del aoRg_packed + del IP_possible + aoRg_packed = None + IP_possible = None + aoPairBuffer = None + R = None + pivot = None + thread_buffer = None + global_buffer = None + + return results + +def select_IP_local_ls_drive(mydf, c, m, IP_possible_atm, group, use_mpi=False): + + if use_mpi: + from pyscf.isdf.isdf_tools_mpi import rank, comm, comm_size + else: + rank = 0 + comm = None + comm_size = 1 + + IP_group = [] + + aoRg_possible = mydf.aoRg_possible + + ######### allocate buffer ######### + + natm = mydf.natm + + for i in range(len(group)): + IP_group.append(None) + + if len(group) < natm: + + if use_mpi == False: + for i in range(len(group)): + IP_group[i] = select_IP_group_ls(mydf, aoRg_possible, c, m, group=group[i], atm_2_IP_possible=IP_possible_atm) + else: + group_begin, group_end = ISDF_Local_Utils._range_partition(len(group), rank, comm_size, use_mpi) + for i in range(group_begin, group_end): + IP_group[i] = select_IP_group_ls(mydf, aoRg_possible, c, m, group=group[i], atm_2_IP_possible=IP_possible_atm) + # allgather(IP_group) + + IP_group = ISDF_Local_Utils._sync_list(IP_group, len(group)) + + else: + IP_group = IP_possible_atm + + mydf.IP_group = IP_group + + mydf.IP_flat = [] + mydf.IP_segment = [0] + nIP_now = 0 + for x in IP_group: + mydf.IP_flat.extend(x) + nIP_now += len(x) + mydf.IP_segment.append(nIP_now) + mydf.IP_flat = np.array(mydf.IP_flat, dtype=np.int32) + mydf.naux = mydf.IP_flat.shape[0] + + gridID_2_atmID = mydf.gridID_2_atmID + + partition_IP = [] + for i in range(natm): + partition_IP.append([]) + + for _ip_id_ in mydf.IP_flat: + atm_id = gridID_2_atmID[_ip_id_] + partition_IP[atm_id].append(_ip_id_) + + for i in range(natm): + partition_IP[i] = np.array(partition_IP[i], dtype=np.int32) + partition_IP[i].sort() + + mydf.partition_IP = partition_IP + + ### build ### + + if len(group) < natm: + + coords = mydf.coords + weight = np.sqrt(mydf.cell.vol / mydf.coords.shape[0]) + + del mydf.aoRg_possible + mydf.aoRg_possible = None + + t1 = (lib.logger.process_clock(), lib.logger.perf_counter()) + mydf.aoRg = mydf._construct_build_aoRg(partition_IP, group) + t2 = (lib.logger.process_clock(), lib.logger.perf_counter()) + _benchmark_time(t1, t2, "build_aoRg", mydf) + + else: + if use_mpi: + mydf.aoRg = mydf.aoRg_possible + else: + mydf.aoRg = mydf.aoRg_possible + + if rank == 0: + memory = ISDF_Local_Utils._get_aoR_holders_memory(mydf.aoRg) + log = lib.logger.Logger(mydf.cell.stdout, mydf.cell.verbose) + log.info("memory to store aoRg is %d " %(memory)) + + return IP_group + +############ subroutines --- build aux bas ############ + +def find_common_elements_positions(arr1, arr2): + position1 = [] + position2 = [] + i, j = 0, 0 + while i < len(arr1) and j < len(arr2): + if arr1[i] < arr2[j]: + i += 1 + elif arr1[i] > arr2[j]: + j += 1 + else: + # positions.append(((i, arr1[i]), (j, arr2[j]))) + position1.append(i) + position2.append(j) + i += 1 + j += 1 + return np.array(position1, dtype=np.int32), np.array(position2, dtype=np.int32) + +def build_aux_basis_ls(mydf, group, IP_group, debug=True, use_mpi=False): + + log = lib.logger.Logger(mydf.cell.stdout, mydf.cell.verbose) + + if use_mpi: + from pyscf.isdf.isdf_tools_mpi import rank, comm, comm_size + else: + rank = 0 + comm = None + comm_size = 1 + + ###### split task ###### + + ngroup = len(group) + nthread = lib.num_threads() + assert len(IP_group) == ngroup + + group_begin, group_end = ISDF_Local_Utils._range_partition(ngroup, rank, comm_size, use_mpi) + + ngroup_local = group_end - group_begin + + if ngroup_local == 0: + log.warn(" WARNING : rank = %d, ngroup_local = 0" % rank) + + mydf.group_begin = group_begin + mydf.group_end = group_end + + ###### build grid_ID_local ###### + + coords = mydf.coords + + ###### build aux basis ###### + + mydf.aux_basis = [] + + for i in range(ngroup): + mydf.aux_basis.append(None) + + if not USE_SCIPY_CHOLESKY: + fn_cholesky = getattr(libisdf, "Cholesky", None) + assert (fn_cholesky is not None) + fn_build_aux = getattr(libisdf, "Solve_LLTEqualB_Parallel", None) + assert(fn_build_aux is not None) + + for i in range(group_begin, group_end): + + aoRg_unpacked = [] + aoR_unpacked = [] + + for atm_id in group[i]: + aoRg_unpacked.append(mydf.aoRg[atm_id]) + aoR_unpacked.append(mydf.aoR[atm_id]) + + aoRg1 = ISDF_Local_Utils._pack_aoR_holder(aoRg_unpacked, mydf.nao) + aoR1 = ISDF_Local_Utils._pack_aoR_holder(aoR_unpacked, mydf.nao) + + if aoRg1.aoR.shape[0] == aoR1.aoR.shape[0]: + aoRg1 = aoRg1.aoR + aoR1 = aoR1.aoR + else: + pos1, pos2 = find_common_elements_positions(aoRg1.ao_involved, aoR1.ao_involved) + assert len(pos1) == aoRg1.aoR.shape[0] + aoRg1 = aoRg1.aoR + aoR1 = aoR1.aoR[pos2,:] + + + A = lib.ddot(aoRg1.T, aoRg1) + lib_isdf.square_inPlace(A) + grid_ID = mydf.partition_group_to_gridID[i] + B = lib.ddot(aoRg1.T, aoR1) + lib_isdf.square_inPlace(B) + + if not USE_SCIPY_CHOLESKY: + print("SCIPY is not called") + fn_cholesky( + A.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(A.shape[0]), + ) + nThread = lib.num_threads() + bunchsize = B.shape[1]//nThread + fn_build_aux( + ctypes.c_int(B.shape[0]), + A.ctypes.data_as(ctypes.c_void_p), + B.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(B.shape[1]), + ctypes.c_int(bunchsize) + ) + else: + # print("SCIPY is called") + C = scipy.linalg.cholesky(A, lower=True, overwrite_a=True, check_finite=False) + B = scipy.linalg.cho_solve((C, True), B, overwrite_b=True, check_finite=False) + + mydf.aux_basis[i] = B.copy() + # exit(1) + + ### sync aux_basis ### + + if use_mpi: + mydf.aux_basis = ISDF_Local_Utils._sync_list(mydf.aux_basis, ngroup) + + del A + A = None + del B + B = None + del aoRg1 + aoRg1 = None + del aoR1 + aoR1 = None + +def build_auxiliary_Coulomb_local_bas_wo_robust_fitting(mydf, debug=True, use_mpi=False): + + if use_mpi: + raise NotImplementedError("use_mpi = True is not supported") + #### NOTE: one should bcast aux_basis first! #### + + + t0 = (lib.logger.process_clock(), lib.logger.perf_counter()) + + cell = mydf.cell + mesh = cell.mesh + mesh_int32 = np.array(mesh, dtype=np.int32) + + naux = mydf.naux + + ncomplex = mesh[0] * mesh[1] * (mesh[2] // 2 + 1) * 2 + + group_begin = mydf.group_begin + group_end = mydf.group_end + ngroup = len(mydf.group) + + grid_ordering = mydf.grid_ID_ordered + + if mydf.omega is not None: + assert mydf.omega >= 0.0 + + coulG = mydf.coulG.copy() + coulG_real = coulG.reshape(*mesh)[:, :, :mesh[2]//2+1].reshape(-1).copy() + + def construct_V(aux_basis:np.ndarray, buf, V, grid_ID, grid_ordering): + fn = getattr(libisdf, "_construct_V_local_bas", None) + assert(fn is not None) + + nThread = buf.shape[0] + bufsize_per_thread = buf.shape[1] + nrow = aux_basis.shape[0] + ncol = aux_basis.shape[1] + shift_row = 0 + + fn(mesh_int32.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(nrow), + ctypes.c_int(ncol), + grid_ID.ctypes.data_as(ctypes.c_void_p), + aux_basis.ctypes.data_as(ctypes.c_void_p), + coulG_real.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(shift_row), + V.ctypes.data_as(ctypes.c_void_p), + grid_ordering.ctypes.data_as(ctypes.c_void_p), + buf.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(bufsize_per_thread)) + + ####### allocate buf for V ######## + + nThread = lib.num_threads() + bufsize_per_thread = (coulG_real.shape[0] * 2 + mesh[0] * mesh[1] * mesh[2]) + buf = np.zeros((nThread, bufsize_per_thread), dtype=np.double) + + assert len(mydf.aux_basis) == ngroup + + naux_local = 0 + max_naux_bunch = 0 + for i in range(group_begin, group_end): + naux_local += mydf.aux_basis[i].shape[0] + max_naux_bunch = max(max_naux_bunch, mydf.aux_basis[i].shape[0]) + + if hasattr(mydf, "grid_pnt_near_atm"): + max_naux_bunch = max(max_naux_bunch, len(mydf.grid_pnt_near_atm)) + if use_mpi == False or (use_mpi and rank == comm_size - 1): + naux_local += len(mydf.grid_pnt_near_atm) + + V = np.zeros((max_naux_bunch, np.prod(mesh_int32)), dtype=np.double) + + naux = mydf.naux + + W = np.zeros((naux_local, naux), dtype=np.double) + + aux_row_loc = 0 + + if hasattr(mydf, "grid_pnt_near_atm"): + grid_ID_near_atm = mydf.grid_pnt_near_atm + else: + grid_ID_near_atm = [] + grid_ID_near_atm = np.array(grid_ID_near_atm, dtype=np.int32) + for i in range(group_begin, group_end): + + aux_basis_now = mydf.aux_basis[i] + naux_bra = aux_basis_now.shape[0] + grid_ID = mydf.partition_group_to_gridID[i] + + construct_V(aux_basis_now, buf, V, grid_ID, grid_ordering) + + grid_shift = 0 + aux_col_loc = 0 + for j in range(0, ngroup): + grid_ID_now = mydf.partition_group_to_gridID[j] + aux_bas_ket = mydf.aux_basis[j] + naux_ket = aux_bas_ket.shape[0] + ngrid_now = grid_ID_now.size + W[aux_row_loc:aux_row_loc+naux_bra, aux_col_loc:aux_col_loc+naux_ket] = lib.ddot(V[:naux_bra, grid_shift:grid_shift+ngrid_now], aux_bas_ket.T) + grid_shift += ngrid_now + aux_col_loc += naux_ket + print("aux_row_loc = %d, aux_col_loc = %d" % (aux_row_loc, aux_col_loc)) + print("V.shape = ", V[:naux_bra,:].shape) + W[aux_row_loc:aux_row_loc+naux_bra, aux_col_loc:] = V[:naux_bra, grid_shift:] + aux_row_loc += aux_basis_now.shape[0] + + if (use_mpi == False or (use_mpi and rank == comm_size - 1)) and len(grid_ID_near_atm) != 0: + ### construct the final row ### + grid_ID = grid_ID_near_atm + aux_basis_now = np.identity(len(grid_ID), dtype=np.double) + construct_V(aux_basis_now, buf, V, grid_ID, grid_ordering) + grid_shift = 0 + aux_col_loc = 0 + naux_bra = len(grid_ID) + for j in range(0, ngroup): + grid_ID_now = mydf.partition_group_to_gridID[j] + aux_bas_ket = mydf.aux_basis[j] + naux_ket = aux_bas_ket.shape[0] + ngrid_now = grid_ID_now.size + W[aux_row_loc:aux_row_loc+naux_bra, aux_col_loc:aux_col_loc+naux_ket] = lib.ddot(V[:naux_bra, grid_shift:grid_shift+ngrid_now], aux_bas_ket.T) + grid_shift += ngrid_now + aux_col_loc += naux_ket + assert aux_row_loc == aux_col_loc + W[aux_row_loc:, aux_col_loc:] = V[:naux_bra, grid_shift:] + + del buf + buf = None + del V + V = None + + mydf.W = W + + if use_mpi: + comm.Barrier() + + t1 = (lib.logger.process_clock(), lib.logger.perf_counter()) + + if mydf.verbose > 0: + _benchmark_time(t0, t1, 'build_auxiliary_Coulomb', mydf) + +def build_auxiliary_Coulomb_local_bas(mydf, debug=True, use_mpi=False): + + if hasattr(mydf, "grid_pnt_near_atm") and len(mydf.grid_pnt_near_atm) != 0 : + raise NotImplementedError("grid_pnt_near_atm is not supported") + + log = lib.logger.Logger(mydf.cell.stdout, mydf.cell.verbose) + + t0 = (lib.logger.process_clock(), lib.logger.perf_counter()) + + cell = mydf.cell + mesh = cell.mesh + + naux = mydf.naux + + ncomplex = mesh[0] * mesh[1] * (mesh[2] // 2 + 1) * 2 + + group_begin = mydf.group_begin + group_end = mydf.group_end + + grid_ordering = mydf.grid_ID_ordered + + def construct_V_CCode(aux_basis:list[np.ndarray], mesh, coul_G): + + coulG_real = coul_G.reshape(*mesh)[:, :, :mesh[2]//2+1].reshape(-1).copy() + nThread = lib.num_threads() + bufsize_per_thread = int((coulG_real.shape[0] * 2 + mesh[0] * mesh[1] * mesh[2]) * 1.1) + bufsize_per_thread = (bufsize_per_thread + 15) // 16 * 16 + + buf = np.zeros((nThread, bufsize_per_thread), dtype=np.double) + + # nAux = aux_basis.shape[0] + + nAux = 0 + for x in aux_basis: + nAux += x.shape[0] + + ngrids = mesh[0] * mesh[1] * mesh[2] + mesh_int32 = np.array(mesh, dtype=np.int32) + V = np.zeros((nAux, ngrids), dtype=np.double) + + fn = getattr(libisdf, "_construct_V_local_bas", None) + assert(fn is not None) + + shift_row = 0 + ngrid_now = 0 + for i in range(len(aux_basis)): + + aux_basis_now = aux_basis[i] + grid_ID = mydf.partition_group_to_gridID[group_begin+i] + assert aux_basis_now.shape[1] == grid_ID.size + ngrid_now += grid_ID.size + + fn(mesh_int32.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(aux_basis_now.shape[0]), + ctypes.c_int(aux_basis_now.shape[1]), + grid_ID.ctypes.data_as(ctypes.c_void_p), + aux_basis_now.ctypes.data_as(ctypes.c_void_p), + coulG_real.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(shift_row), + V.ctypes.data_as(ctypes.c_void_p), + grid_ordering.ctypes.data_as(ctypes.c_void_p), + buf.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(bufsize_per_thread)) + + shift_row += aux_basis_now.shape[0] + + del buf + buf = None + + return V + + ########### construct V ########### + + if mydf.omega is not None: + assert mydf.omega >= 0.0 + coulG = mydf.coulG.copy() + V = construct_V_CCode(mydf.aux_basis, mesh, coulG) + + if use_mpi: + + ############# the only communication ############# + + grid_segment = mydf.grid_segment + assert len(grid_segment) == comm_size + 1 + + t0_comm = (lib.logger.process_clock(), lib.logger.perf_counter()) + + sendbuf = [] + for i in range(comm_size): + p0 = grid_segment[i] + p1 = grid_segment[i+1] + sendbuf.append(V[:, p0:p1]) + del V + V = None + V_fullrow = np.vstack(alltoall(sendbuf, split_recvbuf=True)) + del sendbuf + sendbuf = None + + mydf.V_R = V_fullrow + + t1_comm = (lib.logger.process_clock(), lib.logger.perf_counter()) + t_comm = t1_comm[1] - t0_comm[1] + + if mydf.verbose > 0: + log.info("rank = %d, t_comm = %12.6e" % (rank, t_comm)) + else: + t_comm = 0.0 + mydf.V_R = V + + ########### construct W ########### + + aux_group_shift = [0] + naux_now = 0 + for i in range(len(mydf.IP_group)): + IP_group_now = mydf.IP_group[i] + naux_now += len(IP_group_now) + aux_group_shift.append(naux_now) + + mydf.W = np.zeros((mydf.naux, mydf.naux), dtype=np.float64) + + grid_shift = 0 + for i in range(group_begin, group_end): + aux_begin = aux_group_shift[i] + aux_end = aux_group_shift[i+1] + ngrid_now = mydf.partition_group_to_gridID[i].size + sys.stdout.flush() + mydf.W[:, aux_begin:aux_end] = lib.ddot(mydf.V_R[:, grid_shift:grid_shift+ngrid_now], mydf.aux_basis[i-group_begin].T) + grid_shift += ngrid_now + + if use_mpi: + comm.Barrier() + + t1 = (lib.logger.process_clock(), lib.logger.perf_counter()) + + if mydf.verbose > 0: + _benchmark_time(t0, t1, 'build_auxiliary_Coulomb', mydf) + + sys.stdout.flush() + + +class PBC_ISDF_Info_Quad(ISDF.PBC_ISDF_Info): + + ''' Interpolative separable density fitting (ISDF) for periodic systems. + The locality is explored! + + Fitting aux basis is linear scaling! + + Quad stands for quadratic scaling for constructing V and W matrix as well as build K matrix! + + Examples: + + >>> pbc_isdf = PBC_ISDF_Info_Quad(cell, with_robust_fitting=True, aoR_cutoff=1e-8, direct=False, use_occ_RI_K=False) + >>> pbc_isdf.build_IP_local(c=C, m=5) + >>> pbc_isdf.build_auxiliary_Coulomb() + >>> from pyscf.pbc import scf + >>> mf = scf.RHF(cell) + >>> pbc_isdf.direct_scf = mf.direct_scf + >>> mf.with_df = pbc_isdf + >>> mf.verbose = 0 + >>> mf.kernel() + + ''' + + # group_partition refer to the group of atoms to perform local fitting + # if not set then each atom is treated as a group + + def __init__(self, mol:Cell, + with_robust_fitting = True, + kmesh = None, + verbose = None, + rela_cutoff_QRCP = None, + aoR_cutoff = 1e-8, + direct = False, + use_occ_RI_K = False, + limited_memory = False, + build_K_bunchsize = None): + + assert use_occ_RI_K == False + + if verbose is None: + verbose = mol.verbose + + super().__init__( + mol=mol, + aoR=None, + with_robust_fitting=with_robust_fitting, + kmesh=kmesh, + get_partition=False, + verbose=verbose + ) + + self.cell = mol.copy() + cell = self.cell + + #### get other info #### + + shl_atm = [] + + for i in range(self.natm): + shl_atm.append([None, None]) + + for i in range(cell.nbas): + atm_id = cell.bas_atom(i) + if shl_atm[atm_id][0] is None: + shl_atm[atm_id][0] = i + shl_atm[atm_id][1] = i+1 + + self.shl_atm = shl_atm + self.aoloc_atm = cell.ao_loc_nr() + + self.use_mpi = False + + self.aoR_cutoff = aoR_cutoff + + if rela_cutoff_QRCP is None: + self.no_restriction_on_nIP = False + self.rela_cutoff_QRCP = 0.0 + else: + self.no_restriction_on_nIP = True + self.rela_cutoff_QRCP = rela_cutoff_QRCP + + self.aoR = None + self.partition = None + + self.V_W_cutoff = None + + self.direct = direct # whether to use direct method to calculate J and K, if True, the memory usage will be reduced, V W will not be stored + if self.direct: + self.with_robust_fitting = True + + self.with_translation_symmetry = False + self.kmesh = None + + ######### default setting for range separation ######### + + # WARNING: not a good design pattern to write this code here! + + self.omega = None + self.use_aft_ao = False + self.ke_cutoff_pp = self.cell.ke_cutoff + self.ke_cutoff_ft_ao = self.cell.ke_cutoff + self.ft_ao_mesh = self.mesh.copy() + #self.rsjk = None + #self.cell_rsjk = None + + ########## coul kernel ########## + + self.get_coulG() + self.ovlp = self.cell.pbc_intor('int1e_ovlp') + self.occ_tol = 1e-9 + self.occ_RI_K = use_occ_RI_K + + ########## limited memory ########## + + self._limited_memory = limited_memory + self._build_K_bunchsize = build_K_bunchsize + if build_K_bunchsize is None: + if limited_memory: + from _isdf_local_K_direct import K_DIRECT_NAUX_BUNCHSIZE + self._build_K_bunchsize = K_DIRECT_NAUX_BUNCHSIZE + else: + self._build_K_bunchsize = 10000 * 10000 # infinite in practice + + @property + def first_natm(self): + if self.kmesh is not None: + return self.cell.natm // np.prod(self.kmesh) + else: + return self.cell.natm + + def build_partition_aoR(self, Ls=None): + + if self.aoR is not None and self.partition is not None: + return + + ##### build cutoff info ##### + + self.distance_matrix = ISDF_Local_Utils.get_cell_distance_matrix(self.cell) + weight = np.sqrt(self.cell.vol / self.coords.shape[0]) + precision = self.aoR_cutoff + rcut = ISDF_Local_Utils._estimate_rcut(self.cell, self.coords.shape[0], precision) + rcut_max = np.max(rcut) + atm2_bas = ISDF_Local_Utils._atm_to_bas(self.cell) + self.AtmConnectionInfo = [] + + for i in range(self.cell.natm): + tmp = ISDF_Local_Utils.AtmConnectionInfo(self.cell, i, self.distance_matrix, precision, rcut, rcut_max, atm2_bas) + self.AtmConnectionInfo.append(tmp) + + ##### build partition ##### + + if Ls is None: + lattice_x = self.cell.lattice_vectors()[0][0] + lattice_y = self.cell.lattice_vectors()[1][1] + lattice_z = self.cell.lattice_vectors()[2][2] + + Ls = [int(lattice_x)/3+6, int(lattice_y)/3+6, int(lattice_z)/3+6] + + t1 = (lib.logger.process_clock(), lib.logger.perf_counter()) + # if self.rsjk is not None and self.cell_rsjk is not None: + # self.partition = ISDF_Local_Utils.get_partition(self.cell_rsjk, self.coords, self.AtmConnectionInfo, + # Ls, + # self.with_translation_symmetry, + # self.kmesh, + # self.use_mpi) + # else: + self.partition = ISDF_Local_Utils.get_partition(self.cell, self.coords, self.AtmConnectionInfo, + Ls, + self.with_translation_symmetry, + self.kmesh, + self.use_mpi) + t2 = (lib.logger.process_clock(), lib.logger.perf_counter()) + + if not self.use_mpi: + rank = 0 + else: + from pyscf.isdf.isdf_tools_mpi import rank + + if rank == 0: + _benchmark_time(t1, t2, "build_partition", self) + + for i in range(self.natm): + self.partition[i] = np.array(self.partition[i], dtype=np.int32) + self.partition[i].sort() + + t1 = (lib.logger.process_clock(), lib.logger.perf_counter()) + sync_aoR = False + if self.direct: + sync_aoR = True + + ## deal with translation symmetry ## + + first_natm = self.first_natm + + #################################### + + for x in range(self.natm): + # print("len of partition[%d] = %d" % (x, len(self.partition[x]))) + logger.debug4(self, "len of partition[%d] = %d" % (x, len(self.partition[x]))) + + if self.use_aft_ao: + self.aoR = ISDF_Local_Utils.get_aoR_analytic(self.cell, self.coords, self.partition, + None, + first_natm, + self.group, + self.distance_matrix, + self.AtmConnectionInfo, + self.use_mpi, self.use_mpi, sync_aoR) + else: + # assert self.rsjk is None and self.cell_rsjk is None + self.aoR = ISDF_Local_Utils.get_aoR(self.cell, self.coords, self.partition, + None, + first_natm, + self.group, + self.distance_matrix, + self.AtmConnectionInfo, + self.use_mpi, self.use_mpi, sync_aoR) + + memory = ISDF_Local_Utils._get_aoR_holders_memory(self.aoR) + t2 = (lib.logger.process_clock(), lib.logger.perf_counter()) + + if rank == 0: + _benchmark_time(t1, t2, "build_aoR", self) + + def _allocate_jk_buffer(self, datatype, ngrids_local): + pass + + @property + def max_nao_involved(self): + return np.max([aoR_holder.aoR.shape[0] for aoR_holder in self.aoR if aoR_holder is not None]) + + @property + def max_ngrid_involved(self): + return np.max([aoR_holder.aoR.shape[1] for aoR_holder in self.aoR if aoR_holder is not None]) + + @property + def max_nIP_involved(self): + return np.max([aoR_holder.aoR.shape[1] for aoR_holder in self.aoRg if aoR_holder is not None]) + + @property + def maxsize_group_naux(self): + maxsize_group_naux = 0 + for group_id, atm_ids in enumerate(self.group): + naux_tmp = 0 + for atm_id in atm_ids: + naux_tmp += self.aoRg[atm_id].aoR.shape[1] + maxsize_group_naux = max(maxsize_group_naux, naux_tmp) + return maxsize_group_naux + + def deallocate_k_buffer(self): + if hasattr(self, "build_k_buf") and self.build_k_buf is not None: + del self.build_k_buf + self.build_k_buf = None + if hasattr(self, "build_VW_in_k_buf") and self.build_VW_in_k_buf is not None: + del self.build_VW_in_k_buf + self.build_VW_in_k_buf = None + + def allocate_k_buffer(self, nset=1): + + log = lib.logger.Logger(self.cell.stdout, self.cell.verbose) + + ### TODO: split grid again to reduce the size of buf when robust fitting is true! + # TODO: try to calculate the size when direct is true + + max_nao_involved = self.max_nao_involved + max_ngrid_involved = self.max_ngrid_involved + max_nIP_involved = self.max_nIP_involved + maxsize_group_naux = self.maxsize_group_naux + + allocated = False + + if self.direct: + if hasattr(self, "build_k_buf") and self.build_k_buf is not None: + if hasattr(self, "build_VW_in_k_buf") and self.build_VW_in_k_buf is not None: + allocated = True + else: + if hasattr(self, "build_k_buf") and self.build_k_buf is not None: + allocated = True + + if allocated: + pass + else: + + if self.direct: + + if self._limited_memory: + build_K_bunchsize = min(maxsize_group_naux, self._build_K_bunchsize) + else: + build_K_bunchsize = maxsize_group_naux + + #### compare build_K_bunchsize with those buf used for W matrix #### + + size1 = maxsize_group_naux * self.nao * nset + size2 = maxsize_group_naux * max_nao_involved + self.Density_RgAO_buf = np.zeros((size1+size2,), dtype=np.float64) + + #### allocate build_VW_in_k_buf #### + + mesh = self.cell.mesh + ngrid= np.prod(mesh) + ncomplex = mesh[0] * mesh[1] * (mesh[2]//2+1) + nthread = lib.num_threads() + + build_K_bunchsize = max(maxsize_group_naux * self.naux//ngrid+2, build_K_bunchsize) + build_K_bunchsize = max(maxsize_group_naux * max_nIP_involved//max_ngrid_involved+2, build_K_bunchsize) + self._build_K_bunchsize = build_K_bunchsize + + size0 = (np.prod(self.cell.mesh) + 2 * ncomplex) * nthread + size1 = build_K_bunchsize * np.prod(self.cell.mesh) + size2 = maxsize_group_naux * self.naux + self.build_VW_in_k_buf = np.zeros((size0+size1+size2,), dtype=np.float64) + + #### allocate build_k_buf #### + + size1 = build_K_bunchsize * np.prod(self.cell.mesh) # density RgR + size2 = build_K_bunchsize * max_ngrid_involved # ddot_res_RgR + size3 = maxsize_group_naux * self.nao # K1_tmp1 + #size4 = max_ngrid_involved * max_nao_involved # K1_tmp1_ddot_res + size4 = maxsize_group_naux * self.nao + #size5 = max_ngrid_involved * max_ngrid_involved + size5 = 0 + size6 = self.nao * self.nao # K1_final_ddot + + size = size1 + size2 + size3 + size4 + size5 + size6 + + self.build_k_buf = np.zeros((size,), dtype=np.float64) + + log.info("In allocate_k_buffer, Density_RgAO_buf memory = %d"%(self.Density_RgAO_buf.nbytes)) + log.info("In allocate_k_buffer, build_VW_in_k_buf memory = %d"%(self.build_VW_in_k_buf.nbytes)) + log.info("In allocate_k_buffer, build_k_buf memory = %d"%(self.build_k_buf.nbytes)) + + else: + + self.Density_RgAO_buf = np.zeros((self.naux, self.nao), dtype=np.float64) + max_dim = max(max_nao_involved, max_ngrid_involved, self.nao) + + ### size0 in getting W part of K ### + + size0 = self.naux * max_nIP_involved + self.naux * max_nao_involved + self.naux * max(max_nIP_involved, max_nao_involved) + + ### size1 in getting Density Matrix ### + + size11 = self.nao * max_nIP_involved + self.nao * self.nao + size1 = self.naux * self.nao + self.naux * max_dim + self.nao * self.nao + size1 += self.naux * max_nao_involved + size1 = max(size1, size11) + + ### size2 in getting K ### + + size2 = self.naux * max_nao_involved + if self.with_robust_fitting: + size2 += self.naux * max_ngrid_involved + self.naux * max_nao_involved + size2 += self.naux * max_ngrid_involved + self.build_k_buf = np.zeros((max(size0, size1, size2)), dtype=np.float64) + + def _construct_build_aoRg(self, IP_group, group=None): + + if group is None: + group = [] + for i in range(self.natm): + group.append([i]) + for i in range(len(group)): + group[i] = np.array(group[i], dtype=np.int32) + group[i].sort() + assert self.natm == len(IP_group) + + aoR_holders_res = [] + for i in range(self.natm): + aoR_holders_res.append(None) + + assert hasattr(self, "partition") + assert hasattr(self, "aoR") + + atm_ordering = [] + for i in range(len(group)): + atm_ordering.extend(group[i]) + + IP_ID_NOW = 0 + GRID_ID_NOW = 0 + + IP_loc_in_ordered_grids = [] + + for atm_id in atm_ordering: + aoR_holder = self.aoR[atm_id] + if aoR_holder is None: + if IP_group[atm_id] is None: + continue + else: + IP_ID_NOW += len(IP_group[atm_id]) + continue + nIP = len(IP_group[atm_id]) + + idx = np.searchsorted(self.partition[atm_id], IP_group[atm_id]) + + ao_involved = aoR_holder.ao_involved.copy() + aoR = aoR_holder.aoR[:, idx].copy() + aoR_holders_res[atm_id] = ISDF_Local_Utils.aoR_Holder(aoR, ao_involved, IP_ID_NOW, IP_ID_NOW+nIP, IP_ID_NOW, IP_ID_NOW+nIP) + + IP_loc_in_ordered_grids.extend(idx+GRID_ID_NOW) + + IP_ID_NOW += nIP + GRID_ID_NOW += len(self.partition[atm_id]) + + self.IP_loc_in_ordered_grids = np.array(IP_loc_in_ordered_grids, dtype=np.int32) + assert self.IP_loc_in_ordered_grids.ndim == 1 + + return aoR_holders_res + + def _determine_c(self): + ''' + called in build_IP_local when c is not set + empirical rule to determine c + ''' + + DEFAULT = 15 + SEGMENT = [1e-2, 1e-3, 1e-4, 1e-5] + C = [10, 20, 30, 35, 40] + + if self.rela_cutoff_QRCP is None: + return DEFAULT + else: + if self.rela_cutoff_QRCP > SEGMENT[0]: + return C[0] + else: + for i in range(1, len(SEGMENT)): + if self.rela_cutoff_QRCP > SEGMENT[i]: + return C[i] + return C[-1] + + def build_IP_local(self, c=None, m=5, first_natm=None, group=None, Ls = None, debug=True): + + if c is None: + c = self._determine_c() + + if first_natm is None: + first_natm = self.natm + + if group == None: + group = [] + for i in range(natm): + group.append([i]) + + self.group = group + + for i in range(len(group)): + group[i] = np.array(group[i], dtype=np.int32) + group[i].sort() + + # build partition and aoR # + + t1 = (lib.logger.process_clock(), lib.logger.perf_counter()) + + self.build_partition_aoR(Ls) + + ao2atomID = self.ao2atomID + partition = self.partition + aoR = self.aoR + natm = self.natm + nao = self.nao + + self.partition_atmID_to_gridID = partition + + self.partition_group_to_gridID = [] + for i in range(len(group)): + self.partition_group_to_gridID.append([]) + for atm_id in group[i]: + self.partition_group_to_gridID[i].extend(partition[atm_id]) + self.partition_group_to_gridID[i] = np.array(self.partition_group_to_gridID[i], dtype=np.int32) + # self.partition_group_to_gridID[i].sort() + + ngrids = self.coords.shape[0] + + gridID_2_atmID = np.zeros((ngrids), dtype=np.int32) + + for atm_id in range(natm): + gridID_2_atmID[partition[atm_id]] = atm_id + + self.gridID_2_atmID = gridID_2_atmID + self.grid_ID_ordered = ISDF_Local_Utils._get_grid_ordering(self.partition, self.group, self.use_mpi) + + t2 = (lib.logger.process_clock(), lib.logger.perf_counter()) + + # if self.verbose and debug: + _benchmark_time(t1, t2, "build_partition_aoR", self) + + t1 = t2 + + if len(group) < first_natm: + IP_Atm = select_IP_atm_ls(self, c+1, m, first_natm, + rela_cutoff=self.rela_cutoff_QRCP, + no_retriction_on_nIP=self.no_restriction_on_nIP, + use_mpi=self.use_mpi) + else: + IP_Atm = select_IP_atm_ls(self, c, m, first_natm, + rela_cutoff=self.rela_cutoff_QRCP, + no_retriction_on_nIP=self.no_restriction_on_nIP, + use_mpi=self.use_mpi) + t3 = (lib.logger.process_clock(), lib.logger.perf_counter()) + + self.aoRg_possible = self._construct_build_aoRg(IP_Atm, None) + + t4 = (lib.logger.process_clock(), lib.logger.perf_counter()) + if self.verbose and debug: + _benchmark_time(t3, t4, "build_aoRg_possible", self) + + select_IP_local_ls_drive(self, c, m, IP_Atm, group, use_mpi=self.use_mpi) + + t2 = (lib.logger.process_clock(), lib.logger.perf_counter()) + + if self.verbose and debug: + _benchmark_time(t1, t2, "select_IP", self) + + t1 = t2 + + build_aux_basis_ls(self, group, self.IP_group, debug=debug, use_mpi=self.use_mpi) + + t2 = (lib.logger.process_clock(), lib.logger.perf_counter()) + + if self.verbose and debug: + _benchmark_time(t1, t2, "build_aux_basis", self) + + sys.stdout.flush() + + def get_coulG(self): + if hasattr(self, "rsjk") and self.rsjk is not None: + + ##### construct coulG_LR , copy from rsjk.py ##### + + if self.rsjk.cell.dimension!=3: + raise NotImplementedError('3D only') + + _, _, kws = self.rsjk.cell.get_Gv_weights(self.mesh) + coulG_SR_at_G0 = np.pi/self.rsjk.omega**2 * kws + kpt = np.zeros(3) + with lib.temporary_env(self.rsjk.cell, dimension=3): + coulG_SR = self.rsjk.weighted_coulG_SR(kpt, False, self.mesh) + G0_idx = 0 + coulG_SR[G0_idx] += coulG_SR_at_G0 + coulG_full = self.rsjk.weighted_coulG(kpt, None, self.mesh, omega=0.0) + self.coulG = coulG_full - coulG_SR + + coulG_bench = tools.get_coulG(self.cell_rsjk, mesh=self.cell_rsjk.mesh, omega=0.0) + + ### find coulG_full with values larger than 1e-6 ### + + idx = np.where(np.abs(coulG_full) > 1e-6) + + G1 = coulG_full[idx].copy() + G2 = coulG_bench[idx].copy() + ratio = G2/G1 + fac = ratio[0] + assert fac == 1.0/kws + assert np.allclose(ratio, fac) + self.coulG *= fac + + else: + self.coulG = tools.get_coulG(self.cell, mesh=self.cell.mesh) + + def diag_dm(self, dm, linear_dep_threshold=1e-16): + '''Solver for generalized eigenvalue problem + + .. math:: HC = SCE + + used only for occ-RI-K, better not merge into PySCF first! + + ''' + # print("ovlp = ", self.ovlp) + + # diagonalize overlap matrix + e, v = scipy.linalg.eigh(self.ovlp) + + mask = e > linear_dep_threshold * e[-1] + e = e[mask] + v = v[:,mask] + v*= np.sqrt(e) + + dm_new_basis = np.dot(v.T, np.dot(dm, v)) + + mo_occ, mo_coeff = scipy.linalg.eigh(dm_new_basis) + + mo_coeff = np.dot(v, mo_coeff) # SC = mocoeff + + v /= np.sqrt(e) + + mo_coeff = np.dot(v.T, mo_coeff) + mo_coeff = (1.0/e).reshape(-1,1) * mo_coeff + mo_coeff = np.dot(v, mo_coeff) + + return mo_occ[::-1], mo_coeff[:,::-1] + + def build_auxiliary_Coulomb(self, debug=True): + + if self.direct == True: + return # do nothing + + ### the cutoff based on distance for V and W is used only for testing now ! ### + + distance_max = np.max(self.distance_matrix) + if self.V_W_cutoff is not None and self.V_W_cutoff > distance_max: + logger.warn(self, "WARNING : V_W_cutoff is larger than the maximum distance in the cell") + self.V_W_cutoff = None # no cutoff indeed + if self.V_W_cutoff is not None: + logger.debug4(self, "PBC_ISDF_Info_Quad:->build_auxiliary_Coulomb: V_W_cutoff = %12.6e" % self.V_W_cutoff) + logger.debug4(self, "PBC_ISDF_Info_Quad:->build_auxiliary_Coulomb: distance_max = %12.6e" % distance_max) + + if self.with_robust_fitting: + build_auxiliary_Coulomb_local_bas(self, debug=debug, use_mpi=self.use_mpi) + else: + build_auxiliary_Coulomb_local_bas_wo_robust_fitting(self, debug=debug, use_mpi=self.use_mpi) + + if self.V_W_cutoff is not None: + + if hasattr(self, "V_R"): + V = self.V_R + + bra_loc = 0 + for atm_i, aoRg_holder in enumerate(self.aoRg): + nbra = aoRg_holder.aoR.shape[1] + ket_loc = 0 + for atm_j, aoR_holder in enumerate(self.aoR): + nket = aoR_holder.aoR.shape[1] + if self.distance_matrix[atm_i, atm_j] > self.V_W_cutoff: + V[bra_loc:bra_loc+nbra, ket_loc:ket_loc+nket] = 0.0 + ket_loc += nket + bra_loc += nbra + + self.V_R = V + + W = self.W + + bra_loc = 0 + for atm_i, aoRg_holder_bra in enumerate(self.aoRg): + nbra = aoRg_holder.aoR.shape[1] + ket_loc = 0 + for atm_j, aoRg_holder_ket in enumerate(self.aoRg): + nket = aoRg_holder.aoR.shape[1] + if self.distance_matrix[atm_i, atm_j] > self.V_W_cutoff: + W[bra_loc:bra_loc+nbra, ket_loc:ket_loc+nket] = 0.0 + ket_loc += nket + bra_loc += nbra + + self.W = W + + get_jk = ISDF_Local_JK.get_jk_dm_quadratic + + def aoR_RangeSeparation(self, CompactAO): + + self.CompactAOList = np.array(CompactAO, dtype=np.int32) + DiffuseAO = [] + for i in range(self.nao): + if i not in CompactAO: + DiffuseAO.append(i) + self.DiffuseAOList = np.array(DiffuseAO, dtype=np.int32) + + IsCompact = np.zeros((self.nao), dtype=bool) + IsCompact[CompactAO] = True + IsCompact[DiffuseAO] = False + self.IsCompact = IsCompact + + for aoR in self.aoR: + aoR.RangeSeparation(IsCompact) + for aoRg in self.aoRg: + aoRg.RangeSeparation(IsCompact) + + def aoRg_full(self): + + fn_pack = getattr(libisdf, "_Pack_Matrix_SparseRow_DenseCol", None) + assert fn_pack is not None + + partition = [] + + res = np.zeros((self.nao, self.naux), dtype=np.float64) + for i in range(self.natm): + aoRg_i = self.aoRg[i] + ao_involved_i = aoRg_i.ao_involved + nao_i = aoRg_i.aoR.shape[0] + global_IP_begin_i = aoRg_i.global_gridID_begin + nIP_i = aoRg_i.aoR.shape[1] + + fn_pack( + res.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(res.shape[0]), + ctypes.c_int(res.shape[1]), + aoRg_i.aoR.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(nao_i), + ctypes.c_int(nIP_i), + ao_involved_i.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(global_IP_begin_i), + ctypes.c_int(global_IP_begin_i+nIP_i) + ) + + partition.append([global_IP_begin_i, global_IP_begin_i+nIP_i]) + + return res, partition + + ### LS_THC fit ### + + def LS_THC_recompression(self, X:np.ndarray, force_LS_THC=True): + + from isdf_ao2mo import LS_THC + + if force_LS_THC: + self.with_robust_fitting = False + self.force_LS_THC = True + self.W = LS_THC(self, X) / (self.ngrids/self.cell.vol) + self.aoRg = X + self.aoR = None + self.V_R = None + else: + self.force_LS_THC = False + self.W2 = LS_THC(self, X) / (self.ngrids/self.cell.vol) + self.aoRg2 = X + + ### check aoR value ### + + def check_aoR(self): + for aoR_holder in self.aoR: + max_abs_index = np.unravel_index(np.argmax(np.abs(aoR_holder.aoR)), aoR_holder.aoR.shape) + value = aoR_holder.aoR[max_abs_index[0]][max_abs_index[1]] + ao_indx = aoR_holder.ao_involved[max_abs_index[0]] + print("max_abs_value = ", value, " with indx = ", ao_indx, max_abs_index[1]+aoR_holder.global_gridID_begin) + + +if __name__ == '__main__': + + C = 15 + from pyscf.lib.parameters import BOHR + from isdf_tools_cell import build_supercell, build_supercell_with_partition + import pyscf.pbc.gto as pbcgto + + verbose = 10 + + cell = pbcgto.Cell() + boxlen = 3.5668 + cell.a = np.array([[boxlen,0.0,0.0],[0.0,boxlen,0.0],[0.0,0.0,boxlen]]) + prim_a = np.array([[boxlen,0.0,0.0],[0.0,boxlen,0.0],[0.0,0.0,boxlen]]) + atm = [ + ['C', (0. , 0. , 0. )], + ['C', (0.8917 , 0.8917 , 0.8917)], + ['C', (1.7834 , 1.7834 , 0. )], + ['C', (2.6751 , 2.6751 , 0.8917)], + ['C', (1.7834 , 0. , 1.7834)], + ['C', (2.6751 , 0.8917 , 2.6751)], + ['C', (0. , 1.7834 , 1.7834)], + ['C', (0.8917 , 2.6751 , 2.6751)], + ] + KE_CUTOFF = 70 + # basis = 'unc-gth-cc-tzvp' + # pseudo = "gth-hf" + basis = 'gth-dzvp' + pseudo = "gth-pade" + prim_cell = build_supercell(atm, prim_a, Ls = [1,1,1], ke_cutoff=KE_CUTOFF, basis=basis, pseudo=pseudo) + prim_partition = [[0,1],[2,3],[4,5],[6,7]] + # prim_partition = [[0], [1], [2], [3], [4], [5], [6], [7]] + # prim_partition = [[0,1,2,3,4,5,6,7]] + # prim_partition = [[0,1,2,3],[4,5,6,7]] + + prim_mesh = prim_cell.mesh + + Ls = [1, 1, 2] + # Ls = [2, 2, 2] + Ls = np.array(Ls, dtype=np.int32) + mesh = [Ls[0] * prim_mesh[0], Ls[1] * prim_mesh[1], Ls[2] * prim_mesh[2]] + mesh = np.array(mesh, dtype=np.int32) + # mesh = None ### NOTE: magically, use None will be much slower ? + + cell, group_partition = build_supercell_with_partition(atm, prim_a, mesh=mesh, + Ls=Ls, + basis=basis, + pseudo=pseudo, + partition=prim_partition, ke_cutoff=KE_CUTOFF, verbose=verbose) + print("group_partition = ", group_partition) + + t1 = (lib.logger.process_clock(), lib.logger.perf_counter()) + # pbc_isdf_info = PBC_ISDF_Info_Quad(cell, with_robust_fitting=True, aoR_cutoff=1e-8, direct=False, use_occ_RI_K=False) + pbc_isdf_info = PBC_ISDF_Info_Quad(cell, with_robust_fitting=True, + aoR_cutoff=1e-8, + # direct=False, + direct=True, + limited_memory=True, build_K_bunchsize=32, + use_occ_RI_K=False, rela_cutoff_QRCP=3e-3) + pbc_isdf_info.build_IP_local(c=C, m=5, group=group_partition) + pbc_isdf_info.build_auxiliary_Coulomb() + t2 = (lib.logger.process_clock(), lib.logger.perf_counter()) + _benchmark_time(t1, t2, "build isdf", pbc_isdf_info) + + # pbc_isdf_info.check_aoR() + # exit(1) + + from pyscf.pbc import scf + + t1 = (lib.logger.process_clock(), lib.logger.perf_counter()) + + mf = scf.RHF(cell) + pbc_isdf_info.direct_scf = mf.direct_scf + mf.with_df = pbc_isdf_info + mf.max_cycle = 6 + mf.conv_tol = 1e-7 + + mf.kernel() + + t2 = (lib.logger.process_clock(), lib.logger.perf_counter()) + + _benchmark_time(t1, t2, "scf", pbc_isdf_info) + sys.stdout.flush() \ No newline at end of file diff --git a/pyscf/isdf/isdf_local_MPI.py b/pyscf/isdf/isdf_local_MPI.py new file mode 100644 index 000000000..320897409 --- /dev/null +++ b/pyscf/isdf/isdf_local_MPI.py @@ -0,0 +1,213 @@ +#!/usr/bin/env python +# Copyright 2014-2020 The PySCF Developers. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Author: Ning Zhang +# + +import numpy as np + +from pyscf import lib +import pyscf.pbc.gto as pbcgto +from pyscf.pbc.gto import Cell +from pyscf.gto.mole import * + +from pyscf.isdf.isdf_tools_mpi import rank, comm, comm_size, allgather, bcast +import pyscf.isdf.isdf_local as isdf_local +import pyscf.isdf.isdf_local_k as isdf_local_k +from pyscf.isdf.isdf_tools_local import flatten_aoR_holder + +############################################################### + +# debug code # + +def dump_attributes(mydf, attr_lst:list[str], dtype=np.int32, filename:str=None): + + res = [] + + for attr in attr_lst: + assert hasattr(mydf, attr) + tmp = getattr(mydf, attr) + if isinstance(tmp, list): + if all([isinstance(x, np.ndarray) for x in tmp]): + tmp = np.concatenate([x.ravel() for x in tmp]) + else: + tmp = np.asarray(tmp, dtype=dtype) + else: + tmp = np.asarray(tmp, dtype=dtype) + res.append(tmp.flatten().astype(dtype)) + + res = np.concatenate(res) + print("rank = ", rank, res.shape) + res.tofile(filename) + +def dump_aoR(mydf, filename:str=None): + + res_int = [] + res_float = [] + + for attr in ["aoR", "aoR1", "aoRg", "aoRg"]: + if hasattr(mydf, attr): + tmp = getattr(mydf, attr) + if tmp is None: + print("%s is None" % (attr)) + continue + tmp1, tmp2 = flatten_aoR_holder(tmp) + res_int.append(tmp1) + res_float.append(tmp2) + + res_int = np.concatenate(res_int) + res_float = np.concatenate(res_float) + + print("rank = ", rank, res_int.shape, res_float.shape) + res_int.tofile(filename + "_int.dat") + res_float.tofile(filename + "_float.dat") + + +############## MPI version of PBC_ISDF_Info_Quad ############## + +class PBC_ISDF_Info_Quad_MPI(isdf_local.PBC_ISDF_Info_Quad): + ''' Interpolative separable density fitting (ISDF) for periodic systems with MPI. + + The locality is explored! + + k-point sampling is not currently supported! + + ''' + + # Quad stands for quadratic scaling + + def __init__(self, mol:Cell, + kmesh = None, + verbose = None, + rela_cutoff_QRCP = None, + aoR_cutoff = 1e-8, + limited_memory = False, + build_K_bunchsize = None): + + super().__init__(mol, True, kmesh, verbose, rela_cutoff_QRCP, aoR_cutoff, True, + use_occ_RI_K = False, + limited_memory = limited_memory, + build_K_bunchsize = build_K_bunchsize) + self.use_mpi = True + assert self.use_aft_ao == False + + dump_attributes = dump_attributes + dump_aoR = dump_aoR + +############################################################### + +############## MPI version of PBC_ISDF_Info_Quad_K ############## + +class PBC_ISDF_Info_Quad_K_MPI(isdf_local_k.PBC_ISDF_Info_Quad_K): + ''' Interpolative separable density fitting (ISDF) for periodic systems with MPI. + + The locality is explored! + + k-point sampling is not currently supported! + + ''' + + # Quad stands for quadratic scaling + + def __init__(self, mol:Cell, + kmesh = None, + verbose = None, + rela_cutoff_QRCP = None, + aoR_cutoff = 1e-8, + limited_memory = False, + build_K_bunchsize = None): + + super().__init__(mol, True, kmesh, verbose, rela_cutoff_QRCP, aoR_cutoff, True, + # use_occ_RI_K = False, + limited_memory = limited_memory, + build_K_bunchsize = build_K_bunchsize) + self.use_mpi = True + assert self.use_aft_ao == False + + dump_attributes = dump_attributes + dump_aoR = dump_aoR + +################################################################# + +if __name__ == '__main__': + + C = 15 + from pyscf.lib.parameters import BOHR + from isdf_tools_cell import build_supercell, build_supercell_with_partition + + verbose = 6 + if rank != 0: + verbose = 0 + + prim_a = np.array( + [[14.572056092/2, 0.000000000, 0.000000000], + [0.000000000, 14.572056092/2, 0.000000000], + [0.000000000, 0.000000000, 6.010273939],]) * BOHR + atm = [ +['Cu1', (1.927800, 1.927800, 1.590250)], +['O1', (1.927800, 0.000000, 1.590250)], +['O1', (0.000000, 1.927800, 1.590250)], +['Ca', (0.000000, 0.000000, 0.000000)], + ] + from pyscf.gto.basis import parse_nwchem + fbas="basis2.dat" ## NOTE: you should copy it from examples/isdf to run this scripts + atms = ['O', 'Cu', "Ca"] + basis = {atm:parse_nwchem.load(fbas, atm) for atm in atms} + pseudo = {'Cu1': 'gth-pbe-q19', 'Cu2': 'gth-pbe-q19', 'O1': 'gth-pbe', 'Ca': 'gth-pbe'} + ke_cutoff = 128 + prim_cell = build_supercell(atm, prim_a, Ls = [1,1,1], ke_cutoff=ke_cutoff, basis=basis, pseudo=pseudo) + prim_mesh = prim_cell.mesh + KE_CUTOFF = 128 + + prim_mesh = prim_cell.mesh + prim_partition = [[0], [1], [2], [3]] + + Ls = [2, 2, 1] + Ls = np.array(Ls, dtype=np.int32) + mesh = [Ls[0] * prim_mesh[0], Ls[1] * prim_mesh[1], Ls[2] * prim_mesh[2]] + mesh = np.array(mesh, dtype=np.int32) + + cell, group_partition = build_supercell_with_partition(atm, prim_a, mesh=mesh, + Ls=Ls, + basis=basis, pseudo=pseudo, + partition=prim_partition, ke_cutoff=KE_CUTOFF, verbose=verbose) + if rank == 0: + print("group_partition = ", group_partition) + + pbc_isdf_info = PBC_ISDF_Info_Quad_MPI(cell, aoR_cutoff=1e-8, verbose=verbose, limited_memory=True, build_K_bunchsize=16) + pbc_isdf_info.build_IP_local(c=C, m=5, group=group_partition) + pbc_isdf_info.Ls = Ls + pbc_isdf_info.build_auxiliary_Coulomb(debug=True) + + from pyscf.pbc import scf + + if comm_size > 1: + comm.Barrier() + + mf = scf.RHF(cell) + mf = scf.addons.smearing_(mf, sigma=0.2, method='fermi') + pbc_isdf_info.direct_scf = mf.direct_scf + mf.with_df = pbc_isdf_info + mf.max_cycle = 16 + mf.conv_tol = 0.0 + + dm = mf.init_guess_by_atom() + + if comm_size > 1: + dm = bcast(dm, root=0) + + mf.kernel(dm) + + comm.Barrier() \ No newline at end of file diff --git a/pyscf/isdf/isdf_local_jk.py b/pyscf/isdf/isdf_local_jk.py new file mode 100644 index 000000000..c414df9e1 --- /dev/null +++ b/pyscf/isdf/isdf_local_jk.py @@ -0,0 +1,2112 @@ +#!/usr/bin/env python +# Copyright 2014-2020 The PySCF Developers. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Author: Ning Zhang +# + +############ sys module ############ + +import copy, sys +import ctypes +import numpy as np + +############ pyscf module ############ + +from pyscf import lib +from pyscf.lib import logger +from pyscf.pbc import tools +from pyscf.pbc.lib.kpts_helper import is_zero, gamma_point +from pyscf.pbc.df.df_jk import _ewald_exxdiv_for_G0 +from pyscf.pbc.df.df_jk import _format_dms, _format_kpts_band, _format_jks +libisdf = lib.load_library('libisdf') + +############ isdf utils ############ + +from pyscf.isdf.isdf_jk import _benchmark_time +from pyscf.isdf._isdf_local_K_direct import _isdf_get_K_direct_kernel_1 +import pyscf.isdf.isdf_tools_linearop as lib_isdf + +############ GLOBAL PARAMETER ############ + +J_MAX_GRID_BUNCHSIZE = 8192 + +################################################## +# +# only Gamma Point +# +################################################## + +### ls = linear scaling + +def _half_J(mydf, dm, use_mpi=False, + first_pass = None, + short_range = False): + + if use_mpi: + assert mydf.direct == True + from pyscf.isdf.isdf_tools_mpi import rank, comm, comm_size, bcast, reduce + size = comm.Get_size() + + ######### prepare the parameter ######### + + assert first_pass in [None, "only_dd", "only_cc", "exclude_cc", "all"] + + if first_pass is None: + first_pass = "all" + + first_pass_all = first_pass == "all" + first_pass_has_dd = first_pass in ["all", "only_dd", "exclude_cc"] + first_pass_has_cc = first_pass in ["all", "only_cc"] + first_pass_has_cd = first_pass in ["all", "exclude_cc"] + + t1 = (logger.process_clock(), logger.perf_counter()) + + if len(dm.shape) == 3: + assert dm.shape[0] == 1 + dm = dm[0] + + nao = dm.shape[0] + cell = mydf.cell + assert cell.nao == nao + vol = cell.vol + mesh = np.array(cell.mesh, dtype=np.int32) + ngrid = np.prod(mesh) + + aoR = mydf.aoR + assert isinstance(aoR, list) + naux = mydf.naux + + #### step 0. allocate buffer + + max_nao_involved = np.max([aoR_holder.aoR.shape[0] for aoR_holder in aoR if aoR_holder is not None]) + max_ngrid_involved = np.max([aoR_holder.aoR.shape[1] for aoR_holder in aoR if aoR_holder is not None]) + ngrids_local = np.sum([aoR_holder.aoR.shape[1] for aoR_holder in aoR if aoR_holder is not None]) + + density_R = np.zeros((ngrid,), dtype=np.float64) + + dm_buf = np.zeros((max_nao_involved, max_nao_involved), dtype=np.float64) + max_col_buf = min(max_ngrid_involved, J_MAX_GRID_BUNCHSIZE) + ddot_buf = np.zeros((max_nao_involved, max_col_buf), dtype=np.float64) + + fn_multiplysum = getattr(libisdf, "_fn_J_dmultiplysum", None) + assert fn_multiplysum is not None + + ##### get the involved C function ##### + + fn_extract_dm = getattr(libisdf, "_extract_dm_involved_ao", None) + assert fn_extract_dm is not None + + fn_extract_dm2 = getattr(libisdf, "_extract_dm_involved_ao_RS", None) + assert fn_extract_dm is not None + + fn_packadd_dm = getattr(libisdf, "_packadd_local_dm", None) + assert fn_packadd_dm is not None + + #### step 1. get density value on real space grid and IPs + + group = mydf.group + ngroup = len(group) + + density_R_tmp = None + + density_R_tmp_buf = np.zeros((max_ngrid_involved,), dtype=np.float64) + + def _get_rhoR( + bra_aoR, + bra_ao_involved, + ket_aoR, + ket_ao_involved, + bra_type, + ket_type + ): + + nbra_ao = bra_aoR.shape[0] + nket_ao = ket_aoR.shape[0] + if bra_type == ket_type: + dm_now = np.ndarray((nbra_ao, nbra_ao), buffer=dm_buf) + fn_extract_dm( + dm.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(nao), + dm_now.ctypes.data_as(ctypes.c_void_p), + bra_ao_involved.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(nbra_ao), + ) + + # _density_R_tmp = np.zeros((ket_aoR.shape[1],), dtype=np.float64) + _density_R_tmp = np.ndarray((ket_aoR.shape[1],), buffer=density_R_tmp_buf) + + for p0, p1 in lib.prange(0, ket_aoR.shape[1], J_MAX_GRID_BUNCHSIZE): + ddot_res = np.ndarray((nbra_ao, p1-p0), buffer=ddot_buf) + lib.ddot(dm_now, ket_aoR[:,p0:p1], c=ddot_res) + _res_tmp = np.ndarray((p1-p0,), + dtype =_density_R_tmp.dtype, + buffer=_density_R_tmp, + offset=p0*_density_R_tmp.dtype.itemsize) + fn_multiplysum( + _res_tmp.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(nbra_ao), + ctypes.c_int(p1-p0), + bra_aoR.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(bra_aoR.shape[0]), + ctypes.c_int(bra_aoR.shape[1]), + ctypes.c_int(0), + ctypes.c_int(p0), + ddot_res.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(nbra_ao), + ctypes.c_int(p1-p0), + ctypes.c_int(0), + ctypes.c_int(0)) + return _density_R_tmp + else: + dm_now = np.ndarray((nbra_ao, nket_ao), buffer=dm_buf) + fn_extract_dm2( + dm.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(nao), + dm_now.ctypes.data_as(ctypes.c_void_p), + bra_ao_involved.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(bra_ao_involved.shape[0]), + ket_ao_involved.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(ket_ao_involved.shape[0]), + ) + # _density_R_tmp = np.zeros((ket_aoR.shape[1],), dtype=np.float64) + _density_R_tmp = np.ndarray((ket_aoR.shape[1],), buffer=density_R_tmp_buf) + + for p0, p1 in lib.prange(0, ket_aoR.shape[1], J_MAX_GRID_BUNCHSIZE): + ddot_res = np.ndarray((nbra_ao, p1-p0), buffer=ddot_buf) + lib.ddot(dm_now, ket_aoR[:,p0:p1], c=ddot_res) + _res_tmp = np.ndarray((p1-p0,), + dtype =_density_R_tmp.dtype, + buffer=_density_R_tmp, + offset=p0*_density_R_tmp.dtype.itemsize) + fn_multiplysum( + _res_tmp.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(nbra_ao), + ctypes.c_int(p1-p0), + bra_aoR.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(bra_aoR.shape[0]), + ctypes.c_int(bra_aoR.shape[1]), + ctypes.c_int(0), + ctypes.c_int(p0), + ddot_res.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(nbra_ao), + ctypes.c_int(p1-p0), + ctypes.c_int(0), + ctypes.c_int(0)) + + return _density_R_tmp * 2.0 + + for atm_id, aoR_holder in enumerate(aoR): + + if aoR_holder is None: + continue + + if use_mpi: + if atm_id % comm_size != rank: + continue + + ngrids_now = aoR_holder.aoR.shape[1] + nao_involved = aoR_holder.aoR.shape[0] + global_gridID_begin = aoR_holder.global_gridID_begin + nCompact = aoR_holder.nCompact + + if first_pass_all: + density_R_tmp = _get_rhoR( + aoR_holder.aoR, + aoR_holder.ao_involved, + aoR_holder.aoR, + aoR_holder.ao_involved, + "all", + "all" + ) + + density_R[global_gridID_begin:global_gridID_begin+ngrids_now] = density_R_tmp + else: + + if first_pass_has_cc: + density_R_tmp = _get_rhoR( + aoR_holder.aoR[:nCompact,:], + aoR_holder.ao_involved[:nCompact], + aoR_holder.aoR[:nCompact,:], + aoR_holder.ao_involved[:nCompact], + "compact", + "compact" + ) + + density_R[global_gridID_begin:global_gridID_begin+ngrids_now] += density_R_tmp + + if first_pass_has_dd: + density_R_tmp = _get_rhoR( + aoR_holder.aoR[nCompact:,:], + aoR_holder.ao_involved[nCompact:], + aoR_holder.aoR[nCompact:,:], + aoR_holder.ao_involved[nCompact:], + "diffuse", + "diffuse" + ) + + density_R[global_gridID_begin:global_gridID_begin+ngrids_now] += density_R_tmp + + if first_pass_has_cd: + density_R_tmp = _get_rhoR( + aoR_holder.aoR[:nCompact,:], + aoR_holder.ao_involved[:nCompact], + aoR_holder.aoR[nCompact:,:], + aoR_holder.ao_involved[nCompact:], + "compact", + "diffuse" + ) + density_R[global_gridID_begin:global_gridID_begin+ngrids_now] += density_R_tmp + + # assert local_grid_loc == ngrids_local + + if use_mpi: + density_R = reduce(density_R, root=0) + else: + assert ngrids_local == np.prod(mesh) + + grid_ID_ordered = mydf.grid_ID_ordered + + if (use_mpi and rank == 0) or (use_mpi == False): + density_R_original = np.zeros_like(density_R) + + fn_order = getattr(libisdf, "_Reorder_Grid_to_Original_Grid", None) + assert fn_order is not None + + fn_order( + ctypes.c_int(density_R.size), + mydf.grid_ID_ordered.ctypes.data_as(ctypes.c_void_p), + density_R.ctypes.data_as(ctypes.c_void_p), + density_R_original.ctypes.data_as(ctypes.c_void_p), + ) + + density_R = density_R_original.copy() + + J = None + + if (use_mpi and rank == 0) or (use_mpi == False): + + fn_J = getattr(libisdf, "_construct_J", None) + assert(fn_J is not None) + + J = np.zeros_like(density_R) + + if short_range: + coulG = mydf.coulG_SR + else: + coulG = mydf.coulG + + fn_J( + mesh.ctypes.data_as(ctypes.c_void_p), + density_R.ctypes.data_as(ctypes.c_void_p), + coulG.ctypes.data_as(ctypes.c_void_p), + J.ctypes.data_as(ctypes.c_void_p), + ) + + J_ordered = np.zeros_like(J) + + fn_order = getattr(libisdf, "_Original_Grid_to_Reorder_Grid", None) + assert fn_order is not None + + fn_order( + ctypes.c_int(J.size), + mydf.grid_ID_ordered.ctypes.data_as(ctypes.c_void_p), + J.ctypes.data_as(ctypes.c_void_p), + J_ordered.ctypes.data_as(ctypes.c_void_p), + ) + + J = J_ordered.copy() + + if use_mpi: + J = bcast(J, root=0) + + t2 = (logger.process_clock(), logger.perf_counter()) + + del dm_buf, ddot_buf, density_R + del density_R_tmp + + _benchmark_time(t1, t2, "half_J", mydf) + + return J + +def _contract_j_dm_ls(mydf, dm, + use_mpi = False, + first_pass = None, + second_pass = None, + short_range = False): + + if use_mpi: + assert mydf.direct == True + from pyscf.isdf.isdf_tools_mpi import rank, comm, comm_size, bcast, reduce + size = comm.Get_size() + + ###### Prepocess parameter for RS ###### + + assert first_pass in [None, "only_dd", "only_cc", "exclude_cc", "all"] + assert second_pass in [None, "only_dd", "only_cc", "exclude_cc", "all"] + + if short_range: + assert first_pass == "only_dd" + assert second_pass == "only_dd" + + if first_pass is None: + first_pass = "all" + if second_pass is None: + second_pass = "all" + + second_pass_all = second_pass == "all" + second_pass_has_dd = second_pass in ["all", "only_dd", "exclude_cc"] + second_pass_has_cc = second_pass in ["all", "only_cc"] + second_pass_has_cd = second_pass in ["all", "exclude_cc"] + + ####### judge whether to call the original one ####### + + if isinstance(mydf.aoRg, np.ndarray): + has_aoR = False + if hasattr(mydf, "aoR") and mydf.aoR is not None: + assert isinstance(mydf.aoR, np.ndarray) + has_aoR = True + ### call the original get_j ### + from isdf_jk import _contract_j_dm_fast, _contract_j_dm_wo_robust_fitting + if has_aoR: + return _contract_j_dm_fast(mydf, dm, use_mpi=use_mpi) + else: + return _contract_j_dm_wo_robust_fitting(mydf, dm, use_mpi=use_mpi) + + ####### Start the calculation ######## + + t1 = (logger.process_clock(), logger.perf_counter()) + + if len(dm.shape) == 3: + assert dm.shape[0] == 1 + dm = dm[0] + + nao = dm.shape[0] + cell = mydf.cell + assert cell.nao == nao + vol = cell.vol + mesh = np.array(cell.mesh, dtype=np.int32) + ngrid = np.prod(mesh) + + aoR = mydf.aoR + assert isinstance(aoR, list) + naux = mydf.naux + + #### step 0. allocate buffer + + max_nao_involved = np.max([aoR_holder.aoR.shape[0] for aoR_holder in aoR if aoR_holder is not None]) + max_ngrid_involved = np.max([aoR_holder.aoR.shape[1] for aoR_holder in aoR if aoR_holder is not None]) + ngrids_local = np.sum([aoR_holder.aoR.shape[1] for aoR_holder in aoR if aoR_holder is not None]) + + density_R = np.zeros((ngrid,), dtype=np.float64) + + # max_dim_buf = max(max_ngrid_involved, max_nao_involved) + max_dim_buf = max_nao_involved + ddot_buf = np.zeros((max_dim_buf, max_dim_buf), dtype=np.float64) + aoR_buf1 = np.zeros((max_nao_involved, max_ngrid_involved), dtype=np.float64) + + ##### get the involved C function ##### + + fn_packadd_dm = getattr(libisdf, "_packadd_local_dm", None) + assert fn_packadd_dm is not None + + fn_packadd_dm2 = getattr(libisdf, "_packadd_local_RS", None) + assert fn_packadd_dm2 is not None + + #### step 1 2. get density value on real space grid and IPs + + group = mydf.group + ngroup = len(group) + + J = _half_J(mydf, dm, use_mpi, first_pass, short_range) + + #### step 3. get J + + J_Res = np.zeros((nao, nao), dtype=np.float64) + + ordered_ao_ind = np.arange(nao) + + def _get_j_pass2_ls(_aoR_bra, + _ao_involved_bra, + _aoR_ket, + _ao_involved_ket, + _bra_type, + _ket_type, + _potential, + _Res): + + nao_bra = _aoR_bra.shape[0] + nao_ket = _aoR_ket.shape[0] + + if _bra_type == _ket_type: + + aoR_J_res = np.ndarray(_aoR_ket.shape, buffer=aoR_buf1) + lib_isdf.d_ij_j_ij(_aoR_ket, _potential, out=aoR_J_res) + ddot_res = np.ndarray((nao_ket, nao_ket), buffer=ddot_buf) + lib.ddot(_aoR_ket, aoR_J_res.T, c=ddot_res) + + if nao_ket == nao and np.allclose(_ao_involved_ket, ordered_ao_ind): + _Res += ddot_res + else: + fn_packadd_dm( + ddot_res.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(nao_ket), + _ao_involved_ket.ctypes.data_as(ctypes.c_void_p), + _Res.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(_Res.shape[0]) + ) + else: + + ### J_Res = ddot_res + ddot_res.T + + aoR_J_res = np.ndarray(_aoR_ket.shape, buffer=aoR_buf1) + lib_isdf.d_ij_j_ij(_aoR_ket, _potential, out=aoR_J_res) + ddot_res = np.ndarray((nao_bra, nao_ket), buffer=ddot_buf) + lib.ddot(_aoR_bra, aoR_J_res.T, c=ddot_res) + + fn_packadd_dm2( + ddot_res.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(nao_bra), + _ao_involved_bra.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(nao_ket), + _ao_involved_ket.ctypes.data_as(ctypes.c_void_p), + _Res.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(_Res.shape[0]) + ) + + + for atm_id, aoR_holder in enumerate(aoR): + + if aoR_holder is None: + continue + + if use_mpi: + if atm_id % comm_size != rank: + continue + + ngrids_now = aoR_holder.aoR.shape[1] + nao_involved = aoR_holder.nao_involved + nao_compact = aoR_holder.nCompact + nao_diffuse = nao_involved - nao_compact + + global_gridID_begin = aoR_holder.global_gridID_begin + + J_tmp = J[global_gridID_begin:global_gridID_begin+ngrids_now] + + if second_pass_all: ### with RS case ### + + _get_j_pass2_ls( + aoR_holder.aoR, + aoR_holder.ao_involved, + aoR_holder.aoR, + aoR_holder.ao_involved, + "all", + "all", + J_tmp, + J_Res + ) + + else: + + if second_pass_has_cc: + _get_j_pass2_ls( + aoR_holder.aoR[:nao_compact,:], + aoR_holder.ao_involved[:nao_compact], + aoR_holder.aoR[:nao_compact,:], + aoR_holder.ao_involved[:nao_compact], + "compact", + "compact", + J_tmp, + J_Res + ) + + if second_pass_has_dd: + _get_j_pass2_ls( + aoR_holder.aoR[nao_compact:,:], + aoR_holder.ao_involved[nao_compact:], + aoR_holder.aoR[nao_compact:,:], + aoR_holder.ao_involved[nao_compact:], + "diffuse", + "diffuse", + J_tmp, + J_Res + ) + + if second_pass_has_cd: + _get_j_pass2_ls( + aoR_holder.aoR[:nao_compact,:], + aoR_holder.ao_involved[:nao_compact], + aoR_holder.aoR[nao_compact:,:], + aoR_holder.ao_involved[nao_compact:], + "compact", + "diffuse", + J_tmp, + J_Res + ) + + J = J_Res + + if use_mpi: + J = reduce(J, root=0) + + t2 = (logger.process_clock(), logger.perf_counter()) + + _benchmark_time(t1, t2, "_contract_j_dm_fast", mydf) + + ######### delete the buffer ######### + + del ddot_buf + del aoR_buf1 + + return J * ngrid / vol + +def _contract_j_dm_wo_robust_fitting(mydf, dm, use_mpi=False): + + if use_mpi: + from mpi4py import MPI + comm = MPI.COMM_WORLD + rank = comm.Get_rank() + size = comm.Get_size() + assert mydf.direct == True + + ####### judge whether to call the original one ####### + + if isinstance(mydf.aoRg, np.ndarray): + from isdf_jk import _contract_j_dm_wo_robust_fitting + _contract_j_dm_wo_robust_fitting(mydf, dm, use_mpi=use_mpi) + + ######## start the calculation ######## + + t1 = (logger.process_clock(), logger.perf_counter()) + + if len(dm.shape) == 3: + assert dm.shape[0] == 1 + dm = dm[0] + + nao = dm.shape[0] + cell = mydf.cell + assert cell.nao == nao + vol = cell.vol + mesh = np.array(cell.mesh, dtype=np.int32) + ngrid = np.prod(mesh) + + aoRg = mydf.aoRg + assert isinstance(aoRg, list) + naux = mydf.naux + W = mydf.W + + #### step 0. allocate buffer + + max_nao_involved = np.max([aoR_holder.aoR.shape[0] for aoR_holder in aoRg if aoR_holder is not None]) + max_ngrid_involved = np.max([aoR_holder.aoR.shape[1] for aoR_holder in aoRg if aoR_holder is not None]) + ngrids_local = np.sum([aoR_holder.aoR.shape[1] for aoR_holder in aoRg if aoR_holder is not None]) + + density_Rg = np.zeros((naux,), dtype=np.float64) + + dm_buf = np.zeros((max_nao_involved, max_nao_involved), dtype=np.float64) + max_dim_buf = max(max_ngrid_involved, max_nao_involved) + ddot_buf = np.zeros((max_dim_buf, max_dim_buf), dtype=np.float64) + aoR_buf1 = np.zeros((max_nao_involved, max_ngrid_involved), dtype=np.float64) + + ##### get the involved C function ##### + + fn_extract_dm = getattr(libisdf, "_extract_dm_involved_ao", None) + assert fn_extract_dm is not None + + fn_packadd_dm = getattr(libisdf, "_packadd_local_dm", None) + assert fn_packadd_dm is not None + + #### step 1. get density value on real space grid and IPs + + group = mydf.group + ngroup = len(group) + + density_R_tmp = None + ordered_ao_ind = np.arange(nao) + + for atm_id, aoR_holder in enumerate(aoRg): + + if aoR_holder is None: + continue + + if use_mpi: + if atm_id % comm_size != rank: + continue + + ngrids_now = aoR_holder.aoR.shape[1] + nao_involved = aoR_holder.aoR.shape[0] + + if nao_involved < nao or (nao_involved == nao and not np.allclose(aoR_holder.ao_involved, ordered_ao_ind)): + fn_extract_dm( + dm.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(nao), + dm_buf.ctypes.data_as(ctypes.c_void_p), + aoR_holder.ao_involved.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(nao_involved), + ) + else: + dm_buf.ravel()[:] = dm.ravel() + + dm_now = np.ndarray((nao_involved, nao_involved), buffer=dm_buf) + + ddot_res = np.ndarray((nao_involved, ngrids_now), buffer=ddot_buf) + + lib.ddot(dm_now, aoR_holder.aoR, c=ddot_res) + density_R_tmp = lib.multiply_sum_isdf(aoR_holder.aoR, ddot_res) + + global_gridID_begin = aoR_holder.global_gridID_begin + + density_Rg[global_gridID_begin:global_gridID_begin+ngrids_now] = density_R_tmp + + if use_mpi == False: + assert ngrids_local == naux + + if use_mpi: + density_Rg = reduce(density_Rg, root=0) + J = bcast(J, root=0) + + #### step 3. get J + + J = np.asarray(lib.dot(W, density_Rg.reshape(-1,1)), order='C').reshape(-1) + + J_Res = np.zeros((nao, nao), dtype=np.float64) + + for aoR_holder in aoRg: + + if aoR_holder is None: + continue + + if use_mpi: + if atm_id % comm_size != rank: + continue + + ngrids_now = aoR_holder.aoR.shape[1] + nao_involved = aoR_holder.aoR.shape[0] + + global_gridID_begin = aoR_holder.global_gridID_begin + + J_tmp = J[global_gridID_begin:global_gridID_begin+ngrids_now] + + aoR_J_res = np.ndarray(aoR_holder.aoR.shape, buffer=aoR_buf1) + lib_isdf.d_ij_j_ij(aoR_holder.aoR, J_tmp, out=aoR_J_res) + ddot_res = np.ndarray((nao_involved, nao_involved), buffer=ddot_buf) + lib.ddot(aoR_holder.aoR, aoR_J_res.T, c=ddot_res) + + if nao_involved == nao and np.allclose(aoR_holder.ao_involved, ordered_ao_ind): + J_Res += ddot_res + else: + fn_packadd_dm( + ddot_res.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(nao_involved), + aoR_holder.ao_involved.ctypes.data_as(ctypes.c_void_p), + J_Res.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(nao) + ) + + J = J_Res + + if use_mpi: + J = reduce(J, root=0) + + t2 = (logger.process_clock(), logger.perf_counter()) + + _benchmark_time(t1, t2, "_contract_j_dm_fast", mydf) + + ######### delete the buffer ######### + + del dm_buf, ddot_buf, density_Rg + del density_R_tmp + del aoR_buf1 + + return J * ngrid / vol + +############# quadratic scaling (not cubic!) ############# + +def __get_DensityMatrixonRgAO_qradratic(mydf, dm, + bra_aoR_holder, + bra_type = None, + _res:np.ndarray = None, + verbose = 1): + + assert bra_type in [None, "all", "compact", "diffuse"] + + t1 = (logger.process_clock(), logger.perf_counter()) + + if len(dm.shape) == 3: + assert dm.shape[0] <= 4 + # dm = dm[0] + else: + dm = dm.reshape(1, *dm.shape) + + assert dm.shape[1] == dm.shape[2] + nset, nao = dm.shape[0], dm.shape[1] + + ngrid_bra = np.sum([aoR_holder.aoR.shape[1] for aoR_holder in bra_aoR_holder if aoR_holder is not None]) + + max_ngrid_bra = np.max([aoR_holder.aoR.shape[1] for aoR_holder in bra_aoR_holder if aoR_holder is not None]) + max_ao_involved = np.max([aoR_holder.aoR.shape[0] for aoR_holder in bra_aoR_holder if aoR_holder is not None]) + + if _res is None: + res = np.zeros((nset, ngrid_bra, nao), dtype=np.float64) + else: + res = np.ndarray((nset, ngrid_bra, nao), buffer=_res, dtype=np.float64) + + ### allocate buf ### + + offset = 0 + ddot_buf = np.ndarray((max_ngrid_bra, nao), buffer=mydf.build_k_buf, offset=offset) + offset += ddot_buf.size * ddot_buf.dtype.itemsize + dm_pack_buf = np.ndarray((dm.shape[1], dm.shape[2]), buffer=mydf.build_k_buf, offset=offset) + + ### get pack fn ### + + fn_packrow = getattr(libisdf, "_buildK_packrow", None) + assert fn_packrow is not None + fn_packcol = getattr(libisdf, "_buildK_packcol", None) + assert fn_packcol is not None + + ### perform aoR_bra.T * dm + + ordered_ao_ind = np.arange(nao) + grid_shift = None + ngrid_loc = 0 + + for aoR_holder in bra_aoR_holder: + + if aoR_holder is None: + continue + + ngrid_now = aoR_holder.aoR.shape[1] + nao_involved = aoR_holder.aoR.shape[0] + nao_compact = aoR_holder.nCompact + + ao_begin_indx = 0 + ao_end_indx = nao_involved + if bra_type == "compact": + ao_end_indx = nao_compact + elif bra_type == "diffuse": + ao_begin_indx = nao_compact + + nao_at_work = ao_end_indx - ao_begin_indx + + for iset in range(nset): + if (nao_at_work) == nao and np.allclose(aoR_holder.ao_involved, ordered_ao_ind): + dm_packed = dm[iset] + else: + dm_packed = np.ndarray((nao_at_work, nao), buffer=dm_pack_buf) + fn_packrow( + dm_packed.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(nao_at_work), + ctypes.c_int(nao), + dm[iset].ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(nao), + ctypes.c_int(nao), + aoR_holder.ao_involved[ao_begin_indx:ao_end_indx].ctypes.data_as(ctypes.c_void_p) + ) + + ddot_res = np.ndarray((ngrid_now, nao), buffer=ddot_buf) + lib.ddot(aoR_holder.aoR[ao_begin_indx:ao_end_indx,:].T, dm_packed, c=ddot_res) + grid_loc_begin = aoR_holder.global_gridID_begin + + if grid_shift is None: + grid_shift = grid_loc_begin + else: + assert grid_loc_begin>=grid_shift + + res[iset, grid_loc_begin-grid_shift:grid_loc_begin-grid_shift+ngrid_now, :] = ddot_res + + t2 = (logger.process_clock(), logger.perf_counter()) + return res + +def _contract_k_dm_quadratic(mydf, dm, with_robust_fitting=True, use_mpi=False): + + if use_mpi: + raise NotImplementedError("MPI is not supported yet.") + + ####### judge whether to call the original one ####### + + if isinstance(mydf.aoRg, np.ndarray): + from isdf_jk import _contract_k_dm, _contract_k_dm_wo_robust_fitting + if mydf.aoR is None: + return _contract_k_dm_wo_robust_fitting(mydf, dm, False, use_mpi=use_mpi) + else: + return _contract_k_dm(mydf, dm, with_robust_fitting, use_mpi=use_mpi) + + ######## start the calculation ######## + + t1 = (logger.process_clock(), logger.perf_counter()) + + if len(dm.shape) == 3: + assert dm.shape[0] == 1 + dm = dm[0] + + nao = dm.shape[0] + cell = mydf.cell + assert cell.nao == nao + vol = cell.vol + mesh = np.array(cell.mesh, dtype=np.int32) + ngrid = np.prod(mesh) + + aoRg = mydf.aoRg + assert isinstance(aoRg, list) + aoR = mydf.aoR + assert isinstance(aoR, list) + + naux = mydf.naux + nao = cell.nao + + #### step 0. allocate buffer + + max_nao_involved = np.max([aoR_holder.aoR.shape[0] for aoR_holder in aoR if aoR_holder is not None]) + max_ngrid_involved = np.max([aoR_holder.aoR.shape[1] for aoR_holder in aoR if aoR_holder is not None]) + max_nIP_involved = np.max([aoR_holder.aoR.shape[1] for aoR_holder in aoRg if aoR_holder is not None]) + + mydf.allocate_k_buffer() + + # ddot_res_buf = np.zeros((naux, max_nao_involved), dtype=np.float64) + ddot_res_buf = mydf.build_k_buf + + ##### get the involved C function ##### + + fn_packadd_row = getattr(libisdf, "_buildK_packaddrow", None) + assert fn_packadd_row is not None + fn_packadd_col = getattr(libisdf, "_buildK_packaddcol", None) + assert fn_packadd_col is not None + + fn_packcol1 = getattr(libisdf, "_buildK_packcol", None) + fn_packcol2 = getattr(libisdf, "_buildK_packcol2", None) + assert fn_packcol1 is not None + assert fn_packcol2 is not None + + #### step 1. get density matrix value on real space grid and IPs + + Density_RgAO = __get_DensityMatrixonRgAO_qradratic(mydf, dm, aoRg, "all", mydf.Density_RgAO_buf) + Density_RgAO = Density_RgAO[0] + + #### step 2. get K, those part which W is involved + + W = mydf.W + assert W is not None + assert isinstance(W, np.ndarray) + + K1 = np.zeros((naux, nao), dtype=np.float64) + + ####### buf for the first loop ####### + + offset = 0 + ddot_buf1 = np.ndarray((naux, max_nIP_involved), buffer=ddot_res_buf, offset=offset, dtype=np.float64) + offset = ddot_buf1.size * ddot_res_buf.dtype.itemsize + pack_buf = np.ndarray((naux, max_nao_involved), buffer=ddot_res_buf, offset=offset, dtype=np.float64) + offset+= pack_buf.size * pack_buf.dtype.itemsize + ddot_buf2 = np.ndarray((naux, max(max_nIP_involved, max_nao_involved)), buffer=ddot_res_buf, offset=offset, dtype=np.float64) + + ordered_ao_ind = np.arange(nao) + + ### TODO: consider MPI + + nIP_loc = 0 + for aoRg_holder in aoRg: + + if aoRg_holder is None: + continue + + nIP_now = aoRg_holder.aoR.shape[1] + nao_involved = aoRg_holder.aoR.shape[0] + + #### pack the density matrix #### + + if nao_involved == nao and np.allclose(aoRg_holder.ao_involved, ordered_ao_ind): + Density_RgAO_packed = Density_RgAO + else: + # Density_RgAO_packed = Density_RgAO[:, aoRg_holder.ao_involved] + Density_RgAO_packed = np.ndarray((naux, nao_involved), buffer=pack_buf) + fn_packcol1( + Density_RgAO_packed.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(naux), + ctypes.c_int(nao_involved), + Density_RgAO.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(naux), + ctypes.c_int(nao), + aoRg_holder.ao_involved.ctypes.data_as(ctypes.c_void_p) + ) + + # W_tmp = Density_RgRg[:, nIP_loc:nIP_loc+nIP_now] * W[:, nIP_loc:nIP_loc+nIP_now] + + ddot_res1 = np.ndarray((naux, nIP_now), buffer=ddot_buf1) + lib.ddot(Density_RgAO_packed, aoRg_holder.aoR, c=ddot_res1) + Density_RgRg = ddot_res1 + W_packed = np.ndarray((naux, nIP_now), buffer=ddot_buf2) + fn_packcol2( + W_packed.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(naux), + ctypes.c_int(nIP_now), + W.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(naux), + ctypes.c_int(naux), + ctypes.c_int(nIP_loc), + ctypes.c_int(nIP_loc+nIP_now) + ) + lib_isdf.cwise_mul(W_packed, Density_RgRg, out=Density_RgRg) + W_tmp = Density_RgRg + + # ddot + + ddot_res = np.ndarray((naux, nao_involved), buffer=ddot_buf2) + lib.ddot(W_tmp, aoRg_holder.aoR.T, c=ddot_res) + + if nao_involved == nao and np.allclose(aoRg_holder.ao_involved, ordered_ao_ind): + K1 += ddot_res + else: + # K1[: , aoRg_holder.ao_involved] += ddot_res + fn_packadd_col( + K1.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(K1.shape[0]), + ctypes.c_int(K1.shape[1]), + ddot_res.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(ddot_res.shape[0]), + ctypes.c_int(ddot_res.shape[1]), + aoRg_holder.ao_involved.ctypes.data_as(ctypes.c_void_p) + ) + + nIP_loc += nIP_now + # del W_tmp + assert nIP_loc == naux + + K = np.zeros((nao, nao), dtype=np.float64) + + nIP_loc = 0 + for aoRg_holder in aoRg: + + if aoRg_holder is None: + continue + + nIP_now = aoRg_holder.aoR.shape[1] + nao_involved = aoRg_holder.aoR.shape[0] + + K_tmp = K1[nIP_loc:nIP_loc+nIP_now, :] + + ddot_res = np.ndarray((nao_involved, nao), buffer=ddot_res_buf) + lib.ddot(aoRg_holder.aoR, K_tmp, c=ddot_res) + + if nao_involved == nao and np.allclose(aoRg_holder.ao_involved, ordered_ao_ind): + K += ddot_res + else: + # K[aoRg_holder.ao_involved, :] += ddot_res + fn_packadd_row( + K.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(K.shape[0]), + ctypes.c_int(K.shape[1]), + ddot_res.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(ddot_res.shape[0]), + ctypes.c_int(ddot_res.shape[1]), + aoRg_holder.ao_involved.ctypes.data_as(ctypes.c_void_p) + ) + + nIP_loc += nIP_now + # del K_tmp + assert nIP_loc == naux + + #### step 3. get K, those part which W is not involved, with robust fitting + + if with_robust_fitting: + + K = -K + + ### calcualte those parts where V is involved + + V_R = mydf.V_R + assert V_R is not None + assert isinstance(V_R, np.ndarray) + + # lib_isdf.cwise_mul(V_R, Density_RgR, out=Density_RgR) + + K2 = K1 + K2.ravel()[:] = 0.0 + + # fn_packcol = getattr(libisdf, "_buildK_packcol2", None) + # assert fn_packcol is not None + + ddot_buf1 = np.ndarray((naux, max_nao_involved), buffer=ddot_res_buf) + offset = naux * max_nao_involved * ddot_res_buf.dtype.itemsize + V_tmp_buf = np.ndarray((naux, max_ngrid_involved), buffer=ddot_res_buf, offset=offset) + offset += V_tmp_buf.size * V_tmp_buf.dtype.itemsize + pack_buf = np.ndarray((naux, max_nao_involved), buffer=ddot_res_buf, offset=offset) + offset += pack_buf.size * pack_buf.dtype.itemsize + ddot_buf2 = np.ndarray((naux, max_ngrid_involved), buffer=ddot_res_buf, offset=offset) + + ngrid_loc = 0 + + for aoR_holder in aoR: + + if aoR_holder is None: + continue + + ngrid_now = aoR_holder.aoR.shape[1] + nao_involved = aoR_holder.aoR.shape[0] + + #### pack the density matrix #### + + if nao_involved == nao and np.allclose(aoR_holder.ao_involved, ordered_ao_ind): + Density_RgAO_packed = Density_RgAO + else: + # Density_RgAO_packed = Density_RgAO[:, aoR_holder.ao_involved] + Density_RgAO_packed = np.ndarray((naux, nao_involved), buffer=pack_buf) + fn_packcol1( + Density_RgAO_packed.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(naux), + ctypes.c_int(nao_involved), + Density_RgAO.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(naux), + ctypes.c_int(nao), + aoR_holder.ao_involved.ctypes.data_as(ctypes.c_void_p) + ) + + # V_tmp = Density_RgR[:, ngrid_loc:ngrid_loc+ngrid_now] * V_R[:, ngrid_loc:ngrid_loc+ngrid_now] + + ddot_res2 = np.ndarray((naux, ngrid_now), buffer=ddot_buf2) + lib.ddot(Density_RgAO_packed, aoR_holder.aoR, c=ddot_res2) + Density_RgR = ddot_res2 + V_packed = np.ndarray((naux, ngrid_now), buffer=V_tmp_buf) + fn_packcol2( + V_packed.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(naux), + ctypes.c_int(ngrid_now), + V_R.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(naux), + ctypes.c_int(ngrid), + ctypes.c_int(ngrid_loc), + ctypes.c_int(ngrid_loc+ngrid_now) + ) + lib_isdf.cwise_mul(V_packed, Density_RgR, out=Density_RgR) + V_tmp = Density_RgR + + ddot_res = np.ndarray((naux, nao_involved), buffer=ddot_buf1) + lib.ddot(V_tmp, aoR_holder.aoR.T, c=ddot_res) + + if nao_involved == nao and np.allclose(aoR_holder.ao_involved, ordered_ao_ind): + K2 += ddot_res + else: + # K2[: , aoR_holder.ao_involved] += ddot_res + fn_packadd_col( + K2.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(K2.shape[0]), + ctypes.c_int(K2.shape[1]), + ddot_res.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(ddot_res.shape[0]), + ctypes.c_int(ddot_res.shape[1]), + aoR_holder.ao_involved.ctypes.data_as(ctypes.c_void_p) + ) + + ngrid_loc += ngrid_now + # del V_tmp + + assert ngrid_loc == ngrid + + K_add = np.zeros((nao, nao), dtype=np.float64) + + nIP_loc = 0 + for aoRg_holder in aoRg: + + if aoRg_holder is None: + continue + + nIP_now = aoRg_holder.aoR.shape[1] + nao_involved = aoRg_holder.aoR.shape[0] + + K_tmp = K2[nIP_loc:nIP_loc+nIP_now, :] # no need to pack, continguous anyway + + ddot_res = np.ndarray((nao_involved, nao), buffer=ddot_res_buf) + lib.ddot(aoRg_holder.aoR, K_tmp, c=ddot_res) + + if nao == nao_involved and np.allclose(aoRg_holder.ao_involved, ordered_ao_ind): + K_add += ddot_res + else: + # K_add[aoRg_holder.ao_involved, :] += ddot_res + fn_packadd_row( + K_add.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(K_add.shape[0]), + ctypes.c_int(K_add.shape[1]), + ddot_res.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(ddot_res.shape[0]), + ctypes.c_int(ddot_res.shape[1]), + aoRg_holder.ao_involved.ctypes.data_as(ctypes.c_void_p) + ) + + nIP_loc += nIP_now + # del K_tmp + assert nIP_loc == naux + + K_add += K_add.T + + K += K_add + + ######### finally delete the buffer ######### + + del K1 + + t2 = (logger.process_clock(), logger.perf_counter()) + + # if mydf.verbose: + _benchmark_time(t1, t2, "_contract_k_dm_quadratic", mydf) + + return K * ngrid / vol + +def _contract_k_dm_quadratic_direct(mydf, dm, use_mpi=False): + + if use_mpi: + assert mydf.direct == True + from pyscf.isdf.isdf_tools_mpi import rank, comm, comm_size, bcast, reduce + size = comm.Get_size() + + t1 = (logger.process_clock(), logger.perf_counter()) + + if dm.ndim == 3: + assert dm.shape[0] <= 4 + # dm = dm[0] + else: + dm = dm.reshape(1, *dm.shape) + + aoR = mydf.aoR + aoRg = mydf.aoRg + + max_nao_involved = mydf.max_nao_involved + max_ngrid_involved = mydf.max_ngrid_involved + max_nIP_involved = mydf.max_nIP_involved + maxsize_group_naux = mydf.maxsize_group_naux + + ####### preparing the data ####### + + nset, nao = dm.shape[0], dm.shape[1] + cell = mydf.cell + assert cell.nao == nao + vol = cell.vol + mesh = np.array(cell.mesh, dtype=np.int32) + mesh_int32 = mesh + ngrid = np.prod(mesh) + + aoRg = mydf.aoRg + assert isinstance(aoRg, list) + aoR = mydf.aoR + assert isinstance(aoR, list) + + naux = mydf.naux + nao = cell.nao + aux_basis = mydf.aux_basis + + grid_ordering = mydf.grid_ID_ordered + + if hasattr(mydf, "coulG") == False: + if mydf.omega is not None: + assert mydf.omega >= 0.0 + # mydf.coulG = tools.get_coulG(cell, mesh=mesh, omega=mydf.omega) + raise NotImplementedError("coulG is not implemented yet.") + + coulG = mydf.coulG + coulG_real = coulG.reshape(*mesh)[:, :, :mesh[2]//2+1].reshape(-1).copy() + + mydf.allocate_k_buffer(nset) + build_k_buf = mydf.build_k_buf + build_VW_buf = mydf.build_VW_in_k_buf + + group = mydf.group + assert len(group) == len(aux_basis) + + ######### allocate buffer ######### + + Density_RgAO_buf = mydf.Density_RgAO_buf + + nThread = lib.num_threads() + bufsize_per_thread = (coulG_real.shape[0] * 2 + np.prod(mesh)) + buf_build_V = np.ndarray((nThread, bufsize_per_thread), dtype=np.float64, buffer=build_VW_buf) + + offset_now = buf_build_V.size * buf_build_V.dtype.itemsize + + build_K_bunchsize = min(maxsize_group_naux, mydf._build_K_bunchsize) + + offset_build_now = 0 + offset_Density_RgR_buf = 0 + Density_RgR_buf = np.ndarray((build_K_bunchsize, ngrid), buffer=build_k_buf, offset=offset_build_now) + + offset_build_now += Density_RgR_buf.size * Density_RgR_buf.dtype.itemsize + offset_ddot_res_RgR_buf = offset_build_now + ddot_res_RgR_buf = np.ndarray((build_K_bunchsize, max_ngrid_involved), buffer=build_k_buf, offset=offset_ddot_res_RgR_buf) + + offset_build_now += ddot_res_RgR_buf.size * ddot_res_RgR_buf.dtype.itemsize + offset_K1_tmp1_buf = offset_build_now + K1_tmp1_buf = np.ndarray((maxsize_group_naux, nao), buffer=build_k_buf, offset=offset_K1_tmp1_buf) + + offset_build_now += K1_tmp1_buf.size * K1_tmp1_buf.dtype.itemsize + offset_K1_tmp1_ddot_res_buf = offset_build_now + K1_tmp1_ddot_res_buf = np.ndarray((maxsize_group_naux, nao), buffer=build_k_buf, offset=offset_K1_tmp1_ddot_res_buf) + + offset_build_now += K1_tmp1_ddot_res_buf.size * K1_tmp1_ddot_res_buf.dtype.itemsize + + offset_K1_final_ddot_buf = offset_build_now + K1_final_ddot_buf = np.ndarray((nao, nao), buffer=build_k_buf, offset=offset_K1_final_ddot_buf) + + ########### get involved C function ########### + + fn_packcol1 = getattr(libisdf, "_buildK_packcol", None) + assert fn_packcol1 is not None + fn_packcol2 = getattr(libisdf, "_buildK_packcol2", None) + assert fn_packcol2 is not None + fn_packadd_col = getattr(libisdf, "_buildK_packaddcol", None) + assert fn_packadd_col is not None + fn_packadd_row = getattr(libisdf, "_buildK_packaddrow", None) + assert fn_packadd_row is not None + + ordered_ao_ind = np.arange(nao) + + ######### begin work ######### + + K1 = np.zeros((nset, nao, nao), dtype=np.float64) # contribution from V matrix + K2 = np.zeros((nset, nao, nao), dtype=np.float64) # contribution from W matrix + + for group_id, atm_ids in enumerate(group): + + if use_mpi: + if group_id % comm_size != rank: + continue + + naux_tmp = 0 + aoRg_holders = [] + for atm_id in atm_ids: + naux_tmp += aoRg[atm_id].aoR.shape[1] + aoRg_holders.append(aoRg[atm_id]) + assert naux_tmp == aux_basis[group_id].shape[0] + + aux_basis_tmp = aux_basis[group_id] + + #### 1. build the involved DM_RgR #### + + Density_RgAO_tmp = np.ndarray((nset, naux_tmp, nao), buffer=Density_RgAO_buf) + offset_density_RgAO_buf = Density_RgAO_tmp.size * Density_RgAO_buf.dtype.itemsize + Density_RgAO_tmp.ravel()[:] = 0.0 + Density_RgAO_tmp = __get_DensityMatrixonRgAO_qradratic(mydf, dm, aoRg_holders, "all", Density_RgAO_tmp, verbose=mydf.verbose) + + #### 2. build the V matrix #### + + W_tmp = None + + for iset in range(nset): + + calculate_W_tmp = (iset == 0) + + _W_tmp = _isdf_get_K_direct_kernel_1( + mydf, coulG_real, + group_id, Density_RgAO_tmp[iset], + None, True, calculate_W_tmp, + ##### buffer ##### + buf_build_V, + build_VW_buf, + offset_now, + Density_RgR_buf, + Density_RgAO_buf, + offset_density_RgAO_buf, + ddot_res_RgR_buf, + K1_tmp1_buf, + K1_tmp1_ddot_res_buf, + K1_final_ddot_buf, + ##### bunchsize ##### + build_K_bunchsize, + ##### other info ##### + use_mpi=use_mpi, + ##### out ##### + K1_or_2=K1[iset]) + + if calculate_W_tmp: + W_tmp = _W_tmp.copy() + + _isdf_get_K_direct_kernel_1( + mydf, coulG_real, + group_id, Density_RgAO_tmp[iset], + W_tmp, False, False, + ##### buffer ##### + buf_build_V, + build_VW_buf, + offset_now, + Density_RgR_buf, + Density_RgAO_buf, + offset_density_RgAO_buf, + ddot_res_RgR_buf, + K1_tmp1_buf, + K1_tmp1_ddot_res_buf, + K1_final_ddot_buf, + ##### bunchsize ##### + build_K_bunchsize, + ##### other info ##### + use_mpi=use_mpi, + ##### out ##### + K1_or_2=K2[iset]) + + ######### finally delete the buffer ######### + + if use_mpi: + comm.Barrier() + + if use_mpi: + K1 = reduce(K1, root = 0) + K2 = reduce(K2, root = 0) + K = np.zeros_like(K1) + if rank == 0: + for iset in range(nset): + K[iset] = K1[iset] + K1[iset].T - K2[iset] + else: + K = None + K = bcast(K, root = 0) + else: + K = np.zeros_like(K1) + for iset in range(nset): + K[iset] = K1[iset] + K1[iset].T - K2[iset] + + del K1 + del K2 + + t2 = (logger.process_clock(), logger.perf_counter()) + + _benchmark_time(t1, t2, "_contract_k_dm_quadratic_direct", mydf) + + return K * ngrid / vol + +############# occ RI ############# + +def get_jk_occRI(mydf, dm, use_mpi=False, with_j=True, with_k=True): + + assert mydf.omega is None or mydf.omega == 0.0 + # assert with_j_occRI is False + + t1 = (logger.process_clock(), logger.perf_counter()) + t0 = t1 + + if mydf.direct: + raise NotImplementedError("get_jk_occRI does not support robust fitting or direct=True") + + if use_mpi: + raise NotImplementedError("get_jk_occRI does not support use_mpi=True") + + # print("dm.shape = ", dm.shape) + + if getattr(dm, 'mo_coeff', None) is not None: + mo_coeff = dm.mo_coeff + mo_occ = dm.mo_occ + else: + raise NotImplementedError("mo_coeff is not provided yet") + + if len(dm.shape) == 3: + assert dm.shape[0] == 1 + dm = dm[0] + + ##### fetch the basic info ##### + + nao = dm.shape[0] + cell = mydf.cell + assert cell.nao == nao + vol = cell.vol + mesh = np.array(cell.mesh, dtype=np.int32) + ngrid = np.prod(mesh) + + aoR = mydf.aoR + aoRg = mydf.aoRg + assert isinstance(aoR, list) + naux = mydf.naux + + weight = np.sqrt(cell.vol/ngrid) + + ######### weighted mo_coeff ######### + + occ_tol = mydf.occ_tol + nocc = np.count_nonzero(mo_occ > occ_tol) + occ_weight = np.sqrt(mo_occ[mo_occ > occ_tol]) + # print("occ_weight = ", occ_weight) + mo_coeff_full = mo_coeff.copy() + mo_coeff_original = mo_coeff[:,mo_occ > occ_tol].copy() + mo_coeff = mo_coeff[:,mo_occ > occ_tol] * occ_weight ## NOTE: it is a weighted mo_coeff + mo_coeff = mo_coeff.copy() ## NOTE: nonsense thing in python + assert mo_coeff.shape[1] == nocc + assert mo_coeff.shape[0] == nao + + # dm2 = np.dot(mo_coeff, mo_coeff.T) + # assert np.allclose(dm, dm2) + + # print("mo_coeff_original = ", mo_coeff_original[:,0]) + # print("mo_coeff = ", mo_coeff[:,0]) + + ####### determine whether to construct moR ####### + + construct_moR = with_j or (with_k and mydf.with_robust_fitting is True) + construct_dmRgRg = with_k + construct_dmRgR = with_k and mydf.with_robust_fitting is True + + #### step -2. allocate buffer + + max_nao_involved = np.max([aoR_holder.aoR.shape[0] for aoR_holder in aoR if aoR_holder is not None]) + max_ngrid_involved = np.max([aoR_holder.aoR.shape[1] for aoR_holder in aoR if aoR_holder is not None]) + ngrids_local = np.sum([aoR_holder.aoR.shape[1] for aoR_holder in aoR if aoR_holder is not None]) + max_dim_buf = max(max_ngrid_involved, max_nao_involved) + max_nIP_involved = np.max([aoRg_holder.aoR.shape[1] for aoRg_holder in aoRg if aoRg_holder is not None]) + + mydf.deallocate_k_buffer() + + if hasattr(mydf, "moRg") is False: + mydf.moRg = np.zeros((nocc, naux), dtype=np.float64) + else: + if nocc != mydf.moRg.shape[0]: + mydf.moRg = np.zeros((nocc, naux), dtype=np.float64) + + if hasattr(mydf, "K1_packbuf") is False: + mydf.K1_packbuf = np.zeros((nocc, max_ngrid_involved), dtype=np.float64) + else: + if nocc != mydf.K1_packbuf.shape[0]: + mydf.K1_packbuf = np.zeros((nocc, max_ngrid_involved), dtype=np.float64) + + if construct_moR: + if hasattr(mydf, "moR") is False: + mydf.moR = np.zeros((nocc, ngrid), dtype=np.float64) + else: + if nocc != mydf.moR.shape[0]: + mydf.moR = np.zeros((nocc, ngrid), dtype=np.float64) + + if construct_dmRgR: + if hasattr(mydf, "dmRgR") is False: + mydf.dmRgR = np.zeros((naux, ngrid), dtype=np.float64) + if construct_dmRgRg: + if hasattr(mydf, "dmRgRg") is False: + mydf.dmRgRg = np.zeros((naux, naux), dtype=np.float64) + + ddot_buf = np.zeros((max_dim_buf, max_dim_buf), dtype=np.float64) + aoR_buf1 = np.zeros((max_nao_involved, max_ngrid_involved), dtype=np.float64) + moR_buf = np.zeros((nocc, max_ngrid_involved), dtype=np.float64) # which can generated on the fly + mo_coeff_pack_buf = np.zeros((nao, max_nao_involved), dtype=np.float64) + + ####### involved functions ####### + + fn_packrow = getattr(libisdf, "_buildK_packrow", None) + assert fn_packrow is not None + + fn_packadd_row = getattr(libisdf, "_buildK_packaddrow", None) + assert fn_packadd_row is not None + + fn_packcol = getattr(libisdf, "_buildK_packcol", None) + assert fn_packcol is not None + + fn_packcol2 = getattr(libisdf, "_buildK_packcol2", None) + assert fn_packcol2 is not None + + fn_packcol3 = getattr(libisdf, "_buildK_packcol3", None) + assert fn_packcol3 is not None + + fn_packadd_col = getattr(libisdf, "_buildK_packaddcol", None) + assert fn_packadd_col is not None + + fn_packadd_dm = getattr(libisdf, "_packadd_local_dm", None) + assert fn_packadd_dm is not None + + #### step -1. construct moR, moRg, dmRgRg, dmRg #### + + IP_loc_in_ordered_grids = mydf.IP_loc_in_ordered_grids + + def _get_mo_values_on_grids(_aoR_holders, out_): + + for aoR_holder in _aoR_holders: + + ngrids_now = aoR_holder.aoR.shape[1] + nao_involved = aoR_holder.aoR.shape[0] + + mo_coeff_packed = np.ndarray((nao_involved, nocc), buffer=mo_coeff_pack_buf) + # assert mo_coeff_packed.shape[0] == aoR_holder.ao_involved.shape[0] + # assert mo_coeff_packed.shape[1] == mo_coeff.shape[1] + + fn_packrow( + mo_coeff_packed.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(mo_coeff_packed.shape[0]), + ctypes.c_int(mo_coeff_packed.shape[1]), + mo_coeff.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(mo_coeff.shape[0]), + ctypes.c_int(mo_coeff.shape[1]), + aoR_holder.ao_involved.ctypes.data_as(ctypes.c_void_p) + ) + + moR_now = np.ndarray((nocc, ngrids_now), buffer=moR_buf) + lib.ddot(mo_coeff_packed.T, aoR_holder.aoR, c=moR_now) + global_gridID_begin = aoR_holder.global_gridID_begin + fn_packcol3( + out_.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(out_.shape[0]), + ctypes.c_int(out_.shape[1]), + ctypes.c_int(global_gridID_begin), + ctypes.c_int(global_gridID_begin+ngrids_now), + moR_now.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(moR_now.shape[0]), + ctypes.c_int(moR_now.shape[1]) + ) + + + t3 = (logger.process_clock(), logger.perf_counter()) + + if hasattr(mydf, "moR"): + moR = mydf.moR + else: + moR = None + moRg = mydf.moRg + + if construct_moR: + _get_mo_values_on_grids(aoR, moR) + fn_packcol( + moRg.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(moRg.shape[0]), + ctypes.c_int(moRg.shape[1]), + moR.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(moR.shape[0]), + ctypes.c_int(moR.shape[1]), + IP_loc_in_ordered_grids.ctypes.data_as(ctypes.c_void_p) + ) + + else: + moR = None + _get_mo_values_on_grids(aoRg, moRg) + + t4 = (logger.process_clock(), logger.perf_counter()) + + #if mydf.verbose: + _benchmark_time(t3, t4, "get_mo over grids", mydf) + #sys.stdout.flush() + + t3 = (logger.process_clock(), logger.perf_counter()) + + if construct_dmRgR: + dmRgR = mydf.dmRgR + lib.ddot(moRg.T, moR, c=dmRgR) + dmRgRg = mydf.dmRgRg + fn_packcol( + dmRgRg.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(naux), + ctypes.c_int(naux), + dmRgR.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(naux), + ctypes.c_int(ngrid), + IP_loc_in_ordered_grids.ctypes.data_as(ctypes.c_void_p) + ) + else: + dmRgR = None + dmRgRg = mydf.dmRgRg + lib.ddot(moRg.T, moRg, c=dmRgRg) + + t4 = (logger.process_clock(), logger.perf_counter()) + + #if mydf.verbose: + _benchmark_time(t3, t4, "get_dm over grids", mydf) + + #### step 0 get_half_J #### + + if with_j: + + # weighted moR to densityR + + rhoR = np.zeros((ngrid), dtype=np.float64) + + fn_rhoR = getattr(libisdf, "moR_to_Density", None) + assert fn_rhoR is not None + + fn_rhoR( + ctypes.c_int(ngrid), + ctypes.c_int(nocc), + moR.ctypes.data_as(ctypes.c_void_p), + rhoR.ctypes.data_as(ctypes.c_void_p) + ) + + # from rhoG to the potential # + + rhoR_original = np.zeros_like(rhoR) + + fn_order = getattr(libisdf, "_Reorder_Grid_to_Original_Grid", None) + assert fn_order is not None + + fn_order( + ctypes.c_int(ngrid), + mydf.grid_ID_ordered.ctypes.data_as(ctypes.c_void_p), + rhoR.ctypes.data_as(ctypes.c_void_p), + rhoR_original.ctypes.data_as(ctypes.c_void_p) + ) + + rhoR = rhoR_original + + fn_J = getattr(libisdf, "_construct_J", None) + assert fn_J is not None + + if hasattr(mydf, "coulG") == False: + if mydf.omega is not None: + assert mydf.omega >= 0.0 + print("mydf.omega = ", mydf.omega) + # mydf.coulG = tools.get_coulG(cell, mesh=mesh, omega=mydf.omega) + raise ValueError("mydf.coulG is not found.") + + J = np.zeros_like(rhoR) + + fn_J( + mesh.ctypes.data_as(ctypes.c_void_p), + rhoR.ctypes.data_as(ctypes.c_void_p), + mydf.coulG.ctypes.data_as(ctypes.c_void_p), + J.ctypes.data_as(ctypes.c_void_p) + ) + + J_ordered = np.zeros_like(J) + + fn_order = getattr(libisdf, "_Original_Grid_to_Reorder_Grid", None) + assert fn_order is not None + + fn_order( + ctypes.c_int(ngrid), + mydf.grid_ID_ordered.ctypes.data_as(ctypes.c_void_p), + J.ctypes.data_as(ctypes.c_void_p), + J_ordered.ctypes.data_as(ctypes.c_void_p) + ) + + rhoR = J_ordered.copy() + + else: + rhoR = None + + J_Res = np.zeros((nao, nao), dtype=np.float64) + + ordered_ao_ind = np.arange(nao, dtype=np.int32) + + #### step 1 get_J #### + + t1 = (logger.process_clock(), logger.perf_counter()) + + for aoR_holder in aoR: + + if with_j is False: + continue + + if aoR_holder is None: + continue + + if use_mpi: + if atm_id % comm_size != rank: + continue + + ngrids_now = aoR_holder.aoR.shape[1] + nao_involved = aoR_holder.aoR.shape[0] + + global_gridID_begin = aoR_holder.global_gridID_begin + rhoR_tmp = rhoR[global_gridID_begin:global_gridID_begin+ngrids_now] + + aoR_rhoR_res = np.ndarray((nao_involved, ngrids_now), buffer=aoR_buf1) + lib_isdf.d_ij_j_ij(aoR_holder.aoR, rhoR_tmp, out=aoR_rhoR_res) + ddot_res = np.ndarray((nao_involved, nao_involved), buffer=ddot_buf) + lib.ddot(aoR_rhoR_res, aoR_holder.aoR.T, c=ddot_res) + + if nao_involved == nao and np.allclose(aoR_holder.ao_involved, ordered_ao_ind): + J_Res += ddot_res + else: + fn_packadd_dm( + ddot_res.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(nao_involved), + aoR_holder.ao_involved.ctypes.data_as(ctypes.c_void_p), + J_Res.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(nao) + ) + + J = J_Res + + if with_j is False: + J = None + + t2 = (logger.process_clock(), logger.perf_counter()) + + if with_j: + _benchmark_time(t1, t2, "get_j", mydf) + + t1 = (logger.process_clock(), logger.perf_counter()) + + if with_k is False: + K = None + return J * ngrid / vol, K + + K = np.zeros((nocc, nao), dtype=np.float64) + + #### in the following steps, mo should not be weighted #### + + occ_weight_inv = (1.0 / occ_weight).copy() + if moR is not None: + lib.d_i_ij_ij(occ_weight_inv, moR, out=moR) + if moRg is not None: + lib.d_i_ij_ij(occ_weight_inv, moRg, out=moRg) + + #### step 2 get moRg and dmRgRg #### + + ### step 3. get_K ### + + lib_isdf.cwise_mul(mydf.W, dmRgRg, out=dmRgRg) + W2 = dmRgRg + if construct_dmRgR: + lib_isdf.cwise_mul(mydf.V_R, dmRgR, out=dmRgR) + V2 = dmRgR + else: + V2 = None + + K1 = lib.ddot(moRg, W2) ### moRg * W2 * aoRg.T + K1_res = np.zeros((nocc, nao), dtype=np.float64) + if mydf.with_robust_fitting: + K2 = lib.ddot(moRg, V2) ### moRg * V2 * aoR.T + K3 = lib.ddot(V2, moR.T) ### aoRg * V2 * moR.T + K2_res = np.zeros((nocc, nao), dtype=np.float64) + K3_res = np.zeros((nao, nocc), dtype=np.float64) + else: + K2 = None + K3 = None + + K = np.zeros((nocc, nao), dtype=np.float64) + K1_packbuf = mydf.K1_packbuf + + ##### construct with aoRg ##### + + for aoR_holder in mydf.aoRg: + + ngrids_now = aoR_holder.aoR.shape[1] + nao_involved = aoR_holder.aoR.shape[0] + + ########## for (moRg * W2) * aoRg.T ########## + + K1_pack = np.ndarray((nocc, ngrids_now), buffer=K1_packbuf) + + grid_loc_now = aoR_holder.global_gridID_begin + + fn_packcol2( + K1_pack.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(nocc), + ctypes.c_int(ngrids_now), + K1.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(nocc), + ctypes.c_int(naux), + ctypes.c_int(grid_loc_now), + ctypes.c_int(grid_loc_now+ngrids_now) + ) + + ddot_res = np.ndarray((nocc, nao_involved), buffer=ddot_buf) + + lib.ddot(K1_pack, aoR_holder.aoR.T, c=ddot_res) + + if nao_involved == nao and np.allclose(aoR_holder.ao_involved, ordered_ao_ind): + K1_res += ddot_res + else: + fn_packadd_col( + K1_res.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(K1_res.shape[0]), + ctypes.c_int(K1_res.shape[1]), + ddot_res.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(ddot_res.shape[0]), + ctypes.c_int(ddot_res.shape[1]), + aoR_holder.ao_involved.ctypes.data_as(ctypes.c_void_p) + ) + + ########## aoRg * (V2 * moR.T) ########## + + if mydf.with_robust_fitting: + K3_pack = K3[grid_loc_now:grid_loc_now+ngrids_now, :] + ddot_res = np.ndarray((nao_involved, nocc), buffer=ddot_buf) + lib.ddot(aoR_holder.aoR, K3_pack, c=ddot_res) + if nao_involved == nao and np.allclose(aoR_holder.ao_involved, ordered_ao_ind): + K3_res += ddot_res + else: + fn_packadd_row( + K3_res.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(K3_res.shape[0]), + ctypes.c_int(K3_res.shape[1]), + ddot_res.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(ddot_res.shape[0]), + ctypes.c_int(ddot_res.shape[1]), + aoR_holder.ao_involved.ctypes.data_as(ctypes.c_void_p) + ) + + grid_loc_now += ngrids_now + + + if mydf.with_robust_fitting: + + for aoR_holder in mydf.aoR: + + ngrids_now = aoR_holder.aoR.shape[1] + nao_involved = aoR_holder.aoR.shape[0] + + ########## (moRg * V2) * aoR.T ########## + + K2_pack = np.ndarray((nocc, ngrids_now), buffer=K1_packbuf) + + grid_loc_now = aoR_holder.global_gridID_begin + + fn_packcol2( + K2_pack.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(nocc), + ctypes.c_int(ngrids_now), + K2.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(nocc), + ctypes.c_int(ngrid), + ctypes.c_int(grid_loc_now), + ctypes.c_int(grid_loc_now+ngrids_now) + ) + + ddot_res = np.ndarray((nocc, nao_involved), buffer=ddot_buf) + + lib.ddot(K2_pack, aoR_holder.aoR.T, c=ddot_res) + + if nao_involved == nao and np.allclose(aoR_holder.ao_involved, ordered_ao_ind): + K2_res += ddot_res + else: + fn_packadd_col( + K2_res.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(K2_res.shape[0]), + ctypes.c_int(K2_res.shape[1]), + ddot_res.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(ddot_res.shape[0]), + ctypes.c_int(ddot_res.shape[1]), + aoR_holder.ao_involved.ctypes.data_as(ctypes.c_void_p) + ) + + if mydf.with_robust_fitting: + K1 = K1_res + K2 = K2_res + K3 = K3_res + K = -K1 + K2 + K3.T + else: + K1 = K1_res + K = K1 + + ### delete buf ### + + del ddot_buf, aoR_buf1, moR_buf, mo_coeff_pack_buf + + t2 = (logger.process_clock(), logger.perf_counter()) + + _benchmark_time(t1, t2, "get_k_occRI", mydf) + + # Kiv = K.copy() # for debug + + ##### final step from Kiv -> kuv #### + + t1 = (logger.process_clock(), logger.perf_counter()) + + ovlp = mydf.ovlp + K1 = lib.ddot(mo_coeff_original, K) + K1 = lib.ddot(ovlp, K1) + # print("K.shape = ", K.shape) + # print("mo_coeff_original.shape = ", mo_coeff_original.shape) + Kij = lib.ddot(K, mo_coeff_original) + assert np.allclose(Kij, Kij.T) + K2 = lib.ddot(mo_coeff_original, Kij) + K2 = lib.ddot(ovlp, K2) + K2 = lib.ddot(K2, mo_coeff_original.T) + K2 = lib.ddot(K2, ovlp) + K = K1 + K1.T - K2 + + # Kip = lib.ddot(K, mo_coeff_full) + # Kpq = np.zeros((nao, nao), dtype=np.float64) + # Kpq[:nocc, :] = Kip + # Kpq[nocc:, :nocc] = Kip[:,nocc:].T + # K = lib.ddot(mo_coeff_full, Kpq) + # K = lib.ddot(K, mo_coeff_full.T) + + t2 = (logger.process_clock(), logger.perf_counter()) + t00 = t2 + + _benchmark_time(t1, t2, "get_k_iv_2_uv", mydf) + _benchmark_time(t0, t00, "get_jk_occ-RI-K", mydf) + + del K1, K2, K3 + + return J * ngrid / vol, K * ngrid / vol + + +def get_jk_dm_quadratic(mydf, dm, hermi=1, kpt=np.zeros(3), + kpts_band=None, with_j=True, with_k=True, omega=None, + **kwargs): + + '''JK''' + + ############ deal with occ-RI-K ############ + + use_occ_RI_K = False + + if getattr(mydf, "occ_RI_K", None) is not None: + use_occ_RI_K = mydf.occ_RI_K + + if getattr(dm, '__dict__', None) is not None: + mo_coeff = dm.__dict__['mo_coeff'] + mo_occ = dm.__dict__['mo_occ'] + if mo_coeff is not None: + assert mo_occ is not None + if mo_coeff.ndim == 3: + assert mo_coeff.shape[2] == mo_occ.shape[1] + assert mo_occ.ndim == 2 + else: + assert mo_coeff.shape[1] == mo_occ.shape[0] + assert mo_coeff.ndim == 2 + assert mo_occ.ndim == 1 + # if use_occ_RI_K and mo_coeff is None: + # dm = np.asarray(dm) + # if len(dm.shape) == 3: + # assert dm.shape[0] == 1 + # dm = dm[0] + # mo_occ, mo_coeff = mydf.diag_dm(dm) + # dm = dm.reshape(1, dm.shape[0], dm.shape[1]) + # dm = lib.tag_array(dm, mo_coeff=mo_coeff, mo_occ=mo_occ) + else: + dm = np.asarray(dm) + if len(dm.shape) == 3: + assert dm.shape[0] <= 4 + # if use_occ_RI_K: + # assert dm.shape[0] == 1 + # dm = dm[0] + # mo_occ, mo_coeff = mydf.diag_dm(dm) + # dm = dm.reshape(1, dm.shape[0], dm.shape[1]) + # dm = lib.tag_array(dm, mo_coeff=mo_coeff, mo_occ=mo_occ) + # else: + # mo_occ = None + # mo_coeff = None + mo_occ = None + mo_coeff = None + + # if use_occ_RI_K: + # if mydf.direct == True: + # raise ValueError("ISDF does not support direct=True for occ-RI-K") + + if dm.ndim == 2: + dm = dm.reshape(1, *dm.shape) + + assert dm.ndim == 3 + + ############ end deal with occ-RI-K ############ + + direct = mydf.direct + use_mpi = mydf.use_mpi + + if use_mpi and direct == False: + raise NotImplementedError("ISDF does not support use_mpi and direct=False") + + if len(dm.shape) == 3: + assert dm.shape[0] <= 4 + ## NOTE: 1 for RHF 2 for UHF 3/4 for GHF + + if hasattr(mydf, 'Ls') and mydf.Ls is not None: + from pyscf.isdf.isdf_tools_densitymatrix import symmetrize_dm + dm = symmetrize_dm(dm, mydf.Ls) + else: + if hasattr(mydf, 'kmesh') and mydf.kmesh is not None: + from pyscf.isdf.isdf_tools_densitymatrix import symmetrize_dm + dm = symmetrize_dm(dm, mydf.kmesh) + + if use_mpi: + from pyscf.isdf.isdf_tools_mpi import rank, bcast + dm = bcast(dm, root=0) + if mo_coeff is not None: + mo_coeff = bcast(mo_coeff, root=0) + if mo_occ is not None: + mo_occ = bcast(mo_occ, root=0) + + dm = lib.tag_array(dm, mo_coeff=mo_coeff, mo_occ=mo_occ) + + nset, nao = dm.shape[:2] + + ############ end deal with dm with tags ############ + + #### perform the calculation #### + + if "exxdiv" in kwargs: + exxdiv = kwargs["exxdiv"] + kwargs.pop("exxdiv") + else: + exxdiv = None + + assert exxdiv in ["ewald", None] + + vj = vk = None + + if kpts_band is not None and abs(kpt-kpts_band).sum() > 1e-9: + raise NotImplementedError("ISDF does not support kpts_band != kpt") + + log = logger.Logger(mydf.stdout, mydf.verbose) + t1 = (logger.process_clock(), logger.perf_counter()) + + j_real = gamma_point(kpt) + k_real = gamma_point(kpt) and not np.iscomplexobj(dm) + + assert j_real + assert k_real + + mem_now = lib.current_memory()[0] + max_memory = max(2000, (mydf.max_memory - mem_now)) + + log.debug1('max_memory = %d MB (%d in use)', max_memory, mem_now) + + # if use_occ_RI_K: + # vj, vk = get_jk_occRI(mydf, dm, use_mpi, with_j, with_k) + # else: + + ### TODO: improve the efficiency ### + + vj = np.zeros_like(dm) + vk = np.zeros_like(dm) + for iset in range(nset): + if with_j and iset<=1: + from pyscf.isdf.isdf_jk import _contract_j_dm + vj[iset] = _contract_j_dm_ls(mydf, dm[iset], use_mpi) + if with_k: + if mydf.direct: + if iset == 0: + vk = _contract_k_dm_quadratic_direct(mydf, dm, use_mpi=use_mpi) + # vk[iset] = _contract_k_dm_quadratic_direct(mydf, dm[iset], use_mpi=use_mpi) + else: + vk[iset] = _contract_k_dm_quadratic(mydf, dm[iset], mydf.with_robust_fitting, use_mpi=use_mpi) + + ##### the following code is added to deal with _ewald_exxdiv_for_G0 ##### + + if not use_mpi or (use_mpi and rank == 0): + + kpts = kpt.reshape(1,3) + kpts = np.asarray(kpts) + dm_kpts = dm.reshape(-1, dm.shape[0], dm.shape[1]).copy() + dm_kpts = lib.asarray(dm_kpts, order='C') + dms = _format_dms(dm_kpts, kpts) + nset, nkpts, nao = dms.shape[:3] + + assert nset <= 4 + assert nkpts == 1 + + kpts_band, input_band = _format_kpts_band(kpts_band, kpts), kpts_band + nband = len(kpts_band) + + assert nband == 1 + + if is_zero(kpts_band) and is_zero(kpts): + vk = vk.reshape(nset,nband,nao,nao) + else: + raise NotImplementedError("ISDF does not support kpts_band != 0") + + if exxdiv == 'ewald': + _ewald_exxdiv_for_G0(mydf.cell, kpts, dms, vk, kpts_band=kpts_band) + + vk = vk[:,0,:,:] + + if use_mpi: + vj = bcast(vj, root=0) + vk = bcast(vk, root=0) + + ##### end of dealing with _ewald_exxdiv_for_G0 ##### + + t1 = log.timer('sr jk', *t1) + + return vj, vk + +############# linear scaling implementation ############# \ No newline at end of file diff --git a/pyscf/isdf/isdf_local_k.py b/pyscf/isdf/isdf_local_k.py new file mode 100644 index 000000000..a440ee1a3 --- /dev/null +++ b/pyscf/isdf/isdf_local_k.py @@ -0,0 +1,1378 @@ +#!/usr/bin/env python +# Copyright 2014-2020 The PySCF Developers. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Author: Ning Zhang +# + +############ sys module ############ + +import copy +from copy import deepcopy +import numpy as np +import ctypes + +############ pyscf module ############ + +from pyscf import lib +from pyscf.pbc.gto import Cell +from pyscf.pbc import tools +from pyscf.gto.mole import * +libisdf = lib.load_library('libisdf') + +############ isdf utils ############ + +from pyscf.isdf.isdf_eval_gto import ISDF_eval_gto +import pyscf.isdf.isdf_local as ISDF_Local +import pyscf.isdf.isdf_tools_local as ISDF_Local_Utils +from pyscf.isdf.isdf_local_k_jk import get_jk_dm_translation_symmetry +from pyscf.isdf.isdf_jk import _benchmark_time + +############ subroutines --- deal with translation symmetry ############ + +### WARNING: the unit cell must be put in the first cell !! ### + +def _expand_partition_prim(partition_prim, kmesh, mesh): + + meshPrim = np.array(mesh) // np.array(kmesh) + + partition = [] + + for i in range(kmesh[0]): + for j in range(kmesh[1]): + for k in range(kmesh[2]): + shift = i * meshPrim[0] * mesh[1] * mesh[2] + j * meshPrim[1] * mesh[2] + k * meshPrim[2] + for data in partition_prim: + partition.append(data + shift) + + return partition + +def _expand_primlist_2_superlist(primlist, kmesh, mesh): + + meshPrim = np.array(mesh) // np.array(kmesh) + + superlist = [] + + for i in range(kmesh[0]): + for j in range(kmesh[1]): + for k in range(kmesh[2]): + shift = i * meshPrim[0] * mesh[1] * mesh[2] + j * meshPrim[1] * mesh[2] + k * meshPrim[2] + superlist.extend(primlist + shift) + + return np.array(superlist, dtype=np.int32) + +def _get_grid_ordering_k(input, kmesh, mesh): + + if isinstance(input, list): + prim_ordering = [] + for data in input: + prim_ordering.extend(data) + return _expand_primlist_2_superlist(prim_ordering, kmesh, mesh) + else: + raise NotImplementedError + +def select_IP_local_ls_k_drive(mydf, c, m, + IP_possible_atm, + group, + build_aoR_FFT = True, + use_mpi = False): + + # assert use_mpi == False + + IP_group = [] + aoRg_possible = mydf.aoRg_possible + + assert len(IP_possible_atm) == mydf.first_natm + + #### do the work #### + + first_natm = mydf.first_natm + + for i in range(len(group)): + IP_group.append(None) + + if len(group) < first_natm: + if use_mpi == False: + for i in range(len(group)): + IP_group[i] = ISDF_Local.select_IP_group_ls( + mydf, aoRg_possible, c, m, + group = group[i], + atm_2_IP_possible=IP_possible_atm + ) + else: + group_begin, group_end = ISDF_Local_Utils._range_partition(len(group), rank, comm_size, use_mpi) + for i in range(group_begin, group_end): + IP_group[i] = ISDF_Local.select_IP_local_ls( + mydf, aoRg_possible, c, m, + group = group[i], + atm_2_IP_possible=IP_possible_atm + ) + IP_group = ISDF_Local_Utils._sync_list(IP_group, len(group)) + else: + IP_group = IP_possible_atm + + mydf.IP_group = IP_group + mydf.IP_flat_prim = [] + mydf.IP_segment_prim = [] + + nIP_now = 0 + + for x in IP_group: + mydf.IP_flat_prim.extend(x) + mydf.IP_segment_prim.append(nIP_now) + nIP_now += len(x) + + mydf.IP_flat = _expand_primlist_2_superlist(mydf.IP_flat_prim, mydf.kmesh, mydf.mesh) + mydf.naux = mydf.IP_flat.shape[0] + mydf.nIP_Prim = len(mydf.IP_flat_prim) + mydf.nGridPrim = len(mydf.grid_ID_ordered_prim) + gridID_2_atmID = mydf.gridID_2_atmID + + partition_IP = [] + for i in range(mydf.cell.natm): + partition_IP.append([]) + + for _ip_id_ in mydf.IP_flat: + atm_id = gridID_2_atmID[_ip_id_] + partition_IP[atm_id].append(_ip_id_) + + for i in range(mydf.cell.natm): + partition_IP[i] = np.array(partition_IP[i], dtype=np.int32) + + mydf.IP_segment = [0] + for atm_id in mydf.atm_ordering: + mydf.IP_segment.append(mydf.IP_segment[-1] + len(partition_IP[atm_id])) + mydf.IP_segment = np.array(mydf.IP_segment, dtype=np.int32) + + ### build aoR_IP ### + + #### recalculate it anyway ! #### + + coords = mydf.coords + weight = np.sqrt(mydf.cell.vol / coords.shape[0]) + + del mydf.aoRg_possible + mydf.aoRg_possible = None + + t1 = (lib.logger.process_clock(), lib.logger.perf_counter()) + + weight = np.sqrt(mydf.cell.vol / coords.shape[0]) + + mydf.aoRg = ISDF_Local_Utils.get_aoR( + mydf.cell, coords, partition_IP, + first_natm, + mydf.cell.natm, + mydf.group_global, + mydf.distance_matrix, + mydf.AtmConnectionInfo, + False, + mydf.use_mpi, + True) + + assert len(mydf.aoRg) == first_natm + + mydf.aoRg1 = ISDF_Local_Utils.get_aoR( + mydf.cell, coords, partition_IP, + mydf.cell.natm, + first_natm, + mydf.group_global, + mydf.distance_matrix, + mydf.AtmConnectionInfo, + False, + mydf.use_mpi, + True) + + aoRg_activated = [] + for _id_, aoR_holder in enumerate(mydf.aoRg): + if aoR_holder.ao_involved.size == 0: + aoRg_activated.append(False) + else: + aoRg_activated.append(True) + aoRg_activated = np.array(aoRg_activated, dtype=bool) + mydf.aoRg_activated = aoRg_activated + + t2 = (lib.logger.process_clock(), lib.logger.perf_counter()) + + #################### build aoRg_FFT #################### + + kmesh = mydf.kmesh + ncell_complex = kmesh[0] * kmesh[1] * (kmesh[2]//2+1) + nao_prim = mydf.nao // np.prod(kmesh) + nbas_prim = mydf.cell.nbas // np.prod(mydf.kmesh) + weight = np.sqrt(mydf.cell.vol / coords.shape[0]) + nIP_Prim = mydf.nIP_Prim + + ### todo make it a list ! ### + + ################# construct aoRg_FFT ################# + + if build_aoR_FFT: + + aoRg_Tmp = ISDF_eval_gto(mydf.cell, coords=coords[mydf.IP_flat], shls_slice=(0, nbas_prim)) * weight + + mydf.aoRg_FFT = np.zeros((nao_prim, ncell_complex*mydf.nIP_Prim), dtype=np.complex128) + mydf.aoRg_FFT_real = np.ndarray((nao_prim, np.prod(kmesh)*mydf.nIP_Prim), dtype=np.double, buffer=mydf.aoRg_FFT, offset=0) + mydf.aoRg_FFT_real.ravel()[:] = aoRg_Tmp.ravel() + + del aoRg_Tmp + + nthread = lib.num_threads() + buffer = np.zeros((nao_prim, ncell_complex*mydf.nIP_Prim), dtype=np.complex128) + + fn = getattr(libisdf, "_FFT_Matrix_Col_InPlace", None) + assert fn is not None + + ''' + fn = _FFT_Matrix_Col_InPlace transform + + (A0 | A1 | A2) --> (A0+A1+A2 | A0+wA1 + w^2 A2 | A0 + w^2 A1+ w A2) + + ''' + + # print("aoRg_FFT.shape = ", mydf.aoRg_FFT.shape) + + kmesh = np.array(kmesh, dtype=np.int32) + + fn( + mydf.aoRg_FFT_real.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(nao_prim), + ctypes.c_int(nIP_Prim), + kmesh.ctypes.data_as(ctypes.c_void_p), + buffer.ctypes.data_as(ctypes.c_void_p) + ) # no normalization factor ! + + aoRg_packed = [] + for i in range(ncell_complex): + aoRg_packed.append(mydf.aoRg_FFT[:, i*nIP_Prim:(i+1)*nIP_Prim].copy()) + del mydf.aoRg_FFT + mydf.aoRg_FFT = aoRg_packed + else: + mydf.aoRg_FFT = None + + ################# End aoRg_FFT ################# + + #################### build aoR_FFT #################### + + if mydf.with_robust_fitting and build_aoR_FFT: + + ngrids = coords.shape[0] + ngrids_prim = ngrids // np.prod(kmesh) + aoR_tmp = ISDF_eval_gto(mydf.cell, coords=coords[mydf.grid_ID_ordered], shls_slice=(0, nbas_prim)) * weight + mydf.aoR_FFT = np.zeros((nao_prim, ncell_complex*ngrids_prim), dtype=np.complex128) + mydf.aoR_FFT_real = np.ndarray((nao_prim, np.prod(kmesh)*ngrids_prim), dtype=np.double, buffer=mydf.aoR_FFT, offset=0) + mydf.aoR_FFT_real.ravel()[:] = aoR_tmp.ravel() + + del aoR_tmp + + buffer = np.zeros((nao_prim, ncell_complex*ngrids_prim), dtype=np.complex128) + + fn( + mydf.aoR_FFT_real.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(nao_prim), + ctypes.c_int(ngrids_prim), + kmesh.ctypes.data_as(ctypes.c_void_p), + buffer.ctypes.data_as(ctypes.c_void_p) + ) + + aoR_packed = [] + for i in range(ncell_complex): + aoR_packed.append(mydf.aoR_FFT[:, i*ngrids_prim:(i+1)*ngrids_prim].copy()) + del mydf.aoR_FFT + mydf.aoR_FFT = aoR_packed + # mydf.aoR = None + del buffer + else: + mydf.aoR_FFT = None + # build aoR # + +def build_auxiliary_Coulomb_local_bas_k(mydf, debug=True, use_mpi=False): + + if use_mpi: + raise NotImplementedError + + t0 = (lib.logger.process_clock(), lib.logger.perf_counter()) + + cell = mydf.cell + mesh = mydf.mesh + + naux = mydf.naux + + ncomplex = mesh[0] * mesh[1] * (mesh[2] // 2 + 1) * 2 + + grid_ordering = mydf.grid_ID_ordered + + assert mydf.omega is None or mydf.omega == 0.0 + coulG = tools.get_coulG(cell, mesh=mesh) + mydf.coulG = coulG.copy() + coulG_real = coulG.reshape(*mesh)[:, :, :mesh[2]//2+1].reshape(-1).copy() + + nThread = lib.num_threads() + bufsize_per_thread = int((coulG_real.shape[0] * 2 + mesh[0] * mesh[1] * mesh[2]) * 1.1) + buf = np.empty((nThread, bufsize_per_thread), dtype=np.double) + + def construct_V_CCode(aux_basis:list[np.ndarray], + # buf:np.ndarray, + V=None, shift_row=None): + + nThread = buf.shape[0] + bufsize_per_thread = buf.shape[1] + + nAux = 0 + for x in aux_basis: + nAux += x.shape[0] + + ngrids = mesh[0] * mesh[1] * mesh[2] + mesh_int32 = np.array(mesh, dtype=np.int32) + + if V is None: + assert shift_row is None + V = np.zeros((nAux, ngrids), dtype=np.double) + + fn = getattr(libisdf, "_construct_V_local_bas", None) + assert(fn is not None) + + if shift_row is None: + shift_row = 0 + # ngrid_now = 0 + + for i in range(len(aux_basis)): + + aux_basis_now = aux_basis[i] + grid_ID = mydf.partition_group_to_gridID[i] + # ngrid_now += grid_ID.size + # print("i = ", i) + # print("shift_row = ", shift_row) + # print("aux_bas_now = ", aux_basis_now.shape) + # print("ngrid_now = ", grid_ID.size) + # print("buf = ", buf.shape) + # print("ngrid_ordering = ", grid_ordering.size) + # sys.stdout.flush() + assert aux_basis_now.shape[1] == grid_ID.size + + fn(mesh_int32.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(aux_basis_now.shape[0]), + ctypes.c_int(aux_basis_now.shape[1]), + grid_ID.ctypes.data_as(ctypes.c_void_p), + aux_basis_now.ctypes.data_as(ctypes.c_void_p), + coulG_real.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(shift_row), + V.ctypes.data_as(ctypes.c_void_p), + grid_ordering.ctypes.data_as(ctypes.c_void_p), + buf.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(bufsize_per_thread)) + + shift_row += aux_basis_now.shape[0] + + return V + + + V = construct_V_CCode(mydf.aux_basis, V=None, shift_row=None) + + if mydf.with_robust_fitting: + mydf.V_R = V + + ########### construct W ########### + + naux_bra = 0 + for x in mydf.aux_basis: + naux_bra += x.shape[0] + + naux = mydf.naux + + assert naux % naux_bra == 0 + assert naux // naux_bra == np.prod(mydf.kmesh) + + mydf.W = np.zeros((naux_bra, naux), dtype=np.double) + + ngroup = len(mydf.aux_basis) + aux_bra_shift = 0 + kmesh = mydf.kmesh + + for i in range(ngroup): + + aux_ket_shift = 0 + grid_shift = 0 + naux_bra = mydf.aux_basis[i].shape[0] + + for ix in range(kmesh[0]): + for iy in range(kmesh[1]): + for iz in range(kmesh[2]): + for j in range(ngroup): + aux_basis_ket = mydf.aux_basis[j] + ngrid_now = aux_basis_ket.shape[1] + naux_ket = aux_basis_ket.shape[0] + mydf.W[aux_bra_shift:aux_bra_shift+naux_bra, aux_ket_shift:aux_ket_shift+naux_ket] = lib.ddot( + V[aux_bra_shift:aux_bra_shift+naux_bra, grid_shift:grid_shift+ngrid_now], + aux_basis_ket.T + ) + aux_ket_shift += naux_ket + grid_shift += ngrid_now + + aux_bra_shift += naux_bra + + assert grid_shift == np.prod(mesh) + + del buf + buf = None + + assert V.shape[0] == mydf.naux // np.prod(mydf.kmesh) + assert V.shape[1] == np.prod(mesh) + assert mydf.W.shape[0] == mydf.naux // np.prod(mydf.kmesh) + assert mydf.W.shape[1] == mydf.naux + + if mydf.with_robust_fitting == False: + del V + +##### get_jk ##### + +class PBC_ISDF_Info_Quad_K(ISDF_Local.PBC_ISDF_Info_Quad): + + # Quad stands for quadratic scaling + + def __init__(self, + mol:Cell, # means the primitive cell + with_robust_fitting=True, + kmesh =None, + verbose =None, + rela_cutoff_QRCP =None, + aoR_cutoff =1e-8, + direct =False, + limited_memory =False, + build_K_bunchsize =None, + ): + + ### extract the info from the primitive cell ### + + atm = [] + + #### TODO: remove the following restriction on the structure of lattice #### + + assert mol.a[0][1] == 0.0 + assert mol.a[0][2] == 0.0 + assert mol.a[1][0] == 0.0 + assert mol.a[1][2] == 0.0 + assert mol.a[2][0] == 0.0 + assert mol.a[2][1] == 0.0 + + from pyscf.lib.parameters import BOHR + + for i in range(mol.natm): + coords = mol.atom_coord(i) + coords = np.array(coords) * BOHR + atm.append([mol.atom_symbol(i), tuple(coords)]) + + prim_mesh = mol.mesh + mesh = np.array(prim_mesh) * np.array(kmesh) + + nelectron = np.sum(mol.nelectron) + + from pyscf.isdf.isdf_tools_cell import build_supercell + supercell = build_supercell( + atm, + mol.a, + spin = nelectron*np.prod(kmesh) % 2, + mesh = mesh, + Ls = kmesh, + basis = mol.basis, + pseudo = mol.pseudo, + ke_cutoff = mol.ke_cutoff, + max_memory = mol.max_memory, + verbose = mol.verbose + ) + + self.prim_cell = mol + + # print("supercell.mesh = ", supercell.mesh) + + super().__init__(supercell, with_robust_fitting, None, verbose, rela_cutoff_QRCP, aoR_cutoff, direct, use_occ_RI_K=False, + limited_memory=limited_memory, build_K_bunchsize=build_K_bunchsize) + + self.kmesh = kmesh + + self.kpts = self.prim_cell.make_kpts(kmesh) + + assert self.mesh[0] % kmesh[0] == 0 + assert self.mesh[1] % kmesh[1] == 0 + assert self.mesh[2] % kmesh[2] == 0 + + # print("self.mesh = ", self.mesh) + # exit(1) + + #### information relating primitive cell and supercell + + self.meshPrim = np.array(self.mesh) // np.array(self.kmesh) + self.natm = self.cell.natm + self.natmPrim = self.cell.natm // np.prod(self.kmesh) + + self.with_translation_symmetry = True + + from pyscf.isdf.isdf_tools_cell import build_primitive_cell + self.primCell = build_primitive_cell(self.cell, self.kmesh) + self.nao_prim = self.nao // np.prod(self.kmesh) + assert self.nao_prim == self.primCell.nao_nr() + + ##### rename everthing with pre_fix _supercell #### + + def build_partition_aoR(self, Ls=None): + ''' + + build partition of grid points and AO values on grids + + partition of grids is the assignment of each grids to the atom + + partition is hence a list of list of grids + + ''' + + if self.aoR is not None and self.partition is not None: + return + + log = lib.logger.Logger(self.stdout, self.verbose) + + ##### build cutoff info ##### + + self.distance_matrix = ISDF_Local_Utils.get_cell_distance_matrix(self.cell) + weight = np.sqrt(self.cell.vol / self.coords.shape[0]) + precision = self.aoR_cutoff + rcut = ISDF_Local_Utils._estimate_rcut(self.cell, self.coords.shape[0], precision) + rcut_max = np.max(rcut) + atm2_bas = ISDF_Local_Utils._atm_to_bas(self.cell) + self.AtmConnectionInfo = [] + + for i in range(self.cell.natm): + tmp = ISDF_Local_Utils.AtmConnectionInfo(self.cell, i, self.distance_matrix, precision, rcut, rcut_max, atm2_bas) + self.AtmConnectionInfo.append(tmp) + + #### information dealing grids , build parition #### + + t1 = (lib.logger.process_clock(), lib.logger.perf_counter()) + + if Ls is None: + Ls = [ + int(self.cell.lattice_vectors()[0][0]/2)+1, + int(self.cell.lattice_vectors()[1][1]/2)+1, + int(self.cell.lattice_vectors()[2][2]/2)+1 + ] + + self.partition_prim = ISDF_Local_Utils.get_partition( + self.cell, self.coords, + self.AtmConnectionInfo, + Ls, + self.with_translation_symmetry, + self.kmesh, + self.use_mpi + ) ## the id of grid points of self.partition_prim is w.r.t the supercell ## + + for i in range(len(self.partition_prim)): + self.partition_prim[i] = np.array(self.partition_prim[i], dtype=np.int32) + + assert len(self.partition_prim) == self.natmPrim ## the grid id is the global grid id + + self.partition = _expand_partition_prim(self.partition_prim, self.kmesh, self.mesh) + + assert len(self.partition) == self.natm + + t2 = (lib.logger.process_clock(), lib.logger.perf_counter()) + + #### + + if not self.use_mpi: + rank = 0 + _benchmark_time(t1, t2, "build_partition", self) + else: + from pyscf.isdf.isdf_tools_mpi import rank, bcast + if rank == 0: + _benchmark_time(t1, t2, "build_partition", self) + + #### build aoR #### + + t1 = (lib.logger.process_clock(), lib.logger.perf_counter()) + + sync_aoR = False + if self.direct: + sync_aoR = True + + ## deal with translation symmetry ## + + first_natm = self.first_natm + natm = self.cell.natm + + ### we need three types of aoR ### + + # this type of aoR is used in get J and select IP + + weight = np.sqrt(self.cell.vol / self.coords.shape[0]) + + self.aoR = ISDF_Local_Utils.get_aoR(self.cell, self.coords, self.partition, + first_natm, + natm, + self.group_global, + self.distance_matrix, + self.AtmConnectionInfo, + self.use_mpi, self.use_mpi, sync_aoR) ### full col, store aoR[:, :ngrid_prim] + + + memory = ISDF_Local_Utils._get_aoR_holders_memory(self.aoR) ### full col + assert len(self.aoR) == first_natm + + if rank == 0: + log.info("In ISDF-K build_partition_aoR aoR memory: %d " % (memory)) + + + weight = np.sqrt(self.cell.vol / self.coords.shape[0]) + self.aoR1 = ISDF_Local_Utils.get_aoR(self.cell, self.coords, self.partition, + None, + first_natm, + self.group_global, + self.distance_matrix, + self.AtmConnectionInfo, + self.use_mpi, self.use_mpi, sync_aoR) ### full row , store aoR[:nao_prim, :] + + memory = ISDF_Local_Utils._get_aoR_holders_memory(self.aoR1) ### full row + assert len(self.aoR1) == natm + + if rank == 0: + log.info("In ISDF-K build_partition_aoR aoR1 memory: %s", memory) + + partition_activated = None + + ##### the following info is used in get_J ##### + + if rank == 0: + partition_activated = [] + for _id_, aoR_holder in enumerate(self.aoR1): + if aoR_holder.ao_involved.size == 0: + partition_activated.append(False) + else: + partition_activated.append(True) + partition_activated = np.array(partition_activated, dtype=bool) + + if self.use_mpi: + partition_activated = bcast(partition_activated) + + self.partition_activated = partition_activated + self.partition_activated_id = [] + for i in range(len(partition_activated)): + if partition_activated[i]: + self.partition_activated_id.append(i) + self.partition_activated_id = np.array(self.partition_activated_id, dtype=np.int32) + + t2 = (lib.logger.process_clock(), lib.logger.perf_counter()) + + if rank == 0: + _benchmark_time(t1, t2, "build_aoR", self) + + def set_group(self, group=None): + + first_natm = self.first_natm + if group is None: + group = [] + for i in range(first_natm): + group.append([i]) + + ## check the group ## + + natm_involved = 0 + for data in group: + for atm_id in data: + assert atm_id < first_natm + natm_involved += len(data) + assert natm_involved == first_natm + + for i in range(len(group)): + group[i] = np.array(group[i], dtype=np.int32) + + assert len(group) <= first_natm + + self.group = group + + self.group_global = [] + shift = 0 + self.atm_ordering = [] + for ix in range(self.kmesh[0]): + for iy in range(self.kmesh[1]): + for iz in range(self.kmesh[2]): + for data in self.group: + self.group_global.append(data + shift) + self.atm_ordering.extend(data + shift) + shift += self.natmPrim + self.atm_ordering = np.array(self.atm_ordering, dtype=np.int32) + + self.atm_id_2_group = np.zeros((self.cell.natm), dtype=np.int32) + for i in range(len(self.group_global)): + for atm_id in self.group_global[i]: + self.atm_id_2_group[atm_id] = i + + def build_IP_local(self, c=5, m=5, group=None, Ls = None, debug=True): + + assert self.use_aft_ao == False + + self.set_group(group) + first_natm = self.first_natm + + # build partition and aoR # + + t1 = (lib.logger.process_clock(), lib.logger.perf_counter()) + + self.build_partition_aoR(None) + + self.grid_segment = [0] + for atm_id in self.atm_ordering: + loc_now = self.grid_segment[-1] + len(self.partition[atm_id]) + self.grid_segment.append(loc_now) + self.grid_segment = np.array(self.grid_segment, dtype=np.int32) + + ao2atomID = self.ao2atomID + partition = self.partition + aoR = self.aoR + natm = self.natm + nao = self.nao + + self.partition_atmID_to_gridID = partition + + self.partition_group_to_gridID = [] + for i in range(len(group)): + self.partition_group_to_gridID.append([]) + for atm_id in group[i]: + self.partition_group_to_gridID[i].extend(partition[atm_id]) + self.partition_group_to_gridID[i] = np.array(self.partition_group_to_gridID[i], dtype=np.int32) + + ngrids = self.coords.shape[0] + + gridID_2_atmID = np.zeros((ngrids), dtype=np.int32) + + for atm_id in range(self.cell.natm): + gridID_2_atmID[partition[atm_id]] = atm_id + + self.gridID_2_atmID = gridID_2_atmID + self.grid_ID_ordered = _get_grid_ordering_k(self.partition_group_to_gridID, self.kmesh, self.mesh) + self.grid_ID_ordered_prim = self.grid_ID_ordered[:ngrids//np.prod(self.kmesh)].copy() + self.partition_group_to_gridID = _expand_partition_prim(self.partition_group_to_gridID, self.kmesh, self.mesh) + + for i in range(len(self.grid_ID_ordered_prim)): + grid_ID = self.grid_ID_ordered_prim[i] + + ix = grid_ID // (self.mesh[1] * self.mesh[2]) + iy = (grid_ID % (self.mesh[1] * self.mesh[2])) // self.mesh[2] + iz = grid_ID % self.mesh[2] + + # assert ix < self.meshPrim[0] + # assert iy < self.meshPrim[1] + # assert iz < self.meshPrim[2] + + self.grid_ID_ordered_prim[i] = ix * self.meshPrim[1] * self.meshPrim[2] + iy * self.meshPrim[2] + iz + + + t2 = (lib.logger.process_clock(), lib.logger.perf_counter()) + + #if self.verbose and debug: + if not self.use_mpi: + rank = 0 + else: + from pyscf.isdf.isdf_tools_mpi import rank + + if rank == 0: + _benchmark_time(t1, t2, "build_partition_aoR", self) + + t1 = t2 + + if len(group) < first_natm: + IP_Atm = ISDF_Local.select_IP_atm_ls( + self, + c+1, m, + first_natm, + rela_cutoff = self.rela_cutoff_QRCP, + no_retriction_on_nIP = self.no_restriction_on_nIP, + use_mpi = self.use_mpi + ) + else: + IP_Atm = ISDF_Local.select_IP_atm_ls( + self, + c, m, + first_natm, + rela_cutoff = self.rela_cutoff_QRCP, + no_retriction_on_nIP = self.no_restriction_on_nIP, + use_mpi = self.use_mpi + ) + + self.IP_Atm = IP_Atm + + t3 = (lib.logger.process_clock(), lib.logger.perf_counter()) + + weight = np.sqrt(self.cell.vol / self.coords.shape[0]) + + self.aoRg_possible = ISDF_Local_Utils.get_aoR( + self.cell, self.coords, + IP_Atm, + first_natm, + natm, + self.group, + self.distance_matrix, + self.AtmConnectionInfo, + self.use_mpi, self.use_mpi, True + ) + + assert len(self.aoRg_possible) == first_natm + + t4 = (lib.logger.process_clock(), lib.logger.perf_counter()) + + #if self.verbose and debug: + if rank == 0: + _benchmark_time(t3, t4, "build_aoRg_possible", self) + + build_aoR_FFT = (self.direct == False) + + select_IP_local_ls_k_drive( + self, c, m, + self.IP_Atm, self.group, + build_aoR_FFT = build_aoR_FFT, + use_mpi = self.use_mpi + ) + + t2 = (lib.logger.process_clock(), lib.logger.perf_counter()) + + #if self.verbose and debug: + if rank == 0: + _benchmark_time(t1, t2, "select_IP", self) + + t1 = t2 + + ISDF_Local.build_aux_basis_ls( + self, group, self.IP_group, debug=debug, use_mpi=self.use_mpi) + + t2 = (lib.logger.process_clock(), lib.logger.perf_counter()) + + # if self.verbose and debug: + if rank == 0: + _benchmark_time(t1, t2, "build_aux_basis", self) + + t1 = t2 + sys.stdout.flush() + + def build_auxiliary_Coulomb(self, debug=True): + + if self.direct == False: + build_auxiliary_Coulomb_local_bas_k(self, debug=debug, use_mpi=self.use_mpi) + + ################ testing code ################ + + # def test_ + + ################ allocate buffer ################ + + def _get_bufsize_get_j(self): + + # if self.with_robust_fitting == False: + if True: + + naux = self.naux + nao = self.nao + nIP_Prim = self.nIP_Prim + nao_prim = self.nao // np.prod(self.kmesh) + + size_buf3 = nao * naux + naux + naux + nao * nao + size_buf4 = nao * nIP_Prim + size_buf4 += nIP_Prim + size_buf4 += nao_prim * nao + size_buf4 += nIP_Prim + size_buf4 += nao_prim * nao_prim + size_buf4 += nao_prim * nIP_Prim * 3 + + return max(size_buf3, size_buf4) + + # else: + # raise NotImplementedError + + def _get_bufsize_get_k(self): + + # if self.with_robust_fitting == False: + if self.with_robust_fitting == False: + + naux = self.naux + nao = self.nao + nIP_Prim = self.nIP_Prim + nao_prim = self.nao // np.prod(self.kmesh) + ncell_complex = self.kmesh[0] * self.kmesh[1] * (self.kmesh[2]//2+1) + + #### size of density matrix #### + + size_dm = nao_prim * nao_prim * ncell_complex * 2 + size_dm += nIP_Prim * nIP_Prim * ncell_complex * 2 + + #### size of buf to construct dm #### + + size_buf5 = nao_prim * nao_prim * 2 * 2 + size_buf5 += nao_prim * nIP_Prim * 2 * 2 + + size_fft_buf = nIP_Prim * nIP_Prim * ncell_complex * 2 + + #### size of buf to construct K #### + + size_buf6 = nao_prim * nao_prim * ncell_complex * 2 # k-buf + size_buf6 += nIP_Prim * nIP_Prim * 2 # buf_A + size_buf6 += nao_prim * nIP_Prim * 2 *2 # buf_B/C + size_buf6 += nao_prim * nao_prim * 2 # buf_D + + return size_dm + max(size_buf5, size_buf6, size_fft_buf) + + else: + + naux = self.naux + nao = self.nao + nIP_Prim = self.nIP_Prim + nGrid_Prim = self.nGridPrim + nao_prim = self.nao // np.prod(self.kmesh) + ncell_complex = self.kmesh[0] * self.kmesh[1] * (self.kmesh[2]//2+1) + + #### size of density matrix #### + + size_dm = nao_prim * nao_prim * ncell_complex * 2 + size_dm += nIP_Prim * nGrid_Prim * ncell_complex * 2 + + #### size of buf to construct dm #### + + size_buf5 = nao_prim * nao_prim * 2 + size_buf5 += nao_prim * nIP_Prim * 2 + size_buf5 += nao_prim * nGrid_Prim * 2 * 2 + size_buf5 += nIP_Prim * nGrid_Prim * 2 + + size_fft_buf = nIP_Prim * nGrid_Prim * ncell_complex * 2 + + #### size of buf to construct K #### + + size_buf6 = nao_prim * nao_prim * ncell_complex * 2 # k-buf + size_buf6 += nIP_Prim * nGrid_Prim * 2 # buf_A + size_buf6 += nao_prim * nGrid_Prim * 2 # buf_B + size_buf6 += nao_prim * nIP_Prim * 2 * 2 # buf_B2/C + size_buf6 += nao_prim * nao_prim * 2 # buf_D + + return size_dm + max(size_buf5, size_buf6, size_fft_buf) + + def _allocate_jk_buffer(self, dtype=np.float64): + + if self.jk_buffer is not None: + return + + num_threads = lib.num_threads() + + nIP_Prim = self.nIP_Prim + nGridPrim = self.nGridPrim + ncell_complex = self.kmesh[0] * self.kmesh[1] * (self.kmesh[2]//2+1) + nao_prim = self.nao // np.prod(self.kmesh) + naux = self.naux + nao = self.nao + ngrids = nGridPrim * self.kmesh[0] * self.kmesh[1] * self.kmesh[2] + ncell = np.prod(self.kmesh) + + self.outcore = False + + if self.outcore is False: + + ### in build aux basis ### + + size_buf1 = nIP_Prim * ncell_complex*nIP_Prim * 2 + size_buf1+= nIP_Prim * ncell_complex*nGridPrim * 2 * 2 + size_buf1+= num_threads * nGridPrim * 2 + size_buf1+= nIP_Prim * nIP_Prim * 2 + size_buf1+= nIP_Prim * nGridPrim * 2 * 2 + size_buf1 = 0 + + ### in construct W ### + + size_buf2 = nIP_Prim * nIP_Prim * 2 + size_buf2 += nIP_Prim * nGridPrim * 2 * 2 + size_buf2 += nIP_Prim * nIP_Prim * ncell_complex * 2 * 2 + size_buf2 = 0 + + ### in get_j ### + + buf_J = self._get_bufsize_get_j() + buf_J = 0 + + ### in get_k ### + + buf_K = self._get_bufsize_get_k() + + ### ddot_buf ### + + size_ddot_buf = (nIP_Prim*nIP_Prim+2)*num_threads + size_buf = max(size_buf1,size_buf2,buf_J,buf_K) + + if hasattr(self, "IO_buf"): + if self.IO_buf.size < (size_buf+size_ddot_buf): + self.IO_buf = np.zeros((size_buf+size_ddot_buf), dtype=np.float64) + self.jk_buffer = np.ndarray((size_buf), dtype=np.float64, buffer=self.IO_buf, offset=0) + self.ddot_buf = np.ndarray((size_ddot_buf), dtype=np.float64, buffer=self.IO_buf, offset=size_buf) + + else: + + self.jk_buffer = np.ndarray((size_buf), dtype=np.float64) + self.ddot_buf = np.zeros((size_ddot_buf), dtype=np.float64) + + ##### all the following functions are used to deal with translation symmetry when getting j and getting k ##### + + def _get_permutation_column_aoR(self, box_x, box_y, box_z, loc_internal=None): + + assert box_x < self.kmesh[0] + assert box_y < self.kmesh[1] + assert box_z < self.kmesh[2] + + if hasattr(self, "aoR_col_permutation") is False: + self.aoR_col_permutation = [] + for i in range(np.prod(self.kmesh)): + self.aoR_col_permutation.append(None) + + loc = box_x * self.kmesh[1] * self.kmesh[2] + box_y * self.kmesh[2] + box_z + + if self.aoR_col_permutation[loc] is None: + ### construct the permutation matrix ### + permutation = [] + for aoR_holder in self.aoR: + ao_involved = aoR_holder.ao_involved + ao_permutated = [] + for ao_id in ao_involved: + box_id = ao_id // self.nao_prim + nao_id = ao_id % self.nao_prim + box_x_ = box_id // (self.kmesh[1] * self.kmesh[2]) + box_y_ = (box_id % (self.kmesh[1] * self.kmesh[2])) // self.kmesh[2] + box_z_ = box_id % self.kmesh[2] + box_x_new = (box_x + box_x_) % self.kmesh[0] + box_y_new = (box_y + box_y_) % self.kmesh[1] + box_z_new = (box_z + box_z_) % self.kmesh[2] + nao_id_new = box_x_new * self.kmesh[1] * self.kmesh[2] * self.nao_prim + box_y_new * self.kmesh[2] * self.nao_prim + box_z_new * self.nao_prim + nao_id + ao_permutated.append(nao_id_new) + # print("ao_permutated = ", ao_permutated) + permutation.append(np.array(ao_permutated, dtype=np.int32)) + self.aoR_col_permutation[loc] = permutation + + if loc_internal is not None: + return self.aoR_col_permutation[loc][loc_internal] + else: + return self.aoR_col_permutation[loc] + + def _get_permutation_column_aoRg(self, box_x, box_y, box_z, loc_internal=None): + + assert box_x < self.kmesh[0] + assert box_y < self.kmesh[1] + assert box_z < self.kmesh[2] + + if hasattr(self, "aoRg_col_permutation") is False: + self.aoRg_col_permutation = [] + for i in range(np.prod(self.kmesh)): + self.aoRg_col_permutation.append(None) + + loc = box_x * self.kmesh[1] * self.kmesh[2] + box_y * self.kmesh[2] + box_z + + if self.aoRg_col_permutation[loc] is None: + ### construct the permutation matrix ### + permutation = [] + for aoRg_holder in self.aoRg: + ao_involved = aoRg_holder.ao_involved + ao_permutated = [] + for ao_id in ao_involved: + box_id = ao_id // self.nao_prim + nao_id = ao_id % self.nao_prim + box_x_ = box_id // (self.kmesh[1] * self.kmesh[2]) + box_y_ = (box_id % (self.kmesh[1] * self.kmesh[2])) // self.kmesh[2] + box_z_ = box_id % self.kmesh[2] + box_x_new = (box_x + box_x_) % self.kmesh[0] + box_y_new = (box_y + box_y_) % self.kmesh[1] + box_z_new = (box_z + box_z_) % self.kmesh[2] + nao_id_new = box_x_new * self.kmesh[1] * self.kmesh[2] * self.nao_prim + box_y_new * self.kmesh[2] * self.nao_prim + box_z_new * self.nao_prim + nao_id + ao_permutated.append(nao_id_new) + permutation.append(np.array(ao_permutated, dtype=np.int32)) + self.aoRg_col_permutation[loc] = permutation + + if loc_internal is not None: + return self.aoRg_col_permutation[loc][loc_internal] + else: + return self.aoRg_col_permutation[loc] + + def _get_aoRg_Row(self, box_x, box_y, box_z): + + assert box_x < self.kmesh[0] + assert box_y < self.kmesh[1] + assert box_z < self.kmesh[2] + + if box_x == 0 and box_y == 0 and box_z == 0: + return self.aoRg1 + else: + Res = [] + for ix in range(self.kmesh[0]): + for iy in range(self.kmesh[1]): + for iz in range(self.kmesh[2]): + ix_ = (ix - box_x + self.kmesh[0]) % self.kmesh[0] + iy_ = (iy - box_y + self.kmesh[1]) % self.kmesh[1] + iz_ = (iz - box_z + self.kmesh[2]) % self.kmesh[2] + loc_ = ix_ * self.kmesh[1] * self.kmesh[2] + iy_ * self.kmesh[2] + iz_ + for i in range(loc_*self.natmPrim, (loc_+1)*self.natmPrim): + Res.append(self.aoRg1[i]) + return Res + + #### subroutine to deal with _ewald_exxdiv_for_G0 + + def get_jk(self, _dm, hermi=1, kpts=None, kpts_band=None, + with_j=True, with_k=True, omega=None, exxdiv=None): + + dm = deepcopy(_dm) + + if self.use_mpi: + from pyscf.isdf.isdf_tools_mpi import rank, bcast, comm + dm = bcast(dm, root=0) + + if omega is not None: # J/K for RSH functionals + raise NotImplementedError + # with self.range_coulomb(omega) as rsh_df: + # return rsh_df.get_jk(dm, hermi, kpts, kpts_band, with_j, with_k, + # omega=None, exxdiv=exxdiv) + + from pyscf.pbc.df.aft import _check_kpts + + kpts, is_single_kpt = _check_kpts(self, kpts) + + if is_single_kpt: + assert np.allclose(kpts[0], np.zeros(3)) + assert not self.use_mpi + vj, vk = get_jk_dm_translation_symmetry(self, dm, hermi, kpts[0], kpts_band, + with_j, with_k, exxdiv=exxdiv) + else: + + ### first construct J and K ### + + from pyscf.isdf.isdf_local_k_jk import _contract_j_dm_k_ls, _get_k_kSym_robust_fitting_fast, _get_k_kSym, _get_k_kSym_direct, _get_k_kSym_direct_mimic_MPI + from pyscf.pbc.df.df_jk import _ewald_exxdiv_for_G0, _format_dms, _format_kpts_band, _format_jks + + ### preprocess dm ### + + if dm.ndim == 3: + dm = dm.reshape(1, *dm.shape) + nset = dm.shape[0] + vj = np.zeros_like(dm, dtype=np.complex128) + vk = np.zeros_like(dm, dtype=np.complex128) + + for iset in range(nset): + if iset<=1: + vj[iset] = _contract_j_dm_k_ls(self, dm[iset], self.use_mpi) + if self.with_robust_fitting: + if self.direct: + # vk[iset] = _get_k_kSym_direct(self, dm[iset]) + if iset == 0: + # if self.use_mpi: + vk = _get_k_kSym_direct(self, dm, self.use_mpi) + #else: + #vk = _get_k_kSym_direct_mimic_MPI(self, dm, self.use_mpi) + else: + vk[iset] = _get_k_kSym_robust_fitting_fast(self, dm[iset]) + else: + vk[iset] = _get_k_kSym(self, dm[iset]) + + # if self.use_mpi: + # from pyscf.isdf.isdf_tools_mpi import rank, comm, comm_size + # for i in range(comm_size): + # if i == rank: + # print("rank == ", rank) + # print("vk = ", vk[0][0][0,:32]) + # print("vk = ", vk[0][0][:32,0]) + # comm.Barrier() + # else: + # print("vk = ", vk[0][0][0,:32]) + # print("vk = ", vk[0][0][:32,0]) + + ### post process J and K ### + + if not self.use_mpi or (self.use_mpi and rank == 0): + + kpts = np.asarray(kpts) + dm_kpts = lib.asarray(dm, order='C') + assert dm_kpts.ndim == 4 + assert dm_kpts.shape[1] == len(kpts) + assert dm_kpts.shape[2] == dm_kpts.shape[3] + dms = _format_dms(dm_kpts, kpts) + nset, nkpts, nao = dms.shape[:3] + assert nset <= 4 + + kpts_band, input_band = _format_kpts_band(kpts_band, kpts), kpts_band + nband = len(kpts_band) + assert nband == nkpts + + vk_kpts = vk.reshape(nset, nband, nao, nao) + + cell = self.prim_cell + + if exxdiv == 'ewald': + _ewald_exxdiv_for_G0(cell, kpts, dms, vk_kpts, kpts_band=kpts_band) + + vk = _format_jks(vk_kpts, dm_kpts, input_band, kpts) + vj_kpts = vj.reshape(nset, nband, nao, nao) + vj = _format_jks(vj_kpts, dm_kpts, input_band, kpts) + + #print("vk = ", vk[0][0][0,:32]) + #print("vk = ", vk[0][0][:32,0]) + + if nset == 1: + + vj = vj[0] + vk = vk[0] + + + if self.use_mpi: + + vj = bcast(vj, root = 0) + vk = bcast(vk, root = 0) + + comm.Barrier() + + # for i in range(comm_size): + # if i == rank: + # print("rank == ", rank) + # print("vk = ", vk[0][0,:32]) + # print("vk = ", vk[0][:32,0]) + # else: + # print("vk = ", vk[0][0,:32]) + # print("vk = ", vk[0][:32,0]) + + return vj, vk + +if __name__ == "__main__": + + from isdf_tools_cell import build_supercell, build_supercell_with_partition + C = 25 + + verbose = 10 + import pyscf.pbc.gto as pbcgto + + cell = pbcgto.Cell() + boxlen = 3.5668 + cell.a = np.array([[boxlen,0.0,0.0],[0.0,boxlen,0.0],[0.0,0.0,boxlen]]) + prim_a = np.array([[boxlen,0.0,0.0],[0.0,boxlen,0.0],[0.0,0.0,boxlen]]) + atm = [ + ['C', (0. , 0. , 0. )], + ['C', (0.8917 , 0.8917 , 0.8917)], + ['C', (1.7834 , 1.7834 , 0. )], + ['C', (2.6751 , 2.6751 , 0.8917)], + ['C', (1.7834 , 0. , 1.7834)], + ['C', (2.6751 , 0.8917 , 2.6751)], + ['C', (0. , 1.7834 , 1.7834)], + ['C', (0.8917 , 2.6751 , 2.6751)], + ] + + KE_CUTOFF = 70 + + prim_cell = build_supercell(atm, prim_a, Ls = [1,1,1], ke_cutoff=KE_CUTOFF) + prim_mesh = prim_cell.mesh + # prim_partition = [[0], [1], [2], [3], [4], [5], [6], [7]] + # prim_partition = [[0,1,2,3,4,5,6,7]] + prim_partition = [[0,1],[2,3],[4,5],[6,7]] + + Ls = [1, 1, 8] + kpts = prim_cell.make_kpts(Ls) + Ls = np.array(Ls, dtype=np.int32) + mesh = [Ls[0] * prim_mesh[0], Ls[1] * prim_mesh[1], Ls[2] * prim_mesh[2]] + mesh = np.array(mesh, dtype=np.int32) + + cell, group_partition = build_supercell_with_partition(atm, prim_a, mesh=mesh, + Ls=Ls, + #basis=basis, pseudo=pseudo, + partition=prim_partition, ke_cutoff=KE_CUTOFF, verbose=verbose) + + # pbc_isdf_info = PBC_ISDF_Info_Quad_K(cell, kmesh=Ls, with_robust_fitting=True, aoR_cutoff=1e-8, direct=False, rela_cutoff_QRCP=3e-3) + pbc_isdf_info = PBC_ISDF_Info_Quad_K(prim_cell, kmesh=Ls, with_robust_fitting=True, aoR_cutoff=1e-8, + direct=True, + # direct=False, + rela_cutoff_QRCP=3e-3, + limited_memory=True, + build_K_bunchsize=32) + pbc_isdf_info.build_IP_local(c=C, m=5, group=prim_partition, Ls=[Ls[0]*10, Ls[1]*10, Ls[2]*10]) + pbc_isdf_info.verbose = 10 + + weight = np.sqrt(cell.vol / pbc_isdf_info.coords.shape[0]) + aoR_benchmark = ISDF_eval_gto(cell, coords=pbc_isdf_info.coords[pbc_isdf_info.grid_ID_ordered]) * weight + + naux_prim = 0 + for data in pbc_isdf_info.aoRg: + naux_prim += data.aoR.shape[1] + print("naux_prim = ", naux_prim) + print("naux = ", pbc_isdf_info.naux) + + aoR_unpacked = np.zeros_like(aoR_benchmark) + ngrid = 0 + for ix in range(Ls[0]): + for iy in range(Ls[1]): + for iz in range(Ls[2]): + perm_col = pbc_isdf_info._get_permutation_column_aoR(ix, iy, iz) + for _loc_, data in enumerate(pbc_isdf_info.aoR): + aoR_unpacked[perm_col[_loc_], ngrid:ngrid+data.aoR.shape[1]] = data.aoR + ngrid += data.aoR.shape[1] + assert ngrid == np.prod(mesh) + diff = aoR_benchmark - aoR_unpacked + where = np.where(np.abs(diff) > 1e-4) + print("where = ", where) + print("diff = ", np.linalg.norm(diff)/np.sqrt(aoR_unpacked.size)) + + ngrid_prim = np.prod(prim_mesh) + diff = aoR_benchmark[:, :ngrid_prim] - aoR_unpacked[:,:ngrid_prim] + print("diff.shape = ", diff.shape) + print("diff = ", np.linalg.norm(diff)/np.sqrt(diff.size)) + where = np.where(np.abs(diff) > 1e-4) + print("where = ", where) + + grid_ID_prim = pbc_isdf_info.grid_ID_ordered[:ngrid_prim] + grid_ID_prim2 = [] + for i in range(pbc_isdf_info.natmPrim): + grid_ID_prim2.extend(pbc_isdf_info.partition[i]) + grid_ID_prim2 = np.array(grid_ID_prim2, dtype=np.int32) + assert np.allclose(grid_ID_prim, grid_ID_prim2) + + # pbc_isdf_info.build_auxiliary_Coulomb(debug=True) + + from pyscf.pbc import scf + + mf = scf.KRHF(prim_cell, kpts) + # mf = scf.KUHF(prim_cell, kpts) + # pbc_isdf_info.kpts = np.array([[0,0,0]]) + # mf = scf.addons.smearing_(mf, sigma=0.2, method='fermi') + pbc_isdf_info.set_build_K_distance_cutoff(30.0) + pbc_isdf_info.direct_scf = mf.direct_scf + mf.with_df = pbc_isdf_info + mf.max_cycle = 16 + mf.conv_tol = 1e-7 + + mf.kernel() + + # exit(1) + + ######### bench mark ######### + + pbc_isdf_info = ISDF_Local.PBC_ISDF_Info_Quad(cell, with_robust_fitting=True, aoR_cutoff=1e-8, direct=True, rela_cutoff_QRCP=1e-3, use_occ_RI_K=False) + pbc_isdf_info.build_IP_local(c=C, m=5, group=group_partition, Ls=[Ls[0]*10, Ls[1]*10, Ls[2]*10]) + # pbc_isdf_info.build_IP_local(c=C, m=5, group=group_partition, Ls=[Ls[0]*3, Ls[1]*3, Ls[2]*3]) + pbc_isdf_info.Ls = Ls + pbc_isdf_info.build_auxiliary_Coulomb(debug=True) + + aoR_unpacked = [] + for aoR_holder in pbc_isdf_info.aoR: + aoR_unpacked.append(aoR_holder.todense(cell.nao_nr())) + aoR_unpacked = np.concatenate(aoR_unpacked, axis=1) + grid_ordered = pbc_isdf_info.grid_ID_ordered + aoR_benchmark = ISDF_eval_gto(cell, coords=pbc_isdf_info.coords[grid_ordered]) * weight + diff = aoR_benchmark - aoR_unpacked + print("diff = ", np.linalg.norm(diff)/np.sqrt(aoR_unpacked.size)) + # exit(1) + + mf = scf.RHF(cell) + pbc_isdf_info.direct_scf = mf.direct_scf + mf.with_df = pbc_isdf_info + mf.max_cycle = 16 + mf.conv_tol = 1e-7 + mf.kernel() \ No newline at end of file diff --git a/pyscf/isdf/isdf_local_k_jk.py b/pyscf/isdf/isdf_local_k_jk.py new file mode 100644 index 000000000..f23206a93 --- /dev/null +++ b/pyscf/isdf/isdf_local_k_jk.py @@ -0,0 +1,2083 @@ +#!/usr/bin/env python +# Copyright 2014-2020 The PySCF Developers. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Author: Ning Zhang +# + +############ sys module ############ + +import copy +import numpy as np +import ctypes + +############ pyscf module ############ + +from pyscf import lib +from pyscf.pbc import tools +from pyscf.pbc.lib.kpts_helper import is_zero, gamma_point +from pyscf.gto.mole import * +from pyscf.pbc.df.df_jk import _ewald_exxdiv_for_G0, _format_dms, _format_kpts_band, _format_jks + +from pyscf.isdf.isdf_jk import _benchmark_time +from pyscf.isdf.isdf_tools_densitymatrix import pack_JK, pack_JK_in_FFT_space +from pyscf.isdf.isdf_local_jk import J_MAX_GRID_BUNCHSIZE, __get_DensityMatrixonRgAO_qradratic +from pyscf.isdf.isdf_tools_kSampling import _RowCol_FFT_bench +from pyscf.isdf._isdf_local_K_direct import _isdf_get_K_direct_kernel_1 +libisdf = lib.load_library('libisdf') +import pyscf.isdf.isdf_tools_linearop as lib_isdf + +############ subroutines ############ + +def _preprocess_dm(mydf, dm): + + log = lib.logger.Logger(mydf.cell.stdout, mydf.cell.verbose) + + in_real_space = True + + kmesh = np.asarray(mydf.kmesh, dtype=np.int32) + ncell_complex = kmesh[0] * kmesh[1] * (kmesh[2]//2+1) + + if len(dm.shape) == 3: + if dm.shape[0] == 1: + if dm.dtype == np.float64: + dm = dm[0].real + else: + in_real_space = False + dm = dm[0].real + else: + + #print("dm.shape = ", dm.shape) + #print("dm = ", dm) + #print("dtype = ", dm.dtype) + + in_real_space = False + + if dm.dtype == np.float64: + #assert kmesh[0] in [1, 2] + #assert kmesh[1] in [1, 2] + #assert kmesh[2] in [1, 2] + dm = np.asarray(dm, dtype=np.complex128) + + assert dm.dtype == np.complex128 + assert dm.shape[1] == dm.shape[2] + assert dm.shape[0] == np.prod(kmesh) + + nao_prim = dm.shape[1] + nkpts = dm.shape[0] + + #dm_complex = np.transpose(dm, axes=(1, 0, 2)).copy() + #dm_complex = dm_complex.reshape(nao_prim, -1) + + ### check the symmetry ### + + for ix in range(kmesh[0]): + for iy in range(kmesh[1]): + for iz in range(kmesh[2]): + loc1 = ix * kmesh[1] * kmesh[2] + iy * kmesh[2] + iz + loc2 = (kmesh[0] - ix) % kmesh[0] * kmesh[1] * kmesh[2] + (kmesh[1] - iy) % kmesh[1] * kmesh[2] + (kmesh[2] - iz) % kmesh[2] + #print("loc1 = ", loc1, "loc2 = ", loc2) + #print("dm[loc1] = ", dm[loc1]) + #print("dm[loc2] = ", dm[loc2]) + diff = np.linalg.norm(dm[loc1] - dm[loc2].conj()) / np.sqrt(dm.size) + # print("diff = ", diff) ## NOTE: should be very small + # assert diff < 1e-7 + if diff > 1e-7: + log.debug4("warning, the input density matrix is not symmetric.") + log.debug4("k1 = (%d, %d, %d) " % (ix, iy, iz)) + log.debug4("k2 = (%d, %d, %d) " % ((kmesh[0] - ix) % kmesh[0], (kmesh[1] - iy) % kmesh[1], (kmesh[2] - iz) % kmesh[2])) + # log.debug4("kmesh = ", kmesh) + log.debug4("diff = %15.6f" % (diff)) + dm_complex = np.zeros((ncell_complex, nao_prim, nao_prim), dtype=np.complex128) + loc = 0 + for ix in range(kmesh[0]): + for iy in range(kmesh[1]): + for iz in range(kmesh[2]//2+1): + loc1 = ix * kmesh[1] * kmesh[2] + iy * kmesh[2] + iz + loc2 = (kmesh[0] - ix) % kmesh[0] * kmesh[1] * kmesh[2] + (kmesh[1] - iy) % kmesh[1] * kmesh[2] + (kmesh[2] - iz) % kmesh[2] + # dm_complex[loc].ravel()[:] = dm[loc1].ravel()[:] + dm_input = ((dm[loc1] + dm[loc2].conj()) / 2.0).copy() + dm_complex[loc].ravel()[:] = dm_input.ravel()[:] + loc += 1 + + dm_complex = np.transpose(dm_complex, axes=(1, 0, 2)).copy() + dm_complex = dm_complex.conj().copy() + + #print("dm_complex.shape = ", dm_complex.shape) + #print("dm_complex = ", dm_complex[:, 0, :]) + #print("dm_complex = ", dm_complex[:, 1, :]) + + ### do the FFT ### + + dm_real = np.ndarray((nao_prim, nkpts * nao_prim), dtype=np.float64, buffer=dm_complex) + buf_fft = np.zeros((nao_prim, ncell_complex, nao_prim), dtype=np.complex128) + + fn2 = getattr(libisdf, "_iFFT_Matrix_Col_InPlace", None) + assert fn2 is not None + + fn2( + dm_complex.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(nao_prim), + ctypes.c_int(nao_prim), + kmesh.ctypes.data_as(ctypes.c_void_p), + buf_fft.ctypes.data_as(ctypes.c_void_p) + ) + + #print("dm_real = ", dm_real) + #print("dm_complex = ", dm_complex) + + dm = pack_JK(dm_real, kmesh, nao_prim) + + #print("dm.shape = ", dm.shape) + + return dm, in_real_space + +def _contract_j_dm_k_ls(mydf, _dm, use_mpi=False): + + dm, in_real_space = _preprocess_dm(mydf, _dm) + + if use_mpi: + assert mydf.direct == True + from pyscf.isdf.isdf_tools_mpi import rank, comm, comm_size, bcast, reduce + size = comm_size + # raise NotImplementedError("MPI is not supported yet.") + dm = bcast(dm, root=0) + + t1 = (logger.process_clock(), logger.perf_counter()) + + if len(dm.shape) == 3: + assert dm.shape[0] == 1 + dm = dm[0] + + nao = dm.shape[0] + cell = mydf.cell + assert cell.nao == nao + vol = cell.vol + mesh = np.array(cell.mesh, dtype=np.int32) + ngrid = np.prod(mesh) + ngrid_prim = ngrid // np.prod(mydf.kmesh) + + aoR = mydf.aoR + assert isinstance(aoR, list) + naux = mydf.naux + aoR1 = mydf.aoR1 + assert isinstance(aoR1, list) + + kmesh = np.array(mydf.kmesh, dtype=np.int32) + ncell = np.prod(kmesh) + ncell_complex = kmesh[0] * kmesh[1] * (kmesh[2]//2+1) + + #### step 0. allocate buffer + + max_nao_involved = np.max([aoR_holder.aoR.shape[0] for aoR_holder in aoR if aoR_holder is not None]) + max_nao_involved1 = np.max([aoR_holder.aoR.shape[0] for aoR_holder in aoR1 if aoR_holder is not None]) + max_nao_involved = max(max_nao_involved, max_nao_involved1) + max_ngrid_involved = np.max([aoR_holder.aoR.shape[1] for aoR_holder in aoR if aoR_holder is not None]) + max_ngrid_involved1 = np.max([aoR_holder.aoR.shape[1] for aoR_holder in aoR1 if aoR_holder is not None]) + max_ngrid_involved = max(max_ngrid_involved, max_ngrid_involved1) + + density_R_prim = np.zeros((ngrid_prim,), dtype=np.float64) + + dm_buf = np.zeros((max_nao_involved, max_nao_involved), dtype=np.float64) + max_dim_buf = max_nao_involved + max_col_buf = min(max_ngrid_involved, J_MAX_GRID_BUNCHSIZE) + aoR_buf1 = np.zeros((max_nao_involved, max_ngrid_involved), dtype=np.float64) + + ##### get the involved C function ##### + + fn_extract_dm = getattr(libisdf, "_extract_dm_involved_ao", None) + assert fn_extract_dm is not None + + fn_packadd_dm = getattr(libisdf, "_packadd_local_dm", None) + assert fn_packadd_dm is not None + + fn_multiplysum = getattr(libisdf, "_fn_J_dmultiplysum", None) + assert fn_multiplysum is not None + + #### step 1. get density value on real space grid and IPs + + density_R_tmp = None + ddot_buf = np.zeros((max_nao_involved, max_col_buf), dtype=np.float64) + + for atm_id, aoR_holder in enumerate(aoR): + + if aoR_holder is None: + continue + + if use_mpi: + if atm_id % comm_size != rank: + continue + + ngrids_now = aoR_holder.aoR.shape[1] + nao_involved = aoR_holder.aoR.shape[0] + + if nao_involved < nao: + fn_extract_dm( + dm.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(nao), + dm_buf.ctypes.data_as(ctypes.c_void_p), + aoR_holder.ao_involved.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(nao_involved), + ) + else: + dm_buf.ravel()[:] = dm.ravel() + + dm_now = np.ndarray((nao_involved, nao_involved), buffer=dm_buf) + global_gridID_begin = aoR_holder.global_gridID_begin + + for p0, p1 in lib.prange(0, ngrids_now, J_MAX_GRID_BUNCHSIZE): + ddot_res = np.ndarray((nao_involved, p1-p0), buffer=ddot_buf) + lib.ddot(dm_now, aoR_holder.aoR[:,p0:p1], c=ddot_res) + # density_R_tmp = lib.multiply_sum_isdf(aoR_holder.aoR[:,p0:p1], ddot_res) + _res_tmp = np.ndarray((p1-p0,), + dtype =density_R_prim.dtype, + buffer=density_R_prim, + offset=(global_gridID_begin+p0)*density_R_prim.dtype.itemsize) + # density_R_prim[global_gridID_begin+p0:global_gridID_begin+p1] = density_R_tmp + fn_multiplysum( + _res_tmp.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(nao_involved), + ctypes.c_int(p1-p0), + aoR_holder.aoR.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(aoR_holder.aoR.shape[0]), + ctypes.c_int(aoR_holder.aoR.shape[1]), + ctypes.c_int(0), + ctypes.c_int(p0), + ddot_res.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(nao_involved), + ctypes.c_int(p1-p0), + ctypes.c_int(0), + ctypes.c_int(0)) + # ddot_res = np.ndarray((nao_involved, ngrids_now), buffer=ddot_buf) + # lib.ddot(dm_now, aoR_holder.aoR, c=ddot_res) + # density_R_tmp = lib.multiply_sum_isdf(aoR_holder.aoR, ddot_res) + # density_R_prim[global_gridID_begin:global_gridID_begin+ngrids_now] = density_R_tmp + + if use_mpi: + density_R_prim = reduce(density_R_prim, root=0) + + grid_ID_ordered = mydf.grid_ID_ordered_prim + + if (use_mpi and rank == 0) or (use_mpi == False): + + density_R_original = np.zeros_like(density_R_prim) + + fn_order = getattr(libisdf, "_Reorder_Grid_to_Original_Grid", None) + assert fn_order is not None + + fn_order( + ctypes.c_int(density_R_prim.size), + mydf.grid_ID_ordered_prim.ctypes.data_as(ctypes.c_void_p), + density_R_prim.ctypes.data_as(ctypes.c_void_p), + density_R_original.ctypes.data_as(ctypes.c_void_p), + ) + + density_R_prim = density_R_original.copy() + + J = None + + ddot_buf = np.zeros((max_nao_involved, max_nao_involved), dtype=np.float64) + + if (use_mpi and rank == 0) or (use_mpi == False): + + fn_J = getattr(libisdf, "_construct_J", None) + assert(fn_J is not None) + + if hasattr(mydf, "coulG_prim") == False: + assert mydf.omega is None or mydf.omega == 0.0 + mydf.coulG_prim = tools.get_coulG(mydf.primCell, mesh=mydf.primCell.mesh) + + J = np.zeros_like(density_R_prim) + + mesh_prim = np.array(mydf.primCell.mesh, dtype=np.int32) + + fn_J( + mesh_prim.ctypes.data_as(ctypes.c_void_p), + density_R_prim.ctypes.data_as(ctypes.c_void_p), + mydf.coulG_prim.ctypes.data_as(ctypes.c_void_p), + J.ctypes.data_as(ctypes.c_void_p), + ) + + J_ordered = np.zeros_like(J) + + fn_order = getattr(libisdf, "_Original_Grid_to_Reorder_Grid", None) + assert fn_order is not None + + fn_order( + ctypes.c_int(J.size), + grid_ID_ordered.ctypes.data_as(ctypes.c_void_p), + J.ctypes.data_as(ctypes.c_void_p), + J_ordered.ctypes.data_as(ctypes.c_void_p), + ) + + J = J_ordered.copy() + + if use_mpi: + J = bcast(J, root=0) + + #### step 3. get J , using translation symmetry ### + + nao_prim = mydf.nao_prim + J_Res = np.zeros((nao_prim, nao), dtype=np.float64) + + partition_activated_ID = mydf.partition_activated_id + + kmesh = np.asarray(mydf.kmesh, dtype=np.int32) + natm_prim = mydf.natmPrim + + grid_segment = mydf.grid_segment + + fn_packadd_J = getattr(libisdf, "_buildJ_k_packaddrow", None) + assert fn_packadd_J is not None + + for task_id, box_id in enumerate(partition_activated_ID): + + if use_mpi: + if task_id % comm_size != rank: + continue + + box_loc1 = box_id // natm_prim + box_loc2 = box_id % natm_prim + + box_x = box_loc1 // (kmesh[1] * kmesh[2]) + box_y = box_loc1 % (kmesh[1] * kmesh[2]) // kmesh[2] + box_z = box_loc1 % kmesh[2] + + aoR_holder_bra = aoR1[box_id] + + permutation = mydf._get_permutation_column_aoR(box_x, box_y, box_z, box_loc2) + + aoR_holder_ket = aoR[box_loc2] + + J_tmp = J[grid_segment[box_loc2]:grid_segment[box_loc2+1]] + + assert aoR_holder_ket.aoR.shape[1] == J_tmp.size + + aoR_J_res = np.ndarray(aoR_holder_bra.aoR.shape, buffer=aoR_buf1) + lib_isdf.d_ij_j_ij(aoR_holder_bra.aoR, J_tmp, out=aoR_J_res) + + nao_bra = aoR_holder_bra.aoR.shape[0] + nao_ket = aoR_holder_ket.aoR.shape[0] + + ddot_res = np.ndarray((nao_bra, nao_ket), buffer=ddot_buf) + lib.ddot(aoR_J_res, aoR_holder_ket.aoR.T, c=ddot_res) + + #### pack and add the result to J_Res + + fn_packadd_J( + J_Res.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(nao_prim), + ctypes.c_int(nao), + ddot_res.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(nao_bra), + ctypes.c_int(nao_ket), + aoR_holder_bra.ao_involved.ctypes.data_as(ctypes.c_void_p), + permutation.ctypes.data_as(ctypes.c_void_p), + ) + + J = J_Res + if use_mpi: + J = reduce(J, root=0) + + ######### delete the buffer ######### + + del dm_buf, ddot_buf, density_R_prim + del density_R_tmp + del aoR_buf1 + + if not use_mpi or (use_mpi and rank == 0): + + J *= ngrid / vol + + if in_real_space: + J = pack_JK(J, mydf.kmesh, nao_prim) + else: + ## transform J back to FFT space ## + fn1 = getattr(libisdf, "_FFT_Matrix_Col_InPlace", None) + assert fn1 is not None + J_complex = np.ndarray((nao_prim,nao_prim*ncell_complex), dtype=np.complex128) + fft_buf = np.ndarray((nao_prim,nao_prim*ncell_complex), dtype=np.complex128) + J_real = np.ndarray((nao_prim,nao_prim*ncell), dtype=np.float64, buffer=J_complex) + J_real.ravel()[:] = J.ravel()[:] + fn1( + J_real.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(nao_prim), + ctypes.c_int(nao_prim), + kmesh.ctypes.data_as(ctypes.c_void_p), + fft_buf.ctypes.data_as(ctypes.c_void_p) + ) + del fft_buf + ## pack J in FFT space ## + J_complex = J_complex.conj().copy() + J = pack_JK_in_FFT_space(J_complex, mydf.kmesh, nao_prim) + + if use_mpi: + J = bcast(J, root=0) + + t2 = (logger.process_clock(), logger.perf_counter()) + + if not use_mpi or (use_mpi and rank == 0): + _benchmark_time(t1, t2, "_contract_j_dm_k_ls", mydf) + + return J + +def _get_k_kSym_robust_fitting_fast(mydf, _dm): + + ''' + this is a slow version, abandon ! + ''' + + #### preprocess #### + + dm, in_real_space = _preprocess_dm(mydf, _dm) + + mydf._allocate_jk_buffer(dm.dtype) + t1 = (logger.process_clock(), logger.perf_counter()) + + if len(dm.shape) == 3: + assert dm.shape[0] == 1 + dm = dm[0] + + nao = dm.shape[0] + cell = mydf.cell + assert cell.nao == nao + ngrid = np.prod(cell.mesh) + vol = cell.vol + + W = mydf.W + naux = mydf.naux + + kmesh = np.array(mydf.kmesh, dtype=np.int32) + mesh = mydf.mesh + meshPrim = np.array(mesh) // np.array(kmesh) + nGridPrim = mydf.nGridPrim + ncell = np.prod(kmesh) + ncell_complex = kmesh[0] * kmesh[1] * (kmesh[2]//2+1) + nIP_prim = mydf.nIP_Prim + nao_prim = nao // ncell + + #### allocate buffer #### + + + offset = 0 + + DM_complex = np.ndarray((nao_prim,nao_prim*ncell_complex), dtype=np.complex128, buffer=mydf.jk_buffer, offset=offset) + # DM_complex = np.ndarray((nao_prim,nao_prim*ncell_complex), dtype=np.complex128) + DM_real = np.ndarray((nao_prim,nao), dtype=np.float64, buffer=DM_complex) + DM_real.ravel()[:] = dm[:nao_prim, :].ravel()[:] + offset += DM_complex.size * DM_complex.itemsize + + offset_after_dm = offset + + DM_RgRg_complex = np.ndarray((nIP_prim,nIP_prim*ncell_complex), dtype=np.complex128, buffer=mydf.jk_buffer, offset=offset) + DM_RgRg_real = np.ndarray((nIP_prim,nIP_prim*ncell), dtype=np.float64, buffer=DM_RgRg_complex) + offset += DM_RgRg_complex.size * DM_RgRg_complex.itemsize + + offset_after_DM = offset + + #### get D #### + + #_get_DM_RgRg_real(mydf, DM_real, DM_complex, DM_RgRg_real, DM_RgRg_complex, offset) + + fn1 = getattr(libisdf, "_FFT_Matrix_Col_InPlace", None) + assert fn1 is not None + + fn_packcol2 = getattr(libisdf, "_buildK_packcol2", None) + assert fn_packcol2 is not None + fn_packcol3 = getattr(libisdf, "_buildK_packcol3", None) + assert fn_packcol3 is not None + + fn_copy = getattr(libisdf, "_buildK_copy", None) + assert fn_copy is not None + + buf_fft = np.ndarray((nao_prim, nao_prim*ncell_complex), dtype=np.complex128, buffer=mydf.jk_buffer, offset=offset) + + t3 = (logger.process_clock(), logger.perf_counter()) + + fn1( + DM_real.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(nao_prim), + ctypes.c_int(nao_prim), + kmesh.ctypes.data_as(ctypes.c_void_p), + buf_fft.ctypes.data_as(ctypes.c_void_p) + ) + + t4 = (logger.process_clock(), logger.perf_counter()) + _benchmark_time(t3, t4, "_fft1", mydf) + + buf_A = np.ndarray((nao_prim, nao_prim), dtype=np.complex128, buffer=mydf.jk_buffer, offset=offset) + + offset2 = offset + (nao_prim * nao_prim) * buf_A.itemsize + buf_B = np.ndarray((nao_prim, nIP_prim), dtype=np.complex128, buffer=mydf.jk_buffer, offset=offset2) + + offset3 = offset2 + (nao_prim * nIP_prim) * buf_B.itemsize + buf_C = np.ndarray((nao_prim, nIP_prim), dtype=np.complex128, buffer=mydf.jk_buffer, offset=offset3) + + offset4 = offset3 + (nao_prim * nIP_prim) * buf_C.itemsize + buf_D = np.ndarray((nIP_prim, nIP_prim), dtype=np.complex128, buffer=mydf.jk_buffer, offset=offset4) + + aoRg_FFT = mydf.aoRg_FFT + + t3 = (logger.process_clock(), logger.perf_counter()) + + if isinstance(aoRg_FFT, list): + + for i in range(ncell_complex): + + k_begin = i * nao_prim + k_end = (i + 1) * nao_prim + + # buf_A[:] = DM_complex[:, k_begin:k_end] + fn_packcol2( + buf_A.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(nao_prim), + ctypes.c_int(2*nao_prim), + DM_complex.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(DM_complex.shape[0]), + ctypes.c_int(2*DM_complex.shape[1]), + ctypes.c_int(2*k_begin), + ctypes.c_int(2*k_end) # 2 due to complex number + ) + + # buf_B[:] = aoRg_FFT[:, i*nIP_prim:(i+1)*nIP_prim] + # buf_B.ravel()[:] = aoRg_FFT[i].ravel()[:] + fn_copy( + buf_B.ctypes.data_as(ctypes.c_void_p), + aoRg_FFT[i].ctypes.data_as(ctypes.c_void_p), + ctypes.c_size_t(2*buf_B.size) # 2 due to complex number + ) + + lib.dot(buf_A, buf_B, c=buf_C) + lib.dot(buf_B.T.conj(), buf_C, c=buf_D) + + k_begin = i * nIP_prim + k_end = (i + 1) * nIP_prim + + # DM_RgRg_complex[:, k_begin:k_end] = buf_D + fn_packcol3( + DM_RgRg_complex.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(DM_RgRg_complex.shape[0]), + ctypes.c_int(2*DM_RgRg_complex.shape[1]), + ctypes.c_int(2*k_begin), + ctypes.c_int(2*k_end), + buf_D.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(buf_D.shape[0]), + ctypes.c_int(2*buf_D.shape[1]), + ) + + else: + + raise NotImplementedError("not implemented yet.") + + for i in range(ncell_complex): + + k_begin = i * nao_prim + k_end = (i + 1) * nao_prim + + buf_A[:] = DM_complex[:, k_begin:k_end] + buf_B[:] = aoRg_FFT[:, i*nIP_prim:(i+1)*nIP_prim] + + lib.dot(buf_A, buf_B, c=buf_C) + lib.dot(buf_B.T.conj(), buf_C, c=buf_D) + + k_begin = i * nIP_prim + k_end = (i + 1) * nIP_prim + + DM_RgRg_complex[:, k_begin:k_end] = buf_D + + t4 = (logger.process_clock(), logger.perf_counter()) + _benchmark_time(t3, t4, "DM_RgRg_complex", mydf) + + t3 = t4 + + buf_fft = np.ndarray((nIP_prim, nIP_prim*ncell_complex), dtype=np.complex128, buffer=mydf.jk_buffer, offset=offset) + + fn2 = getattr(libisdf, "_iFFT_Matrix_Col_InPlace", None) + assert fn2 is not None + + fn2( + DM_RgRg_complex.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(nIP_prim), + ctypes.c_int(nIP_prim), + kmesh.ctypes.data_as(ctypes.c_void_p), + buf_fft.ctypes.data_as(ctypes.c_void_p) + ) + + t4 = (logger.process_clock(), logger.perf_counter()) + _benchmark_time(t3, t4, "DM_RgRg_complex 2", mydf) + t3 = t4 + + # inplace multiplication + + lib_isdf.cwise_mul(mydf.W, DM_RgRg_real, out=DM_RgRg_real) + + t4 = (logger.process_clock(), logger.perf_counter()) + _benchmark_time(t3, t4, "lib.cwise_mul 2", mydf) + t3 = t4 + + offset = offset_after_DM + + buf_fft = np.ndarray((nIP_prim, nIP_prim*ncell_complex), dtype=np.complex128, buffer=mydf.jk_buffer, offset=offset) + + fn1( + DM_RgRg_real.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(nIP_prim), + ctypes.c_int(nIP_prim), + kmesh.ctypes.data_as(ctypes.c_void_p), + buf_fft.ctypes.data_as(ctypes.c_void_p) + ) + + t4 = (logger.process_clock(), logger.perf_counter()) + _benchmark_time(t3, t4, "DM_RgRg_real", mydf) + t3 = t4 + + K_complex_buf = np.ndarray((nao_prim, nao_prim*ncell_complex), dtype=np.complex128, buffer=mydf.jk_buffer, offset=offset) + K_real_buf = np.ndarray((nao_prim, nao_prim*ncell), dtype=np.float64, buffer=mydf.jk_buffer, offset=offset) + offset += (nao_prim * nao_prim * ncell_complex) * K_complex_buf.itemsize + offset_now = offset + + buf_A = np.ndarray((nIP_prim, nIP_prim), dtype=np.complex128, buffer=mydf.jk_buffer, offset=offset_now) + offset_now += (nIP_prim * nIP_prim) * buf_A.itemsize + buf_B = np.ndarray((nao_prim, nIP_prim), dtype=np.complex128, buffer=mydf.jk_buffer, offset=offset_now) + offset_now += (nao_prim * nIP_prim) * buf_B.itemsize + buf_C = np.ndarray((nIP_prim, nao_prim), dtype=np.complex128, buffer=mydf.jk_buffer, offset=offset_now) + offset_now += (nIP_prim * nao_prim) * buf_C.itemsize + buf_D = np.ndarray((nao_prim, nao_prim), dtype=np.complex128, buffer=mydf.jk_buffer, offset=offset_now) + + if isinstance(aoRg_FFT, list): + for i in range(ncell_complex): + + k_begin = i * nIP_prim + k_end = (i + 1) * nIP_prim + + # buf_A.ravel()[:] = DM_RgRg_complex[:, k_begin:k_end].ravel()[:] + fn_packcol2( + buf_A.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(nIP_prim), + ctypes.c_int(2*nIP_prim), + DM_RgRg_complex.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(DM_RgRg_complex.shape[0]), + ctypes.c_int(2*DM_RgRg_complex.shape[1]), + ctypes.c_int(2*k_begin), + ctypes.c_int(2*k_end) + ) + + # buf_B.ravel()[:] = aoRg_FFT[:, i*nIP_prim:(i+1)*nIP_prim].ravel()[:] + # buf_B.ravel()[:] = aoRg_FFT[i].ravel()[:] + fn_copy( + buf_B.ctypes.data_as(ctypes.c_void_p), + aoRg_FFT[i].ctypes.data_as(ctypes.c_void_p), + ctypes.c_size_t(2*buf_B.size) # 2 due to complex number + ) + + + lib.dot(buf_A, buf_B.T.conj(), c=buf_C) + lib.dot(buf_B, buf_C, c=buf_D) + + k_begin = i * nao_prim + k_end = (i + 1) * nao_prim + + # K_complex_buf[:, k_begin:k_end] = buf_D + + fn_packcol3( + K_complex_buf.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(K_complex_buf.shape[0]), + ctypes.c_int(2*K_complex_buf.shape[1]), + ctypes.c_int(2*k_begin), + ctypes.c_int(2*k_end), + buf_D.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(buf_D.shape[0]), + ctypes.c_int(2*buf_D.shape[1]), + ) + + else: + + raise NotImplementedError("not implemented yet.") + + for i in range(ncell_complex): + + k_begin = i * nIP_prim + k_end = (i + 1) * nIP_prim + + buf_A.ravel()[:] = DM_RgRg_complex[:, k_begin:k_end].ravel()[:] + buf_B.ravel()[:] = aoRg_FFT[:, i*nIP_prim:(i+1)*nIP_prim].ravel()[:] + + lib.dot(buf_A, buf_B.T.conj(), c=buf_C) + lib.dot(buf_B, buf_C, c=buf_D) + + k_begin = i * nao_prim + k_end = (i + 1) * nao_prim + + K_complex_buf[:, k_begin:k_end] = buf_D + + t4 = (logger.process_clock(), logger.perf_counter()) + _benchmark_time(t3, t4, "K_complex_buf", mydf) + t3 = t4 + + #if in_real_space: + + buf_fft = np.ndarray((nao_prim, nao_prim*ncell_complex), dtype=np.complex128, buffer=mydf.jk_buffer, offset=offset) + + fn2( + K_complex_buf.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(nao_prim), + ctypes.c_int(nao_prim), + kmesh.ctypes.data_as(ctypes.c_void_p), + buf_fft.ctypes.data_as(ctypes.c_void_p) + ) + + t4 = (logger.process_clock(), logger.perf_counter()) + _benchmark_time(t3, t4, "K_real_buf", mydf) + t3 = t4 + + K_real_buf *= (ngrid / vol) + + K = -pack_JK(K_real_buf, kmesh, nao_prim, output=None) # "-" due to robust fitting + + #else: + # K = -pack_JK_in_FFT_space(K_complex_buf, kmesh, nao_prim) / np.prod(kmesh) + + ########### do the same thing on V ########### + + DM_RgR_complex = np.ndarray((nIP_prim,nGridPrim*ncell_complex), dtype=np.complex128, buffer=mydf.jk_buffer, offset=offset_after_dm) + DM_RgR_real = np.ndarray((nIP_prim,nGridPrim*ncell), dtype=np.float64, buffer=DM_RgR_complex) + + offset_now = offset_after_dm + DM_RgR_complex.size * DM_RgR_complex.itemsize + + aoR_FFT = mydf.aoR_FFT + + offset_A = offset_now + buf_A = np.ndarray((nao_prim, nao_prim), dtype=np.complex128, buffer=mydf.jk_buffer, offset=offset_A) + offset_B = offset_A + buf_A.size * buf_A.itemsize + buf_B = np.ndarray((nao_prim, nGridPrim), dtype=np.complex128, buffer=mydf.jk_buffer, offset=offset_B) + offset_B2 = offset_B + buf_B.size * buf_B.itemsize + buf_B2 = np.ndarray((nao_prim, nIP_prim), dtype=np.complex128, buffer=mydf.jk_buffer, offset=offset_B2) + offset_C = offset_B2 + buf_B2.size * buf_B2.itemsize + buf_C = np.ndarray((nao_prim, nGridPrim), dtype=np.complex128, buffer=mydf.jk_buffer, offset=offset_C) + offset_D = offset_C + buf_C.size * buf_C.itemsize + buf_D = np.ndarray((nIP_prim, nGridPrim), dtype=np.complex128, buffer=mydf.jk_buffer, offset=offset_D) + + if isinstance(aoRg_FFT, list): + assert isinstance(aoR_FFT, list) + + for i in range(ncell_complex): + + k_begin = i * nao_prim + k_end = (i + 1) * nao_prim + + # buf_A[:] = DM_complex[:, k_begin:k_end] + fn_packcol2( + buf_A.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(nao_prim), + ctypes.c_int(2*nao_prim), + DM_complex.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(DM_complex.shape[0]), + ctypes.c_int(2*DM_complex.shape[1]), + ctypes.c_int(2*k_begin), + ctypes.c_int(2*k_end) + ) + + # buf_B[:] = aoR_FFT[:, i*nGridPrim:(i+1)*nGridPrim] + # buf_B2[:] = aoRg_FFT[:, i*nIP_prim:(i+1)*nIP_prim] + # buf_B.ravel()[:] = aoR_FFT[i].ravel()[:] + # buf_B2.ravel()[:] = aoRg_FFT[i].ravel()[:] + fn_copy( + buf_B.ctypes.data_as(ctypes.c_void_p), + aoR_FFT[i].ctypes.data_as(ctypes.c_void_p), + ctypes.c_size_t(2*buf_B.size) # 2 due to complex number + ) + fn_copy( + buf_B2.ctypes.data_as(ctypes.c_void_p), + aoRg_FFT[i].ctypes.data_as(ctypes.c_void_p), + ctypes.c_size_t(2*buf_B2.size) # 2 due to complex number + ) + + + lib.dot(buf_A, buf_B, c=buf_C) + lib.dot(buf_B2.T.conj(), buf_C, c=buf_D) + + k_begin = i * nGridPrim + k_end = (i + 1) * nGridPrim + + # DM_RgR_complex[:, k_begin:k_end] = buf_D + fn_packcol3( + DM_RgR_complex.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(DM_RgR_complex.shape[0]), + ctypes.c_int(2*DM_RgR_complex.shape[1]), + ctypes.c_int(2*k_begin), + ctypes.c_int(2*k_end), + buf_D.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(buf_D.shape[0]), + ctypes.c_int(2*buf_D.shape[1]), + ) + + else: + + raise NotImplementedError("not implemented yet.") + + for i in range(ncell_complex): + + k_begin = i * nao_prim + k_end = (i + 1) * nao_prim + + buf_A[:] = DM_complex[:, k_begin:k_end] + buf_B[:] = aoR_FFT[:, i*nGridPrim:(i+1)*nGridPrim] + buf_B2[:] = aoRg_FFT[:, i*nIP_prim:(i+1)*nIP_prim] + + lib.dot(buf_A, buf_B, c=buf_C) + lib.dot(buf_B2.T.conj(), buf_C, c=buf_D) + + k_begin = i * nGridPrim + k_end = (i + 1) * nGridPrim + + DM_RgR_complex[:, k_begin:k_end] = buf_D + + t4 = (logger.process_clock(), logger.perf_counter()) + _benchmark_time(t3, t4, "DM_RgR_complex", mydf) + t3 = t4 + + buf_A = None + buf_B = None + buf_B2 = None + buf_C = None + buf_D = None + + offset_now_fft = offset_now + + buf_fft = np.ndarray((nIP_prim, nGridPrim*ncell_complex), dtype=np.complex128, buffer=mydf.jk_buffer, offset=offset_now_fft) + + fn2( + DM_RgR_complex.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(nIP_prim), + ctypes.c_int(nGridPrim), + kmesh.ctypes.data_as(ctypes.c_void_p), + buf_fft.ctypes.data_as(ctypes.c_void_p) + ) + + t4 = (logger.process_clock(), logger.perf_counter()) + _benchmark_time(t3, t4, "DM_RgR_real", mydf) + t3 = t4 + + # inplace multiplication + + lib_isdf.cwise_mul(mydf.V_R, DM_RgR_real, out=DM_RgR_real) + + t4 = (logger.process_clock(), logger.perf_counter()) + _benchmark_time(t3, t4, "cwise_mul", mydf) + t3 = t4 + + fn1( + DM_RgR_real.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(nIP_prim), + ctypes.c_int(nGridPrim), + kmesh.ctypes.data_as(ctypes.c_void_p), + buf_fft.ctypes.data_as(ctypes.c_void_p) + ) + + t4 = (logger.process_clock(), logger.perf_counter()) + _benchmark_time(t3, t4, "DM_RgR_complex 2", mydf) + t3 = t4 + + buf_fft = None + + offset_K = offset_now + + K_complex_buf = np.ndarray((nao_prim, nao_prim*ncell_complex), dtype=np.complex128, buffer=mydf.jk_buffer, offset=offset_K) + K_real_buf = np.ndarray((nao_prim, nao_prim*ncell), dtype=np.float64, buffer=K_complex_buf) + + offset_after_K = offset_K + K_complex_buf.size * K_complex_buf.itemsize + + offset_A = offset_K + K_complex_buf.size * K_complex_buf.itemsize + buf_A = np.ndarray((nIP_prim, nGridPrim), dtype=np.complex128, buffer=mydf.jk_buffer, offset=offset_A) + offset_B = offset_A + buf_A.size * buf_A.itemsize + buf_B = np.ndarray((nao_prim, nGridPrim), dtype=np.complex128, buffer=mydf.jk_buffer, offset=offset_B) + offset_B2 = offset_B + buf_B.size * buf_B.itemsize + buf_B2 = np.ndarray((nao_prim, nIP_prim), dtype=np.complex128, buffer=mydf.jk_buffer, offset=offset_B2) + offset_C = offset_B2 + buf_B2.size * buf_B2.itemsize + buf_C = np.ndarray((nIP_prim, nao_prim), dtype=np.complex128, buffer=mydf.jk_buffer, offset=offset_C) + offset_D = offset_C + buf_C.size * buf_C.itemsize + buf_D = np.ndarray((nao_prim, nao_prim), dtype=np.complex128, buffer=mydf.jk_buffer, offset=offset_D) + + if isinstance(aoRg_FFT, list): + + for i in range(ncell_complex): + + k_begin = i * nGridPrim + k_end = (i + 1) * nGridPrim + + # buf_A.ravel()[:] = DM_RgR_complex[:, k_begin:k_end].ravel()[:] + fn_packcol2( + buf_A.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(nIP_prim), + ctypes.c_int(2*nGridPrim), + DM_RgR_complex.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(DM_RgR_complex.shape[0]), + ctypes.c_int(2*DM_RgR_complex.shape[1]), + ctypes.c_int(2*k_begin), + ctypes.c_int(2*k_end) + ) + + # buf_B.ravel()[:] = aoR_FFT[i].ravel()[:] + # buf_B2.ravel()[:] = aoRg_FFT[i].ravel()[:] + fn_copy( + buf_B.ctypes.data_as(ctypes.c_void_p), + aoR_FFT[i].ctypes.data_as(ctypes.c_void_p), + ctypes.c_size_t(2*buf_B.size) # 2 due to complex number + ) + fn_copy( + buf_B2.ctypes.data_as(ctypes.c_void_p), + aoRg_FFT[i].ctypes.data_as(ctypes.c_void_p), + ctypes.c_size_t(2*buf_B2.size) # 2 due to complex number + ) + + + lib.dot(buf_A, buf_B.T.conj(), c=buf_C) + lib.dot(buf_B2, buf_C, c=buf_D) + + k_begin = i * nao_prim + k_end = (i + 1) * nao_prim + + # K_complex_buf[:, k_begin:k_end] = buf_D + fn_packcol3( + K_complex_buf.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(K_complex_buf.shape[0]), + ctypes.c_int(2*K_complex_buf.shape[1]), + ctypes.c_int(2*k_begin), + ctypes.c_int(2*k_end), + buf_D.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(buf_D.shape[0]), + ctypes.c_int(2*buf_D.shape[1]), + ) + + else: + + raise NotImplementedError("not implemented yet.") + + for i in range(ncell_complex): + + k_begin = i * nGridPrim + k_end = (i + 1) * nGridPrim + + buf_A.ravel()[:] = DM_RgR_complex[:, k_begin:k_end].ravel()[:] + # print("buf_A = ", buf_A[:5,:5]) + buf_B.ravel()[:] = aoR_FFT[:, i*nGridPrim:(i+1)*nGridPrim].ravel()[:] + # print("buf_B = ", buf_B[:5,:5]) + buf_B2.ravel()[:] = aoRg_FFT[:, i*nIP_prim:(i+1)*nIP_prim].ravel()[:] + # print("buf_B2 = ", buf_B2[:5,:5]) + + lib.dot(buf_A, buf_B.T.conj(), c=buf_C) + lib.dot(buf_B2, buf_C, c=buf_D) + + # print("buf_D = ", buf_D[:5,:5]) + + k_begin = i * nao_prim + k_end = (i + 1) * nao_prim + + K_complex_buf[:, k_begin:k_end] = buf_D + + t4 = (logger.process_clock(), logger.perf_counter()) + _benchmark_time(t3, t4, "K_complex_buf 1", mydf) + t3 = t4 + + buf_A = None + buf_B = None + buf_B2 = None + buf_C = None + buf_D = None + + offset_now = offset_after_K + + buf_fft = np.ndarray((nao_prim, nao_prim*ncell_complex), dtype=np.complex128, buffer=mydf.jk_buffer, offset=offset_now) + + fn2( + K_complex_buf.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(nao_prim), + ctypes.c_int(nao_prim), + kmesh.ctypes.data_as(ctypes.c_void_p), + buf_fft.ctypes.data_as(ctypes.c_void_p) + ) + + t4 = (logger.process_clock(), logger.perf_counter()) + _benchmark_time(t3, t4, "K_complex_buf 2", mydf) + t3 = t4 + + buf_fft = None + + K_real_buf *= (ngrid / vol) + + t2 = (logger.process_clock(), logger.perf_counter()) + + _benchmark_time(t1, t2, "_contract_k_dm", mydf) + + t1 = t2 + + K2 = pack_JK(K_real_buf, kmesh, nao_prim, output=None) + + t2 = (logger.process_clock(), logger.perf_counter()) + + _benchmark_time(t1, t2, "_pack_JK", mydf) + + K += K2 + K2.T + + if in_real_space == False: + # K += K2 + K2.T + #else: + # K2 = K2 + K2.T + # K2 = K2[:nao_prim,:] + + K = K[:nao_prim,:].copy() + + K_complex = np.ndarray((nao_prim, nao_prim*ncell_complex), dtype=np.complex128) + K_real = np.ndarray((nao_prim, nao_prim*ncell), dtype=np.float64, buffer=K_complex) + K_real.ravel()[:] = K.ravel()[:] + buf_fft = np.zeros_like(K_complex) + + fn1( + K_real.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(nao_prim), + ctypes.c_int(nao_prim), + kmesh.ctypes.data_as(ctypes.c_void_p), + buf_fft.ctypes.data_as(ctypes.c_void_p) + ) + + K_complex = K_complex.conj().copy() + K_complex = pack_JK_in_FFT_space(K_complex, kmesh, nao_prim) + K = K_complex + + + DM_RgR_complex = None + DM_RgR_real = None + + return K + + # return DM_RgRg_real # temporary return for debug + +def _get_k_kSym(mydf, _dm): + + #### preprocess #### + + dm, in_real_space = _preprocess_dm(mydf, _dm) + + mydf._allocate_jk_buffer(dm.dtype) + t1 = (logger.process_clock(), logger.perf_counter()) + + if len(dm.shape) == 3: + assert dm.shape[0] == 1 + dm = dm[0] + + nao = dm.shape[0] + cell = mydf.cell + assert cell.nao == nao + ngrid = np.prod(cell.mesh) + vol = cell.vol + + W = mydf.W + naux = mydf.naux + + kmesh = np.array(mydf.kmesh, dtype=np.int32) + mesh = mydf.mesh + meshPrim = np.array(mesh) // np.array(kmesh) + nGridPrim = mydf.nGridPrim + ncell = np.prod(kmesh) + ncell_complex = kmesh[0] * kmesh[1] * (kmesh[2]//2+1) + nIP_prim = mydf.nIP_Prim + nao_prim = nao // ncell + + #### allocate buffer #### + + offset = 0 + DM_RgRg_complex = np.ndarray((nIP_prim,nIP_prim*ncell_complex), dtype=np.complex128, buffer=mydf.jk_buffer, offset=offset) + DM_RgRg_real = np.ndarray((nIP_prim,nIP_prim*ncell), dtype=np.float64, buffer=mydf.jk_buffer, offset=offset) + + offset += (nIP_prim * nIP_prim * ncell_complex) * DM_RgRg_complex.itemsize + DM_complex = np.ndarray((nao_prim,nao_prim*ncell_complex), dtype=np.complex128, buffer=mydf.jk_buffer, offset=offset) + DM_real = np.ndarray((nao_prim,nao), dtype=np.float64, buffer=mydf.jk_buffer, offset=offset) + DM_real.ravel()[:] = dm[:nao_prim, :].ravel()[:] + offset += (nao_prim * nao_prim * ncell_complex) * DM_complex.itemsize + + #### get D #### + + fn1 = getattr(libisdf, "_FFT_Matrix_Col_InPlace", None) + assert fn1 is not None + + buf_fft = np.ndarray((nao_prim, nao_prim*ncell_complex), dtype=np.complex128, buffer=mydf.jk_buffer, offset=offset) + + fn1( + DM_real.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(nao_prim), + ctypes.c_int(nao_prim), + kmesh.ctypes.data_as(ctypes.c_void_p), + buf_fft.ctypes.data_as(ctypes.c_void_p) + ) + + buf_A = np.ndarray((nao_prim, nao_prim), dtype=np.complex128, buffer=mydf.jk_buffer, offset=offset) + + offset2 = offset + (nao_prim * nao_prim) * buf_A.itemsize + buf_B = np.ndarray((nao_prim, nIP_prim), dtype=np.complex128, buffer=mydf.jk_buffer, offset=offset2) + + offset3 = offset2 + (nao_prim * nIP_prim) * buf_B.itemsize + buf_C = np.ndarray((nao_prim, nIP_prim), dtype=np.complex128, buffer=mydf.jk_buffer, offset=offset3) + + offset4 = offset3 + (nao_prim * nIP_prim) * buf_C.itemsize + buf_D = np.ndarray((nIP_prim, nIP_prim), dtype=np.complex128, buffer=mydf.jk_buffer, offset=offset4) + + aoRg_FFT = mydf.aoRg_FFT + + if isinstance(aoRg_FFT, list): + for i in range(ncell_complex): + + k_begin = i * nao_prim + k_end = (i + 1) * nao_prim + + buf_A[:] = DM_complex[:, k_begin:k_end] + # buf_B[:] = aoRg_FFT[:, i*nIP_prim:(i+1)*nIP_prim] + buf_B = aoRg_FFT[i] + + lib.dot(buf_A, buf_B, c=buf_C) + lib.dot(buf_B.T.conj(), buf_C, c=buf_D) + + k_begin = i * nIP_prim + k_end = (i + 1) * nIP_prim + + DM_RgRg_complex[:, k_begin:k_end] = buf_D + else: + for i in range(ncell_complex): + + k_begin = i * nao_prim + k_end = (i + 1) * nao_prim + + buf_A[:] = DM_complex[:, k_begin:k_end] + buf_B[:] = aoRg_FFT[:, i*nIP_prim:(i+1)*nIP_prim] + + lib.dot(buf_A, buf_B, c=buf_C) + lib.dot(buf_B.T.conj(), buf_C, c=buf_D) + + k_begin = i * nIP_prim + k_end = (i + 1) * nIP_prim + + DM_RgRg_complex[:, k_begin:k_end] = buf_D + + buf_fft = np.ndarray((nIP_prim, nIP_prim*ncell_complex), dtype=np.complex128, buffer=mydf.jk_buffer, offset=offset) + + fn2 = getattr(libisdf, "_iFFT_Matrix_Col_InPlace", None) + assert fn2 is not None + + fn2( + DM_RgRg_complex.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(nIP_prim), + ctypes.c_int(nIP_prim), + kmesh.ctypes.data_as(ctypes.c_void_p), + buf_fft.ctypes.data_as(ctypes.c_void_p) + ) + + # inplace multiplication + + lib_isdf.cwise_mul(mydf.W, DM_RgRg_real, out=DM_RgRg_real) + + offset = nIP_prim * nIP_prim * ncell_complex * DM_RgRg_complex.itemsize + + buf_fft = np.ndarray((nIP_prim, nIP_prim*ncell_complex), dtype=np.complex128, buffer=mydf.jk_buffer, offset=offset) + + fn1( + DM_RgRg_real.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(nIP_prim), + ctypes.c_int(nIP_prim), + kmesh.ctypes.data_as(ctypes.c_void_p), + buf_fft.ctypes.data_as(ctypes.c_void_p) + ) + + K_complex_buf = np.ndarray((nao_prim, nao_prim*ncell_complex), dtype=np.complex128, buffer=mydf.jk_buffer, offset=offset) + K_real_buf = np.ndarray((nao_prim, nao_prim*ncell), dtype=np.float64, buffer=mydf.jk_buffer, offset=offset) + offset += (nao_prim * nao_prim * ncell_complex) * K_complex_buf.itemsize + offset_now = offset + + buf_A = np.ndarray((nIP_prim, nIP_prim), dtype=np.complex128, buffer=mydf.jk_buffer, offset=offset_now) + offset_now += (nIP_prim * nIP_prim) * buf_A.itemsize + buf_B = np.ndarray((nao_prim, nIP_prim), dtype=np.complex128, buffer=mydf.jk_buffer, offset=offset_now) + offset_now += (nao_prim * nIP_prim) * buf_B.itemsize + buf_C = np.ndarray((nIP_prim, nao_prim), dtype=np.complex128, buffer=mydf.jk_buffer, offset=offset_now) + offset_now += (nIP_prim * nao_prim) * buf_C.itemsize + buf_D = np.ndarray((nao_prim, nao_prim), dtype=np.complex128, buffer=mydf.jk_buffer, offset=offset_now) + + if isinstance(aoRg_FFT, list): + + for i in range(ncell_complex): + + k_begin = i * nIP_prim + k_end = (i + 1) * nIP_prim + + buf_A.ravel()[:] = DM_RgRg_complex[:, k_begin:k_end].ravel()[:] + # buf_B.ravel()[:] = aoRg_FFT[:, i*nIP_prim:(i+1)*nIP_prim].ravel()[:] + buf_B = aoRg_FFT[i] + + lib.dot(buf_A, buf_B.T.conj(), c=buf_C) + lib.dot(buf_B, buf_C, c=buf_D) + + k_begin = i * nao_prim + k_end = (i + 1) * nao_prim + + K_complex_buf[:, k_begin:k_end] = buf_D + else: + + for i in range(ncell_complex): + + k_begin = i * nIP_prim + k_end = (i + 1) * nIP_prim + + buf_A.ravel()[:] = DM_RgRg_complex[:, k_begin:k_end].ravel()[:] + buf_B.ravel()[:] = aoRg_FFT[:, i*nIP_prim:(i+1)*nIP_prim].ravel()[:] + + lib.dot(buf_A, buf_B.T.conj(), c=buf_C) + lib.dot(buf_B, buf_C, c=buf_D) + + k_begin = i * nao_prim + k_end = (i + 1) * nao_prim + + K_complex_buf[:, k_begin:k_end] = buf_D + + buf_fft = np.ndarray((nao_prim, nao_prim*ncell_complex), dtype=np.complex128, buffer=mydf.jk_buffer, offset=offset) + + K_complex_buf *= (ngrid / vol) + + #print("K_complex_buf = ", K_complex_buf) + + if in_real_space: + + fn2( + K_complex_buf.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(nao_prim), + ctypes.c_int(nao_prim), + kmesh.ctypes.data_as(ctypes.c_void_p), + buf_fft.ctypes.data_as(ctypes.c_void_p) + ) + + K = pack_JK(K_real_buf, kmesh, nao_prim, output=None) + + else: + + K_complex_buf = K_complex_buf.conj().copy() ### NOTE: convention problem + K = pack_JK_in_FFT_space(K_complex_buf, kmesh, nao_prim, output=None) + + t2 = (logger.process_clock(), logger.perf_counter()) + + _benchmark_time(t1, t2, "_contract_k_dm", mydf) + + return K + +def _get_k_kSym_direct(mydf, _dm, use_mpi=False): + + if use_mpi: + assert mydf.direct == True + from pyscf.isdf.isdf_tools_mpi import rank, comm, comm_size, bcast, reduce + size = comm.Get_size() + + t1 = (logger.process_clock(), logger.perf_counter()) + t0 = (logger.process_clock(), logger.perf_counter()) + + ############# preprocess ############# + + dm = None + + if (use_mpi and rank == 0) or not use_mpi: + + dm = [] + nset = _dm.shape[0] + + for iset in range(nset): + _dm_tmp, in_real_space = _preprocess_dm(mydf, _dm[iset]) + dm.append(_dm_tmp) + if in_real_space: + if np.prod(mydf.kmesh) == 1: + in_real_space = False + assert not in_real_space + + dm = np.asarray(dm) + + if use_mpi: + dm = bcast(dm, root=0) + + if len(dm.shape) == 3: + assert dm.shape[0] <= 4 + else: + dm = dm.reshape(1, *dm.shape) + + aoR = mydf.aoR + aoRg = mydf.aoRg + + max_nao_involved = mydf.max_nao_involved + max_ngrid_involved = mydf.max_ngrid_involved + max_nIP_involved = mydf.max_nIP_involved + maxsize_group_naux = mydf.maxsize_group_naux + + ####### preparing the data ####### + + nset, nao = dm.shape[0], dm.shape[1] + cell = mydf.cell + assert cell.nao == nao + vol = cell.vol + mesh = np.array(cell.mesh, dtype=np.int32) + mesh_int32 = mesh + ngrid = np.prod(mesh) + + aoRg = mydf.aoRg + assert isinstance(aoRg, list) + aoR = mydf.aoR + assert isinstance(aoR, list) + + naux = mydf.naux + nao = cell.nao + nao_prim = mydf.nao_prim + aux_basis = mydf.aux_basis + kmesh = np.array(mydf.kmesh, dtype=np.int32) + nkpts = np.prod(kmesh) + + grid_ordering = mydf.grid_ID_ordered + + if hasattr(mydf, "coulG") == False: + if mydf.omega is not None: + assert mydf.omega >= 0.0 + # mydf.coulG = tools.get_coulG(cell, mesh=mesh, omega=mydf.omega) + raise NotImplementedError("coulG is not implemented yet.") + + coulG = mydf.coulG + coulG_real = coulG.reshape(*mesh)[:, :, :mesh[2]//2+1].reshape(-1).copy() + + mydf.allocate_k_buffer(nset) + build_k_buf = mydf.build_k_buf + build_VW_buf = mydf.build_VW_in_k_buf + + group = mydf.group + assert len(group) == len(aux_basis) + + ######### allocate buffer ######### + + Density_RgAO_buf = mydf.Density_RgAO_buf + + nThread = lib.num_threads() + bufsize_per_thread = (coulG_real.shape[0] * 2 + np.prod(mesh)) + buf_build_V = np.ndarray((nThread, bufsize_per_thread), dtype=np.float64, buffer=build_VW_buf) + + offset_now = buf_build_V.size * buf_build_V.dtype.itemsize + + build_K_bunchsize = min(maxsize_group_naux, mydf._build_K_bunchsize) + + offset_build_now = 0 + offset_Density_RgR_buf = 0 + Density_RgR_buf = np.ndarray((build_K_bunchsize, ngrid), buffer=build_k_buf, offset=offset_build_now) + + offset_build_now += Density_RgR_buf.size * Density_RgR_buf.dtype.itemsize + offset_ddot_res_RgR_buf = offset_build_now + ddot_res_RgR_buf = np.ndarray((build_K_bunchsize, max_ngrid_involved), buffer=build_k_buf, offset=offset_ddot_res_RgR_buf) + + offset_build_now += ddot_res_RgR_buf.size * ddot_res_RgR_buf.dtype.itemsize + offset_K1_tmp1_buf = offset_build_now + K1_tmp1_buf = np.ndarray((maxsize_group_naux, nao), buffer=build_k_buf, offset=offset_K1_tmp1_buf) + + offset_build_now += K1_tmp1_buf.size * K1_tmp1_buf.dtype.itemsize + offset_K1_tmp1_ddot_res_buf = offset_build_now + K1_tmp1_ddot_res_buf = np.ndarray((maxsize_group_naux, nao), buffer=build_k_buf, offset=offset_K1_tmp1_ddot_res_buf) + + offset_build_now += K1_tmp1_ddot_res_buf.size * K1_tmp1_ddot_res_buf.dtype.itemsize + + offset_K1_final_ddot_buf = offset_build_now + K1_final_ddot_buf = np.ndarray((nao, nao), buffer=build_k_buf, offset=offset_K1_final_ddot_buf) + + ########### get involved C function ########### + + fn_packcol1 = getattr(libisdf, "_buildK_packcol", None) + assert fn_packcol1 is not None + fn_packcol2 = getattr(libisdf, "_buildK_packcol2", None) + assert fn_packcol2 is not None + fn_packadd_col = getattr(libisdf, "_buildK_packaddcol", None) + assert fn_packadd_col is not None + fn_packadd_row = getattr(libisdf, "_buildK_packaddrow", None) + assert fn_packadd_row is not None + + ordered_ao_ind = np.arange(nao) + + ######### begin work ######### + + K1 = np.zeros((nset, nao_prim, nao), dtype=np.float64) # contribution from V matrix + K2 = np.zeros((nset, nao_prim, nao), dtype=np.float64) # contribution from W matrix + + from pyscf.isdf._isdf_local_K_direct import reset_profile_buildK_time, add_cputime_RgAO, add_walltime_RgAO, log_profile_buildK_time + + reset_profile_buildK_time() + + ######## distribution task among different process ######## + + task_info = [] + + nIP_prim = mydf.nIP_Prim + + if use_mpi: + nIP_bunchsize = (nIP_prim + comm_size) // comm_size + bunch_begin = rank * nIP_bunchsize + bunch_end = min(nIP_prim, (rank + 1) * nIP_bunchsize) + + else: + bunch_begin = 0 + bunch_end = nIP_prim + + iIP = 0 + + for group_id, atm_ids in enumerate(group): + + naux_tmp = 0 + for atm_id in atm_ids: + naux_tmp += aoRg[atm_id].aoR.shape[1] + assert naux_tmp == aux_basis[group_id].shape[0] + assert iIP + naux_tmp <= nIP_prim + + ### judge whether [iIP, iIP+naux_tmp) intersects with [bunch_begin, bunch_end) ### + + if iIP >= bunch_end or iIP + naux_tmp <= bunch_begin: + task_info.append((None, None)) + else: + if bunch_begin <= iIP: + group_begin = 0 + else: + group_begin = bunch_begin - iIP + if bunch_end >= iIP + naux_tmp: + group_end = naux_tmp + else: + group_end = bunch_end - iIP + task_info.append((group_begin, group_end)) + + iIP += naux_tmp + + #if use_mpi: + # print("rank = ", rank, "task_info = ", task_info) + + ########################################################### + + for group_id, atm_ids in enumerate(group): + + if task_info[group_id][0] is None: + continue + + #if use_mpi: + # if group_id % comm_size != rank: + # continue + + naux_tmp = 0 + aoRg_holders = [] + for atm_id in atm_ids: + naux_tmp += aoRg[atm_id].aoR.shape[1] + aoRg_holders.append(aoRg[atm_id]) + assert naux_tmp == aux_basis[group_id].shape[0] + + aux_basis_tmp = aux_basis[group_id] + + #### 1. build the involved DM_RgR #### + + t1 = (logger.process_clock(), logger.perf_counter()) + + Density_RgAO_tmp = np.ndarray((nset, naux_tmp, nao), buffer=Density_RgAO_buf) + offset_density_RgAO_buf = Density_RgAO_tmp.size * Density_RgAO_buf.dtype.itemsize + Density_RgAO_tmp.ravel()[:] = 0.0 + Density_RgAO_tmp = __get_DensityMatrixonRgAO_qradratic(mydf, dm, aoRg_holders, "all", Density_RgAO_tmp, verbose=mydf.verbose) + + #build_k_buf.ravel()[:] = 0.0 + #build_VW_buf.ravel()[:] = 0.0 + + t2 = (logger.process_clock(), logger.perf_counter()) + + add_cputime_RgAO(t2[0] - t1[0]) + add_walltime_RgAO(t2[1] - t1[1]) + + #### 2. build the V matrix #### + + W_tmp = None + + for iset in range(nset): + + calculate_W_tmp = (iset == 0) + + #build_k_buf.ravel()[:] = 0.0 + #build_VW_buf.ravel()[:] = 0.0 + + _W_tmp = _isdf_get_K_direct_kernel_1( + mydf, coulG_real, + group_id, Density_RgAO_tmp[iset], + None, True, calculate_W_tmp, + ##### buffer ##### + buf_build_V, + build_VW_buf, + offset_now, + Density_RgR_buf, + Density_RgAO_buf, + offset_density_RgAO_buf, + ddot_res_RgR_buf, + K1_tmp1_buf, + K1_tmp1_ddot_res_buf, + K1_final_ddot_buf, + ##### bunchsize ##### + #maxsize_group_naux, + build_K_bunchsize, + ##### other info ##### + use_mpi=use_mpi, + begin_id=task_info[group_id][0], + end_id =task_info[group_id][1], + ##### out ##### + K1_or_2=K1[iset]) + + if calculate_W_tmp: + W_tmp = _W_tmp.copy() + + #build_k_buf.ravel()[:] = 0.0 + #build_VW_buf.ravel()[:] = 0.0 + + _isdf_get_K_direct_kernel_1( + mydf, coulG_real, + group_id, Density_RgAO_tmp[iset], + W_tmp, False, False, + ##### buffer ##### + buf_build_V, + build_VW_buf, + offset_now, + Density_RgR_buf, + Density_RgAO_buf, + offset_density_RgAO_buf, + ddot_res_RgR_buf, + K1_tmp1_buf, + K1_tmp1_ddot_res_buf, + K1_final_ddot_buf, + ##### bunchsize ##### + #maxsize_group_naux, + build_K_bunchsize, + ##### other info ##### + use_mpi=use_mpi, + begin_id=task_info[group_id][0], + end_id =task_info[group_id][1], + ##### out ##### + K1_or_2=K2[iset]) + + if (use_mpi and rank == 0) or not use_mpi: + log_profile_buildK_time(mydf) + + ######### finally delete the buffer ######### + + if use_mpi: + comm.Barrier() + + if use_mpi: + K1 = reduce(K1, root = 0) + K2 = reduce(K2, root = 0) + if rank == 0: + # K = K1 + K1.T - K2 + K1_packed = [] + K2_packed = [] + for iset in range(nset): + #K1 = pack_JK(K1, kmesh, nao_prim) + #K2 = pack_JK(K2, kmesh, nao_prim) + K1_packed.append(pack_JK(K1[iset], kmesh, nao_prim)) + K2_packed.append(pack_JK(K2[iset], kmesh, nao_prim)) + K1 = np.array(K1_packed) + K2 = np.array(K2_packed) + K = np.zeros_like(K1) + # K = K1 + K1.T - K2 + for iset in range(nset): + K[iset] = K1[iset] + K1[iset].T - (K2[iset] + K2[iset].T)/2.0 + else: + K = None + K = bcast(K, root = 0) + else: + # K = K1 + K1.T - K2 + K1_packed = [] + K2_packed = [] + for iset in range(nset): + #K1 = pack_JK(K1, kmesh, nao_prim) + #K2 = pack_JK(K2, kmesh, nao_prim) + K1_packed.append(pack_JK(K1[iset], kmesh, nao_prim)) + K2_packed.append(pack_JK(K2[iset], kmesh, nao_prim)) + K1 = np.array(K1_packed) + K2 = np.array(K2_packed) + K = np.zeros_like(K1) + # K = K1 + K1.T - K2 + for iset in range(nset): + K[iset] = K1[iset] + K1[iset].T - (K2[iset] + K2[iset].T)/2.0 + + del K1 + del K2 + + ############ transform back to K ############ + + if (use_mpi and rank == 0) or not use_mpi: + + K_res = [] + + for iset in range(nset): + Ktmp = _RowCol_FFT_bench(K[iset, :nao_prim, :], kmesh, inv=True, TransBra=False, TransKet=True) + K_res.append(Ktmp) + + K = np.asarray(K_res) + K *= nkpts + K *= ngrid / vol + + Res = [] + for iset in range(nset): + Res.append([]) + for i in range(np.prod(kmesh)): + for iset in range(nset): + Res[iset].append(K[iset, :, i*nao_prim:(i+1)*nao_prim]) + + K = np.array(Res) + + if use_mpi: + K = bcast(K, root=0) + + t2 = (logger.process_clock(), logger.perf_counter()) + + if (use_mpi and rank == 0) or not use_mpi: + _benchmark_time(t0, t2, "_contract_k_dm_quadratic_direct", mydf) + + return K + +def get_jk_dm_translation_symmetry(mydf, dm, hermi=1, kpt=np.zeros(3), + kpts_band=None, with_j=True, with_k=True, omega=None, + **kwargs): + + '''JK for given k-point''' + + direct = mydf.direct + use_mpi = mydf.use_mpi + + if use_mpi : + raise NotImplementedError("ISDF does not support use_mpi") + + if len(dm.shape) == 3: + assert dm.shape[0] <= 4 + #dm = dm[0] + else: + assert dm.ndim == 2 + dm = dm.reshape(1, dm.shape[0], dm.shape[1]) + + if hasattr(mydf, 'kmesh') and mydf.kmesh is not None: + from isdf_tools_densitymatrix import symmetrize_dm + dm = symmetrize_dm(dm, mydf.kmesh) + else: + if hasattr(mydf, 'kmesh') and mydf.kmesh is not None: + from isdf_tools_densitymatrix import symmetrize_dm + dm = symmetrize_dm(dm, mydf.kmesh) + + if use_mpi: + dm = bcast(dm, root=0) + + nset = dm.shape[0] + + #### perform the calculation #### + + if "exxdiv" in kwargs: + exxdiv = kwargs["exxdiv"] + else: + exxdiv = None + + #vj = vk = None + vj = np.zeros_like(dm) + vk = np.zeros_like(dm) + + if kpts_band is not None and abs(kpt-kpts_band).sum() > 1e-9: + raise NotImplementedError("ISDF does not support kpts_band != kpt") + + log = logger.Logger(mydf.stdout, mydf.verbose) + t1 = (logger.process_clock(), logger.perf_counter()) + + j_real = gamma_point(kpt) + k_real = gamma_point(kpt) and not np.iscomplexobj(dm) + + assert j_real + assert k_real + + mem_now = lib.current_memory()[0] + max_memory = max(2000, (mydf.max_memory - mem_now)) + + log.debug1('max_memory = %d MB (%d in use)', max_memory, mem_now) + + for iset in range(nset): + if with_j: + vj[iset] = _contract_j_dm_k_ls(mydf, dm[iset], use_mpi) + if with_k: + if mydf.direct: + raise NotImplementedError + else: + if mydf.with_robust_fitting: + vk[iset] = _get_k_kSym_robust_fitting_fast(mydf, dm[iset]) + else: + vk[iset] = _get_k_kSym(mydf, dm[iset]) + if exxdiv == 'ewald': + print("WARNING: ISDF does not support ewald") + + if exxdiv == 'ewald': + if np.allclose(kpt, np.zeros(3)): + # from pyscf.pbc.df.df_jk import _ewald_exxdiv_for_G0, _format_dms, _format_kpts_band, _format_jks + kpts = kpt.reshape(1,3) + kpts = np.asarray(kpts) + #dm_kpts = dm.reshape(-1, dm.shape[0], dm.shape[1]).copy() + dm_kpts = dm.copy() + dm_kpts = lib.asarray(dm_kpts, order='C') + dms = _format_dms(dm_kpts, kpts) + nset, nkpts, nao = dms.shape[:3] + assert nset <= 4 + kpts_band, input_band = _format_kpts_band(kpts_band, kpts), kpts_band + nband = len(kpts_band) + assert nband == 1 + if is_zero(kpts_band) and is_zero(kpts): + vk = vk.reshape(nset,nband,nao,nao) + else: + raise NotImplementedError("ISDF does not support kpts_band != 0") + _ewald_exxdiv_for_G0(mydf.cell, kpts, dms, vk, kpts_band=kpts_band) + #vk = vk[0,0] + vk = vk.reshape(nset,nao,nao) + else: + logger.warn(mydf, 'get_jk_dm_k_quadratic: Exxdiv for k-point is not supported') + + t1 = log.timer('sr jk', *t1) + + return vj, vk + +def _get_k_kSym_direct_mimic_MPI(mydf, _dm, use_mpi=False): + + if use_mpi: + raise NotImplementedError + assert mydf.direct == True + from pyscf.isdf.isdf_tools_mpi import rank, comm, comm_size, bcast, reduce + size = comm.Get_size() + + t1 = (logger.process_clock(), logger.perf_counter()) + t0 = (logger.process_clock(), logger.perf_counter()) + + ############# preprocess ############# + + dm = [] + nset = _dm.shape[0] + for iset in range(nset): + _dm_tmp, in_real_space = _preprocess_dm(mydf, _dm[iset]) + dm.append(_dm_tmp) + if in_real_space: + if np.prod(mydf.kmesh) == 1: + in_real_space = False + assert not in_real_space + dm = np.asarray(dm) + + if len(dm.shape) == 3: + assert dm.shape[0] <= 4 + else: + dm = dm.reshape(1, *dm.shape) + + aoR = mydf.aoR + aoRg = mydf.aoRg + + max_nao_involved = mydf.max_nao_involved + max_ngrid_involved = mydf.max_ngrid_involved + max_nIP_involved = mydf.max_nIP_involved + maxsize_group_naux = mydf.maxsize_group_naux + + ####### preparing the data ####### + + nset, nao = dm.shape[0], dm.shape[1] + cell = mydf.cell + assert cell.nao == nao + vol = cell.vol + mesh = np.array(cell.mesh, dtype=np.int32) + mesh_int32 = mesh + ngrid = np.prod(mesh) + + aoRg = mydf.aoRg + assert isinstance(aoRg, list) + aoR = mydf.aoR + assert isinstance(aoR, list) + + naux = mydf.naux + nao = cell.nao + nao_prim = mydf.nao_prim + aux_basis = mydf.aux_basis + kmesh = np.array(mydf.kmesh, dtype=np.int32) + nkpts = np.prod(kmesh) + + grid_ordering = mydf.grid_ID_ordered + + if hasattr(mydf, "coulG") == False: + if mydf.omega is not None: + assert mydf.omega >= 0.0 + raise NotImplementedError("coulG is not implemented yet.") + + coulG = mydf.coulG + coulG_real = coulG.reshape(*mesh)[:, :, :mesh[2]//2+1].reshape(-1).copy() + + mydf.allocate_k_buffer(nset) + build_k_buf = mydf.build_k_buf + build_VW_buf = mydf.build_VW_in_k_buf + + group = mydf.group + assert len(group) == len(aux_basis) + + ######### allocate buffer ######### + + Density_RgAO_buf = mydf.Density_RgAO_buf + + nThread = lib.num_threads() + bufsize_per_thread = (coulG_real.shape[0] * 2 + np.prod(mesh)) + # buf_build_V = np.ndarray((nThread, bufsize_per_thread), dtype=np.float64, buffer=build_VW_buf) + buf_build_V = np.ndarray((nThread, bufsize_per_thread), dtype=np.float64) + + offset_now = buf_build_V.size * buf_build_V.dtype.itemsize + + build_K_bunchsize = min(maxsize_group_naux, mydf._build_K_bunchsize) + + offset_build_now = 0 + offset_Density_RgR_buf = 0 + Density_RgR_buf = np.ndarray((build_K_bunchsize, ngrid), buffer=build_k_buf, offset=offset_build_now) + + offset_build_now += Density_RgR_buf.size * Density_RgR_buf.dtype.itemsize + offset_ddot_res_RgR_buf = offset_build_now + ddot_res_RgR_buf = np.ndarray((build_K_bunchsize, max_ngrid_involved), buffer=build_k_buf, offset=offset_ddot_res_RgR_buf) + + offset_build_now += ddot_res_RgR_buf.size * ddot_res_RgR_buf.dtype.itemsize + offset_K1_tmp1_buf = offset_build_now + K1_tmp1_buf = np.ndarray((maxsize_group_naux, nao), buffer=build_k_buf, offset=offset_K1_tmp1_buf) + + offset_build_now += K1_tmp1_buf.size * K1_tmp1_buf.dtype.itemsize + offset_K1_tmp1_ddot_res_buf = offset_build_now + K1_tmp1_ddot_res_buf = np.ndarray((maxsize_group_naux, nao), buffer=build_k_buf, offset=offset_K1_tmp1_ddot_res_buf) + + offset_build_now += K1_tmp1_ddot_res_buf.size * K1_tmp1_ddot_res_buf.dtype.itemsize + + offset_K1_final_ddot_buf = offset_build_now + K1_final_ddot_buf = np.ndarray((nao, nao), buffer=build_k_buf, offset=offset_K1_final_ddot_buf) + + ########### get involved C function ########### + + fn_packcol1 = getattr(libisdf, "_buildK_packcol", None) + assert fn_packcol1 is not None + fn_packcol2 = getattr(libisdf, "_buildK_packcol2", None) + assert fn_packcol2 is not None + fn_packadd_col = getattr(libisdf, "_buildK_packaddcol", None) + assert fn_packadd_col is not None + fn_packadd_row = getattr(libisdf, "_buildK_packaddrow", None) + assert fn_packadd_row is not None + + ordered_ao_ind = np.arange(nao) + + ######### begin work ######### + + K1 = np.zeros((nset, nao_prim, nao), dtype=np.float64) # contribution from V matrix + K2 = np.zeros((nset, nao_prim, nao), dtype=np.float64) # contribution from W matrix + + from pyscf.isdf._isdf_local_K_direct import reset_profile_buildK_time, add_cputime_RgAO, add_walltime_RgAO, log_profile_buildK_time + + reset_profile_buildK_time() + + ######## distribution task among different process ######## + + if hasattr(mydf, "fake_comm_size"): + COMM_SIZE = mydf.fake_comm_size + else: + COMM_SIZE = 2 + + print("COMM_SIZE = ", COMM_SIZE) + + for rank in range(COMM_SIZE): + + K1_tmp = np.zeros((nset, nao_prim, nao), dtype=np.float64) + K2_tmp = np.zeros((nset, nao_prim, nao), dtype=np.float64) + + task_info = [] + + nIP_prim = mydf.nIP_Prim + nIP_bunchsize = (nIP_prim + COMM_SIZE) // COMM_SIZE + bunch_begin = rank * nIP_bunchsize + bunch_end = min(nIP_prim, (rank + 1) * nIP_bunchsize) + + iIP = 0 + for group_id, atm_ids in enumerate(group): + + naux_tmp = 0 + for atm_id in atm_ids: + naux_tmp += aoRg[atm_id].aoR.shape[1] + assert naux_tmp == aux_basis[group_id].shape[0] + assert iIP + naux_tmp <= nIP_prim + + ### judge whether [iIP, iIP+naux_tmp) intersects with [bunch_begin, bunch_end) ### + + if iIP >= bunch_end or iIP + naux_tmp <= bunch_begin: + task_info.append((None, None)) + else: + if bunch_begin <= iIP: + group_begin = 0 + else: + group_begin = bunch_begin - iIP + if bunch_end >= iIP + naux_tmp: + group_end = naux_tmp + else: + group_end = bunch_end - iIP + task_info.append((group_begin, group_end)) + + iIP += naux_tmp + + if use_mpi: + print("rank = ", rank, "task_info = ", task_info) + + ########################################################### + + for group_id, atm_ids in enumerate(group): + + if task_info[group_id][0] is None: + continue + + #if use_mpi: + # if group_id % comm_size != rank: + # continue + + naux_tmp = 0 + aoRg_holders = [] + for atm_id in atm_ids: + naux_tmp += aoRg[atm_id].aoR.shape[1] + aoRg_holders.append(aoRg[atm_id]) + assert naux_tmp == aux_basis[group_id].shape[0] + + aux_basis_tmp = aux_basis[group_id] + + #### 1. build the involved DM_RgR #### + + t1 = (logger.process_clock(), logger.perf_counter()) + + Density_RgAO_tmp = np.ndarray((nset, naux_tmp, nao), buffer=Density_RgAO_buf) + offset_density_RgAO_buf = Density_RgAO_tmp.size * Density_RgAO_buf.dtype.itemsize + Density_RgAO_buf.ravel()[:] = 0.0 + # Density_RgAO_tmp.ravel()[:] = 0.0 + Density_RgAO_tmp = __get_DensityMatrixonRgAO_qradratic(mydf, dm, aoRg_holders, "all", Density_RgAO_tmp, verbose=mydf.verbose) + + #build_k_buf.ravel()[:] = 0.0 + #build_VW_buf.ravel()[:] = 0.0 + + t2 = (logger.process_clock(), logger.perf_counter()) + + add_cputime_RgAO(t2[0] - t1[0]) + add_walltime_RgAO(t2[1] - t1[1]) + + #### 2. build the V matrix #### + + W_tmp = None + + for iset in range(nset): + + calculate_W_tmp = (iset == 0) + + build_k_buf.ravel()[:] = 0.0 + build_VW_buf.ravel()[:] = 0.0 + + _W_tmp = _isdf_get_K_direct_kernel_1( + mydf, coulG_real, + group_id, Density_RgAO_tmp[iset], + None, True, calculate_W_tmp, + ##### buffer ##### + buf_build_V, + build_VW_buf, + offset_now, + Density_RgR_buf, + Density_RgAO_buf, + offset_density_RgAO_buf, + ddot_res_RgR_buf, + K1_tmp1_buf, + K1_tmp1_ddot_res_buf, + K1_final_ddot_buf, + ##### bunchsize ##### + #maxsize_group_naux, + build_K_bunchsize, + ##### other info ##### + use_mpi=use_mpi, + begin_id=task_info[group_id][0], + end_id =task_info[group_id][1], + ##### out ##### + K1_or_2=K1_tmp[iset]) + + if calculate_W_tmp: + W_tmp = _W_tmp.copy() + + build_k_buf.ravel()[:] = 0.0 + build_VW_buf.ravel()[:] = 0.0 + + _isdf_get_K_direct_kernel_1( + mydf, coulG_real, + group_id, Density_RgAO_tmp[iset], + W_tmp, False, False, + ##### buffer ##### + buf_build_V, + build_VW_buf, + offset_now, + Density_RgR_buf, + Density_RgAO_buf, + offset_density_RgAO_buf, + ddot_res_RgR_buf, + K1_tmp1_buf, + K1_tmp1_ddot_res_buf, + K1_final_ddot_buf, + ##### bunchsize ##### + #maxsize_group_naux, + build_K_bunchsize, + ##### other info ##### + use_mpi=use_mpi, + begin_id=task_info[group_id][0], + end_id =task_info[group_id][1], + ##### out ##### + K1_or_2=K2_tmp[iset]) + + log_profile_buildK_time(mydf) + + ### reduce ### + + K1 += K1_tmp + K2 += K2_tmp + + ######### finally delete the buffer ######### + + # K = K1 + K1.T - K2 + K1_packed = [] + K2_packed = [] + for iset in range(nset): + K1_packed.append(pack_JK(K1[iset], kmesh, nao_prim)) + K2_packed.append(pack_JK(K2[iset], kmesh, nao_prim)) + K1 = np.array(K1_packed) + K2 = np.array(K2_packed) + K = np.zeros_like(K1) + # K = K1 + K1.T - K2 + for iset in range(nset): + K[iset] = K1[iset] + K1[iset].T - (K2[iset] + K2[iset].T)/2.0 + + del K1 + del K2 + + ############ transform back to K ############ + + K_res = [] + for iset in range(nset): + Ktmp = _RowCol_FFT_bench(K[iset, :nao_prim, :], kmesh, inv=True, TransBra=False, TransKet=True) + K_res.append(Ktmp) + K = np.asarray(K_res) + K *= nkpts + K *= ngrid / vol + Res = [] + for iset in range(nset): + Res.append([]) + for i in range(np.prod(kmesh)): + for iset in range(nset): + Res[iset].append(K[iset, :, i*nao_prim:(i+1)*nao_prim]) + K = np.array(Res) + + t2 = (logger.process_clock(), logger.perf_counter()) + + _benchmark_time(t0, t2, "_contract_k_dm_quadratic_direct", mydf) + + return K \ No newline at end of file diff --git a/pyscf/isdf/isdf_posthf.py b/pyscf/isdf/isdf_posthf.py new file mode 100644 index 000000000..85fd519a2 --- /dev/null +++ b/pyscf/isdf/isdf_posthf.py @@ -0,0 +1,198 @@ +#!/usr/bin/env python +# Copyright 2014-2020 The PySCF Developers. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Author: Ning Zhang +# + +############ sys module ############ + +import numpy +import numpy as np +import ctypes + +############ pyscf module ############ + +import pyscf +from pyscf import lib +from pyscf import ao2mo +from pyscf.ao2mo.incore import iden_coeffs +from pyscf.pbc import tools +from pyscf.pbc.lib import kpts_helper +from pyscf.lib import logger +from pyscf.pbc.lib.kpts_helper import is_zero, gamma_point, unique +from pyscf import __config__ +from pyscf.pbc.df.fft_ao2mo import _format_kpts, _iskconserv, _contract_compact +import pyscf.pbc.gto as pbcgto +from pyscf.cc.rccsd import _ChemistsERIs, RCCSD +libpbc = lib.load_library('libpbc') + +############ isdf utils ############ + +from isdf_jk import _benchmark_time +import isdf_local as ISDF +from isdf_tools_cell import build_supercell, build_supercell_with_partition +from isdf_ao2mo import LS_THC, LS_THC_eri + +#################################### + +### post-HF with ISDF ERIs (NOT THC-POSTHF!) + +#################################### + +############ subroutines ---- deal with CC ############ + +def _make_isdf_eris_incore(mycc, my_isdf:ISDF.PBC_ISDF_Info_Quad, mo_coeff=None): + + cput0 = (logger.process_clock(), logger.perf_counter()) + eris = _ChemistsERIs() + eris._common_init_(mycc, mo_coeff) + nocc = eris.nocc + nmo = eris.fock.shape[0] + + eri1 = my_isdf.ao2mo(mo_coeff, compact=False).reshape(nmo,nmo,nmo,nmo) + eris.oooo = eri1[:nocc,:nocc,:nocc,:nocc].copy() + eris.ovoo = eri1[:nocc,nocc:,:nocc,:nocc].copy() + eris.ovov = eri1[:nocc,nocc:,:nocc,nocc:].copy() + eris.oovv = eri1[:nocc,:nocc,nocc:,nocc:].copy() + eris.ovvo = eri1[:nocc,nocc:,nocc:,:nocc].copy() + eris.ovvv = eri1[:nocc,nocc:,nocc:,nocc:].copy() + eris.vvvv = eri1[nocc:,nocc:,nocc:,nocc:].copy() + logger.timer(mycc, 'CCSD integral transformation', *cput0) + + cput1 = (logger.process_clock(), logger.perf_counter()) + + _benchmark_time(cput0, cput1, "CCSD integral transformation", my_isdf) + + return eris + +def RCCSD_isdf(mf, frozen=0, mo_coeff=None, mo_occ=None, run=True, cc2=False): + mycc = RCCSD(mf, frozen=frozen, mo_coeff=mo_coeff, mo_occ=mo_occ) + mycc.cc2 = cc2 + # eris = mycc.ao2mo(mo_coeff) + if mo_coeff is None: + mo_coeff = mf.mo_coeff + eris_ccsd = _make_isdf_eris_incore(mycc, mf.with_df, mo_coeff=mo_coeff) + # mycc.eris = eris + if run: + mycc.kernel(eris=eris_ccsd) + return mycc, eris_ccsd + +if __name__ == '__main__': + + for c in [15]: + for N in [1]: + + print("Testing c = ", c, "N = ", N, "...") + + cell = pbcgto.Cell() + boxlen = 3.5668 + cell.a = np.array([[boxlen,0.0,0.0],[0.0,boxlen,0.0],[0.0,0.0,boxlen]]) + + cell.atom = [ + ['C', (0. , 0. , 0. )], + ['C', (0.8917 , 0.8917 , 0.8917)], + ['C', (1.7834 , 1.7834 , 0. )], + ['C', (2.6751 , 2.6751 , 0.8917)], + ['C', (1.7834 , 0. , 1.7834)], + ['C', (2.6751 , 0.8917 , 2.6751)], + ['C', (0. , 1.7834 , 1.7834)], + ['C', (0.8917 , 2.6751 , 2.6751)], + ] + + cell.basis = 'gth-szv' + cell.pseudo = 'gth-pade' + cell.verbose = 10 + cell.ke_cutoff = 128 + cell.max_memory = 800 # 800 Mb + cell.precision = 1e-8 # integral precision + cell.use_particle_mesh_ewald = True + + verbose = 10 + + prim_cell = build_supercell(cell.atom, cell.a, Ls = [1,1,1], ke_cutoff=cell.ke_cutoff, basis=cell.basis, pseudo=cell.pseudo, verbose=10) + prim_partition = [[0,1,2,3], [4,5,6,7]] + prim_mesh = prim_cell.mesh + + Ls = [1, 1, N] + Ls = np.array(Ls, dtype=np.int32) + mesh = [Ls[0] * prim_mesh[0], Ls[1] * prim_mesh[1], Ls[2] * prim_mesh[2]] + mesh = np.array(mesh, dtype=np.int32) + + cell, group_partition = build_supercell_with_partition( + cell.atom, cell.a, mesh=mesh, + Ls=Ls, + basis=cell.basis, + pseudo=cell.pseudo, + partition=prim_partition, ke_cutoff=cell.ke_cutoff, verbose=verbose) + + ####### bench mark MP2 ####### + + import numpy + from pyscf.pbc import gto, scf, mp + + mf = scf.RHF(cell) + # mf.kernel() + mypt = mp.RMP2(mf) + # mypt.kernel() + + ####### isdf MP2 can perform directly! ####### + + myisdf = ISDF.PBC_ISDF_Info_Quad(cell, with_robust_fitting=True, aoR_cutoff=1e-8, direct=False) + myisdf.verbose = 10 + myisdf.build_IP_local(c=c, m=5, group=group_partition, Ls=[Ls[0]*10, Ls[1]*10, Ls[2]*10]) + myisdf.build_auxiliary_Coulomb(debug=True) + + mf_isdf = scf.RHF(cell) + myisdf.direct_scf = mf_isdf.direct_scf + mf_isdf.with_df = myisdf + mf_isdf.max_cycle = 8 + mf_isdf.conv_tol = 1e-8 + mf_isdf.kernel() + + isdf_pt = mp.RMP2(mf_isdf) + isdf_pt.kernel() + + mf_isdf.with_df.LS_THC_recompression(mf_isdf.with_df.aoRg_full()[0], force_LS_THC=False) + isdf_pt = mp.RMP2(mf_isdf) + isdf_pt.kernel() + + ######################## CCSD ######################## + + ## benchmark ## + + mycc = pyscf.cc.CCSD(mf) + # mycc.kernel() + + mycc_isdf, eris_ccsd = RCCSD_isdf(mf_isdf, run=False, cc2=False) + mycc_isdf.kernel(eris=eris_ccsd) + + eip,cip = mycc_isdf.ipccsd(nroots=2, eris=eris_ccsd) + eea,cea = mycc_isdf.eaccsd(nroots=2, eris=eris_ccsd) + + print("eip = ", eip) + print("eea = ", eea) + + ####### THC-DF ####### + + _myisdf = ISDF.PBC_ISDF_Info_Quad(cell, with_robust_fitting=True, aoR_cutoff=1e-8, direct=False, use_occ_RI_K=False) + _myisdf.build_IP_local(c=15, m=5, group=group_partition, Ls=[Ls[0]*10, Ls[1]*10, Ls[2]*10]) + R,_ = _myisdf.aoRg_full() + Z = LS_THC(myisdf, R) + eri_LS_THC = LS_THC_eri(Z, R) + print("eri_LS_THC = ", eri_LS_THC[0,0,0,0]) + eri_benchmark = myisdf.get_eri(compact=False) + print("eri_benchmark = ", eri_benchmark[0,0,0,0]) + diff = np.linalg.norm(eri_LS_THC - eri_benchmark) + print("diff = ", diff/np.sqrt(eri_benchmark.size)) \ No newline at end of file diff --git a/pyscf/isdf/isdf_tools_cell.py b/pyscf/isdf/isdf_tools_cell.py new file mode 100644 index 000000000..5bd1da04d --- /dev/null +++ b/pyscf/isdf/isdf_tools_cell.py @@ -0,0 +1,147 @@ +#!/usr/bin/env python +# Copyright 2014-2020 The PySCF Developers. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Author: Ning Zhang +# + +import sys + +import numpy +import numpy as np +import copy + +from pyscf.pbc.gto import Cell +import pyscf.pbc.gto as pbcgto + + +def build_supercell(prim_atm, + prim_a, + spin=0, + charge=0, + mesh=None, + Ls = [1,1,1], + basis='gth-dzvp', + pseudo='gth-pade', + ke_cutoff=70, + max_memory=2000, + precision=1e-8, + use_particle_mesh_ewald=True, + verbose=4): + + Cell = pbcgto.Cell() + + assert prim_a[0, 1] == 0.0 + assert prim_a[0, 2] == 0.0 + assert prim_a[1, 0] == 0.0 + assert prim_a[1, 2] == 0.0 + assert prim_a[2, 0] == 0.0 + assert prim_a[2, 1] == 0.0 + + Supercell_a = prim_a * np.array(Ls) + Cell.a = Supercell_a + + atm = [] + + for ix in range(Ls[0]): + for iy in range(Ls[1]): + for iz in range(Ls[2]): + shift = [ix * prim_a[0, 0], iy * prim_a[1, 1], iz * prim_a[2, 2]] + for atom in prim_atm: + atm.append([atom[0], (atom[1][0] + shift[0], atom[1][1] + shift[1], atom[1][2] + shift[2])]) + + Cell.atom = atm + Cell.basis = basis + Cell.pseudo = pseudo + Cell.ke_cutoff = ke_cutoff + Cell.max_memory = max_memory + Cell.precision = precision + Cell.use_particle_mesh_ewald = use_particle_mesh_ewald + Cell.verbose = verbose + Cell.unit = 'angstorm' + Cell.spin = spin + Cell.charge = charge + + Cell.build(mesh=mesh) + + return Cell + +def build_primitive_cell(supercell:Cell, kmesh): + + Cell = pbcgto.Cell() + + # assert prim_a[0, 1] == 0.0 + # assert prim_a[0, 2] == 0.0 + # assert prim_a[1, 0] == 0.0 + # assert prim_a[1, 2] == 0.0 + # assert prim_a[2, 0] == 0.0 + # assert prim_a[2, 1] == 0.0 + + prim_a = np.array( [supercell.a[0]/kmesh[0], supercell.a[1]/kmesh[1], supercell.a[2]/kmesh[2]], dtype=np.float64 ) + + #print("supercell.a = ", supercell.a) + #print("prim_a = ", prim_a) + + Cell.a = prim_a + + atm = supercell.atom[:supercell.natm//np.prod(kmesh)] + + Cell.atom = atm + Cell.basis = supercell.basis + Cell.pseudo = supercell.pseudo + Cell.ke_cutoff = supercell.ke_cutoff + Cell.max_memory = supercell.max_memory + Cell.precision = supercell.precision + Cell.use_particle_mesh_ewald = supercell.use_particle_mesh_ewald + Cell.verbose = supercell.verbose + Cell.unit = supercell.unit + + mesh = np.array(supercell.mesh) // np.array(kmesh) + + Cell.build(mesh=mesh) + + return Cell + +def build_supercell_with_partition(prim_atm, + prim_a, + mesh=None, + Ls = [1,1,1], + partition = None, + basis='gth-dzvp', + pseudo='gth-pade', + ke_cutoff=70, + max_memory=2000, + precision=1e-8, + use_particle_mesh_ewald=True, + verbose=4): + + cell = build_supercell(prim_atm, prim_a, mesh=mesh, Ls=Ls, basis=basis, pseudo=pseudo, ke_cutoff=ke_cutoff, max_memory=max_memory, precision=precision, use_particle_mesh_ewald=use_particle_mesh_ewald, verbose=verbose) + + natm_prim = len(prim_atm) + + if partition is None: + partition = [] + for i in range(natm_prim): + partition.append([i]) + + partition_supercell = [] + + for ix in range(Ls[0]): + for iy in range(Ls[1]): + for iz in range(Ls[2]): + cell_id = ix * Ls[1] * Ls[2] + iy * Ls[2] + iz + for sub_partition in partition: + partition_supercell.append([x + cell_id * natm_prim for x in sub_partition]) + + return cell, partition_supercell diff --git a/pyscf/isdf/isdf_tools_densitymatrix.py b/pyscf/isdf/isdf_tools_densitymatrix.py new file mode 100644 index 000000000..a5bfd0cf4 --- /dev/null +++ b/pyscf/isdf/isdf_tools_densitymatrix.py @@ -0,0 +1,188 @@ +#!/usr/bin/env python +# Copyright 2014-2020 The PySCF Developers. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Author: Ning Zhang +# + +import sys + +import numpy +import numpy as np + +from pyscf.pbc.gto import Cell +import pyscf.pbc.gto as pbcgto + +def symmetrize_dm(dm:np.ndarray, Ls): + ''' + + generate translation symmetrized density matrix (by average) + + Args : + dm : np.ndarray, density matrix, shape = (nao, nao) + Ls : list, supercell dimension, shape = (3,), or kmesh in k-sampling + + Returns : + dm_symm : np.ndarray, symmetrized density matrix, shape = (nao, nao) + ''' + + is_single_dm = False + + if dm.ndim == 2: + is_single_dm = True + dm = dm.reshape(1, dm.shape[0], dm.shape[1]) + + ncell = np.prod(Ls) + nao = dm.shape[1] + nset = dm.shape[0] + nao_prim = nao // ncell + dm_symm = np.zeros((nset,nao,nao), dtype=dm.dtype) + + for i in range(Ls[0]): + for j in range(Ls[1]): + for k in range(Ls[2]): + + dm_symmized_buf = np.zeros((nset,nao_prim,nao_prim), dtype=dm.dtype) + + for i_row in range(Ls[0]): + for j_row in range(Ls[1]): + for k_row in range(Ls[2]): + + loc_row = i_row * Ls[1] * Ls[2] + j_row * Ls[2] + k_row + loc_col = ((i + i_row) % Ls[0]) * Ls[1] * Ls[2] + ((j + j_row) % Ls[1]) * Ls[2] + (k + k_row) % Ls[2] + + b_begin = loc_row * nao_prim + b_end = (loc_row + 1) * nao_prim + + k_begin = loc_col * nao_prim + k_end = (loc_col + 1) * nao_prim + + dm_symmized_buf += dm[:,b_begin:b_end, k_begin:k_end] + + dm_symmized_buf /= ncell + + for i_row in range(Ls[0]): + for j_row in range(Ls[1]): + for k_row in range(Ls[2]): + + loc_row = i_row * Ls[1] * Ls[2] + j_row * Ls[2] + k_row + loc_col = ((i + i_row) % Ls[0]) * Ls[1] * Ls[2] + ((j + j_row) % Ls[1]) * Ls[2] + (k + k_row) % Ls[2] + + b_begin = loc_row * nao_prim + b_end = (loc_row + 1) * nao_prim + + k_begin = loc_col * nao_prim + k_end = (loc_col + 1) * nao_prim + + dm_symm[:,b_begin:b_end, k_begin:k_end] = dm_symmized_buf + + if is_single_dm: + return dm_symm[0] + else: + return dm_symm + +def pack_JK(input_mat:np.ndarray, Ls, nao_prim, output=None): + + ''' + pack matrix in real space + ''' + + assert input_mat.dtype == np.float64 + ncell = np.prod(Ls) + # print("ncell = ", ncell) + # print("Ls = ", Ls) + # print("nao_prim = ", nao_prim) + # print("input_mat.shape = ", input_mat.shape) + assert input_mat.shape[0] == nao_prim + assert input_mat.shape[1] == nao_prim * ncell + + if output is None: + output = np.zeros((ncell*nao_prim, ncell*nao_prim), dtype=np.float64) + else: + assert output.shape == (ncell*nao_prim, ncell*nao_prim) + + for ix_row in range(Ls[0]): + for iy_row in range(Ls[1]): + for iz_row in range(Ls[2]): + + loc_row = ix_row * Ls[1] * Ls[2] + iy_row * Ls[2] + iz_row + + b_begin = loc_row * nao_prim + b_end = (loc_row + 1) * nao_prim + + for ix_col in range(Ls[0]): + for iy_col in range(Ls[1]): + for iz_col in range(Ls[2]): + + loc_col = ix_col * Ls[1] * Ls[2] + iy_col * Ls[2] + iz_col + + k_begin = loc_col * nao_prim + k_end = (loc_col + 1) * nao_prim + + ix = (ix_col - ix_row) % Ls[0] + iy = (iy_col - iy_row) % Ls[1] + iz = (iz_col - iz_row) % Ls[2] + + loc_col2 = ix * Ls[1] * Ls[2] + iy * Ls[2] + iz + + k_begin2 = loc_col2 * nao_prim + k_end2 = (loc_col2 + 1) * nao_prim + + output[b_begin:b_end, k_begin:k_end] = input_mat[:, k_begin2:k_end2] + + return output + +def pack_JK_in_FFT_space(input_mat:np.ndarray, kmesh, nao_prim, output=None): + + ''' + pack matrix in k-space + ''' + + ncomplex = kmesh[0] * kmesh[1] * (kmesh[2] // 2 + 1) + assert input_mat.dtype == np.complex128 + assert input_mat.shape[0] == nao_prim + #print("input_mat.shape = ", input_mat.shape) + #print("nao_prim = ", nao_prim) + #print("ncomplex = ", ncomplex) + assert input_mat.shape[1] == nao_prim * ncomplex + + nkpts = np.prod(kmesh) + + if output is None: + output = np.zeros((nao_prim, nao_prim*nkpts), dtype=np.complex128) + else: + assert output.shape == (nao_prim, nao_prim*nkpts) or output.shape == (nkpts, nao_prim, nao_prim) + + output = output.reshape(nkpts, nao_prim, nao_prim) + + loc = 0 + + for ix in range(kmesh[0]): + for iy in range(kmesh[1]): + for iz in range(kmesh[2] // 2 + 1): + loc1 = ix * kmesh[1] * kmesh[2] + iy * kmesh[2] + iz + #loc2 = ix * kmesh[1] * kmesh[2] + iy * kmesh[2] + (kmesh[2] - iz) % kmesh[2] + loc2 = (kmesh[0] - ix) % kmesh[0] * kmesh[1] * kmesh[2] + (kmesh[1] - iy) % kmesh[1] * kmesh[2] + (kmesh[2] - iz) % kmesh[2] + if loc1 == loc2: + output[loc1] = input_mat[:, loc*nao_prim:(loc+1)*nao_prim] + imag_part = np.imag(output[loc1]) + if np.max(np.abs(imag_part)) > 1e-8: + print("Warning: max abs of imag_part = ", np.max(np.abs(imag_part))) + else: + output[loc1] = input_mat[:, loc*nao_prim:(loc+1)*nao_prim] + output[loc2] = input_mat[:, loc*nao_prim:(loc+1)*nao_prim].conj() + loc += 1 + + return output + diff --git a/pyscf/isdf/isdf_tools_kSampling.py b/pyscf/isdf/isdf_tools_kSampling.py new file mode 100644 index 000000000..26901c96b --- /dev/null +++ b/pyscf/isdf/isdf_tools_kSampling.py @@ -0,0 +1,257 @@ +#!/usr/bin/env python +# Copyright 2014-2020 The PySCF Developers. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Author: Ning Zhang +# + +import numpy as np +from pyscf import lib +from pyscf.pbc.lib.kpts import KPoints +from pyscf.gto.mole import * + +def _extract_grid_primitive_cell(cell_a, mesh, Ls, coords): + """ + Extract the primitive cell grid information from the supercell grid information + """ + + #print("In _extract_grid_primitive_cell") + + assert cell_a[0, 1] == 0.0 + assert cell_a[0, 2] == 0.0 + assert cell_a[1, 0] == 0.0 + assert cell_a[1, 2] == 0.0 + assert cell_a[2, 0] == 0.0 + assert cell_a[2, 1] == 0.0 + + ngrids = np.prod(mesh) + # print("ngrids = ", ngrids) + + assert ngrids == coords.shape[0] + + Lx = Ls[0] + Ly = Ls[1] + Lz = Ls[2] + + # print("Lx = ", Lx) + # print("Ly = ", Ly) + # print("Lz = ", Lz) + + # print("Length supercell x = %15.6f , primitive cell x = %15.6f" % (cell_a[0, 0], cell_a[0, 0] / Lx)) + # print("Length supercell y = %15.6f , primitive cell y = %15.6f" % (cell_a[1, 1], cell_a[1, 1] / Ly)) + # print("Length supercell z = %15.6f , primitive cell z = %15.6f" % (cell_a[2, 2], cell_a[2, 2] / Lz)) + + nx, ny, nz = mesh + + # print("nx = ", nx) + # print("ny = ", ny) + # print("nz = ", nz) + + coords = coords.reshape(nx, ny, nz, 3) + + assert nx % Lx == 0 + assert ny % Ly == 0 + assert nz % Lz == 0 + + nx_prim = nx // Lx + ny_prim = ny // Ly + nz_prim = nz // Lz + + # print("nx_prim = ", nx_prim) + # print("ny_prim = ", ny_prim) + # print("nz_prim = ", nz_prim) + + ngrids_prim = nx_prim * ny_prim * nz_prim + + res_dict = {} + + res = [] + + prim_grid = coords[:nx_prim, :ny_prim, :nz_prim].reshape(-1, 3) + + for ix in range(Lx): + for iy in range(Ly): + for iz in range(Lz): + x_0 = ix * nx_prim + x_1 = (ix + 1) * nx_prim + y_0 = iy * ny_prim + y_1 = (iy + 1) * ny_prim + z_0 = iz * nz_prim + z_1 = (iz + 1) * nz_prim + + grid_tmp = coords[x_0:x_1, y_0:y_1, z_0:z_1].reshape(-1, 3) + + shift_bench = np.zeros((3), dtype=np.float64) + shift_bench[0] = ix * cell_a[0, 0] / Lx + shift_bench[1] = iy * cell_a[1, 1] / Ly + shift_bench[2] = iz * cell_a[2, 2] / Lz + + shifts = grid_tmp - prim_grid + + # print("shifts = ", shifts) + # print("shift_bench = ", shift_bench) + + for ID in range(shifts.shape[0]): + shift = shifts[ID] + # print("shift = ", shift) + if np.allclose(shift, shift_bench) == False: + tmp = shift - shift_bench + nx = round (tmp[0] / cell_a[0, 0]) + ny = round (tmp[1] / cell_a[1, 1]) + nz = round (tmp[2] / cell_a[2, 2]) + # print(tmp) + # print(nx, ny, nz) + assert np.allclose(tmp[0], nx * cell_a[0, 0]) + assert np.allclose(tmp[1], ny * cell_a[1, 1]) + assert np.allclose(tmp[2], nz * cell_a[2, 2]) + # grid_tmp[ID] = prim_grid[ID] + shift_bench, do not shift to avoid numerical error + + res.append(grid_tmp) + res_dict[(ix, iy, iz)] = grid_tmp + res = np.array(res).reshape(-1, 3) + return res, res_dict + +def _split_partition(Voroini_partition, mesh, Ls): + ngrids = np.prod(mesh) + assert ngrids == coords.shape[0] + + Lx = Ls[0] + Ly = Ls[1] + Lz = Ls[2] + + nx, ny, nz = mesh + + Voroini_partition_reshaped = Voroini_partition.reshape(nx, ny, nz) + + assert nx % Lx == 0 + assert ny % Ly == 0 + assert nz % Lz == 0 + + nx_prim = nx // Lx + ny_prim = ny // Ly + nz_prim = nz // Lz + + ngrids_prim = nx_prim * ny_prim * nz_prim + + res_dict = {} + prim_grid = Voroini_partition_reshaped[:nx_prim, :ny_prim, :nz_prim].reshape(-1, 3) + + for ix in range(Lx): + for iy in range(Ly): + for iz in range(Lz): + x_0 = ix * nx_prim + x_1 = (ix + 1) * nx_prim + y_0 = iy * ny_prim + y_1 = (iy + 1) * ny_prim + z_0 = iz * nz_prim + z_1 = (iz + 1) * nz_prim + + grid_tmp = Voroini_partition_reshaped[x_0:x_1, y_0:y_1, z_0:z_1].reshape(-1) + res_dict[(nx, ny, nz)] = grid_tmp + + return res_dict + +def _RowCol_FFT_bench(input, Ls, inv=False, TransBra = True, TransKet = True): + """ + A is a 3D array, (nbra, nket, ngrid_prim) + """ + + A = input + ncell = np.prod(Ls) + + if TransKet: + assert A.shape[1] % ncell == 0 + if TransBra: + assert A.shape[0] % ncell == 0 + + # print("A.shape = ", A.shape) + # print("Ls = ", Ls) + + NPOINT_KET = A.shape[1] // ncell + + if TransKet: + A = A.reshape(A.shape[0], -1, NPOINT_KET) # nbra, nBox, NPOINT + A = A.transpose(0, 2, 1) # nbra, NPOINT, nBox + shape_tmp = A.shape + A = A.reshape(A.shape[0] * NPOINT_KET, *Ls) + # perform 3d fft + if inv: + A = np.fft.ifftn(A, axes=(1, 2, 3)) + else: + A = np.fft.fftn(A, axes=(1, 2, 3)) + A = A.reshape(shape_tmp) + A = A.transpose(0, 2, 1) + A = A.reshape(A.shape[0], -1) + print("finish transform ket") + # transform bra + NPOINT_BRA = A.shape[0] // ncell + if TransBra: + A = A.reshape(-1, NPOINT_BRA, A.shape[1]) + A = A.transpose(1, 2, 0) + shape_tmp = A.shape + A = A.reshape(-1, *Ls) + if inv: + A = np.fft.fftn(A, axes=(1, 2, 3)) + else: + A = np.fft.ifftn(A, axes=(1, 2, 3)) + A = A.reshape(shape_tmp) + A = A.transpose(2, 0, 1) + A = A.reshape(-1, A.shape[2]) + print("finish transform bra") + # print(A[:NPOINT, :NPOINT]) + return A + +def _RowCol_FFT_ColFull_bench(input, Ls, mesh): + """ + A is a 3D array, (nbra, nket, ngrid_prim) + """ + A = input + ncell = np.prod(Ls) + nGrids = np.prod(mesh) + assert A.shape[1] == nGrids + assert A.shape[0] % ncell == 0 + A = A.reshape(A.shape[0], *mesh) + # perform 3d fft + A = np.fft.fftn(A, axes=(1, 2, 3)) + A = A.reshape(A.shape[0], -1) + print("finish transform ket") + # transform bra + NPOINT_BRA = A.shape[0] // ncell + A = A.reshape(-1, NPOINT_BRA, A.shape[1]) + A = A.transpose(1, 2, 0) + shape_tmp = A.shape + A = A.reshape(-1, *Ls) + A = np.fft.ifftn(A, axes=(1, 2, 3)) + A = A.reshape(shape_tmp) + A = A.transpose(2, 0, 1) + A = A.reshape(-1, A.shape[2]) + print("finish transform bra") + return A + +def _kmesh_to_Kpoints(cell, mesh): + + from pyscf.pbc.lib.kpts import KPoints + + kpts = [] + + for i in range(mesh[0]): + for j in range(mesh[1]): + for k in range(mesh[2]): + kpts.append([1.0/float(mesh[0]) * float(i), + 1.0/float(mesh[1]) * float(j), + 1.0/float(mesh[2]) * float(k)]) + + kpts = np.array(kpts) + + return KPoints(cell, kpts) \ No newline at end of file diff --git a/pyscf/isdf/isdf_tools_linearop.py b/pyscf/isdf/isdf_tools_linearop.py new file mode 100644 index 000000000..c09a95aa9 --- /dev/null +++ b/pyscf/isdf/isdf_tools_linearop.py @@ -0,0 +1,104 @@ +#!/usr/bin/env python +# Copyright 2014-2020 The PySCF Developers. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Author: Ning Zhang +# + +############ sys module ############ + +import copy +import numpy as np +import numpy +import scipy +import ctypes, sys +from pyscf import lib +libisdf = lib.load_library('libisdf') + +def square_inPlace(a): + + assert(a.dtype == numpy.double) + fn = getattr(libisdf, "NPdsquare_inPlace", None) + assert(fn is not None) + + fn(a.ctypes.data_as(ctypes.c_void_p), + ctypes.c_size_t(a.size)) + + return a + +def d_i_ij_ij(a, b, out=None): + assert(a.dtype == b.dtype) + assert(a.shape[0] == b.shape[0]) + assert(a.ndim == 1) + assert(b.ndim == 2) + + if a.dtype != numpy.double: + raise NotImplementedError + else: + fn = getattr(libisdf, "NPd_i_ij_ij", None) + assert(fn is not None) + + if out is None: + out = numpy.empty_like(a) + + fn(out.ctypes.data_as(ctypes.c_void_p), + a.ctypes.data_as(ctypes.c_void_p), + b.ctypes.data_as(ctypes.c_void_p), + ctypes.c_size_t(b.shape[0]), + ctypes.c_size_t(b.shape[1])) + + return out + +def d_ij_j_ij(a, b, out=None): + assert(a.dtype == b.dtype) + assert(a.shape[1] == b.shape[0]) + assert(a.ndim == 2) + assert(b.ndim == 1) + + if a.dtype != numpy.double: + raise NotImplementedError + else: + fn = getattr(libisdf, "NPd_ij_j_ij", None) + assert(fn is not None) + + if out is None: + out = numpy.empty_like(a) + + fn(out.ctypes.data_as(ctypes.c_void_p), + a.ctypes.data_as(ctypes.c_void_p), + b.ctypes.data_as(ctypes.c_void_p), + ctypes.c_size_t(a.shape[0]), + ctypes.c_size_t(a.shape[1])) + + return out + +def cwise_mul(a, b, out=None): + assert(a.size == b.size) + assert(a.dtype == b.dtype) + + if a.dtype != numpy.double: + raise NotImplementedError + else: + fn = getattr(libisdf, "NPdcwisemul", None) + assert(fn is not None) + + if out is None: + out = numpy.empty_like(a) + + fn(out.ctypes.data_as(ctypes.c_void_p), + a.ctypes.data_as(ctypes.c_void_p), + b.ctypes.data_as(ctypes.c_void_p), + ctypes.c_size_t(a.size)) + + return out \ No newline at end of file diff --git a/pyscf/isdf/isdf_tools_local.py b/pyscf/isdf/isdf_tools_local.py new file mode 100644 index 000000000..30ed5ecc0 --- /dev/null +++ b/pyscf/isdf/isdf_tools_local.py @@ -0,0 +1,1139 @@ +#!/usr/bin/env python +# Copyright 2014-2020 The PySCF Developers. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Author: Ning Zhang +# + +########## pyscf module ########## + +import copy +from functools import reduce +import numpy as np +import pyscf +from pyscf import lib +import pyscf.pbc.gto as pbcgto +from pyscf.pbc.gto import Cell +from pyscf.pbc import tools +from pyscf.pbc.lib.kpts import KPoints +from pyscf.pbc.lib.kpts_helper import is_zero, gamma_point, member +from pyscf.gto.mole import * +import pyscf.pbc.df.ft_ao as ft_ao +from pyscf.pbc.df import aft, rsdf_builder, aft_jk + +########## isdf module ########## + +from pyscf.isdf.isdf_jk import _benchmark_time +import pyscf.isdf.isdf_ao2mo as isdf_ao2mo +import pyscf.isdf.isdf_jk as isdf_jk +from pyscf.isdf.isdf_eval_gto import ISDF_eval_gto + +########## sys module ########## + +import ctypes, sys +from multiprocessing import Pool +libisdf = lib.load_library('libisdf') + +########## global parameter ########## + +DISTANCE_CUTOFF = 16 # suitable for cuprates ! + +############ build atm connection graph ############ + +class AtmConnectionInfo: + def __init__(self, cell:Cell, atmID, distance_matrix, precision, rcut, rcut_max, atm_to_bas): + ''' + rcut: the cutoff radius of each bas + ''' + + self.precision = precision + self.atmID = atmID + self.atmID_connection = np.where(distance_matrix[atmID] < rcut_max)[0] + self.distance = distance_matrix[atmID][self.atmID_connection] + self.atm_connected_info = list(zip(self.atmID_connection, self.distance)) + # sort by distance + self.atm_connected_info.sort(key=lambda x: x[1]) + self.bas_range = np.arange(atm_to_bas[atmID][0], atm_to_bas[atmID][1]) + self.bas_cut = rcut[atm_to_bas[atmID][0]:atm_to_bas[atmID][1]] + + def __repr__(self): + return "atmID = %d, atm_connected_info = %s, bas_range = %s, bas_cut = %s" % (self.atmID, self.atm_connected_info, self.bas_range, self.bas_cut) + +class aoR_Holder: + def __init__(self, aoR, ao_involved, local_gridID_begin, local_gridID_end, global_gridID_begin, global_gridID_end): + ''' + currently local_gridID_begin, local_gridID_end is not useful + ''' + + assert aoR.shape[0] == len(ao_involved) + assert (local_gridID_end - local_gridID_begin) == (global_gridID_end - global_gridID_begin) + assert aoR.shape[1] <= (global_gridID_end - global_gridID_begin) + # assert aoR.shape[1] == local_gridID_end - local_gridID_begin + # assert aoR.shape[1] == global_gridID_end - global_gridID_begin + # if aoR.shape[1] != (global_gridID_end - global_gridID_begin): + self.ngrid_tot = global_gridID_end - global_gridID_begin + self.ngrid_kept = aoR.shape[1] + + self.aoR = aoR + self.ao_involved = np.array(ao_involved, dtype=np.int32) + self.nao_involved = len(ao_involved) + self.local_gridID_begin = local_gridID_begin + self.local_gridID_end = local_gridID_end + self.global_gridID_begin = global_gridID_begin + self.global_gridID_end = global_gridID_end + self.nCompact = self.nao_involved ## by default all orbitals are compact + + ## build ao_involved segment ## + + self.ao_involved_sorted = np.sort(self.ao_involved) + self.aoR = self.aoR[np.argsort(self.ao_involved)] + self.ao_involved = self.ao_involved_sorted + + # diff = np.diff(self.ao_involved) + # segment_indices = np.where(diff > 1)[0] + 1 + # segments = np.split(self.ao_involved, segment_indices) + # self.segments = [] + # if len(segments) == 1 and len(segments[0]) == 0: + # self.segments.append(0) + # else: + # loc_begin = 0 + # for segment in segments: + # self.segments.append(loc_begin) + # self.segments.append(segment[0]) + # self.segments.append(segment[-1]+1) + # loc_begin += len(segment) + # self.segments.append(loc_begin) + # self.segments = np.array(self.segments, dtype=np.int32) + # segments = None + + def RangeSeparation(self, IsCompact:np.ndarray): + ordering_C = [] + ordering_D = [] + nao_involved = len(self.ao_involved) + for i in range(nao_involved): + if IsCompact[self.ao_involved[i]]: + ordering_C.append(i) + else: + ordering_D.append(i) + self.nCompact = len(ordering_C) + ordering = ordering_C + ordering.extend(ordering_D) + ordering = np.array(ordering, dtype=np.int32) + self.aoR = self.aoR[ordering].copy() + self.ao_involved = self.ao_involved[ordering].copy() + # print("ordering = ", ordering) + # print("nCompact = ", self.nCompact) + for i in range(self.nCompact): + assert IsCompact[self.ao_involved[i]] + + def size(self): + return self.aoR.nbytes + self.ao_involved.nbytes + # + self.segments.nbytes + + def todense(self, nao): + aoR = np.zeros((nao, self.aoR.shape[1])) + aoR[self.ao_involved] = self.aoR + return aoR + +def _get_aoR_holders_memory(aoR_holders:list[aoR_Holder]): + + return sum([_aoR_holder.size() for _aoR_holder in aoR_holders if _aoR_holder is not None]) + +def flatten_aoR_holder(aoR_holders:list[aoR_Holder]): + res_int = [] + res_float = [] + for _aoR_holder in aoR_holders: + res_int.extend(_aoR_holder.ao_involved) + res_int.extend([_aoR_holder.local_gridID_begin, _aoR_holder.local_gridID_end, _aoR_holder.global_gridID_begin, _aoR_holder.global_gridID_end]) + res_float.extend(_aoR_holder.aoR.ravel()) + res_int = np.array(res_int, dtype=np.int32) + res_float = np.array(res_float, dtype=np.float64) + return res_int, res_float + +def _pack_aoR_holder(aoR_holders:list[aoR_Holder], nao): + + has_involved = [False] * nao + + nGrid = 0 + for _aoR_holder in aoR_holders: + if _aoR_holder is None: + continue + for i in _aoR_holder.ao_involved: + has_involved[i] = True + # nGrid += _aoR_holder.aoR.shape[1] + nGrid += _aoR_holder.ngrid_tot + + ao2loc = [-1] * nao + loc_now = 0 + for ao_id, involved in enumerate(has_involved): + if involved: + ao2loc[ao_id] = loc_now + loc_now += 1 + nao_involved = loc_now + + aoR_packed = np.zeros((nao_involved, nGrid)) + + fn_pack = getattr(libisdf, "_Pack_Matrix_SparseRow_DenseCol", None) + assert fn_pack is not None + + + grid_begin_id = 0 + for _aoR_holder in aoR_holders: + if _aoR_holder is None: + continue + loc_packed = np.zeros((_aoR_holder.aoR.shape[0]), dtype=np.int32) + # grid_end_id = grid_begin_id + _aoR_holder.aoR.shape[1] + grid_end_id = grid_begin_id + _aoR_holder.ngrid_tot + for loc, ao_id in enumerate(_aoR_holder.ao_involved): + loc_packed[loc] = ao2loc[ao_id] + # aoR_packed[loc_packed, grid_begin_id:grid_end_id] = _aoR_holder.aoR + fn_pack( + aoR_packed.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(aoR_packed.shape[0]), + ctypes.c_int(aoR_packed.shape[1]), + _aoR_holder.aoR.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(_aoR_holder.aoR.shape[0]), + ctypes.c_int(_aoR_holder.aoR.shape[1]), + loc_packed.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(grid_begin_id), + ctypes.c_int(grid_end_id) + ) + grid_begin_id = grid_end_id + ao_packed_invovled = np.array([i for i in range(nao) if has_involved[i]], dtype=np.int32) + + assert nGrid == grid_begin_id + local_gridID_begin = 0 + local_gridID_end = nGrid + global_gridID_begin = 0 + global_gridID_end = nGrid + + return aoR_Holder(aoR_packed, ao_packed_invovled, local_gridID_begin, local_gridID_end, global_gridID_begin, global_gridID_end) + +# get the rcut # + +def _atm_to_bas(cell:Cell): + + shl_atm = [] + + natm = cell.natm + + for i in range(natm): + shl_atm.append([None, None]) + + for i in range(cell.nbas): + atm_id = cell.bas_atom(i) + if shl_atm[atm_id][0] is None: + shl_atm[atm_id][0] = i + shl_atm[atm_id][1] = i+1 + + return shl_atm + +def _estimate_rcut(cell, ngrids, precision): + + ''' + Cutoff raidus, above which each shell decays to a value less than the + required precsion + ''' + + weight = numpy.sqrt(cell.vol/ngrids) # note the weight ! + log_prec = numpy.log(precision/weight) + rcut = [] + for ib in range(cell.nbas): + l = cell.bas_angular(ib) + es = cell.bas_exp(ib) + cs = abs(cell.bas_ctr_coeff(ib)).max(axis=1) + r = 5. + r = (((l+2)*numpy.log(r)+numpy.log(cs) - log_prec) / es)**.5 + r[r < 1.] = 1. + r = (((l+2)*numpy.log(r)+numpy.log(cs) - log_prec) / es)**.5 + rcut.append(r.max()) + return numpy.array(rcut) + +# the distance graph # + +def _distance_translation(pa:np.ndarray, pb:np.ndarray, a): + ''' + calculate the distance between pa pb, but taken the periodic boundary condition into account + ''' + + dx = pa[0] - pb[0] + dx1 = dx - a[0][0] + dx2 = dx + a[0][0] + dx = abs(dx) + dx1 = abs(dx1) + dx2 = abs(dx2) + dx = min(dx, dx1, dx2) + + dy = pa[1] - pb[1] + dy1 = dy - a[1][1] + dy2 = dy + a[1][1] + dy = abs(dy) + dy1 = abs(dy1) + dy2 = abs(dy2) + dy = min(dy, dy1, dy2) + + dz = pa[2] - pb[2] + dz1 = dz - a[2][2] + dz2 = dz + a[2][2] + dz = abs(dz) + dz1 = abs(dz1) + dz2 = abs(dz2) + dz = min(dz, dz1, dz2) + + return np.sqrt(dx**2 + dy**2 + dz**2) + +def get_cell_distance_matrix(cell:Cell): + ''' + get the distance matrix of the cell + ''' + a = cell.lattice_vectors() + n = cell.natm + distance_matrix = np.zeros((n, n)) + for i in range(n): + for j in range(i+1, n): + distance_matrix[i][j] = _distance_translation(cell.atom_coord(i), cell.atom_coord(j), a) + distance_matrix[j][i] = distance_matrix[i][j] + return distance_matrix + +############ algorithm based on the distance graph and AtmConnectionInfo ############ + +def get_partition(cell:Cell, coords, AtmConnectionInfoList:list[AtmConnectionInfo], + Ls=[3,3,3], + with_translation_symmetry=False, + kmesh=None, + use_mpi=False): # by default split the cell into 4x4x4 supercell + + ''' get partition of grid points, each group of grid points are associated with one atm. + ''' + + ##### this step is super fast ##### + + ##### we simply perform it on root and broadcast it to all other processes ##### + + if use_mpi: + from pyscf.isdf.isdf_tools_mpi import rank, comm, comm_size, allgather, bcast, reduce, gather, alltoall, _comm_bunch, allgather_list, bcast_pickel + + if with_translation_symmetry and kmesh is None: + raise ValueError("kmesh must be provided if with_translation_symmetry is True") + + log = lib.logger.Logger(cell.stdout, cell.verbose) + + if use_mpi == False or (use_mpi and rank == 0): + #print("************* get_partition *************") + log.debug4("************* get_partition *************") + + ##### construct the box info ##### + + mesh = cell.mesh + lattice_vector = cell.lattice_vectors() + lattice_vector = np.array(lattice_vector) + + meshPrim = None + if with_translation_symmetry: + meshPrim = np.array(mesh) // np.array(kmesh) + + mesh_box = np.array([0,0,0]) + nbox = np.array([0,0,0]) + if mesh[0] % Ls[0] != 0: + mesh_box[0] = mesh[0] // Ls[0] + 1 + nbox[0] = mesh[0] // mesh_box[0] + 1 + else: + mesh_box[0] = mesh[0] // Ls[0] + nbox[0] = mesh[0] // mesh_box[0] + if mesh[1] % Ls[1] != 0: + mesh_box[1] = mesh[1] // Ls[1] + 1 + nbox[1] = mesh[1] // mesh_box[1] + 1 + else: + mesh_box[1] = mesh[1] // Ls[1] + nbox[1] = mesh[1] // mesh_box[1] + if mesh[2] % Ls[2] != 0: + mesh_box[2] = mesh[2] // Ls[2] + 1 + nbox[2] = mesh[2] // mesh_box[2] + 1 + else: + mesh_box[2] = mesh[2] // Ls[2] + nbox[2] = mesh[2] // mesh_box[2] + + Ls_box = [lattice_vector[0] / mesh[0] * mesh_box[0], lattice_vector[1] / mesh[1] * mesh_box[1], lattice_vector[2] / mesh[2] * mesh_box[2]] + + # print("Ls = ", Ls) + # print("mesh = ", mesh) + # print("mesh_box = ", mesh_box) + # print("Ls_box = ", Ls_box) + + assert Ls_box[0][0] < 3.0 + assert Ls_box[1][1] < 3.0 + assert Ls_box[2][2] < 3.0 # the box cannot be too large + + ##### helper functions ##### + + def get_box_id(x, y, z): + ix = int(x // Ls_box[0][0]) + iy = int(y // Ls_box[1][1]) + iz = int(z // Ls_box[2][2]) + return (ix, iy, iz) + + def get_box_id_from_coord(coord): + return get_box_id(coord[0], coord[1], coord[2]) + + def get_mesh_id(ix, iy, iz): + return ix * mesh[1] * mesh[2] + iy * mesh[2] + iz + + ##### build info between atm and box id ##### + + atm_box_id = [] + box_2_atm = {} + + atm_coords = [] + + for i in range(cell.natm): + box_id = get_box_id_from_coord(cell.atom_coord(i)) + atm_box_id.append(box_id) + if box_id not in box_2_atm: + box_2_atm[box_id] = [i] + else: + box_2_atm[box_id].append(i) + atm_coords.append(cell.atom_coord(i)) + + atm_coords = np.array(atm_coords) + distance = np.zeros((cell.natm,), dtype=np.float64) + + fn_calculate_distance = getattr(libisdf, "distance_between_point_atms", None) + assert fn_calculate_distance is not None + + fn_calculate_distance2 = getattr(libisdf, "distance_between_points_atms", None) + assert fn_calculate_distance2 is not None + + ######## a rough partition of the cell based on distance only ######## + + natm_tmp = cell.natm + if with_translation_symmetry: + natm_tmp = cell.natm // np.prod(kmesh) + partition_rough = [] + for i in range(natm_tmp): + partition_rough.append([]) + + grid_id_global = np.arange(mesh[0] * mesh[1] * mesh[2], dtype=np.int32).reshape(mesh[0], mesh[1], mesh[2]) + + for ix in range(nbox[0]): + for iy in range(nbox[1]): + for iz in range(nbox[2]): + + if use_mpi and rank != 0: + continue + + box_id = (ix, iy, iz) + + #### construct the grid ID #### + + mesh_x_begin = min(ix * mesh_box[0], mesh[0]) + mesh_x_end = min((ix+1) * mesh_box[0], mesh[0]) + + if mesh_x_begin == mesh_x_end: + continue + + mesh_y_begin = min(iy * mesh_box[1], mesh[1]) + mesh_y_end = min((iy+1) * mesh_box[1], mesh[1]) + + if mesh_y_begin == mesh_y_end: + continue + + mesh_z_begin = min(iz * mesh_box[2], mesh[2]) + mesh_z_end = min((iz+1) * mesh_box[2], mesh[2]) + + if mesh_z_begin == mesh_z_end: + continue + + IsValidBox=True + if with_translation_symmetry: + if mesh_x_begin >= meshPrim[0]: + IsValidBox=False + if mesh_y_begin >= meshPrim[1]: + IsValidBox=False + if mesh_z_begin >= meshPrim[2]: + IsValidBox=False + if not IsValidBox: + continue + + if with_translation_symmetry: + mesh_x_end = min(mesh_x_end, meshPrim[0]) + mesh_y_end = min(mesh_y_end, meshPrim[1]) + mesh_z_end = min(mesh_z_end, meshPrim[2]) + + grid_ID = grid_id_global[mesh_x_begin:mesh_x_end, mesh_y_begin:mesh_y_end, mesh_z_begin:mesh_z_end].flatten() + + grid_ID.sort() + grid_ID = np.array(grid_ID, dtype=np.int32) + + # print("grid_ID = ", grid_ID) + + if box_id in box_2_atm: + partition_rough[box_2_atm[box_id][0]%natm_tmp].extend(grid_ID) + else: + # random pickup one coord in the box # + + grid_ID_random_pick = grid_ID[np.random.randint(0, len(grid_ID))] + grid_coord = coords[grid_ID_random_pick] + grid_coord = np.array(grid_coord) + + fn_calculate_distance( + distance.ctypes.data_as(ctypes.c_void_p), + grid_coord.ctypes.data_as(ctypes.c_void_p), + atm_coords.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(cell.natm), + lattice_vector.ctypes.data_as(ctypes.c_void_p) + ) + + atm_id = np.argmin(distance) + partition_rough[atm_id%natm_tmp].extend(grid_ID) + + if use_mpi: + comm.Barrier() + + if use_mpi == False or (use_mpi == True and rank == 0): + len_grid_involved = 0 + for atm_id, x in enumerate(partition_rough): + # print("atm %d involved %d grids" % (atm_id, len(x))) + len_grid_involved += len(x) + if with_translation_symmetry: + assert len_grid_involved == np.prod(mesh) // np.prod(kmesh) + else: + assert len_grid_involved == mesh[0] * mesh[1] * mesh[2] + + ######## refine the partition based on the AtmConnectionInfo ######## + + partition = [] + natm_tmp = cell.natm + if with_translation_symmetry: + natm_tmp = cell.natm // np.prod(kmesh) + assert cell.natm % np.prod(kmesh) == 0 + for i in range(natm_tmp): + partition.append([]) + + ao_loc = cell.ao_loc_nr() + # print("nao_intot = ", ao_loc[-1]) + + from copy import deepcopy + lattice_vector = deepcopy(cell.lattice_vectors()) + + # print("lattice_vector = ", lattice_vector) + + if with_translation_symmetry: + # print("lattice_vector = ", lattice_vector) + lattice_vector = np.array(lattice_vector) / np.array(kmesh) + # print("lattice_vector = ", lattice_vector) + + for atm_id in range(natm_tmp): + + atm_involved = [] + + if use_mpi and rank != 0: + continue + + ## pick up atms with distance < DISTANCE_CUTOFF ## + + for atm_id_other, distance in AtmConnectionInfoList[atm_id].atm_connected_info: + # print("atm %d distance = %f" % (atm_id_other, distance)) + if distance < DISTANCE_CUTOFF: + atm_involved.append(atm_id_other % natm_tmp) + if len(atm_involved) >= 16: ## up to 16 atms + break + atm_involved.sort() + atm_involved = list(set(atm_involved)) + atm_involved = np.array(atm_involved, dtype=np.int32) + # print("atm %d involved atm = %s" % (atm_id, atm_involved)) + + ## get the involved ao ## + + atm_coords_involved = [] + + nao_involved = 0 + for atm_id_other in atm_involved: + shl_begin = AtmConnectionInfoList[atm_id_other].bas_range[0] + shl_end = AtmConnectionInfoList[atm_id_other].bas_range[-1]+1 + nao_involved += ao_loc[shl_end] - ao_loc[shl_begin] + atm_coords_involved.append(cell.atom_coord(atm_id_other)) + + atm_coords_involved = np.array(atm_coords_involved) + + grid_ID = partition_rough[atm_id] + + ## determine the partition by distance ## + + coords_now = coords[grid_ID].copy() + distance = np.zeros((len(grid_ID), len(atm_involved)), dtype=np.float64) + fn_calculate_distance2( + distance.ctypes.data_as(ctypes.c_void_p), + coords_now.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(len(grid_ID)), + atm_coords_involved.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(len(atm_involved)), + lattice_vector.ctypes.data_as(ctypes.c_void_p) + ) + argmin_distance = np.argmin(distance, axis=1) + for grid_id, _atm_id_ in zip(grid_ID, argmin_distance): + partition[atm_involved[_atm_id_]%natm_tmp].append(grid_id) + + if use_mpi == False or (use_mpi == True and rank == 0): + len_grid_involved = 0 + for atm_id, x in enumerate(partition): + len_grid_involved += len(x) + if with_translation_symmetry: + assert len_grid_involved == np.prod(mesh) // np.prod(kmesh) + else: + assert len_grid_involved == mesh[0] * mesh[1] * mesh[2] + + del partition_rough + + if use_mpi: + partition_sendbuf = [np.array(x, dtype=np.int32) for x in partition] + partition = [] + for x in partition_sendbuf: + partition.append(bcast(x)) + del partition_sendbuf + + if (use_mpi and rank == 0) or use_mpi == False: + #print("************* end get_partition *************") + log.debug4("************* end get_partition *************") + + return partition + +def _range_partition(ngroup, rank, comm_size, use_mpi=False): + + ''' given ngroup tasks, split them into comm_size parts, and return the range of tasks for the rank-th process + ''' + + if use_mpi == False: + return 0, ngroup + else: + from pyscf.isdf.isdf_tools_mpi import comm_size + if ngroup % comm_size == 0: + ngroup_local = ngroup // comm_size + return rank * ngroup_local, (rank+1) * ngroup_local + else: + ngroup_local = ngroup // comm_size + 1 + + ## solve equation a * ngroup_local + b * (ngroup_local - 1) = ngroup ## + ## a + b = comm_size ## + + b = (ngroup_local * comm_size - ngroup) + a = comm_size - b + + if rank < a: + return rank * ngroup_local, (rank+1) * ngroup_local + else: + return a * ngroup_local + (rank - a) * (ngroup_local - 1), a * ngroup_local + (rank - a + 1) * (ngroup_local - 1) + +def _range_partition_array(ngroup, comm_size, use_mpi=False): + + if use_mpi == False: + return np.array([0, ngroup], dtype=np.int32) + else: + from pyscf.isdf.isdf_tools_mpi import comm_size + if ngroup % comm_size == 0: + ngroup_local = ngroup // comm_size + for i in range(comm_size): + if i == 0: + res = np.array([0, ngroup_local], dtype=np.int32) + else: + res = np.vstack((res, np.array([i * ngroup_local, (i+1) * ngroup_local], dtype=np.int32))) + else: + ngroup_local = ngroup // comm_size + 1 + + ## solve equation a * ngroup_local + b * (ngroup_local - 1) = ngroup ## + ## a + b = comm_size ## + + b = (ngroup_local * comm_size - ngroup) + a = comm_size - b + + for i in range(comm_size): + if i < a: + if i == 0: + res = np.array([0, ngroup_local], dtype=np.int32) + else: + res = np.vstack((res, np.array([i * ngroup_local, (i+1) * ngroup_local], dtype=np.int32))) + else: + if i == a: + res = np.vstack((res, np.array([a * ngroup_local, a * ngroup_local + (ngroup_local - 1)], dtype=np.int32))) + else: + res = np.vstack((res, np.array([a * ngroup_local + (i - a) * (ngroup_local - 1), a * ngroup_local + (i - a + 1) * (ngroup_local - 1)], dtype=np.int32))) + + if comm_size == 1: + res = res.reshape(1, 2) + return res + +def _get_grid_ordering(atmid_to_gridID, group, use_mpi=False): + + ''' given the grid points associated to each atom, return the reordering of grid points according to the ID of atms. + ''' + + grid_ordering = [] + for i in range(len(group)): + for atmid in group[i]: + grid_ordering.extend(atmid_to_gridID[atmid]) + + return np.array(grid_ordering, dtype=np.int32) + +def _get_grid_partition(atmid_to_gridID, group, use_mpi=False): + + if use_mpi: + from pyscf.isdf.isdf_tools_mpi import comm_size + + ngrid = np.sum([len(x) for x in atmid_to_gridID]) + + if use_mpi == False: + return np.array([0, ngrid], dtype=np.int32) + else: + group_partition_array = _range_partition_array(len(group), comm_size, use_mpi) + + grid_partition = [0] + for i in range(comm_size): + group_begin = group_partition_array[i][0] + group_end = group_partition_array[i][1] + + ngrid_local = 0 + for j in range(group_begin, group_end): + for atmid in group[j]: + ngrid_local += len(atmid_to_gridID[atmid]) + + grid_partition.append(grid_partition[-1] + ngrid_local) + + return np.array(grid_partition, dtype=np.int32) + +def _get_atm_2_grid_segment(atmid_to_gridID, group): + + natm = len(atmid_to_gridID) + assert sum([len(x) for x in group]) == natm or (natm % sum([len(x) for x in group])) == 0 + + res = [] + for _ in range(natm): + res.append([None, None]) + + grid_loc_now = 0 + for j in range(len(group)): + for atmid in group[j]: + res[atmid][0] = grid_loc_now + res[atmid][1] = grid_loc_now + len(atmid_to_gridID[atmid]) + grid_loc_now += len(atmid_to_gridID[atmid]) + + return res + +def _sync_list(list_data, ngroup): + + # if use_mpi: + from pyscf.isdf.isdf_tools_mpi import rank, comm_size, bcast + + ### check data ### + + if len(list_data) != ngroup: + raise ValueError("the length of list_data is not equal to ngroup") + + group_begin, group_end = _range_partition(ngroup, rank, comm_size, True) + + for i in range(group_begin): + assert list_data[i] is None + for i in range(group_end, ngroup): + assert list_data[i] is None + for i in range(group_begin, group_end): + assert list_data[i] is not None + + ### generate groupid_2_root ### + + groupid_2_root = [] + + range_partition_array = _range_partition_array(ngroup, comm_size, True) + + for j in range(comm_size): + group_begin = range_partition_array[j][0] + group_end = range_partition_array[j][1] + for i in range(group_begin, group_end): + groupid_2_root.append(j) + + ### sync ### + + for i in range(ngroup): + if rank == groupid_2_root[i]: + sys.stdout.flush() + list_data[i] = bcast(list_data[i], root=groupid_2_root[i]) + + return list_data + +def _sync_aoR(aoR_holders, natm): + + ''' used in MPI + ''' + + assert len(aoR_holders) == natm + + aoR = [] + bas_id = [] + grid_ID_begin = [] + for i in range(natm): + if aoR_holders[i] is not None: + aoR.append(aoR_holders[i].aoR) + bas_id.append(aoR_holders[i].ao_involved) + grid_ID_begin.append(np.asarray([aoR_holders[i].global_gridID_begin],dtype=np.int32)) + else: + aoR.append(None) + bas_id.append(None) + grid_ID_begin.append(None) + + aoR = _sync_list(aoR, natm) + bas_id = _sync_list(bas_id, natm) + grid_ID_begin = _sync_list(grid_ID_begin, natm) + + aoR_holders = [] + + for i in range(natm): + aoR_holders.append( + aoR_Holder(aoR[i], bas_id[i], grid_ID_begin[i][0], grid_ID_begin[i][0] + aoR[i].shape[1], grid_ID_begin[i][0], grid_ID_begin[i][0] + aoR[i].shape[1]) + ) + + return aoR_holders + +def _build_submol(cell:Cell, atm_invovled): + + import pyscf.pbc.gto as pbcgto + + subcell = pbcgto.Cell() + subcell.a = cell.a + + atm = [] + for atm_id in atm_invovled: + atm.append(cell.atom[atm_id]) + + subcell.atom = atm + subcell.basis = cell.basis + subcell.pseudo = cell.pseudo + subcell.verbose = 0 + subcell.ke_cutoff = cell.ke_cutoff + subcell.max_memory = cell.max_memory + subcell.precision = cell.precision + subcell.use_particle_mesh_ewald = cell.use_particle_mesh_ewald + subcell.mesh = cell.mesh + subcell.unit = cell.unit + subcell.build(mesh = cell.mesh) + + return subcell + +def get_aoR(cell:Cell, coords, partition, + first_npartition = None, + first_natm=None, group=None, + distance_matrix=None, AtmConnectionInfoList:list[AtmConnectionInfo]=None, + distributed = False, use_mpi=False, sync_res = False): + + if first_natm is None: + first_natm = cell.natm + if first_npartition is None: + first_npartition = len(partition) + + ## aoR is stored distributedly ## + + log = lib.logger.Logger(cell.stdout, cell.verbose) + + if use_mpi: + from pyscf.isdf.isdf_tools_mpi import rank, comm, comm_size, allgather, bcast, reduce, gather, alltoall, _comm_bunch, allgather_list, bcast_pickel + if rank == 0: + log.debug4("************* get_aoR *************") + else: + rank = 0 + comm_size = 1 + log.debug4("************* get_aoR *************") + + weight = np.sqrt(cell.vol / coords.shape[0]) + + RcutMax = -1e10 + + for _info_ in AtmConnectionInfoList: + RcutMax = max(RcutMax, np.max(_info_.bas_cut)) + + precision = AtmConnectionInfoList[0].precision + + aoR_holder = [] + + if group == None: + group = [] + for i in range(cell.natm): + group.append([i]) + + for _ in range(first_npartition): + aoR_holder.append(None) + + grid_partition = _get_grid_partition(partition, group, use_mpi) + + atm_2_grid_segment = _get_atm_2_grid_segment(partition, group) + + local_gridID_begin = 0 + global_gridID_begin = grid_partition[rank] + ao_loc = cell.ao_loc_nr() + + atm_begin, atm_end = _range_partition(first_npartition, rank, comm_size, use_mpi) + + for atm_id in range(atm_begin, atm_end): + + grid_ID = partition[atm_id] + + if len(grid_ID) == 0: + aoR_holder[atm_id] = None + continue + + ##### find the involved atms within RcutMax ##### + + if first_natm!=cell.natm: + atm_involved = np.arange(first_natm) # with kmesh ! + else: + if first_npartition == len(partition): + atm_involved = [] + for atm_id_other, distance in AtmConnectionInfoList[atm_id].atm_connected_info: + if distance < RcutMax and atm_id_other < first_natm: + atm_involved.append(atm_id_other) + atm_involved.sort() + else: + atm_involved = np.arange(cell.natm) # with kmesh ! + + ##### get the involved ao ##### + + nao_involved = 0 + for atm_id_other in atm_involved: + shl_begin = AtmConnectionInfoList[atm_id_other].bas_range[0] + shl_end = AtmConnectionInfoList[atm_id_other].bas_range[-1]+1 + nao_involved += ao_loc[shl_end] - ao_loc[shl_begin] + + bas_id = [] + + ao_loc_now = 0 + + shell_slice = [] + shl_end_test = 0 + for atm_id_other in atm_involved: + shl_begin = AtmConnectionInfoList[atm_id_other].bas_range[0] + shl_end = AtmConnectionInfoList[atm_id_other].bas_range[-1]+1 + bas_id.extend(np.arange(ao_loc[shl_begin], ao_loc[shl_end])) + + bas_id = np.array(bas_id) + + subcell = _build_submol(cell, atm_involved) + aoR = ISDF_eval_gto(subcell, coords=coords[grid_ID]) * weight + + assert aoR.shape[0] == len(bas_id) + + ##### screening the aoR, TODO: in C ##### + + max_row = np.max(np.abs(aoR), axis=1) + where = np.where(max_row > precision)[0] + if len(where) < aoR.shape[0] * 0.9: + aoR = aoR[where] + bas_id = np.array(bas_id)[where] + + global_gridID_begin = atm_2_grid_segment[atm_id][0] + aoR_holder[atm_id] = aoR_Holder(aoR, bas_id, local_gridID_begin, local_gridID_begin+len(grid_ID), global_gridID_begin, global_gridID_begin+len(grid_ID)) + + assert global_gridID_begin == atm_2_grid_segment[atm_id][0] + assert global_gridID_begin + len(grid_ID) == atm_2_grid_segment[atm_id][1] + + local_gridID_begin += len(grid_ID) + global_gridID_begin += len(grid_ID) + + del aoR + + if use_mpi and sync_res: + # aoR_holder = _sync_aoR(aoR_holder, cell.natm) + aoR_holder = _sync_aoR(aoR_holder, first_npartition) + + if use_mpi: + if rank == 0: + log.debug4("************* end get_aoR *************") + else: + log.debug4("************* end get_aoR *************") + + return aoR_holder + +def get_aoR_analytic(cell:Cell, coords, partition, + first_npartition = None, + first_natm=None, group=None, + distance_matrix=None, AtmConnectionInfoList:list[AtmConnectionInfo]=None, + distributed = False, use_mpi=False, sync_res = False): + + ''' AO values on grid points using FFT, evaluating analytic AO integrals + ''' + + assert use_mpi == False + assert first_natm is None or first_natm == cell.natm + + if group is None: + group = [] + for i in range(cell.natm): + group.append([i]) + + precision = AtmConnectionInfoList[0].precision + mesh = cell.mesh + ngrids = np.prod(mesh) + weight = cell.vol/ngrids + weight2 = np.sqrt(cell.vol / ngrids) + + blksize = 2e9//16 + nao_max_bunch = int(blksize // ngrids) + + Gv = cell.get_Gv() + + ######## pack info ######## + + aoR_unpacked = [] + ao_invovled_unpacked = [] + atm_ordering = [] + for group_idx in group: + group_idx.sort() + atm_ordering.extend(group_idx) + grid_begin_unpacked = [] + grid_end_unpacked = [] + grid_ID_now = 0 + for atm_id in atm_ordering: + grid_ID = partition[atm_id] + grid_begin_unpacked.append(grid_ID_now) + grid_end_unpacked.append(grid_ID_now + len(grid_ID)) + grid_ID_now += len(grid_ID) + aoR_unpacked.append([]) + ao_invovled_unpacked.append([]) + + ao_loc = cell.ao_loc_nr() + + task_sl_loc = [0] + ao_loc_now = 0 + for i in range(cell.nbas): + ao_loc_end = ao_loc[i+1] + if ao_loc_end - ao_loc_now > nao_max_bunch: + task_sl_loc.append(i) + ao_loc_now = ao_loc[i] + task_sl_loc.append(cell.nbas) + print("task_sl_loc = ", task_sl_loc) + nTask = len(task_sl_loc) - 1 + print("nTask = ", nTask) + + for task_id in range(nTask): + + t1 = (lib.logger.process_clock(), lib.logger.perf_counter()) + + shloc = (task_sl_loc[task_id], task_sl_loc[task_id+1]) + aoG = ft_ao.ft_ao(cell, Gv, shls_slice=shloc).T + + ### implementation 1 ### + # aoR_test = numpy.fft.ifftn(aoG.reshape(-1, *mesh), axes=(1,2,3)).real / (weight) + # aoR = aoR_test.reshape(-1, ngrids) * weight2 + + ### implementation 2 ### + aoR_test = None + aoG = aoG.conj() * np.sqrt(1/cell.vol) + aoG = aoG.reshape(-1, *mesh) + aoR = numpy.fft.fftn(aoG, axes=(1,2,3)).real * np.sqrt(1/float(ngrids)) + aoR = aoR.reshape(-1, ngrids) + + bas_id = np.arange(ao_loc[shloc[0]], ao_loc[shloc[1]]) + + for atm_id, atm_partition in enumerate(partition): + aoR_tmp = aoR[:, atm_partition].copy() + ### prune the aoR ### + where = np.where(np.max(np.abs(aoR_tmp), axis=1) > precision)[0] + aoR_tmp = aoR_tmp[where].copy() + bas_id_tmp = bas_id[where].copy() + aoR_unpacked[atm_id].append(aoR_tmp) + ao_invovled_unpacked[atm_id].append(bas_id_tmp) + + t2 = (lib.logger.process_clock(), lib.logger.perf_counter()) + + if rank == 0: + _benchmark_time(t1, t2, "get_aoR_analytic: task %d" % task_id) + + t1 = (lib.logger.process_clock(), lib.logger.perf_counter()) + + aoR_holder = [] + + for atm_id in range(len(aoR_unpacked)): + aoR_holder_tmp = np.concatenate(aoR_unpacked[atm_id], axis=0) + bas_id = np.concatenate(ao_invovled_unpacked[atm_id], axis=0) + aoR_holder.append(aoR_Holder(aoR_holder_tmp, bas_id, grid_begin_unpacked[atm_id], grid_end_unpacked[atm_id], grid_begin_unpacked[atm_id], grid_end_unpacked[atm_id])) + + t2 = (lib.logger.process_clock(), lib.logger.perf_counter()) + + del aoR_unpacked + del ao_invovled_unpacked + del aoR_tmp + del aoR_holder_tmp + del bas_id + del aoR_test + del aoR + del aoG + + + if rank == 0: + _benchmark_time(t1, t2, "get_aoR_analytic: merge") + + return aoR_holder + +if __name__ == '__main__': + + from pyscf.lib.parameters import BOHR + + TARGET_PRECISION = 1e-9 + + prim_a = np.array( + [[14.572056092/2, 0.000000000, 0.000000000], + [0.000000000, 14.572056092/2, 0.000000000], + [0.000000000, 0.000000000, 6.010273939],]) * BOHR + atm = [ +['Cu1', (1.927800, 1.927800, 1.590250)], +['O1', (1.927800, 0.000000, 1.590250)], +['O1', (0.000000, 1.927800, 1.590250)], +['Ca', (0.000000, 0.000000, 0.000000)], + ] + + basis = { + 'Cu1':'gth-dzvp-molopt-sr', 'Cu2':'gth-dzvp-molopt-sr', 'O1': 'gth-dzvp-molopt-sr', 'Ca':'gth-dzvp-molopt-sr' + } + pseudo = {'Cu1': 'gth-pbe-q19', 'Cu2': 'gth-pbe-q19', 'O1': 'gth-pbe', 'Ca': 'gth-pbe'} + + + ke_cutoff = 128 + + from isdf_tools_cell import build_supercell + + prim_cell = build_supercell(atm, prim_a, Ls = [1,1,1], ke_cutoff=ke_cutoff, basis=basis, pseudo=pseudo, verbose=10) + prim_mesh = prim_cell.mesh + + supercell = [2, 2, 1] + + mesh = [supercell[0] * prim_mesh[0], supercell[1] * prim_mesh[1], supercell[2] * prim_mesh[2]] + mesh = np.array(mesh, dtype=np.int32) + + cell = build_supercell(atm, prim_a, Ls = supercell, ke_cutoff=ke_cutoff, mesh=mesh, basis=basis, pseudo=pseudo, verbose=10) + + print(cell.atom) + print(cell.basis) + + from pyscf.pbc.dft.multigrid.multigrid_pair import MultiGridFFTDF2 + + df_tmp = MultiGridFFTDF2(cell) + grids = df_tmp.grids + coords = np.asarray(grids.coords).reshape(-1,3) + assert coords is not None + + distance_matrix = get_cell_distance_matrix(cell) + + weight = np.sqrt(cell.vol / coords.shape[0]) + + precision = TARGET_PRECISION + rcut = _estimate_rcut(cell, coords.shape[0], precision) + rcut_max = np.max(rcut) + + print("rcut = ", rcut) + print("precision = ", precision) + print("max_rcut = ", np.max(rcut)) \ No newline at end of file diff --git a/pyscf/isdf/isdf_tools_mpi.py b/pyscf/isdf/isdf_tools_mpi.py new file mode 100644 index 000000000..02ef01654 --- /dev/null +++ b/pyscf/isdf/isdf_tools_mpi.py @@ -0,0 +1,358 @@ +################### the MPI module ########################## + +from pyscf import lib +import mpi4py +from mpi4py import MPI +import numpy +import numpy as np + +comm = MPI.COMM_WORLD +rank = comm.Get_rank() +comm_size = comm.Get_size() + +## some tools copy from mpi4pyscf ## + +INT_MAX = 2147483647 +BLKSIZE = INT_MAX // 64 + 1 + +def _comm_bunch(size_of_comm, force_even=False): + if size_of_comm % comm_size == 0: + res = size_of_comm // comm_size + else: + res = (size_of_comm // comm_size) + 1 + if force_even: + if res % 2 == 1 : + res += 1 + return res + +def _assert(condition): + if not condition: + import traceback + sys.stderr.write(''.join(traceback.format_stack()[:-1])) + comm.Abort() + +def _segment_counts(counts, p0, p1): + counts_seg = counts - p0 + counts_seg[counts<=p0] = 0 + counts_seg[counts> p1] = p1 - p0 + return counts_seg + +def allgather(sendbuf, split_recvbuf=False): + sendbuf = numpy.asarray(sendbuf, order='C') + shape = sendbuf.shape + attr = comm.allgather((shape, sendbuf.dtype.char)) + rshape = [x[0] for x in attr] + counts = numpy.array([numpy.prod(x) for x in rshape]) + mpi_dtype = numpy.result_type(*[x[1] for x in attr]).char + _assert(sendbuf.dtype.char == mpi_dtype or sendbuf.size == 0) + + displs = numpy.append(0, numpy.cumsum(counts[:-1])) + recvbuf = numpy.empty(sum(counts), dtype=mpi_dtype) + + sendbuf = sendbuf.ravel() + + size_of_recvbuf = recvbuf.size + + print("rank %d size recvbf %d" % (rank, size_of_recvbuf)) + + if size_of_recvbuf >= INT_MAX: + print("large data size go this branch") + blk_size_small = min((INT_MAX // comm_size),BLKSIZE) + recvbuf_small = numpy.empty(comm_size*blk_size_small, dtype=mpi_dtype) + rdispls_small = numpy.arange(comm_size)*blk_size_small + if rank == 0: + print("blk_size_small = ", blk_size_small) + print("rdispls_small = ", rdispls_small) + sys.stdout.flush() + for p0, p1 in prange(0, numpy.max(counts), blk_size_small): + counts_seg = _segment_counts(counts, p0, p1) + comm.Allgatherv([sendbuf[p0:p1], mpi_dtype], + [recvbuf_small, counts_seg, rdispls_small, mpi_dtype]) + # recvbuf[p0:p1] = recvbuf_small[:p1-p0] + + for i in range(comm_size): + begin = displs[i]+p0 + end = begin + counts_seg[i] + recvbuf[begin:end] = recvbuf_small[i*blk_size_small:i*blk_size_small+counts_seg[i]] + + del recvbuf_small + del rdispls_small + + if split_recvbuf: + return [recvbuf[p0:p0+c].reshape(shape) + for p0,c,shape in zip(displs,counts,rshape)] + else: + return recvbuf + else: + print("small data size go this branch") + print("maxcount = ", numpy.max(counts)) + end = numpy.max(counts) + for p0, p1 in lib.prange(0, end, BLKSIZE): + print("rank %d send p0 p1 %d %d"%(rank,p0,p1)) + counts_seg = _segment_counts(counts, p0, p1) + comm.Allgatherv([sendbuf[p0:p1], mpi_dtype], + [recvbuf, counts_seg, displs+p0, mpi_dtype]) + print("rank %d finish all gather" % (rank)) + if split_recvbuf: + return [recvbuf[p0:p0+c].reshape(shape) + for p0,c,shape in zip(displs,counts,rshape)] + else: + # try: + # return recvbuf.reshape((-1,) + shape[1:]) + # except ValueError: + return recvbuf + # raise ValueError("split_recvbuf is not supported") + +def allgather_list(sendbuf): + + assert isinstance(sendbuf, list) + for _data_ in sendbuf: + assert isinstance(_data_, numpy.ndarray) + + shape = [x.shape for x in sendbuf] + attr = comm.allgather(shape) + attr_flat = [] + for x in attr: + for y in x: + attr_flat.append(y) + + if rank == 0: + for x in attr_flat: + print("x = ", x) + + print("rank %d get here 1" % (rank)) + sys.stdout.flush() + + size_tot = np.sum([x.size for x in sendbuf]) + sendbuf_flat = np.empty(size_tot, dtype=sendbuf[0].dtype) + offset = 0 + for x in sendbuf: + sendbuf_flat[offset:offset+x.size] = x.ravel() + offset += x.size + + print("rank %d get here 2" % (rank)) + sys.stdout.flush() + + recvbuf_flat = allgather(sendbuf_flat) + + print("rank %d get here 3" % (rank)) + sys.stdout.flush() + res = [] + + offset = 0 + for x in attr_flat: + res.append(recvbuf_flat[offset:offset+np.prod(x)].reshape(x)) + offset += np.prod(x) + + return res + +def allgather_pickle(sendbuf): + sendbuf_serialized = MPI.pickle.dumps(sendbuf) + sendbuf_serialized = np.frombuffer(sendbuf_serialized, dtype=np.uint8) + received = allgather(sendbuf_serialized, split_recvbuf=True) + received = [MPI.pickle.loads(x.tobytes()) for x in received] + del sendbuf_serialized + return received + +def reduce(sendbuf, op=MPI.SUM, root=0): + sendbuf = numpy.asarray(sendbuf, order='C') + shape, mpi_dtype = comm.bcast((sendbuf.shape, sendbuf.dtype.char),root=root) + _assert(sendbuf.shape == shape and sendbuf.dtype.char == mpi_dtype) + + dtype = sendbuf.dtype.char + recvbuf = numpy.zeros_like(sendbuf) + send_seg = numpy.ndarray(sendbuf.size, dtype=sendbuf.dtype, buffer=sendbuf) + recv_seg = numpy.ndarray(recvbuf.size, dtype=recvbuf.dtype, buffer=recvbuf) + for p0, p1 in lib.prange(0, sendbuf.size, BLKSIZE): + comm.Reduce([send_seg[p0:p1], dtype], + [recv_seg[p0:p1], dtype], op, root) + + if rank == root: + return recvbuf + else: + return sendbuf + +def scatter(sendbuf, root=0): + if rank == root: + mpi_dtype = numpy.result_type(*sendbuf).char + shape = comm.scatter([x.shape for x in sendbuf]) + counts = numpy.asarray([x.size for x in sendbuf]) + comm.bcast((mpi_dtype, counts)) + sendbuf = [numpy.asarray(x, mpi_dtype).ravel() for x in sendbuf] + sendbuf = numpy.hstack(sendbuf) + else: + shape = comm.scatter(None) + mpi_dtype, counts = comm.bcast(None) + + displs = numpy.append(0, numpy.cumsum(counts[:-1])) + recvbuf = numpy.empty(numpy.prod(shape), dtype=mpi_dtype) + + #DONOT use lib.prange. lib.prange may terminate early in some processes + for p0, p1 in prange(comm, 0, numpy.max(counts), BLKSIZE): + counts_seg = _segment_counts(counts, p0, p1) + comm.Scatterv([sendbuf, counts_seg, displs+p0, mpi_dtype], + [recvbuf[p0:p1], mpi_dtype], root) + return recvbuf.reshape(shape) + +def bcast(buf, root=0): + buf = numpy.asarray(buf, order='C') + shape, dtype = comm.bcast((buf.shape, buf.dtype.char), root=root) + if rank != root: + buf = numpy.empty(shape, dtype=dtype) + + dtype = buf.dtype.char + buf_seg = numpy.ndarray(buf.size, dtype=buf.dtype, buffer=buf) + for p0, p1 in lib.prange(0, buf.size, BLKSIZE): + comm.Bcast([buf_seg[p0:p1], dtype], root) + return buf + +def bcast_pickel(buf, root=0): + if rank == root: + buf_serialized = MPI.pickle.dumps(buf) + buf_serialized = np.frombuffer(buf_serialized, dtype=np.uint8) + else: + buf_serialized = None + res = bcast(buf_serialized, root) + res = MPI.pickle.loads(res.tobytes()) + return res + +def gather(sendbuf, root=0, split_recvbuf=False): + + sendbuf = numpy.asarray(sendbuf, order='C') + shape = sendbuf.shape + size_dtype = comm.allgather((shape, sendbuf.dtype.char)) + # print(size_dtype) + rshape = [x[0] for x in size_dtype] + counts = numpy.array([numpy.prod(x) for x in rshape]) + + mpi_dtype = numpy.result_type(*[x[1] for x in size_dtype]).char + _assert(sendbuf.dtype == mpi_dtype or sendbuf.size == 0) + + if rank == root: + displs = numpy.append(0, numpy.cumsum(counts[:-1])) + recvbuf = numpy.empty(sum(counts), dtype=mpi_dtype) + + sendbuf = sendbuf.ravel() + for p0, p1 in lib.prange(0, numpy.max(counts), BLKSIZE): + counts_seg = _segment_counts(counts, p0, p1) + comm.Gatherv([sendbuf[p0:p1], mpi_dtype], + [recvbuf, counts_seg, displs+p0, mpi_dtype], root) + if split_recvbuf: + return [recvbuf[p0:p0+c].reshape(shape) + for p0,c,shape in zip(displs,counts,rshape)] + else: + try: + return recvbuf.reshape((-1,) + shape[1:]) + except ValueError: + return recvbuf + else: + send_seg = sendbuf.ravel() + for p0, p1 in lib.prange(0, numpy.max(counts), BLKSIZE): + comm.Gatherv([send_seg[p0:p1], mpi_dtype], None, root) + return sendbuf + +def prange(start, stop, step): + '''Similar to lib.prange. This function ensures that all processes have the + same number of steps. It is required by alltoall communication. + ''' + nsteps = (stop - start + step - 1) // step + nsteps = max(comm.allgather(nsteps)) + for i in range(nsteps): + i0 = min(stop, start + i * step) + i1 = min(stop, i0 + step) + yield i0, i1 + +def alltoall(sendbuf, split_recvbuf=False): + if isinstance(sendbuf, numpy.ndarray): + raise NotImplementedError + mpi_dtype = comm.bcast(sendbuf.dtype.char) + sendbuf = numpy.asarray(sendbuf, mpi_dtype, 'C') + nrow = sendbuf.shape[0] + ncol = sendbuf.size // nrow + segsize = (nrow+comm_size-1) // comm_size * ncol + sdispls = numpy.arange(0, comm_size*segsize, segsize) + sdispls[sdispls>sendbuf.size] = sendbuf.size + scounts = numpy.append(sdispls[1:]-sdispls[:-1], sendbuf.size-sdispls[-1]) + rshape = comm.alltoall(scounts) + else: + _assert(len(sendbuf) == comm_size) + mpi_dtype = comm.bcast(sendbuf[0].dtype.char) + sendbuf = [numpy.asarray(x, mpi_dtype) for x in sendbuf] + rshape = comm.alltoall([x.shape for x in sendbuf]) + scounts = numpy.asarray([x.size for x in sendbuf], dtype=np.int64) + sdispls = numpy.append(0, numpy.cumsum(scounts[:-1])) + sendbuf = numpy.hstack([x.ravel() for x in sendbuf]) + + rcounts = numpy.asarray([numpy.prod(x) for x in rshape], dtype=np.int64) + rdispls = numpy.append(0, numpy.cumsum(rcounts[:-1])) + recvbuf = numpy.empty(sum(rcounts), dtype=mpi_dtype) + + if rank == 0: + print("sdispls = ", sdispls) + print("rcounts = ", rcounts) + print("rdispls = ", rdispls) + + max_counts = max(numpy.max(scounts), numpy.max(rcounts)) + + if rank == 0: + print("max_counts = ", max_counts) + + sendbuf = sendbuf.ravel() + #DONOT use lib.prange. lib.prange may terminate early in some processes + + size_of_sendbuf = sendbuf.size + + # if sdispls[-1] >= INT_MAX: + if size_of_sendbuf >=INT_MAX: + blk_size_small = min((INT_MAX // comm_size),BLKSIZE) + sendbuf_small = numpy.empty(comm_size*blk_size_small, dtype=mpi_dtype) + recvbuf_small = numpy.empty(comm_size*blk_size_small, dtype=mpi_dtype) + sdispls_small = numpy.arange(comm_size)*blk_size_small + if rank == 0: + print("blk_size_small = ", blk_size_small) + print("sdispls_small = ", sdispls_small) + sys.stdout.flush() + for p0, p1 in prange(0, max_counts, blk_size_small): + scounts_seg = _segment_counts(scounts, p0, p1) + rcounts_seg = _segment_counts(rcounts, p0, p1) + + # if rank == 0: + # print("p0 p1 = ", p0, p1) + # print("scounts_seg = ", scounts_seg) + # print("rcounts_seg = ", rcounts_seg) + # sys.stdout.flush() + ### copy data to sendbuf_small + for i in range(comm_size): + begin = sdispls[i]+p0 + end = begin + scounts_seg[i] + sendbuf_small[i*blk_size_small:i*blk_size_small+scounts_seg[i]] = sendbuf[begin:end] + + comm.Alltoallv([sendbuf_small, scounts_seg, sdispls_small, mpi_dtype], + [recvbuf_small, rcounts_seg, sdispls_small, mpi_dtype]) + + for i in range(comm_size): + begin = rdispls[i]+p0 + end = begin + rcounts_seg[i] + recvbuf[begin:end] = recvbuf_small[i*blk_size_small:i*blk_size_small+rcounts_seg[i]] + + sendbuf_small = None + recvbuf_small = None + else: + for p0, p1 in prange(0, max_counts, BLKSIZE): + scounts_seg = _segment_counts(scounts, p0, p1) + rcounts_seg = _segment_counts(rcounts, p0, p1) + # if rank == 0: + # print("scounts_seg = ", scounts_seg) + # print("rcounts_seg = ", rcounts_seg) + comm.Alltoallv([sendbuf, scounts_seg, sdispls+p0, mpi_dtype], + [recvbuf, rcounts_seg, rdispls+p0, mpi_dtype]) + + # return None + + if split_recvbuf: + return [recvbuf[p0:p0+c].reshape(shape) + for p0,c,shape in zip(rdispls, rcounts, rshape)] + else: + return recvbuf + +################### end of the MPI module ########################## \ No newline at end of file diff --git a/pyscf/isdf/pbc_isdf_V.c b/pyscf/isdf/pbc_isdf_V.c new file mode 100644 index 000000000..a3d166f3f --- /dev/null +++ b/pyscf/isdf/pbc_isdf_V.c @@ -0,0 +1,1567 @@ +#include "fft.h" +#include +#include +#include +#include "vhf/fblas.h" +#include + +int get_omp_threads(); +int omp_get_thread_num(); + +void _construct_J( + int *mesh, + double *DensityR, + double *CoulG, + double *J) +{ + const int nThread = get_omp_threads(); + // int mesh_complex[3] = {mesh[0], mesh[1], mesh[2] / 2 + 1}; + const int n_real = mesh[0] * mesh[1] * mesh[2]; + // const int n_complex = mesh_complex[0] * mesh_complex[1] * mesh_complex[2]; + const double fac = 1. / (double)n_real; + + fftw_complex *DensityR_complex = fftw_malloc(sizeof(double __complex__) * n_real); + fftw_complex *buf = fftw_malloc(sizeof(double __complex__) * n_real); + fftw_complex *J_complex = fftw_malloc(sizeof(double __complex__) * n_real); + + memset(buf, 0, sizeof(double __complex__) * n_real); + memset(J_complex, 0, sizeof(double __complex__) * n_real); + memset(DensityR_complex, 0, sizeof(double __complex__) * n_real); + +#pragma omp parallel for num_threads(nThread) schedule(static) + for (int i = 0; i < n_real; ++i) + { + DensityR_complex[i][0] = DensityR[i]; + } + + fftw_plan p_forward = fftw_plan_dft_3d(mesh[0], mesh[1], mesh[2], DensityR_complex, (fftw_complex *)buf, FFTW_BACKWARD, FFTW_ESTIMATE); + fftw_plan p_backward = fftw_plan_dft_3d(mesh[0], mesh[1], mesh[2], (fftw_complex *)buf, J_complex, FFTW_FORWARD, FFTW_ESTIMATE); + + fftw_execute(p_forward); + + double *ptr = (double *)buf; + +#pragma omp parallel for num_threads(nThread) schedule(static) + for (int i = 0; i < n_real; i++) + { + ptr[i * 2] *= CoulG[i] * fac; + ptr[i * 2 + 1] *= CoulG[i] * fac; + } + + fftw_execute(p_backward); + +#pragma omp parallel for num_threads(nThread) schedule(static) + for (int i = 0; i < n_real; i++) + { + J[i] = J_complex[i][0]; + } + + fftw_destroy_plan(p_forward); + fftw_destroy_plan(p_backward); + + fftw_free(buf); + fftw_free(DensityR_complex); + fftw_free(J_complex); +} + +void _fn_J_dmultiplysum(double *out, + const int nrow, const int ncol, + const double *a, + const int nrow_a, const int ncol_a, + const int row_a_shift, + const int col_a_shift, + const double *b, + const int nrow_b, const int ncol_b, + const int row_b_shift, + const int col_b_shift) +{ + static const int BUNCHSIZE = 512; + + const double *pa = a + row_a_shift * ncol_a + col_a_shift; + const double *pb = b + row_b_shift * ncol_b + col_b_shift; + + memset(out, 0, sizeof(double) * ncol); + + const int nThread = get_omp_threads(); + const int nBunch = ncol / BUNCHSIZE + 1; + +#pragma omp parallel for num_threads(nThread) schedule(static) + for (int i = 0; i < nBunch; i++) + { + int bunch_start = i * BUNCHSIZE; + int bunch_end = (i + 1) * BUNCHSIZE; + if (bunch_end > ncol) + { + bunch_end = ncol; + } + if (bunch_end > ncol) + { + bunch_end = ncol; + } + + for (int j = 0; j < nrow; j++) + { + const double *ppa = pa + j * ncol_a; + const double *ppb = pb + j * ncol_b; + for (int k = bunch_start; k < bunch_end; k++) + { + out[k] += ppa[k] * ppb[k]; + } + } + } +} + +void _Pack_Matrix_SparseRow_DenseCol( + double *target, + const int nrow_target, + const int ncol_target, + double *source, + const int nrow_source, + const int ncol_source, + int *RowLoc, + const int ColBegin, + const int ColEnd) +{ + if (ColEnd - ColBegin <= 0) + { + return; + } + + if (ColEnd < (ColBegin + ncol_source)) + { + printf("ColEnd ncol_target) + { + printf("ColEnd>ncol_target\n"); + exit(1); + } + + int i; + + for (i = 0; i < nrow_source; i++) + { + int row_loc = RowLoc[i]; + memcpy(target + row_loc * ncol_target + ColBegin, source + i * ncol_source, sizeof(double) * ncol_source); + } +} + +void _Reorder_Grid_to_Original_Grid(int ngrid, int *gridID, double *Density_or_J, + double *out) +{ + int i; + for (i = 0; i < ngrid; i++) + { + out[gridID[i]] = Density_or_J[i]; + } +} + +void _Original_Grid_to_Reorder_Grid( + int ngrid, int *gridID, double *Density_or_J, double *out) +{ + int i; + for (i = 0; i < ngrid; i++) + { + out[i] = Density_or_J[gridID[i]]; + } +} + +void _construct_V_local_bas( + int *mesh, + int nrow, + int ncol, + int *gridID, + double *auxBasis, + double *CoulG, + int row_shift, + double *V, + int *grid_ordering, + double *buf, // use the ptr of the ptr to ensure that the memory for each thread is aligned + const int buffersize // must be a multiple of 16 to ensure memory alignment +) +{ + // printf("nrow: %d, ncol: %d\n", nrow, ncol); + // printf("row_shift: %d\n", row_shift); + + const int nThread = get_omp_threads(); + size_t mesh_complex[3] = {mesh[0], mesh[1], mesh[2] / 2 + 1}; + const size_t n_real = mesh[0] * mesh[1] * mesh[2]; + const size_t n_complex = mesh_complex[0] * mesh_complex[1] * mesh_complex[2]; + const double fac = 1. / (double)n_real; + + // create plan for fft + + fftw_plan p_forward = fftw_plan_dft_r2c(3, mesh, auxBasis, (fftw_complex *)buf, FFTW_ESTIMATE); + fftw_plan p_backward = fftw_plan_dft_c2r(3, mesh, (fftw_complex *)buf, V, FFTW_ESTIMATE); + +#pragma omp parallel num_threads(nThread) + { + int thread_id = omp_get_thread_num(); + double *buf_thread = buf + thread_id * buffersize; + fftw_complex *buf_fft = (fftw_complex *)(buf_thread + n_real); + +#pragma omp for schedule(static) + for (size_t i = 0; i < nrow; i++) + { + // pack + + memset(buf_thread, 0, sizeof(double) * n_real); + + for (size_t j = 0; j < ncol; j++) + { + buf_thread[gridID[j]] = auxBasis[i * ncol + j]; + } + + // forward transform + + fftw_execute_dft_r2c(p_forward, buf_thread, (fftw_complex *)buf_fft); + + // multiply CoulG + + double *ptr = (double *)buf_fft; + + for (size_t j = 0; j < n_complex; j++) + { + *ptr++ *= CoulG[j]; /// TODO: use ISPC to accelerate + *ptr++ *= CoulG[j]; /// TODO: use ISPC to accelerate + } + + // backward transform + + memset(buf_thread, 0, sizeof(double) * n_real); + + fftw_execute_dft_c2r(p_backward, (fftw_complex *)buf_fft, buf_thread); + + // scale + + ptr = V + (i + row_shift) * n_real; + + for (size_t j = 0; j < n_real; j++) + { + // *ptr++ *= fac; /// TODO: use ISPC to accelerate + // ptr[grid_ordering[j]] = buf_thread[j] * fac; + ptr[j] = buf_thread[grid_ordering[j]] * fac; + } + } + } + + fftw_destroy_plan(p_forward); + fftw_destroy_plan(p_backward); +} + +void _construct_V_kernel(int *mesh_bra, + int *mesh_ket, + int *map_bra_2_ket, + int naux, + double *auxBasis, + double *CoulG, // bra + double *V, + const int BunchSize, + double *buf, // use the ptr of the ptr to ensure that the memory for each thread is aligned + const int buffersize // must be a multiple of 16 to ensure memory alignment +) +{ + // printf("naux = %d\n", naux); + // printf("BunchSize = %d\n", BunchSize); + + // print all the input info + + static const int INC1 = 1; + static const int SMALL_SIZE = 8; + + const int nThread = get_omp_threads(); + const int nBunch = ((naux / BunchSize) / nThread) * nThread; // dispatch evenly + const int nLeft = naux - nBunch * BunchSize; + + // printf("nBunch = %d\n", nBunch); + // printf("nLeft = %d\n", nLeft); + + // print the dispatch info + + int mesh_bra_complex[3] = {mesh_bra[0], mesh_bra[1], mesh_bra[2] / 2 + 1}; + int mesh_ket_complex[3] = {mesh_ket[0], mesh_ket[1], mesh_ket[2] / 2 + 1}; + + const int n_real_bra = mesh_bra[0] * mesh_bra[1] * mesh_bra[2]; + const int n_complex_bra = mesh_bra_complex[0] * mesh_bra_complex[1] * mesh_bra_complex[2]; + const int n_real_ket = mesh_ket[0] * mesh_ket[1] * mesh_ket[2]; + const int n_complex_ket = mesh_ket_complex[0] * mesh_ket_complex[1] * mesh_ket_complex[2]; + + if (n_real_bra > n_real_ket) + { + printf("n_real_bra > n_real_ket\n"); + exit(1); + } + + const double fac = 1. / sqrtl((double)n_real_bra * (double)n_real_ket); + + // create plan for fft + + fftw_plan p_forward = fftw_plan_many_dft_r2c( + 3, mesh_bra, BunchSize, auxBasis, mesh_bra, 1, n_real_bra, (fftw_complex *)buf, mesh_bra_complex, 1, n_complex_bra, FFTW_ESTIMATE); + + fftw_plan p_backward = fftw_plan_many_dft_c2r( + 3, mesh_ket, BunchSize, (fftw_complex *)buf, mesh_ket_complex, 1, n_complex_ket, V, mesh_ket, 1, n_real_ket, FFTW_ESTIMATE); + + // execute parallelly sharing the same plan + +#pragma omp parallel num_threads(nThread) + { + int thread_id = omp_get_thread_num(); + double *buf_thread = buf + thread_id * (size_t)buffersize; + size_t bunch_i, bunch_start, bunch_end, j, k; + double *ptr; + +#pragma omp for schedule(static) + for (bunch_i = 0; bunch_i < nBunch; ++bunch_i) + // for (bunch_i = 0; bunch_i < 0; ++bunch_i) + { + bunch_start = bunch_i * BunchSize; + bunch_end = bunch_start + BunchSize; + + // forward transform + + fftw_execute_dft_r2c(p_forward, auxBasis + (size_t)bunch_start * (size_t)n_real_bra, (fftw_complex *)buf_thread); + + // multiply CoulG + + ptr = buf_thread; + + for (j = bunch_start; j < bunch_end; ++j) + { + for (k = 0; k < n_complex_bra; ++k) + { + *ptr++ *= CoulG[k]; /// TODO: use ISPC to accelerate + *ptr++ *= CoulG[k]; /// TODO: use ISPC to accelerate + } + } + + if (map_bra_2_ket != NULL) + { + ptr = buf_thread + n_complex_bra * 2 * BunchSize; + memset(ptr, 0, sizeof(double) * n_complex_ket * 2 * BunchSize); + for (j = bunch_start; j < bunch_end; ++j) + { + size_t shift = (j - bunch_start) * n_complex_bra * 2; + for (k = 0; k < n_complex_bra; ++k) + { + ptr[2 * map_bra_2_ket[k]] = buf_thread[shift + 2 * k]; + ptr[2 * map_bra_2_ket[k] + 1] = buf_thread[shift + 2 * k + 1]; + } + ptr += n_complex_ket * 2; + } + ptr = buf_thread + n_complex_bra * 2 * BunchSize; + } + else + { + ptr = buf_thread; + } + + // backward transform + + // fftw_execute_dft_c2r(p_backward, (fftw_complex *)buf_thread, V + (size_t)bunch_start * (size_t)n_real); + fftw_execute_dft_c2r(p_backward, (fftw_complex *)ptr, V + (size_t)bunch_start * (size_t)n_real_ket); + + // scale + + ptr = V + (size_t)bunch_start * (size_t)n_real_ket; + int _size_ = n_real_ket * BunchSize; + dscal_(&_size_, &fac, ptr, &INC1); + + // for (j = bunch_start; j < bunch_end; ++j) + // { + // for (k = 0; k < n_real_ket; ++k) + // { + // *ptr++ *= fac; /// TODO: use ISPC to accelerate + // } + // } + } + } + + // destroy plan + + fftw_destroy_plan(p_forward); + fftw_destroy_plan(p_backward); + + // printf("finish bulk nLeft = %d\n", nLeft); + // fflush(stdout); + + if (nLeft > 0) + { + if ((nLeft <= SMALL_SIZE) && (nLeft <= BunchSize)) + { + // printf("nLeft <= SMALL_SIZE or nLeft <= BunchSize\n"); + // fflush(stdout); + + // use single thread to handle the left + + int bunch_start = nBunch * BunchSize; + int bunch_end = bunch_start + nLeft; + + // create plan + + fftw_plan p_forward = fftw_plan_many_dft_r2c( + // 3, mesh, nLeft, auxBasis + bunch_start * n_real, mesh, 1, n_real, (fftw_complex *)buf, mesh_complex, 1, n_complex, FFTW_ESTIMATE); + 3, mesh_bra, nLeft, auxBasis + bunch_start * n_real_bra, mesh_bra, 1, n_real_bra, (fftw_complex *)buf, mesh_bra_complex, 1, n_complex_bra, FFTW_ESTIMATE); + + fftw_plan p_backward = fftw_plan_many_dft_c2r( + // 3, mesh, nLeft, (fftw_complex *)buf, mesh_complex, 1, n_complex, V + bunch_start * n_real, mesh, 1, n_real, FFTW_ESTIMATE); + 3, mesh_ket, nLeft, (fftw_complex *)buf, mesh_ket_complex, 1, n_complex_ket, V + bunch_start * n_real_ket, mesh_ket, 1, n_real_ket, FFTW_ESTIMATE); + + // forward transform + + // fftw_execute_dft_r2c(p_forward, auxBasis + (size_t)bunch_start * (size_t)n_real, (fftw_complex *)buf); + fftw_execute_dft_r2c(p_forward, auxBasis + (size_t)bunch_start * (size_t)n_real_bra, (fftw_complex *)buf); + + // multiply CoulG + + double *ptr = buf; + + for (int j = bunch_start; j < bunch_end; ++j) + { + // for (int k = 0; k < n_complex; ++k) + for (int k = 0; k < n_complex_bra; ++k) + { + *ptr++ *= CoulG[k]; /// + *ptr++ *= CoulG[k]; /// + } + } + + if (map_bra_2_ket != NULL) + { + ptr = buf + n_complex_bra * 2 * nLeft; + memset(ptr, 0, sizeof(double) * n_complex_ket * 2 * nLeft); + for (int j = bunch_start; j < bunch_end; ++j) + { + size_t shift = (j - bunch_start) * n_complex_bra * 2; + for (int k = 0; k < n_complex_bra; ++k) + { + ptr[2 * map_bra_2_ket[k]] = buf[shift + 2 * k]; + ptr[2 * map_bra_2_ket[k] + 1] = buf[shift + 2 * k + 1]; + } + ptr += n_complex_ket * 2; + } + ptr = buf + n_complex_bra * 2 * nLeft; + } + else + { + ptr = buf; + } + + // backward transform + + // fftw_execute_dft_c2r(p_backward, (fftw_complex *)buf, V + (size_t)bunch_start * (size_t)n_real); + fftw_execute_dft_c2r(p_backward, (fftw_complex *)ptr, V + (size_t)bunch_start * (size_t)n_real_ket); + + // scale + + // ptr = V + (size_t)bunch_start * (size_t)n_real; + ptr = V + (size_t)bunch_start * (size_t)n_real_ket; + int _size_ = n_real_ket * nLeft; + dscal_(&_size_, &fac, ptr, &INC1); + + // for (int j = bunch_start; j < bunch_end; ++j) + // { + // for (int k = 0; k < n_real; ++k) + // { + // *ptr++ *= fac; /// + // } + // } + + // destroy plan + + fftw_destroy_plan(p_forward); + fftw_destroy_plan(p_backward); + } + else + { + // printf("nLeft > SMALL_SIZE or nLeft > BunchSize\n"); + + // use parallel thread to handle the left, assume the nTransform is 1 + + int bunch_start = nBunch * BunchSize; + int bunch_end = bunch_start + nLeft; + + // create plan + + // fftw_plan p_forward = fftw_plan_dft_r2c(3, mesh, auxBasis + bunch_start * n_real, (fftw_complex *)buf, FFTW_ESTIMATE); + fftw_plan p_forward = fftw_plan_dft_r2c(3, + // mesh, auxBasis + bunch_start * n_real, (fftw_complex *)buf, FFTW_ESTIMATE); + mesh_bra, auxBasis + bunch_start * n_real_bra, (fftw_complex *)buf, FFTW_ESTIMATE); + + fftw_plan p_backward = fftw_plan_dft_c2r(3, + // mesh, (fftw_complex *)buf, V + bunch_start * n_real, FFTW_ESTIMATE); + mesh_ket, (fftw_complex *)buf, V + bunch_start * n_real_ket, FFTW_ESTIMATE); + + // size_t nbuf_per_thread = ((n_complex * 2 + 15) / 16) * 16; // make sure the memory is aligned + size_t nbuf_per_thread = ((n_complex_bra * 2 + 15) / 16) * 16; // make sure the memory is aligned + if (map_bra_2_ket != NULL) + { + nbuf_per_thread += ((n_complex_ket * 2 + 15) / 16) * 16; + } + +#pragma omp parallel num_threads(nThread) + { + int thread_id = omp_get_thread_num(); + double *buf_thread = buf + thread_id * (size_t)nbuf_per_thread; + size_t k; + double *ptr; + +#pragma omp for schedule(static) + for (size_t j = bunch_start; j < bunch_end; ++j) + { + + // forward transform + + // fftw_execute_dft_r2c(p_forward, auxBasis + j * (size_t)n_real, (fftw_complex *)buf_thread); + fftw_execute_dft_r2c(p_forward, auxBasis + j * (size_t)n_real_bra, (fftw_complex *)buf_thread); + + // multiply CoulG + + ptr = buf_thread; + + // for (k = 0; k < n_complex; ++k) + for (k = 0; k < n_complex_bra; ++k) + { + *ptr++ *= CoulG[k]; + *ptr++ *= CoulG[k]; + } + + if (map_bra_2_ket != NULL) + { + ptr = buf_thread + n_complex_bra * 2; + memset(ptr, 0, sizeof(double) * n_complex_ket * 2); + for (k = 0; k < n_complex_bra; ++k) + { + ptr[2 * map_bra_2_ket[k]] = buf_thread[2 * k]; + ptr[2 * map_bra_2_ket[k] + 1] = buf_thread[2 * k + 1]; + } + ptr = buf_thread + n_complex_bra * 2; + } + else + { + ptr = buf_thread; + } + + // backward transform + + // fftw_execute_dft_c2r(p_backward, (fftw_complex *)buf_thread, V + j * (size_t)n_real); + fftw_execute_dft_c2r(p_backward, (fftw_complex *)ptr, V + j * (size_t)n_real_ket); + + // scale + + // ptr = V + j * (size_t)n_real; + ptr = V + j * (size_t)n_real_ket; + int _size_ = n_real_ket; + dscal_(&_size_, &fac, ptr, &INC1); + + // for (k = 0; k < n_real; ++k) + // { + // *ptr++ *= fac; + // } + } + } + + // destroy plan + + fftw_destroy_plan(p_forward); + fftw_destroy_plan(p_backward); + } + } +} + +void _construct_V(int *mesh, + int naux, + double *auxBasis, + double *CoulG, + double *V, + const int BunchSize, + double *buf, // use the ptr of the ptr to ensure that the memory for each thread is aligned + const int buffersize // must be a multiple of 16 to ensure memory alignment +) +{ + _construct_V_kernel(mesh, mesh, NULL, naux, auxBasis, CoulG, V, BunchSize, buf, buffersize); +} + +void _construct_V2(int *mesh, + int naux, + double *auxBasis, + double *CoulG, + double *V, + double *auxBasisFFT, + const int BunchSize, + double *buf, // use the ptr of the ptr to ensure that the memory for each thread is aligned + const int buffersize, // must be a multiple of 16 to ensure memory alignment + const int CONSTRUCT_V) +{ + // printf("CONSTRUCT_V: %d\n", CONSTRUCT_V); + + // print all the input info + + static const int SMALL_SIZE = 8; + + const int nThread = get_omp_threads(); + const int nBunch = ((naux / BunchSize) / nThread) * nThread; // dispatch evenly + const int nLeft = naux - nBunch * BunchSize; + + // print the dispatch info + + int mesh_complex[3] = {mesh[0], mesh[1], mesh[2] / 2 + 1}; + + const int n_real = mesh[0] * mesh[1] * mesh[2]; + const int n_complex = mesh_complex[0] * mesh_complex[1] * mesh_complex[2]; + const double fac = 1. / (double)n_real; + + // create plan for fft + + fftw_plan p_forward = fftw_plan_many_dft_r2c( + 3, mesh, BunchSize, auxBasis, mesh, 1, n_real, (fftw_complex *)buf, mesh_complex, 1, n_complex, FFTW_ESTIMATE); + fftw_plan p_backward = fftw_plan_many_dft_c2r( + 3, mesh, BunchSize, (fftw_complex *)buf, mesh_complex, 1, n_complex, V, mesh, 1, n_real, FFTW_ESTIMATE); + + // execute parallelly sharing the same plan + +#pragma omp parallel num_threads(nThread) + { + int thread_id = omp_get_thread_num(); + double *buf_thread = buf + thread_id * (size_t)buffersize; + size_t bunch_i, bunch_start, bunch_end, j, k; + double *ptr; + +#pragma omp for schedule(static) + for (bunch_i = 0; bunch_i < nBunch; ++bunch_i) + // for (bunch_i = 0; bunch_i < 0; ++bunch_i) + { + bunch_start = bunch_i * BunchSize; + bunch_end = bunch_start + BunchSize; + + // forward transform + + fftw_execute_dft_r2c(p_forward, auxBasis + (size_t)bunch_start * (size_t)n_real, (fftw_complex *)buf_thread); + + // multiply CoulG + + ptr = buf_thread; + + // copy + + memcpy(auxBasisFFT + (size_t)bunch_start * (size_t)n_complex * 2, buf_thread, (size_t)BunchSize * (size_t)n_complex * sizeof(double) * 2); + + if (CONSTRUCT_V > 0) + { + for (j = bunch_start; j < bunch_end; ++j) + { + for (k = 0; k < n_complex; ++k) + { + *ptr++ *= CoulG[k]; /// TODO: use ISPC to accelerate + *ptr++ *= CoulG[k]; /// TODO: use ISPC to accelerate + } + } + + // backward transform + + fftw_execute_dft_c2r(p_backward, (fftw_complex *)buf_thread, V + (size_t)bunch_start * (size_t)n_real); + + // scale + + ptr = V + (size_t)bunch_start * (size_t)n_real; + + for (j = bunch_start; j < bunch_end; ++j) + { + for (k = 0; k < n_real; ++k) + { + *ptr++ *= fac; /// TODO: use ISPC to accelerate + } + } + } + } + } + + // destroy plan + + fftw_destroy_plan(p_forward); + fftw_destroy_plan(p_backward); + + if (nLeft > 0) + { + if ((nLeft <= SMALL_SIZE) && (nLeft <= BunchSize)) + // if (1) + { + // use single thread to handle the left + + int bunch_start = nBunch * BunchSize; + int bunch_end = bunch_start + nLeft; + + // create plan + + fftw_plan p_forward = fftw_plan_many_dft_r2c( + 3, mesh, nLeft, auxBasis + bunch_start * n_real, mesh, 1, n_real, (fftw_complex *)buf, mesh_complex, 1, n_complex, FFTW_ESTIMATE); + fftw_plan p_backward = fftw_plan_many_dft_c2r( + 3, mesh, nLeft, (fftw_complex *)buf, mesh_complex, 1, n_complex, V + bunch_start * n_real, mesh, 1, n_real, FFTW_ESTIMATE); + + // forward transform + + fftw_execute_dft_r2c(p_forward, auxBasis + (size_t)bunch_start * (size_t)n_real, (fftw_complex *)buf); + + // multiply CoulG + + double *ptr = buf; + + // copy + + memcpy(auxBasisFFT + (size_t)bunch_start * (size_t)n_complex * 2, buf, (size_t)nLeft * (size_t)n_complex * sizeof(double) * 2); + + if (CONSTRUCT_V > 0) + { + for (int j = bunch_start; j < bunch_end; ++j) + { + for (int k = 0; k < n_complex; ++k) + { + *ptr++ *= CoulG[k]; /// TODO: use ISPC to accelerate + *ptr++ *= CoulG[k]; /// TODO: use ISPC to accelerate + } + } + + // backward transform + + fftw_execute_dft_c2r(p_backward, (fftw_complex *)buf, V + (size_t)bunch_start * (size_t)n_real); + + // scale + + ptr = V + (size_t)bunch_start * (size_t)n_real; + + for (int j = bunch_start; j < bunch_end; ++j) + { + for (int k = 0; k < n_real; ++k) + { + *ptr++ *= fac; /// TODO: use ISPC to accelerate + } + } + } + + // destroy plan + + fftw_destroy_plan(p_forward); + fftw_destroy_plan(p_backward); + } + else + { + + // use parallel thread to handle the left, assume the nTransform is 1 + + int bunch_start = nBunch * BunchSize; + int bunch_end = bunch_start + nLeft; + + // create plan + + fftw_plan p_forward = fftw_plan_dft_r2c(3, mesh, auxBasis + bunch_start * n_real, (fftw_complex *)buf, FFTW_ESTIMATE); + fftw_plan p_backward = fftw_plan_dft_c2r(3, mesh, (fftw_complex *)buf, V + bunch_start * n_real, FFTW_ESTIMATE); + + size_t nbuf_per_thread = ((n_complex * 2 + 15) / 16) * 16; // make sure the memory is aligned + +#pragma omp parallel num_threads(nThread) + { + int thread_id = omp_get_thread_num(); + double *buf_thread = buf + thread_id * (size_t)nbuf_per_thread; + size_t k; + double *ptr; + +#pragma omp for schedule(static) + for (size_t j = bunch_start; j < bunch_end; ++j) + { + + // forward transform + + fftw_execute_dft_r2c(p_forward, auxBasis + j * (size_t)n_real, (fftw_complex *)buf_thread); + + // multiply CoulG + + ptr = buf_thread; + + // copy + + memcpy(auxBasisFFT + j * (size_t)n_complex * 2, buf_thread, (size_t)n_complex * sizeof(double) * 2); + + if (CONSTRUCT_V > 0) + { + for (k = 0; k < n_complex; ++k) + { + *ptr++ *= CoulG[k]; /// TODO: use ISPC to accelerate + *ptr++ *= CoulG[k]; /// TODO: use ISPC to accelerate + } + + // backward transform + + fftw_execute_dft_c2r(p_backward, (fftw_complex *)buf_thread, V + j * (size_t)n_real); + + // scale + + ptr = V + j * (size_t)n_real; + + for (k = 0; k < n_real; ++k) + { + *ptr++ *= fac; /// TODO: use ISPC to accelerate + } + } + } + } + + // destroy plan + + fftw_destroy_plan(p_forward); + fftw_destroy_plan(p_backward); + } + } +} + +void _construct_W_multiG( + int naux, + int p0, + int p1, + double *auxBasisFFT, + double *CoulG) +{ + int ngrid = p1 - p0; + int nThread = get_omp_threads(); + + size_t i; + + const double *ptr_G = CoulG + p0; + +#pragma omp parallel for num_threads(nThread) schedule(static) + for (i = 0; i < naux; i++) + { + size_t j; + double *ptr_basis = auxBasisFFT + i * ngrid * 2; + for (j = 0; j < ngrid; j++) + { + ptr_basis[j * 2] *= ptr_G[j]; + ptr_basis[j * 2 + 1] *= ptr_G[j]; + } + } +} + +///////////// get_jk linear scaling ///////////// + +void _extract_dm_involved_ao( + double *dm, + const int nao, + double *res_buf, + const int *ao_involved, + const int nao_involved) +{ + int nThread = get_omp_threads(); + +#pragma omp parallel for num_threads(nThread) schedule(static) + for (size_t i = 0; i < nao_involved; ++i) + { + for (size_t j = 0; j < nao_involved; ++j) + { + res_buf[i * nao_involved + j] = dm[ao_involved[i] * nao + ao_involved[j]]; + } + } +} + +void _extract_dm_involved_ao_RS( + double *dm, + const int nao, + double *res_buf, + const int *bra_ao_involved, + const int bra_nao_involved, + const int *ket_ao_involved, + const int ket_nao_involved) +{ + int nThread = get_omp_threads(); + +#pragma omp parallel for num_threads(nThread) schedule(static) + for (size_t i = 0; i < bra_nao_involved; ++i) + { + for (size_t j = 0; j < ket_nao_involved; ++j) + { + res_buf[i * ket_nao_involved + j] = dm[bra_ao_involved[i] * nao + ket_ao_involved[j]]; + } + } +} + +void _packadd_local_dm( + double *local_dm, + const int nao_involved, + const int *ao_involved, + double *dm, + const int nao) +{ + int nThread = get_omp_threads(); + +#pragma omp parallel for num_threads(nThread) schedule(static) + for (size_t i = 0; i < nao_involved; ++i) + { + for (size_t j = 0; j < nao_involved; ++j) + { + dm[ao_involved[i] * nao + ao_involved[j]] += local_dm[i * nao_involved + j]; + } + } +} + +void _packadd_local_dm2_add_transpose( + double *local_dm, + const int bra_nao_involved, + const int *bra_ao_involved, + const int ket_nao_involved, + const int *ket_ao_involved, + double *dm, + const int nao) +{ + int nThread = get_omp_threads(); + +#pragma omp parallel for num_threads(nThread) schedule(static) + for (size_t i = 0; i < bra_nao_involved; ++i) + { + for (size_t j = 0; j < ket_nao_involved; ++j) + { + dm[bra_ao_involved[i] * nao + ket_ao_involved[j]] += local_dm[i * ket_nao_involved + j]; + dm[ket_ao_involved[j] * nao + bra_ao_involved[i]] += local_dm[i * ket_nao_involved + j]; + } + } +} + +void _packadd_local_dm2( + double *local_dm, + const int bra_nao_involved, + const int *bra_ao_involved, + const int ket_nao_involved, + const int *ket_ao_involved, + double *dm, + const int nao) +{ + int nThread = get_omp_threads(); + +#pragma omp parallel for num_threads(nThread) schedule(static) + for (size_t i = 0; i < bra_nao_involved; ++i) + { + for (size_t j = 0; j < ket_nao_involved; ++j) + { + dm[bra_ao_involved[i] * nao + ket_ao_involved[j]] += local_dm[i * ket_nao_involved + j]; + } + } +} + +void _packadd_local_RS( + double *local_dm, + const int bra_nao_involved, + const int *bra_ao_involved, + const int ket_nao_involved, + const int *ket_ao_involved, + double *dm, + const int nao) +{ + int nThread = get_omp_threads(); + +#pragma omp parallel for num_threads(nThread) schedule(static) + for (size_t i = 0; i < bra_nao_involved; ++i) + { + for (size_t j = 0; j < ket_nao_involved; ++j) + { + dm[bra_ao_involved[i] * nao + ket_ao_involved[j]] += local_dm[i * ket_nao_involved + j]; + dm[ket_ao_involved[j] * nao + bra_ao_involved[i]] += local_dm[i * ket_nao_involved + j]; + } + } +} + +void _buildJ_k_packaddrow( + double *target, + const int nrow_target, + const int ncol_target, + double *source, + const int nrow_source, + const int ncol_source, + const int *rowloc, + const int *colloc) +{ + int nThread = get_omp_threads(); + +#pragma omp parallel for num_threads(nThread) schedule(static) + for (size_t i = 0; i < nrow_source; ++i) + { + size_t row_loc = rowloc[i]; + for (size_t j = 0; j < ncol_source; ++j) + { + target[row_loc * ncol_target + colloc[j]] += source[i * ncol_source + j]; + } + } +} + +void _buildK_packaddrow( + double *target, + const int nrow_target, + const int ncol_target, + double *source, + const int nrow_source, + const int ncol_source, + const int *ao_involved) +{ + int nThread = get_omp_threads(); + + static const int INC = 1; + static const double ONE = 1.0; + +#pragma omp parallel for num_threads(nThread) schedule(static) + for (size_t i = 0; i < nrow_source; ++i) + { + size_t row_loc = ao_involved[i]; + // memcpy(target + row_loc * ncol_target, source + i * ncol_source, sizeof(double) * ncol_source); + daxpy_(&ncol_source, &ONE, source + i * ncol_source, &INC, target + row_loc * ncol_target, &INC); + } +} + +void _buildK_packaddrow_shift_col( + double *target, + const int nrow_target, + const int ncol_target, + double *source, + const int nrow_source, + const int ncol_source, + const int *ao_involved, + const int kmesh, + const int nao_prim, + const int *box_permutation) +{ + int nThread = get_omp_threads(); + + static const int INC = 1; + static const double ONE = 1.0; + + if (ncol_target != (kmesh * nao_prim)) + { + printf("Error: ncol_target!=(kmesh *nao_prim)\n"); + exit(1); + } + + if (ncol_source != ncol_target) + { + printf("Error: ncol_source!=ncol_target\n"); + exit(1); + } + +#pragma omp parallel for num_threads(nThread) schedule(static) + for (size_t i = 0; i < nrow_source; ++i) + { + size_t row_loc = ao_involved[i]; + // memcpy(target + row_loc * ncol_target, source + i * ncol_source, sizeof(double) * ncol_source); + // daxpy_(&ncol_source, &ONE, source + i * ncol_source, &INC, target + row_loc * ncol_target, &INC); + for (size_t j = 0; j < kmesh; ++j) + { + daxpy_(&nao_prim, &ONE, source + i * ncol_source + j * nao_prim, &INC, target + row_loc * ncol_target + box_permutation[j] * nao_prim, &INC); + } + } +} + +void _buildK_packaddcol( + double *target, + const int nrow_target, + const int ncol_target, + double *source, + const int nrow_source, + const int ncol_source, + const int *ao_involved) + +{ + int nThread = get_omp_threads(); + +#pragma omp parallel for num_threads(nThread) schedule(static) + for (size_t i = 0; i < nrow_source; ++i) + { + for (size_t j = 0; j < ncol_source; ++j) + { + target[i * ncol_target + ao_involved[j]] += source[i * ncol_source + j]; + } + } +} + +void _buildK_packrow( + double *target, + const int nrow_target, + const int ncol_target, + double *source, + const int nrow_source, + const int ncol_source, + const int *ao_involved) +{ + int nThread = get_omp_threads(); + + // static const int INC = 1; + // static const double ONE = 1.0; + +#pragma omp parallel for num_threads(nThread) schedule(static) + for (size_t i = 0; i < nrow_target; ++i) + { + size_t row_loc = ao_involved[i]; + memcpy(target + i * ncol_target, source + row_loc * ncol_source, sizeof(double) * ncol_source); + } +} + +void _buildK_packcol( + double *target, + const int nrow_target, + const int ncol_target, + double *source, + const int nrow_source, + const int ncol_source, + const int *ao_involved) +{ + int nThread = get_omp_threads(); + + // static const int INC = 1; + // static const double ONE = 1.0; + +#pragma omp parallel for num_threads(nThread) schedule(static) + for (size_t i = 0; i < nrow_target; ++i) + { + for (size_t j = 0; j < ncol_target; ++j) + { + target[i * ncol_target + j] = source[i * ncol_source + ao_involved[j]]; + } + } +} + +void _buildK_unpackcol( + double *target, + const int nrow_target, + const int ncol_target, + double *source, + const int nrow_source, + const int ncol_source, + const int *source_ind) +{ + int nThread = get_omp_threads(); + + // static const int INC = 1; + // static const double ONE = 1.0; + + memset(target, 0, sizeof(double) * nrow_target * ncol_target); + +#pragma omp parallel for num_threads(nThread) schedule(static) + for (size_t i = 0; i < nrow_source; ++i) + { + for (size_t j = 0; j < ncol_source; ++j) + { + target[i * ncol_target + source_ind[j]] = source[i * ncol_source + j]; + } + } +} + +void _buildK_packcol2( + double *target, + const int nrow_target, + const int ncol_target, + double *source, + const int nrow_source, + const int ncol_source, + const int col_indx_begin, + const int col_indx_end) +{ + int nThread = get_omp_threads(); + + // static const int INC = 1; + // static const double ONE = 1.0; + +#pragma omp parallel for num_threads(nThread) schedule(static) + for (size_t i = 0; i < nrow_target; ++i) + { + memcpy(target + i * ncol_target, source + i * ncol_source + col_indx_begin, sizeof(double) * (col_indx_end - col_indx_begin)); + } +} + +void _buildK_packcol3( + double *target, + const int nrow_target, + const int ncol_target, + const int col_indx_begin, + const int col_indx_end, + double *source, + const int nrow_source, + const int ncol_source) +{ + int nThread = get_omp_threads(); + + // static const int INC = 1; + // static const double ONE = 1.0; + +#pragma omp parallel for num_threads(nThread) schedule(static) + for (size_t i = 0; i < nrow_target; ++i) + { + memcpy(target + i * ncol_target + col_indx_begin, source + i * ncol_source, sizeof(double) * (col_indx_end - col_indx_begin)); + } +} + +void _buildK_copy(double *target, double *source, const size_t size) +{ + memcpy(target, source, sizeof(double) * size); +} + +////////// used in moR to density ////////// + +void moR_to_Density( + const int ngrids, + const int nMO, + const double *moR, + double *rhoR) +{ + int nThread = get_omp_threads(); + + int ngrid_per_thread = (ngrids + nThread - 1) / nThread; + + memset(rhoR, 0, sizeof(double) * ngrids); + +#pragma omp parallel num_threads(nThread) + { + int thread_id = omp_get_thread_num(); + int grid_start = thread_id * ngrid_per_thread; + grid_start = grid_start < ngrids ? grid_start : ngrids; + int grid_end = (thread_id + 1) * ngrid_per_thread; + grid_end = grid_end < ngrids ? grid_end : ngrids; + + for (int i = 0; i < nMO; i++) + { + for (int j = grid_start; j < grid_end; j++) + { + rhoR[j] += moR[i * ngrids + j] * moR[i * ngrids + j]; + } + } + } +} + +////////// transpose 012 -> 021 ////////// + +void transpose_012_to_021( + double *target, + double *source, + const int n1, + const int n2, + const int n3) +{ + int nThread = get_omp_threads(); + +#pragma omp parallel for num_threads(nThread) schedule(static) + for (size_t i = 0; i < n1; i++) + { + size_t shift = i * n2 * n3; + double *ptr_target = target + shift; + double *ptr_source = source + shift; + for (size_t j = 0; j < n2; j++) + { + for (size_t k = 0; k < n3; k++) + { + ptr_target[k * n2 + j] = ptr_source[j * n3 + k]; + } + } + } +} + +void transpose_012_to_021_InPlace( + double *target, + const int n1, + const int n2, + const int n3, + double *buf) +{ + int nThread = get_omp_threads(); + +#pragma omp parallel for num_threads(nThread) schedule(static) + for (size_t i = 0; i < n1; i++) + { + size_t shift = i * n2 * n3; + double *ptr_buf = buf + shift; + double *ptr_source = target + shift; + for (size_t j = 0; j < n2; j++) + { + for (size_t k = 0; k < n3; k++) + { + ptr_buf[k * n2 + j] = ptr_source[j * n3 + k]; + } + } + } + + memcpy(target, buf, sizeof(double) * n1 * n2 * n3); +} + +void contract_ipk_pk_to_ik( + double *A, + double *B, + double *C, + const int n1, + const int n2, + const int n3) +{ + int nThread = get_omp_threads(); + + memset(C, 0, sizeof(double) * n1 * n3); + +#pragma omp parallel for num_threads(nThread) schedule(static) + for (size_t i = 0; i < n1; i++) + { + double *ptr_A = A + i * n2 * n3; + double *ptr_B = B; + for (size_t j = 0; j < n2; j++) + { + double *ptr_res = C + i * n3; + for (size_t k = 0; k < n3; k++) + { + *ptr_res++ += *ptr_A++ * *ptr_B++; + } + } + } +} + +////////// used in CCCC for LR part in RS-ISDF ////////// + +void _unpack_aoPairR( + double *target, + const int n1, + const int n2, + const int n3, + double *source, + const int m1, + const int m2, + const int m3, + const int m2_begin, + const int m2_end, + const int *grid_involved) +{ + int nThread = get_omp_threads(); + + int ntask = n1 * (m2_end - m2_begin); + + memset(target, 0, sizeof(double) * n1 * n2 * n3); + +#pragma omp parallel for num_threads(nThread) schedule(static) + for (size_t i = 0; i < ntask; i++) + { + size_t i1 = i / (m2_end - m2_begin); + size_t i2 = i % (m2_end - m2_begin); + + size_t shift_target = i1 * n2 * n3 + i2 * n3; + size_t shift_source = i1 * m2 * m3 + (i2 + m2_begin) * m3; + + for (size_t j = 0; j < m3; ++j) + { + target[shift_target + grid_involved[j]] = source[shift_source + j]; + } + } +} + +void _pack_aoPairR_index1( + double *target, + const int n1, + const int n2, + const int n3, + double *source, + const int m1, + const int m2, + const int m3, + const int m2_begin, + const int m2_end) +{ + int nThread = get_omp_threads(); + + // memset(target, 0, sizeof(double) * n1 * n2 * n3); + +#pragma omp parallel for num_threads(nThread) schedule(static) + for (size_t i = 0; i < n1; i++) + { + size_t shift_target = i * n2 * n3; + size_t shift_source = i * m2 * m3 + m2_begin * m3; + + memcpy(target + shift_target, source + shift_source, sizeof(double) * n2 * m3); + } +} + +////////// in determing partition ////////// + +double _distance_translation(double *pa, double *pb, double *a) +{ + double dx, dx1, dx2; + double dy, dy1, dy2; + double dz, dz1, dz2; + + dx = pa[0] - pb[0]; + dx1 = dx - a[0]; + dx2 = dx + a[0]; + dx = fabs(dx); + dx1 = fabs(dx1); + dx2 = fabs(dx2); + dx = fmin(fmin(dx, dx1), dx2); + + dy = pa[1] - pb[1]; + dy1 = dy - a[1]; + dy2 = dy + a[1]; + dy = fabs(dy); + dy1 = fabs(dy1); + dy2 = fabs(dy2); + dy = fmin(fmin(dy, dy1), dy2); + + dz = pa[2] - pb[2]; + dz1 = dz - a[2]; + dz2 = dz + a[2]; + dz = fabs(dz); + dz1 = fabs(dz1); + dz2 = fabs(dz2); + dz = fmin(fmin(dz, dz1), dz2); + + return sqrt(dx * dx + dy * dy + dz * dz); +} + +void distance_between_point_atms( + double *distance, + double *pnt, + double *atm_coords, + const int natm, + const double *lattice_vector) +{ + double a[3]; + a[0] = lattice_vector[0 * 3 + 0]; + a[1] = lattice_vector[1 * 3 + 1]; + a[2] = lattice_vector[2 * 3 + 2]; + +#pragma omp parallel for schedule(static) num_threads(get_omp_threads()) + for (int i = 0; i < natm; i++) + { + distance[i] = _distance_translation(pnt, atm_coords + i * 3, a); + } +} + +void distance_between_points_atms( + double *distance, + double *pnt, + const int npnt, + double *atm_coords, + const int natm, + const double *lattice_vector) +{ + double a[3]; + a[0] = lattice_vector[0 * 3 + 0]; + a[1] = lattice_vector[1 * 3 + 1]; + a[2] = lattice_vector[2 * 3 + 2]; + +#pragma omp parallel for schedule(static) num_threads(get_omp_threads()) + for (size_t i = 0; i < npnt; i++) + { + for (size_t j = 0; j < natm; j++) + { + distance[i * natm + j] = _distance_translation(pnt + i * 3, atm_coords + j * 3, a); + } + } +} + +//////////// further linear algebra operations + +void NPdcwisemul(double *out, double *a, double *b, size_t n) +{ +#pragma omp parallel + { + size_t i; +#pragma omp for schedule(static) + for (i = 0; i < n; i++) + { + out[i] = a[i] * b[i]; + } + } +} + +void NPz2d_InPlace(double complex *in, const size_t n) +{ + // printf("n = %d\n", n); + // fflush(stdout); + + double *out = (double *)in; + + int nThread = get_omp_threads(); + + int BunchSize = n / nThread; + int nLeft = n - BunchSize * nThread; + +#pragma omp parallel num_threads(nThread) + { + size_t i; + + int tid = omp_get_thread_num(); + int start = tid * BunchSize; + int end = start + BunchSize; + + if (tid == nThread - 1) + { + end += nLeft; + } + + double *ptr_real = (double *)(in + start); + double complex *ptr_complex = in + start; + + for (i = 0; i < end - start; i++) + { + ptr_real[i] = creal(ptr_complex[i]); + } + } + + // copy back + + for (int i = 1; i < nThread; i++) + { + int start = i * BunchSize; + int end = start + BunchSize; + + if (i == nThread - 1) + { + end += nLeft; + } + + memcpy(out + start, in + start, sizeof(double) * (end - start)); + } +} + +void NPdsquare_inPlace(double *a, size_t n) +{ +#pragma omp parallel for schedule(static) + for (size_t i = 0; i < n; i++) + { + a[i] = a[i] * a[i]; + } +} + +void NPd_ij_j_ij(double *out, double *a, double *b, size_t nrow, size_t ncol) +{ +#pragma omp parallel + { + size_t i, j; + double *pa, *pout; +#pragma omp for schedule(static) + for (i = 0; i < nrow; i++) + { + pa = a + i * ncol; + pout = out + i * ncol; + for (j = 0; j < ncol; j++) + { + pout[j] = pa[j] * b[j]; // out[i,j] = a[i,j] * b[j] + } + } + } +} + +void NPd_i_ij_ij(double *out, double *a, double *b, size_t nrow, size_t ncol) +{ +#pragma omp parallel + { + size_t i, j; + double *pb, *pout; +#pragma omp for schedule(static) + for (i = 0; i < nrow; i++) + { + pb = b + i * ncol; + pout = out + i * ncol; + for (j = 0; j < ncol; j++) + { + pout[j] = a[i] * pb[j]; // out[i,j] = a[i] * b[i,j] + } + } + } +} + diff --git a/pyscf/isdf/pbc_isdf_auxbasis.c b/pyscf/isdf/pbc_isdf_auxbasis.c new file mode 100644 index 000000000..415db61c4 --- /dev/null +++ b/pyscf/isdf/pbc_isdf_auxbasis.c @@ -0,0 +1,482 @@ +#include "vhf/fblas.h" +#include +#include +#include +#include +int get_omp_threads(); +int omp_get_thread_num(); + +void ColPivotQRRelaCut( + double *aoPaironGrid, // (nPair, nGrid) + const int nPair, + const int nGrid, + const int max_rank, + const double cutoff, // abs_cutoff + const double relacutoff, + int *pivot, + double *R, + int *npt_find, + double *thread_buffer, // (nThread, nGrid) + double *global_buffer) // nGrid +{ + static const int INC = 1; + + // printf("nPair: %d\n", nPair); + // printf("nGrid: %d\n", nGrid); + // printf("max_rank: %d\n", max_rank); + // printf("cutoff: %f\n", cutoff); + + double *Q = aoPaironGrid; + + for (int i = 0; i < nGrid; ++i) + { + pivot[i] = i; + } + + int nThread = get_omp_threads(); + *npt_find = 0; + + int *reduce_indx_buffer = (int *)(thread_buffer + nThread * nGrid); + + int i; + + int argmaxnorm = 0; + double maxnorm = 0.0; + + for (i = 0; i < max_rank; i++) + { + // printf("i: %d\n", i); + +#pragma omp parallel num_threads(nThread) + { + + int thread_id = omp_get_thread_num(); + double *buf = thread_buffer + thread_id * nGrid; + memset(buf, 0, sizeof(double) * nGrid); + + int j, k; + + double *dptr; + + //// 1. determine the arg of maxinaml norm + +#pragma omp for schedule(static) + for (j = 0; j < nPair; j++) + { + dptr = Q + j * nGrid; + for (k = i; k < nGrid; k++) + { + buf[k] += dptr[k] * dptr[k]; + } + } + + int bunchsize = (nGrid - i) / nThread + 1; + int begin_id = i + thread_id * bunchsize; + int end_id = i + (thread_id + 1) * bunchsize; + if (thread_id == nThread - 1) + { + end_id = nGrid; + } + + if (begin_id >= nGrid) + { + begin_id = nGrid; + } + + if (end_id > nGrid) + { + end_id = nGrid; + } + + memcpy(global_buffer + begin_id, thread_buffer + begin_id, sizeof(double) * (end_id - begin_id)); + + for (j = 1; j < nThread; j++) + { + dptr = thread_buffer + j * nGrid; + for (k = begin_id; k < end_id; ++k) + { + global_buffer[k] += dptr[k]; + } + } + + // get the local max + + if (begin_id < end_id) + { + double max_norm2 = global_buffer[begin_id]; + reduce_indx_buffer[thread_id] = begin_id; + for (j = begin_id + 1; j < end_id; j++) + { + if (global_buffer[j] > max_norm2) + { + max_norm2 = global_buffer[j]; + reduce_indx_buffer[thread_id] = j; + } + } + } + else + { + reduce_indx_buffer[thread_id] = begin_id - 1; + } + + // printf("max_norm2: %.3e\n", max_norm2); + +#pragma omp barrier + +#pragma omp single + { + // printf("--------------------------------\n"); + maxnorm = global_buffer[reduce_indx_buffer[0]]; + argmaxnorm = reduce_indx_buffer[0]; + // printf("maxnorm: %.3e\n", maxnorm); + // printf("argmaxnorm: %d\n", argmaxnorm); + for (j = 1; j < nThread; j++) + { + if (global_buffer[reduce_indx_buffer[j]] > maxnorm) + { + // printf("j = %d\n", j); + // printf("global_buffer[reduce_indx_buffer[j]]: %.3e\n", global_buffer[reduce_indx_buffer[j]]); + + maxnorm = global_buffer[reduce_indx_buffer[j]]; + argmaxnorm = reduce_indx_buffer[j]; + + // printf("maxnorm: %.3e\n", maxnorm); + // printf("argmaxnorm: %d\n", argmaxnorm); + } + } + + // printf("i = %d\n", i); + // printf("argmaxnorm = %d\n", argmaxnorm); + + int tmp; + tmp = pivot[i]; + pivot[i] = pivot[argmaxnorm]; + pivot[argmaxnorm] = tmp; + + // printf("argmaxnorm: %d\n", argmaxnorm); + // printf("tmp = %d\n", tmp); + // printf("pivot[i] = %d\n", pivot[i]); + // printf("pivot[argmaxnorm] = %d\n", pivot[argmaxnorm]); + // printf("--------------------------------\n"); + + maxnorm = sqrt(maxnorm); + R[i * nGrid + i] = maxnorm; + // printf("R[%3d,%3d] = maxnorm = %10.3e\n", i, i, maxnorm); + } + +#pragma omp barrier + + //// 2. switch + + ///// Q + +#pragma omp for schedule(static) nowait + for (j = 0; j < nPair; ++j) + { + dptr = Q + j * nGrid; + double tmp; + tmp = dptr[i]; + dptr[i] = dptr[argmaxnorm]; + dptr[argmaxnorm] = tmp; + dptr[i] /= maxnorm; + } + + ///// R + +#pragma omp for schedule(static) + for (j = 0; j < i; ++j) + { + dptr = R + i * nGrid; + double tmp; + tmp = dptr[i]; + dptr[i] = dptr[argmaxnorm]; + dptr[argmaxnorm] = tmp; + } + + //// 3. perform Schimidt decomposition + + ///// calculate the inner product + + memset(buf, 0, sizeof(double) * nGrid); + + int nleft = nGrid - i - 1; + +#pragma omp for schedule(static) + for (j = 0; j < nPair; ++j) + { + dptr = Q + j * nGrid; + daxpy_(&nleft, dptr + i, dptr + i + 1, &INC, buf + i + 1, &INC); + } + + bunchsize = nleft / nThread; + begin_id = i + 1 + thread_id * bunchsize; + end_id = i + 1 + (thread_id + 1) * bunchsize; + if (thread_id == nThread - 1) + { + end_id = nGrid; + } + + memcpy(global_buffer + begin_id, thread_buffer + begin_id, sizeof(double) * (end_id - begin_id)); + + for (j = 1; j < nThread; j++) + { + dptr = thread_buffer + j * nGrid; + for (k = begin_id; k < end_id; ++k) + { + global_buffer[k] += dptr[k]; + } + } + +#pragma omp barrier + + // project out + + double *inner_prod = global_buffer + i + 1; + +#pragma omp for schedule(static) nowait + for (j = 0; j < nPair; ++j) + { + dptr = Q + j * nGrid; + double alpha = -dptr[i]; + daxpy_(&nleft, &alpha, inner_prod, &INC, dptr + i + 1, &INC); + } + + // update R + +#pragma omp single + { + memcpy(R + i * nGrid + i + 1, inner_prod, sizeof(double) * nleft); + } + } + + if ((maxnorm < cutoff) || (maxnorm < R[0] * relacutoff)) + { + break; + } + else + { + (*npt_find)++; + } + } +} + +void ColPivotQR( + double *aoPaironGrid, // (nPair, nGrid) + const int nPair, + const int nGrid, + const int max_rank, + const double cutoff, + int *pivot, + double *R, + int *npt_find, + double *thread_buffer, // (nThread, nGrid) + double *global_buffer) // nGrid +{ + ColPivotQRRelaCut( + aoPaironGrid, nPair, nGrid, max_rank, cutoff, 0.0, pivot, R, npt_find, thread_buffer, global_buffer); +} + +void NP_d_ik_jk_ijk( + const double *A, + const double *B, + double *out, + const int nA, + const int nB, + const int nC) +{ + // printf("nA: %d\n", nA); + // printf("nB: %d\n", nB); + // printf("nC: %d\n", nC); + + int i, j; +#pragma omp parallel for private(i, j) + for (i = 0; i < nA * nB; ++i) + { + int i1 = i / nB; + int i2 = i % nB; + for (j = 0; j < nC; ++j) + { + out[i * nC + j] = A[i1 * nC + j] * B[i2 * nC + j]; + } + } +} + +void NPdsliceFirstCol(double *out, const double *a, size_t ncol_left, size_t nrow, size_t ncol) +{ +#pragma omp parallel + { + size_t i; +#pragma omp for schedule(static) + for (i = 0; i < nrow; i++) + { + memcpy(out + i * ncol_left, a + i * ncol, sizeof(double) * ncol_left); + } + } +} + +void CalculateNormRemained( + const double *InnerProd, // (nIP, nPntPotential) + const int nIP, + const int nPntPotential, + const double *aoPaironGrid, // (nPair, nPntPotential) + const int nPair, + double *thread_buffer, + double *global_buffer) +{ + int nThread = get_omp_threads(); + +#pragma omp parallel num_threads(nThread) + { + int thread_id = omp_get_thread_num(); + double *buf = thread_buffer + thread_id * nPntPotential; + memset(buf, 0, sizeof(double) * nPntPotential); + + int i, j; + + double *dptr; + const double *cdptr; + +#pragma omp for schedule(static) + for (i = 0; i < nPair; i++) + { + cdptr = aoPaironGrid + i * nPntPotential; + for (j = 0; j < nPntPotential; j++) + { + buf[j] += cdptr[j] * cdptr[j]; + } + } + + int bunchsize = nPntPotential / nThread; + int begin_id = thread_id * bunchsize; + int end_id = (thread_id + 1) * bunchsize; + if (thread_id == nThread - 1) + { + end_id = nPntPotential; + } + + memcpy(global_buffer + begin_id, thread_buffer + begin_id, sizeof(double) * (end_id - begin_id)); + + for (i = 1; i < nThread; i++) + { + dptr = thread_buffer + i * nPntPotential; + for (j = begin_id; j < end_id; j++) + { + global_buffer[j] += dptr[j]; + } + } + + // if (begin_id == 0) + // { + // printf("global_buffer[0]: %f\n", sqrt(global_buffer[0])); + // } + + for (i = 0; i < nIP; i++) + { + const double *dptr = InnerProd + i * nPntPotential; + for (j = begin_id; j < end_id; j++) + { + global_buffer[j] -= dptr[j] * dptr[j]; + } + } + + for (j = begin_id; j < end_id; j++) + { + global_buffer[j] = sqrt(global_buffer[j]); + } + } +} + +void PackAFirstCol( + const double *A, // + double *out, // + const int nRow, + const int nACol, + const int nFirst) +{ +} + +void PackABwithSlice( + const double *A, // + const double *B, // + double *out, // + const int nRow, + const int nACol, + const int nBCol, + const int *SliceB, + const int nSliceB, + double *Packbuf) +{ + int i, j; + int nThread = get_omp_threads(); + + const int nOutCol = nACol + nSliceB; + +#pragma omp parallel for num_threads(nThread) schedule(static) + for (i = 0; i < nRow; ++i) + { + memcpy(Packbuf + i * nOutCol, A + i * nACol, sizeof(double) * nACol); + for (j = 0; j < nSliceB; ++j) + { + Packbuf[i * nOutCol + nACol + j] = B[i * nBCol + SliceB[j]]; + } + } + + memcpy(out, Packbuf, sizeof(double) * nRow * nOutCol); +} + +void PackABwithABSlice( + const double *A, // + const double *B, // + double *out, // + const int nRow, + const int nACol, + const int nBCol, + const int *Slice, + const int nSlice, + double *Packbuf, + double *thread_buffer) +{ + int i, j; + int nThread = get_omp_threads(); + + const int nOutCol = nSlice; + +#pragma omp parallel num_threads(nThread) + { + int thread_id = omp_get_thread_num(); + double *buf = thread_buffer + thread_id * (nACol + nBCol); + +#pragma omp for schedule(static) + for (i = 0; i < nRow; ++i) + { + memcpy(buf, A + i * nACol, sizeof(double) * nACol); + memcpy(buf + nACol, B + i * nBCol, sizeof(double) * nBCol); + for (j = 0; j < nSlice; ++j) + { + Packbuf[i * nSlice + j] = buf[Slice[j]]; + } + } + } + memcpy(out, Packbuf, sizeof(double) * nRow * nOutCol); +} + +void PackAB( + const double *A, // + const double *B, // + double *out, // + const int nRow, + const int nACol, + const int nBCol) +{ + int i, j; + int nThread = get_omp_threads(); + + const int nOutCol = nACol + nBCol; + +#pragma omp parallel for num_threads(nThread) schedule(static) + for (i = 0; i < nRow; ++i) + { + memcpy(out + i * nOutCol, A + i * nACol, sizeof(double) * nACol); + memcpy(out + i * nOutCol + nACol, B + i * nBCol, sizeof(double) * nBCol); + } +} diff --git a/pyscf/isdf/pbc_isdf_eri.c b/pyscf/isdf/pbc_isdf_eri.c new file mode 100644 index 000000000..d63437b0e --- /dev/null +++ b/pyscf/isdf/pbc_isdf_eri.c @@ -0,0 +1,438 @@ +#include "fft.h" +#include +#include +#include +#include "vhf/fblas.h" +#include +#include "np_helper/np_helper.h" +#include + +int get_omp_threads(); +int omp_get_thread_num(); + +void _pack_aoR_to_aoPairR_diff( + double *aoR_i, + double *aoR_j, + double *aoPairR, + int nao_i, + int nao_j, + int ngrid) +{ + int nPair = nao_i * nao_j; +#pragma omp parallel for schedule(static) + for (int i = 0; i < nPair; i++) + { + int i1 = i / nao_j; + int j1 = i % nao_j; + for (int k = 0; k < ngrid; k++) + { + aoPairR[i * ngrid + k] = aoR_i[i1 * ngrid + k] * aoR_j[j1 * ngrid + k]; + } + } +} + +void _pack_aoR_to_aoPairR_same( + double *aoR, + double *aoPairR, + int nao, + int ngrid) +{ + // int nPair = nao * (nao + 1) / 2; + +#pragma omp parallel for schedule(static) + for (int i1 = 0; i1 < nao; ++i1) + { + for (int j1 = 0; j1 <= i1; ++j1) + { + int i = i1 * (i1 + 1) / 2 + j1; + for (int k = 0; k < ngrid; ++k) + { + aoPairR[i * ngrid + k] = aoR[i1 * ngrid + k] * aoR[j1 * ngrid + k]; + } + } + } +} + +#define COMBINE2(i, j) ((i) < (j) ? (j) * (j + 1) / 2 + i : i * (i + 1) / 2 + j) + +void _unpack_suberi_to_eri( + double *eri, + const int nao, + double *suberi, + const int nao_bra, + const int *ao_loc_bra, + const int nao_ket, + const int *ao_loc_ket, + const int add_transpose) +{ + int nPair = nao * (nao + 1) / 2; + + int nPair_ket = nao_ket * (nao_ket + 1) / 2; + // int nPair_bra = nao_bra * (nao_bra + 1) / 2; + +#pragma omp parallel for schedule(static) + for (int i1 = 0; i1 < nao_bra; ++i1) + { + for (int j1 = 0; j1 <= i1; ++j1) + { + int i = ao_loc_bra[i1]; + int j = ao_loc_bra[j1]; + int ij = COMBINE2(i, j); + int i1j1 = COMBINE2(i1, j1); + // printf("i1: %d, j1: %d, i: %d, j: %d, ij: %d, i1j1: %d\n", i1, j1, i, j, ij, i1j1); + for (int k1 = 0; k1 < nao_ket; ++k1) + { + for (int l1 = 0; l1 <= k1; ++l1) + { + int k = ao_loc_ket[k1]; + int l = ao_loc_ket[l1]; + int kl = COMBINE2(k, l); + int k1l1 = COMBINE2(k1, l1); + eri[ij * nPair + kl] += suberi[i1j1 * nPair_ket + k1l1]; + } + } + } + } + + if (add_transpose) + { +#pragma omp parallel for schedule(static) + for (int i1 = 0; i1 < nao_bra; ++i1) + { + for (int j1 = 0; j1 <= i1; ++j1) + { + int i = ao_loc_bra[i1]; + int j = ao_loc_bra[j1]; + int ij = COMBINE2(i, j); + int i1j1 = COMBINE2(i1, j1); + for (int k1 = 0; k1 < nao_ket; ++k1) + { + for (int l1 = 0; l1 <= k1; ++l1) + { + int k = ao_loc_ket[k1]; + int l = ao_loc_ket[l1]; + int kl = COMBINE2(k, l); + int k1l1 = COMBINE2(k1, l1); + eri[kl * nPair + ij] += suberi[i1j1 * nPair_ket + k1l1]; + } + } + } + } + } +} + +void _unpack_suberi_to_eri_ovov( + double *eri, + double *suberi, + const int nPair, + const int add_transpose) +{ + static const double ALPHA = 1.0; + static const int INCX = 1; + +#pragma omp parallel for schedule(static) + for (int i = 0; i < nPair; i++) + { + daxpy_(&nPair, &ALPHA, suberi + i * nPair, &INCX, eri + i * nPair, &INCX); + } + + if (add_transpose) + { +#pragma omp parallel for schedule(static) + for (int i = 0; i < nPair; i++) + { + daxpy_(&nPair, &ALPHA, suberi + i * nPair, &INCX, eri + i, &nPair); + } + } +} + +#undef COMBINE2 + +/// sliced operation /// + +void fn_slice_2_0( + const double *tensor_A, + double *tensor_B, + const int n0, + const int n1, + const int slice_0_0, + const int slice_0_1) +{ + int dim0 = slice_0_1 - slice_0_0; + +#pragma omp parallel for + for (size_t i = slice_0_0; i < slice_0_1; i++) + { + memcpy(tensor_B + (i - slice_0_0) * n1, tensor_A + i * n1, sizeof(double) * n1); + } +} + +void fn_slice_2_1( + const double *tensor_A, + double *tensor_B, + const int n0, + const int n1, + const int slice_1_0, + const int slice_1_1) +{ + int dim1 = slice_1_1 - slice_1_0; +#pragma omp parallel for + for (size_t i = 0; i < n0; i++) + { + memcpy(tensor_B + i * dim1, tensor_A + i * n1 + slice_1_0, sizeof(double) * dim1); + } +} + +void fn_slice_3_2( + const double *tensor_A, + double *tensor_B, + const int n0, + const int n1, + const int n2, + const int slice_2_0, + const int slice_2_1) +{ + int dim2 = slice_2_1 - slice_2_0; + +#pragma omp parallel for schedule(static) + for (size_t ij = 0; ij < n0 * n1; ij++) + { + int i = ij / n1; + int j = ij % n1; + memcpy(tensor_B + ij * dim2, tensor_A + i * n1 * n2 + j * n2 + slice_2_0, sizeof(double) * dim2); + } +} + +void fn_slice_3_0_2( + const double *tensor_A, + double *tensor_B, + const int n0, + const int n1, + const int n2, + const int slice_0_0, + const int slice_0_1, + const int slice_2_0, + const int slice_2_1) +{ + int dim0 = slice_0_1 - slice_0_0; + int dim2 = slice_2_1 - slice_2_0; + +#pragma omp parallel for + for (size_t i = slice_0_0; i < slice_0_1; i++) + { + for (size_t j = 0; j < n1; j++) + { + memcpy(tensor_B + (i - slice_0_0) * n1 * dim2 + j * dim2, + tensor_A + i * n1 * n2 + j * n2 + slice_2_0, sizeof(double) * dim2); + } + } +} + +void fn_slice_4_0_1_2( + const double *tensor_A, + double *tensor_B, + const int n0, + const int n1, + const int n2, + const int n3, + const int slice_0_0, + const int slice_0_1, + const int slice_1_0, + const int slice_1_1, + const int slice_2_0, + const int slice_2_1) +{ + int dim1 = slice_1_1 - slice_1_0; + int dim2 = slice_2_1 - slice_2_0; + +#pragma omp parallel for + for (size_t i = slice_0_0; i < slice_0_1; i++) + { + for (size_t j = slice_1_0; j < slice_1_1; j++) + { + memcpy(tensor_B + (i - slice_0_0) * dim1 * dim2 * n3 + (j - slice_1_0) * dim2 * n3, + tensor_A + i * n1 * n2 * n3 + j * n2 * n3 + slice_2_0 * n3, sizeof(double) * dim2 * n3); + } + } +} + +void fn_slice_3_1_2( + const double *tensor_A, + double *tensor_B, + const int n0, + const int n1, + const int n2, + const int slice_1_0, + const int slice_1_1, + const int slice_2_0, + const int slice_2_1) +{ + int dim1 = slice_1_1 - slice_1_0; + int dim2 = slice_2_1 - slice_2_0; + +#pragma omp parallel for + for (size_t i = 0; i < n0; i++) + { + for (size_t j = slice_1_0; j < slice_1_1; j++) + { + memcpy(tensor_B + i * dim1 * dim2 + (j - slice_1_0) * dim2, + tensor_A + i * n1 * n2 + j * n2 + slice_2_0, sizeof(double) * dim2); + } + } +} + +void fn_slice_4_1_2( + const double *tensor_A, + double *tensor_B, + const int n0, + const int n1, + const int n2, + const int n3, + const int slice_1_0, + const int slice_1_1, + const int slice_2_0, + const int slice_2_1) +{ + int dim1 = slice_1_1 - slice_1_0; + int dim2 = slice_2_1 - slice_2_0; + +#pragma omp parallel for schedule(static) + for (size_t i = 0; i < n0; i++) + { + for (size_t j = slice_1_0; j < slice_1_1; j++) + { + memcpy(tensor_B + i * dim1 * dim2 * n3 + (j - slice_1_0) * dim2 * n3, + tensor_A + i * n1 * n2 * n3 + j * n2 * n3 + slice_2_0 * n3, sizeof(double) * dim2 * n3); + } + } +} + +void fn_slice_3_0_1( + const double *tensor_A, + double *tensor_B, + const int n0, + const int n1, + const int n2, + const int slice_0_0, + const int slice_0_1, + const int slice_1_0, + const int slice_1_1) +{ + int dim0 = slice_0_1 - slice_0_0; + int dim1 = slice_1_1 - slice_1_0; + +#pragma omp parallel for schedule(static) + for (size_t i = slice_0_0; i < slice_0_1; i++) + { + for (size_t j = slice_1_0; j < slice_1_1; j++) + { + memcpy(tensor_B + (i - slice_0_0) * dim1 * n2 + (j - slice_1_0) * n2, + tensor_A + i * n1 * n2 + j * n2, sizeof(double) * n2); + } + } +} + +/// packadd /// + +void fn_packadd_3_1_2( + double *tensor_A, + const double *tensor_B, + const int n0, + const int n1, + const int n2, + const int slice_1_0, + const int slice_1_1, + const int slice_2_0, + const int slice_2_1) +{ + int dim1 = slice_1_1 - slice_1_0; + int dim2 = slice_2_1 - slice_2_0; + +#pragma omp parallel for schedule(static) + for (size_t i = 0; i < n0; i++) + { + for (size_t j = slice_1_0; j < slice_1_1; j++) + { + for (size_t k = slice_2_0; k < slice_2_1; k++) + { + tensor_A[i * n1 * n2 + j * n2 + k] += tensor_B[i * dim1 * dim2 + (j - slice_1_0) * dim2 + (k - slice_2_0)]; + // printf("tensor_A[%d,%d,%d] = %f\n", i, j, k, tensor_A[i * n1 * n2 + j * n2 + k]); + } + } + } +} + +void fn_packadd_3_1( + double *tensor_A, + const double *tensor_B, + const int n0, + const int n1, + const int n2, + const int slice_1_0, + const int slice_1_1) +{ + int dim1 = slice_1_1 - slice_1_0; + + static const int INCX = 1; + static const double ALPHA = 1; + +#pragma omp parallel for schedule(static) + for (size_t i = 0; i < n0; i++) + { + for (size_t j = slice_1_0; j < slice_1_1; j++) + { + daxpy_(&n2, &ALPHA, tensor_B + i * dim1 * n2 + (j - slice_1_0) * n2, &INCX, tensor_A + i * n1 * n2 + j * n2, &INCX); + } + } +} + +void fn_copy( + const double *tensor_A, + double *tensor_B, + const int size) +{ + if (tensor_A != tensor_B) + { + memcpy(tensor_B, tensor_A, sizeof(double) * size); + } +} + +void fn_add( + const double *tensor_A, + double *tensor_B, + const int size) +{ + static const int INCX = 1; + static const double ALPHA = 1; + + const int nthread = get_omp_threads(); + const int bunch_size = size / nthread + 1; + + if (size < 1024) + { + daxpy_(&size, &ALPHA, tensor_A, &INCX, tensor_B, &INCX); + return; + } + +#pragma omp parallel + { + const int ithread = omp_get_thread_num(); + int start = ithread * bunch_size; + int end = start + bunch_size; + start = start > size ? size : start; + end = end > size ? size : end; + const int n = end - start; + + if (n > 0) + { + daxpy_(&n, &ALPHA, tensor_A + start, &INCX, tensor_B + start, &INCX); + } + } +} + +void fn_clean( + double *tensor_A, + const int size) +{ + memset(tensor_A, 0, sizeof(double) * size); +} \ No newline at end of file diff --git a/pyscf/isdf/pbc_isdf_samplek.c b/pyscf/isdf/pbc_isdf_samplek.c new file mode 100644 index 000000000..b144fc157 --- /dev/null +++ b/pyscf/isdf/pbc_isdf_samplek.c @@ -0,0 +1,632 @@ +#include "vhf/fblas.h" +#include +#include +#include +#include +#include +#include "fft.h" +#include + +int get_omp_threads(); +int omp_get_thread_num(); + +void _FFT_Matrix_Col_InPlace(double *matrix, // the size of matrix should be (nRow, nCol* *mesh) + int nRow, int nCol, int *mesh, + double *buf) +{ + int mesh_complex[3] = {mesh[0], mesh[1], mesh[2] / 2 + 1}; + int64_t nComplex = mesh_complex[0] * mesh_complex[1] * mesh_complex[2]; + int64_t nReal = mesh[0] * mesh[1] * mesh[2]; + const int nThread = get_omp_threads(); + + // printf("nThread: %d\n", nThread); + // printf("nRow: %d\n", nRow); + // printf("nCol: %d\n", nCol); + // printf("mesh: %d %d %d\n", mesh[0], mesh[1], mesh[2]); + // printf("nComplex: %d\n", nComplex); + + const int64_t m = nRow; + const int64_t n = nCol * mesh[0] * mesh[1] * mesh[2]; + const int64_t n_complex = nCol * mesh_complex[0] * mesh_complex[1] * mesh_complex[2]; + const int64_t nMesh = mesh[0] * mesh[1] * mesh[2]; + const int64_t nMeshComplex = mesh_complex[0] * mesh_complex[1] * mesh_complex[2]; + + // printf("m: %d\n", m); + // printf("n: %d\n", n); + // printf("nMesh: %d\n", nMesh); + // printf("nMeshComplex: %d\n", nMeshComplex); + + // (1) transform (Row, Block, Col) -> (Row, Col, Block) + +#pragma omp parallel for num_threads(nThread) + for (int64_t i = 0; i < m; i++) + { + int64_t iCol = 0; + + for (int64_t iBlock = 0; iBlock < nMesh; iBlock++) + { + for (int64_t j = 0; j < nCol; j++, iCol++) + { + buf[i * n + j * nMesh + iBlock] = matrix[i * n + iCol]; + } + } + } + + // printf("finish (1) \n"); + + // (2) perform FFT on the last dimension + + int64_t nFFT = nRow * nCol; + + double __complex__ *mat_complex = (double __complex__ *)buf; + double __complex__ *buf_complex = (double __complex__ *)matrix; + + // create plan + + const int BunchSize = nFFT / nThread + 1; + +#pragma omp parallel num_threads(nThread) + { + int tid = omp_get_thread_num(); + int64_t start = tid * BunchSize; + int64_t end = (tid + 1) * BunchSize; + if (end > nFFT) + { + end = nFFT; + } + + fftw_plan plan = fftw_plan_many_dft_r2c(3, mesh, end - start, buf + start * nReal, mesh, 1, nReal, (fftw_complex *)buf_complex + start * nComplex, mesh_complex, 1, nComplex, FFTW_ESTIMATE); + fftw_execute(plan); + fftw_destroy_plan(plan); + } + + // printf("finish (2) \n"); + + // (3) transform (Row, Col, Block) -> (Row, Block, Col) + + mat_complex = (double __complex__ *)matrix; + buf_complex = (double __complex__ *)buf; + +#pragma omp parallel for num_threads(nThread) + for (int64_t i = 0; i < m; i++) + { + int64_t iCol = 0; + + for (int64_t j = 0; j < nCol; j++) + { + for (int64_t iBlock = 0; iBlock < nMeshComplex; iBlock++, iCol++) + { + buf_complex[i * n_complex + iBlock * nCol + j] = mat_complex[i * n_complex + iCol]; + } + } + } + + // printf("finish (3) \n"); + + memcpy(matrix, buf, sizeof(double __complex__) * m * nCol * mesh_complex[0] * mesh_complex[1] * mesh_complex[2]); + + // printf("finish memcpy \n"); +} + +void _iFFT_Matrix_Col_InPlace(double __complex__ *matrix, // the size of matrix should be (nRow, nCol* *mesh) + int nRow, int nCol, int *mesh, + double __complex__ *buf) +{ + int mesh_complex[3] = {mesh[0], mesh[1], mesh[2] / 2 + 1}; + int64_t nComplex = mesh_complex[0] * mesh_complex[1] * mesh_complex[2]; + int64_t nReal = mesh[0] * mesh[1] * mesh[2]; + const int64_t nThread = get_omp_threads(); + + const int64_t m = nRow; + const int64_t n = nCol * mesh[0] * mesh[1] * mesh[2]; + const int64_t n_Complex = nCol * mesh_complex[0] * mesh_complex[1] * mesh_complex[2]; + const int64_t nMesh = mesh[0] * mesh[1] * mesh[2]; + const int64_t nMeshComplex = mesh_complex[0] * mesh_complex[1] * mesh_complex[2]; + const double factor = 1.0 / (double)(nMesh); + + // printf("m: %d\n", m); + // printf("n: %d\n", n); + // printf("n_Complex: %d\n", n_Complex); + // printf("nMesh: %d\n", nMesh); + // printf("nMeshComplex: %d\n", nMeshComplex); + // printf("nThread: %d\n", nThread); + // printf("nRow: %d\n", nRow); + // printf("nCol: %d\n", nCol); + // printf("mesh: %d %d %d\n", mesh[0], mesh[1], mesh[2]); + // printf("nComplex: %d\n", nComplex); + // printf("nReal: %d\n", nReal); + + // (1) transform (Row, Block, Col) -> (Row, Col, Block) + +#pragma omp parallel for num_threads(nThread) + for (int64_t i = 0; i < m; i++) + { + int64_t iCol = 0; + + for (int64_t iBlock = 0; iBlock < nMeshComplex; iBlock++) + { + for (int64_t j = 0; j < nCol; j++, iCol++) + { + buf[i * n_Complex + j * nMeshComplex + iBlock] = matrix[i * n_Complex + iCol]; + } + } + } + + // (2) perform iFFT on the last dimension + + int64_t nFFT = nRow * nCol; + + double *mat_real = (double *)buf; + double *buf_real = (double *)matrix; + + // create plan + + const int64_t BunchSize = nFFT / nThread + 1; + +#pragma omp parallel num_threads(nThread) + { + int64_t tid = omp_get_thread_num(); + int64_t start = tid * BunchSize; + int64_t end = (tid + 1) * BunchSize; + if (end > nFFT) + { + end = nFFT; + } + + fftw_plan plan = fftw_plan_many_dft_c2r(3, mesh, end - start, (fftw_complex *)buf + start * nComplex, mesh_complex, 1, nComplex, buf_real + start * nReal, mesh, 1, nReal, FFTW_ESTIMATE); + fftw_execute(plan); + fftw_destroy_plan(plan); + } + + // (3) transform (Row, Col, Block) -> (Row, Block, Col) + + mat_real = (double *)matrix; + buf_real = (double *)buf; + +#pragma omp parallel for num_threads(nThread) + for (int64_t i = 0; i < m; i++) + { + int64_t iCol = 0; + + for (int64_t j = 0; j < nCol; j++) + { + for (int64_t iBlock = 0; iBlock < nMesh; iBlock++, iCol++) + { + // printf("i: %d, j: %d, iBlock: %d, iCol: %d %15.8f\n", i, j, iBlock, iCol, mat_real[i * n + iCol]); + buf_real[i * n + iBlock * nCol + j] = mat_real[i * n + iCol] * factor; + } + } + } + + memcpy(mat_real, buf_real, sizeof(double) * m * nCol * mesh[0] * mesh[1] * mesh[2]); +} + +void _FinalFFT( + double __complex__ *a, + const double __complex__ *freq, + int m, int n, int *mesh, + double __complex__ *buf) +{ + const int nThread = get_omp_threads(); + + if (n != mesh[0] * mesh[1] * mesh[2]) + { + fprintf(stderr, "The size of a is not compatible with mesh\n"); + exit(1); + } + +#pragma omp parallel num_threads(nThread) + { + int thread_id = omp_get_thread_num(); + + double __complex__ *buf_thread = buf + thread_id * n; + + fftw_plan plan = fftw_plan_dft_3d(mesh[0], mesh[1], mesh[2], (fftw_complex *)buf_thread, (fftw_complex *)a, FFTW_FORWARD, FFTW_ESTIMATE); + +#pragma omp for schedule(static, 1) nowait + for (size_t i = 0; i < m; i++) + { + double __complex__ *in = a + i * n; + for (int j = 0; j < n; j++) + { + buf_thread[j] = in[j] * freq[j]; + } + fftw_execute_dft(plan, (fftw_complex *)buf_thread, (fftw_complex *)in); + } + + fftw_destroy_plan(plan); + } +} + +void _FinaliFFT( + double __complex__ *a, + const double __complex__ *freq, + int m, int n, int *mesh, + double __complex__ *buf) +{ + const int nThread = get_omp_threads(); + + double factor = 1.0 / (double)n; + + if (n != mesh[0] * mesh[1] * mesh[2]) + { + printf("n: %d\n", n); + printf("mesh: %d %d %d\n", mesh[0], mesh[1], mesh[2]); + fprintf(stderr, "The size of a is not compatible with mesh\n"); + exit(1); + } + +#pragma omp parallel num_threads(nThread) + { + int thread_id = omp_get_thread_num(); + + double __complex__ *buf_thread = buf + thread_id * n; + + fftw_plan plan = fftw_plan_dft_3d(mesh[0], mesh[1], mesh[2], (fftw_complex *)buf_thread, (fftw_complex *)a, FFTW_BACKWARD, FFTW_ESTIMATE); + +#pragma omp for schedule(static, 1) nowait + for (size_t i = 0; i < m; i++) + { + double __complex__ *in = a + i * n; + fftw_execute_dft(plan, (fftw_complex *)in, (fftw_complex *)buf_thread); + for (int j = 0; j < n; j++) + { + // buf_thread[j] = in[j] * conj(freq[j]) * factor; + in[j] = buf_thread[j] * conj(freq[j]) * factor; + } + } + + fftw_destroy_plan(plan); + } +} + +void _PermutationConj( + double __complex__ *a, + int m, int n, int *permutation, + double __complex__ *buf) +{ + const int nThread = get_omp_threads(); + +#pragma omp parallel num_threads(nThread) + { + int thread_id = omp_get_thread_num(); + + double __complex__ *buf_thread = buf + thread_id * n; + +#pragma omp for schedule(static, 1) nowait + for (size_t i = 0; i < m; i++) + { + double __complex__ *in = a + i * n; + for (int j = 0; j < n; j++) + { + buf_thread[j] = conj(in[permutation[j]]); + // buf_thread[permutation[j]] = conj(in[j]); + } + memcpy(in, buf_thread, sizeof(double __complex__) * n); + } + } +} + +#define PI 3.14159265358979323846 + +void meshgrid(int *range1, int size1, int *range2, int size2, int *range3, int size3, int *output) +{ +#pragma omp parallel for collapse(3) + for (int i = 0; i < size1; i++) + { + for (int j = 0; j < size2; j++) + { + for (int k = 0; k < size3; k++) + { + output[(i * size2 * size3 + j * size3 + k) * 3 + 0] = range1[i]; + output[(i * size2 * size3 + j * size3 + k) * 3 + 1] = range2[j]; + output[(i * size2 * size3 + j * size3 + k) * 3 + 2] = range3[k]; + } + } + } +} + +void _FREQ( + double __complex__ *FREQ, + const int *meshPrim, + const int *Ls) +{ + int *freq1_q = (int *)malloc(meshPrim[0] * sizeof(int)); + int *freq2_q = (int *)malloc(meshPrim[1] * sizeof(int)); + int *freq3_q = (int *)malloc(meshPrim[2] * sizeof(int)); + + for (int i = 0; i < meshPrim[0]; i++) + { + freq1_q[i] = i; + } + for (int i = 0; i < meshPrim[1]; i++) + { + freq2_q[i] = i; + } + for (int i = 0; i < meshPrim[2]; i++) + { + freq3_q[i] = i; + } + + int *freq_q = (int *)malloc(meshPrim[0] * meshPrim[1] * meshPrim[2] * 3 * sizeof(int)); + meshgrid(freq1_q, meshPrim[0], freq2_q, meshPrim[1], freq3_q, meshPrim[2], freq_q); + + int *freq1_Q = (int *)malloc(Ls[0] * sizeof(int)); + int *freq2_Q = (int *)malloc(Ls[1] * sizeof(int)); + int *freq3_Q = (int *)malloc((Ls[2] / 2 + 1) * sizeof(int)); + + for (int i = 0; i < Ls[0]; i++) + { + freq1_Q[i] = i; + } + for (int i = 0; i < Ls[1]; i++) + { + freq2_Q[i] = i; + } + for (int i = 0; i < Ls[2] / 2 + 1; i++) + { + freq3_Q[i] = i; + } + + int *freq_Q = (int *)malloc(Ls[0] * Ls[1] * (Ls[2] / 2 + 1) * 3 * sizeof(int)); + meshgrid(freq1_Q, Ls[0], freq2_Q, Ls[1], freq3_Q, Ls[2] / 2 + 1, freq_Q); + +#pragma omp parallel for collapse(6) + for (int i = 0; i < Ls[0]; i++) + { + for (int j = 0; j < Ls[1]; j++) + { + for (int k = 0; k < Ls[2] / 2 + 1; k++) + { + for (int p = 0; p < meshPrim[0]; p++) + { + for (int q = 0; q < meshPrim[1]; q++) + { + for (int s = 0; s < meshPrim[2]; s++) + { + FREQ[(i * Ls[1] * (Ls[2] / 2 + 1) * meshPrim[0] * meshPrim[1] * meshPrim[2] + + j * (Ls[2] / 2 + 1) * meshPrim[0] * meshPrim[1] * meshPrim[2] + + k * meshPrim[0] * meshPrim[1] * meshPrim[2] + + p * meshPrim[1] * meshPrim[2] + + q * meshPrim[2] + + s)] = freq_Q[(i * Ls[1] * (Ls[2] / 2 + 1) + j * (Ls[2] / 2 + 1) + k) * 3 + 0] * freq_q[(p * meshPrim[1] * meshPrim[2] + q * meshPrim[2] + s) * 3 + 0] / (double)(Ls[0] * meshPrim[0]) + + freq_Q[(i * Ls[1] * (Ls[2] / 2 + 1) + j * (Ls[2] / 2 + 1) + k) * 3 + 1] * freq_q[(p * meshPrim[1] * meshPrim[2] + q * meshPrim[2] + s) * 3 + 1] / (double)(Ls[1] * meshPrim[1]) + + freq_Q[(i * Ls[1] * (Ls[2] / 2 + 1) + j * (Ls[2] / 2 + 1) + k) * 3 + 2] * freq_q[(p * meshPrim[1] * meshPrim[2] + q * meshPrim[2] + s) * 3 + 2] / (double)(Ls[2] * meshPrim[2]); + } + } + } + } + } + } + +#pragma omp parallel for + for (int i = 0; i < Ls[0] * Ls[1] * (Ls[2] / 2 + 1) * meshPrim[0] * meshPrim[1] * meshPrim[2]; i++) + { + FREQ[i] = cexp(-2.0 * PI * I * FREQ[i]); + } + + free(freq1_q); + free(freq2_q); + free(freq3_q); + free(freq_q); + free(freq1_Q); + free(freq2_Q); + free(freq3_Q); + free(freq_Q); +} + +#undef PI + +void _permutation(int nx, int ny, int nz, int shift_x, int shift_y, int shift_z, int *res) +{ + +#pragma omp parallel for collapse(3) + for (int ix = 0; ix < nx; ix++) + { + for (int iy = 0; iy < ny; iy++) + { + for (int iz = 0; iz < nz; iz++) + { + int ix2 = (nx - ix - shift_x) % nx; + int iy2 = (ny - iy - shift_y) % ny; + int iz2 = (nz - iz - shift_z) % nz; + int loc = ix2 * ny * nz + iy2 * nz + iz2; + int loc_now = ix * ny * nz + iy * nz + iz; + res[loc] = loc_now; + } + } + } +} + +void _get_permutation( + const int *meshPrim, + int *res) +{ + int nGridPrim = meshPrim[0] * meshPrim[1] * meshPrim[2]; + +#pragma omp parallel sections + { +#pragma omp section + _permutation(meshPrim[0], meshPrim[1], meshPrim[2], 0, 0, 0, &res[0 * nGridPrim]); + +#pragma omp section + _permutation(meshPrim[0], meshPrim[1], meshPrim[2], 0, 0, 1, &res[1 * nGridPrim]); + +#pragma omp section + _permutation(meshPrim[0], meshPrim[1], meshPrim[2], 0, 1, 0, &res[2 * nGridPrim]); + +#pragma omp section + _permutation(meshPrim[0], meshPrim[1], meshPrim[2], 0, 1, 1, &res[3 * nGridPrim]); + +#pragma omp section + _permutation(meshPrim[0], meshPrim[1], meshPrim[2], 1, 0, 0, &res[4 * nGridPrim]); + +#pragma omp section + _permutation(meshPrim[0], meshPrim[1], meshPrim[2], 1, 0, 1, &res[5 * nGridPrim]); + +#pragma omp section + _permutation(meshPrim[0], meshPrim[1], meshPrim[2], 1, 1, 0, &res[6 * nGridPrim]); + +#pragma omp section + _permutation(meshPrim[0], meshPrim[1], meshPrim[2], 1, 1, 1, &res[7 * nGridPrim]); + } +} + +int _get_loc( + const int freq, + const int mesh) +{ + int max_freq = mesh / 2; + int min_freq = -mesh / 2; + + if (mesh % 2 == 0) + { + max_freq = mesh / 2 - 1; + min_freq = -mesh / 2; + } + + if (freq > max_freq || freq < min_freq) + { + return -1; + } + + if (freq >= 0) + { + return freq; + } + else + { + int shift = mesh / 2; + if (mesh % 2 == 1) + { + shift += 1; + } + return (freq - min_freq) + shift; + } +} + +int _get_loc2( + const int freq, + const int mesh) // for real signal, the freq and loc must always be non-negative ! +{ + int max_freq = (mesh / 2) + 1; + + if (freq >= 0 && freq < max_freq) + { + return freq; + } + else + { + return -1; + } +} + +int _get_freq( + const int loc, + const int mesh) +{ + int mid_loc = mesh / 2; + if (mesh % 2 == 1) + { + mid_loc += 1; + } + + if ((loc < 0) || (loc >= mesh)) + { + printf("loc: %d, mesh: %d\n", loc, mesh); + exit(1); + } + + if (loc < mid_loc) + { + return loc; + } + else + { + return loc - mesh; + } +} + +int _get_freq2( + const int loc, + const int mesh) +{ + int loc_max = mesh / 2 + 1; + + if (loc >= 0 && loc < loc_max) + { + return loc; + } + else + { + return -1; + } +} + +void map_fftfreq(int *mesh_source, int *mesh_target, int *res) +{ + int nGrid = mesh_source[0] * mesh_source[1] * mesh_source[2]; + +#pragma omp parallel for + for (int i = 0; i < nGrid; i++) + { + int ix_loc = i / (mesh_source[1] * mesh_source[2]); + int iy_loc = (i % (mesh_source[1] * mesh_source[2])) / mesh_source[2]; + int iz_loc = i % mesh_source[2]; + + int ix_freq = _get_freq(ix_loc, mesh_source[0]); + int iy_freq = _get_freq(iy_loc, mesh_source[1]); + int iz_freq = _get_freq(iz_loc, mesh_source[2]); + + int ix_target = _get_loc(ix_freq, mesh_target[0]); + int iy_target = _get_loc(iy_freq, mesh_target[1]); + int iz_target = _get_loc(iz_freq, mesh_target[2]); + + if (ix_target == -1 || iy_target == -1 || iz_target == -1) + { + res[i] = -1; + } + else + { + res[i] = ix_target * mesh_target[1] * mesh_target[2] + iy_target * mesh_target[2] + iz_target; + } + + res[i] = ix_target * mesh_target[1] * mesh_target[2] + iy_target * mesh_target[2] + iz_target; + } +} + +void map_rfftfreq(int *mesh_source, int *mesh_target, int *res) +{ + int nGrid = mesh_source[0] * mesh_source[1] * (mesh_source[2] / 2 + 1); + +#pragma omp parallel for + for (int i = 0; i < nGrid; i++) + { + int ix_loc = i / (mesh_source[1] * (mesh_source[2] / 2 + 1)); + int iy_loc = (i % (mesh_source[1] * (mesh_source[2] / 2 + 1))) / (mesh_source[2] / 2 + 1); + int iz_loc = i % (mesh_source[2] / 2 + 1); + + int ix_freq = _get_freq(ix_loc, mesh_source[0]); + int iy_freq = _get_freq(iy_loc, mesh_source[1]); + int iz_freq = _get_freq2(iz_loc, mesh_source[2]); + + if (iz_freq == -1) + { + printf("iz_loc: %d, mesh_source[2]: %d\n", iz_loc, mesh_source[2]); + exit(1); + } + + int ix_target = _get_loc(ix_freq, mesh_target[0]); + int iy_target = _get_loc(iy_freq, mesh_target[1]); + int iz_target = _get_loc2(iz_freq, mesh_target[2]); + + if (ix_target == -1 || iy_target == -1 || iz_target == -1) + { + res[i] = -1; + } + else + { + res[i] = ix_target * mesh_target[1] * (mesh_target[2] / 2 + 1) + iy_target * (mesh_target[2] / 2 + 1) + iz_target; + } + } +} \ No newline at end of file diff --git a/pyscf/isdf/pbc_isdf_sparse.c b/pyscf/isdf/pbc_isdf_sparse.c new file mode 100644 index 000000000..8f542ce25 --- /dev/null +++ b/pyscf/isdf/pbc_isdf_sparse.c @@ -0,0 +1,419 @@ +#include "fft.h" +#include +#include "vhf/fblas.h" +#include +#include +#include +#include +#include + +int get_omp_threads(); +int omp_get_thread_num(); + +void _process_dm( + const double *dm, + const int nao, + const double cutoff, + int *nElmtRow, // record the number of elements in each row, size of which should be larger than nao + 1 + int *nNonZeroElmt) +{ + *nNonZeroElmt = 0; + int nThread = get_omp_threads(); + +#pragma omp parallel num_threads(nThread) + { + int i = 0; + + int NonZeroFound = 0; + +#pragma omp for schedule(dynamic) + for (i = 0; i < nao; i++) + { + int nNonZero = 0; + for (int j = 0; j < nao; j++) + { + if (fabs(dm[i * nao + j]) > cutoff) + { + nNonZero++; + } + } + nElmtRow[i] = nNonZero; + NonZeroFound += nNonZero; + } + +#pragma omp critical + { + *nNonZeroElmt += NonZeroFound; + } + } +} + +void _compress_dm( + const double *dm, + const int nao, + const double cutoff, + const int *nElmtRow, + int *RowLoc, + int *ColIndx, + double *dm_sparse) +{ + *RowLoc = 0; + for (int i = 0; i < nao; i++) + { + RowLoc[i + 1] = RowLoc[i] + nElmtRow[i]; + } + + int nThread = get_omp_threads(); + +#pragma omp parallel num_threads(nThread) + { + int i = 0; + + double *dm_ptr; + int *indx_ptr; + +#pragma omp for schedule(dynamic) + for (i = 0; i < nao; i++) + { + dm_ptr = dm_sparse + RowLoc[i]; + indx_ptr = ColIndx + RowLoc[i]; + for (int j = 0; j < nao; j++) + { + if (fabs(dm[i * nao + j]) > cutoff) + { + *dm_ptr++ = dm[i * nao + j]; + *indx_ptr++ = j; + } + } + } + } +} + +void _dm_aoR_spMM( + const double *dm_sparse, + const int *RowLoc, + const int *ColIndx, + const double *aoR, + const int nao, + const int ngrids, + double *out) +{ + static const int ONE = 1.0; + + // parallel over each row of dm_sparse + + int nThread = get_omp_threads(); + +#pragma omp parallel num_threads(nThread) + { + int i = 0; + + double *out_ptr; + const double *aoR_ptr; + const double *dm_ptr; + const int *indx_ptr; + +#pragma omp for schedule(dynamic) + for (i = 0; i < nao; i++) + { + out_ptr = out + i * ngrids; + dm_ptr = dm_sparse + RowLoc[i]; + indx_ptr = ColIndx + RowLoc[i]; + memset(out_ptr, 0, sizeof(double) * ngrids); + for (int j = 0; j < RowLoc[i + 1] - RowLoc[i]; j++) + { + aoR_ptr = aoR + indx_ptr[j] * ngrids; + daxpy_(&ngrids, dm_ptr + j, aoR_ptr, &ONE, out_ptr, &ONE); + } + } + } +} + +void NPdcwisemul(double *out, double *a, double *b, size_t n); + +void _cwise_product_check_Sparsity( + const double *V, + const double *dmRgR, + double *out, + const int naux, + const int ngrids, + const double cutoff, + double *buf, + int *UseSparsity, + int *IsSparsity) +{ + /// choose seed based on the current time + + static const double COMPRESS_CRITERION = 0.15; + + srand(time(NULL)); + + *UseSparsity = 1; + int nThread = get_omp_threads(); + + int nNonZeroElmt = 0; + + NPdcwisemul(out, V, dmRgR, naux * ngrids); + +#pragma omp parallel num_threads(nThread) + { + int i = 0; + int nNonZero = 0; + +#pragma omp for schedule(static) nowait + for (i = 0; i < naux * ngrids; i++) + { + if (fabs(out[i]) > cutoff) + { + nNonZero++; + } + else + { + out[i] = 0.0; + } + } + +#pragma omp critical + { + nNonZeroElmt += nNonZero; + } + } + + double sparsity = (double)nNonZeroElmt / (naux * ngrids); + printf("sparsity: %8.2f percentage \n", sparsity * 100); + + if (sparsity < COMPRESS_CRITERION) + { + *UseSparsity = 1; + } + else + { + *UseSparsity = 0; + } + + if (*UseSparsity == 1) + { + const int nMaxElmt = ngrids * COMPRESS_CRITERION * 2; + + int nDense = 0; + +#pragma omp parallel num_threads(nThread) + { + int32_t *nElmt_ptr, *indx_ptr; + double *Elmt_ptr, *out_ptr; + + int thread_id = omp_get_thread_num(); + double *buf_thread = buf + thread_id * ngrids; + +#pragma omp for schedule(static) nowait + for (int i = 0; i < naux; i++) + { + nElmt_ptr = (int32_t *)buf_thread; + indx_ptr = (int32_t *)((char *)buf_thread + sizeof(int32_t)); + Elmt_ptr = buf_thread + ngrids - 1; + out_ptr = out + i * ngrids; + + *nElmt_ptr = 0; + for (int j = 0; j < ngrids; j++) + { + if (fabs(out_ptr[j]) > cutoff) + { + *Elmt_ptr-- = out_ptr[j]; + *indx_ptr++ = j; + *nElmt_ptr += 1; + } + } + if (*nElmt_ptr > nMaxElmt) + { + IsSparsity[i] = 0; + +#pragma omp atomic + nDense++; + } + else + { + IsSparsity[i] = 1; + memcpy(out_ptr, buf_thread, sizeof(double) * ngrids); + } + } + } + printf("nDense: %d \n", nDense); + } +} + +void _V_Dm_product_SpMM( + const double *V_Dm_Product, + const int *IsSparsity, + const double *aoR, + const int nao, + const int naux, + const int ngrids, + double *out) +{ + static const int ONE = 1; + + int nThread = get_omp_threads(); + +#pragma omp parallel num_threads(nThread) + { + int i = 0, j = 0, k = 0; + + double *out_ptr; + + int32_t *nElmt_ptr, *indx_ptr; + const double *Elmt_ptr; + const double *aoR_ptr; + + int32_t nElmt; + +#pragma omp for schedule(static) nowait + for (i = 0; i < naux; i++) + { + if (IsSparsity[i] == 0) + { + out_ptr = out + i * nao; + memset(out_ptr, 0, sizeof(double) * nao); + + for (j = 0; j < nao; j++) + { + out_ptr[j] = ddot_(&ngrids, aoR + j * ngrids, &ONE, V_Dm_Product + i * ngrids, &ONE); + } + } + else + { + // # note extremely slow + nElmt_ptr = (int32_t *)(V_Dm_Product + i * ngrids); + indx_ptr = (int32_t *)(nElmt_ptr + 1); + Elmt_ptr = V_Dm_Product + (i + 1) * ngrids - 1; + nElmt = *nElmt_ptr; + + out_ptr = out + i * nao; + memset(out_ptr, 0, sizeof(double) * nao); + + if (nElmt == 0) + { + continue; + } + + for (j = 0; j < nao; j++) + { + aoR_ptr = aoR + j * ngrids; + for (k = 0; k < nElmt; k++) + { + out_ptr[j] += aoR_ptr[indx_ptr[k]] * Elmt_ptr[-k]; + } + } + } + } + } +} + +void _V_Dm_product_SpMM2( + const double *V_Dm_Product, + const int *IsSparsity, + const double *aoRT, + const int nao, + const int naux, + const int ngrids, + double *out) +{ + static const int ONE = 1; + + int nThread = get_omp_threads(); + +#pragma omp parallel num_threads(nThread) + { + int i = 0, j = 0, k = 0; + + double *out_ptr; + + int32_t *nElmt_ptr, *indx_ptr; + const double *Elmt_ptr; + const double *aoR_ptr; + + int32_t nElmt; + +#pragma omp for schedule(static) nowait + for (i = 0; i < naux; i++) + { + if (IsSparsity[i] == 0) + { + out_ptr = out + i * nao; + memset(out_ptr, 0, sizeof(double) * nao); + + // summation over grids + + for (j = 0; j < ngrids; j++) + { + daxpy_(&nao, V_Dm_Product + i * ngrids + j, aoRT + j * nao, &ONE, out_ptr, &ONE); + } + } + else + { + // # note extremely slow + nElmt_ptr = (int32_t *)(V_Dm_Product + i * ngrids); + indx_ptr = (int32_t *)(nElmt_ptr + 1); + Elmt_ptr = V_Dm_Product + (i + 1) * ngrids - 1; + nElmt = *nElmt_ptr; + + out_ptr = out + i * nao; + memset(out_ptr, 0, sizeof(double) * nao); + + if (nElmt == 0) + { + continue; + } + + for (j = 0; j < nElmt; j++) + { + aoR_ptr = aoRT + indx_ptr[j] * nao; + daxpy_(&nao, Elmt_ptr - j, aoR_ptr, &ONE, out_ptr, &ONE); + } + } + } + } +} + +//////// BASIC OPERATION USED IN GET_JK for k_ISDF //////// + +void dcwisemul_dense_sparse_kernel( + double *out, + const double *global, + const int nrow_global, + const int ncol_global, + const int row_shift, + const int col_shift, + const double *local, + const int *ao_invovled, + const int row_local, + const int col_local, + const int row_begin, + const int row_end, + const int col_begin, + const int col_end) +{ + const int nrow = row_end - row_begin; + const int ncol = col_end - col_begin; + memset(out, 0, sizeof(double) * nrow * ncol); + + int nthread = get_omp_threads(); + + const double *global_head = global + row_shift * ncol_global + col_shift; + +#pragma omp parallel for num_threads(nthread) schedule(static) + for (int i = 0; i < row_local; i++) + { + const int irow = ao_invovled[i]; + if (irow < row_begin || irow >= row_end) + { + continue; + } + double *p_global = global_head + (irow - row_begin) * ncol_global; + double *p_local = local + i * col_local; + double *p_out = out + i * ncol; + for (int j = 0; j < ncol; j++) + { + p_out[j] = p_global[j] * p_local[j]; + } + } +} \ No newline at end of file diff --git a/pyscf/lib/CMakeLists.txt b/pyscf/lib/CMakeLists.txt index 1e65e1a31..dcd9c5e42 100644 --- a/pyscf/lib/CMakeLists.txt +++ b/pyscf/lib/CMakeLists.txt @@ -120,3 +120,16 @@ set_target_properties (clib_pdft PROPERTIES CLEAN_DIRECT_OUTPUT 1 LIBRARY_OUTPUT_DIRECTORY ${PROJECT_SOURCE_DIR} OUTPUT_NAME "pdft") + +# Build the ISDF library +add_library(isdf SHARED +../isdf/pbc_isdf_samplek.c +../isdf/pbc_isdf_V.c +../isdf/pbc_isdf_auxbasis.c +../isdf/pbc_isdf_sparse.c +../isdf/pbc_isdf_eri.c +../isdf/fft.c) + +target_link_libraries(isdf cgto cint cvhf np_helper fftw3_threads fftw3 ${BLAS_LIBRARIES} ${OPENMP_C_PROPERTIES}) +set_target_properties(isdf PROPERTIES +LIBRARY_OUTPUT_DIRECTORY ${PROJECT_SOURCE_DIR}) \ No newline at end of file