generate_features.py

import argparse
import os
import time
from glob import glob
from pathlib import Path

import pandas as pd
import progressbar
from biopandas.mol2 import PandasMol2
from biopandas.pdb import PandasPdb
from scipy import spatial

atom_types = {
    "OE1": "O",
    "HB1": "H",
    "SD": "S",
    "HE": "H",
    "1HD1": "H",
    "1HG1": "H",
    "2HD2": "H",
    "1HD2": "H",
    "CE3": "C",
    "OH": "O",
    "CZ": "C",
    "HG2": "H",
    "HN2": "H",
    "NZ": "N",
    "HN1": "H",
    "3HD2": "H",
    "CD2": "C",
    "2HH2": "H",
    "HH2": "H",
    "O": "O",
    "2HD1": "H",
    "ND1": "N",
    "HH": "H",
    "1HE2": "H",
    "HB": "H",
    "NH2": "N",
    "3HG1": "H",
    "ND2": "N",
    "CZ3": "C",
    "HA2": "H",
    "OG": "O",
    "CG2": "C",
    "CE": "C",
    "SG": "S",
    "NE": "N",
    "CG": "C",
    "CB": "C",
    "HG1": "H",
    "NH1": "N",
    "2HE2": "H",
    "3HD1": "H",
    "1HH2": "H",
    "HD2": "H",
    "HD1": "H",
    "NE1": "N",
    "HB2": "H",
    "HA": "H",
    "3HG2": "H",
    "HN3": "H",
    "HE1": "H",
    "CD": "C",
    "HZ3": "H",
    "OD1": "O",
    "N": "N",
    "H": "H",
    "HA1": "H",
    "2HH1": "H",
    "NE2": "N",
    "CE2": "C",
    "C": "C",
    "OD2": "O",
    "2HG1": "H",
    "CD1": "C",
    "HE3": "H",
    "1HH1": "H",
    "2HG2": "H",
    "HB3": "H",
    "CE1": "C",
    "OXT": "O",
    "CH2": "C",
    "1HG2": "H",
    "HZ1": "H",
    "OG1": "O",
    "HZ2": "H",
    "CA": "C",
    "CG1": "C",
    "HE2": "H",
    "CZ2": "C",
    "OE2": "O",
    "HG": "H",
    "HZ": "H",
}

amino_acid_groups = [
    ["ARG", "LYS", "ASP", "GLU"],
    ["GLN", "ASN", "HIS", "SER", "THR", "CYS"],
    ["TRP", "TYR", "MET"],
    ["ILE", "LEU", "PHE", "VAL", "PRO", "GLY", "ALA"],
]

elements = ["H", "C", "N", "O", "S", "P", "F", "Cl", "Br", "I"]


def generate_columns_name(elements, amino_acid_groups):

    """
       Generate feature names which used as columns in .csv file.

    Parameters:
       elements (list): Elements
       amino_acid_groups (list): Classification of amino acids

    Returns:
       columns_name(dict): Feature names which used as columns in .csv file.
    """

    columns_name = {}

    i = 0

    for ligand_element in elements:

        for protein_element in elements:

            for index, amino_acid_group in enumerate(amino_acid_groups):

                columns_name[i] = (
                    ligand_element + "_" + protein_element + "_" + str(index)
                )

                i += 1

    return columns_name


def ligand_specific_element_coordinates(ligand, element, complex=False):

    """
       Returns coordinates of ligand's atoms belong to specific element.

    Parameters:
       ligand (pandas dataframe):Pandas dataframe object extrated from mol2 file of ligand.
       element (str): Element name in string format.
       complex (bool): Use when ligand and protein are in a single pdb file.

    Returns:
       coordinates(numpy array):Numpy array contains ligand coordinates.
    """

    if complex:
        coordinates = ligand[ligand["element_symbol"] == element].loc[
            :, ["x_coord", "y_coord", "z_coord"]
        ]

    else:
        coordinates = ligand[ligand["element_symbol"] == element].loc[
            :, ["x", "y", "z"]
        ]

    return coordinates.to_numpy()


def protein_specific_element_coordinates(protein, element, amino_acid_group):

    """
       Returns coordinates of protein's atoms belong to specific element.

    Parameters:
       protein (pandas dataframe):Pandas dataframe object extrated from pdb file of protein.
       element (str): Element name in string format.

    Returns:
       coordinates(numpy array):Numpy array contains protein coordinates.
    """

    protein = protein[protein["residue_name"].isin(amino_acid_group)]

    coordinates = protein[protein["element_symbol"] == element].loc[
        :, ["x_coord", "y_coord", "z_coord"]
    ]

    return coordinates.to_numpy()


def weighting_sum(distances, cutoff, exp):

    """
       Returns generated feature.

    Parameters:
       distances (numpy array):Distances between ligand and protein atoms.
       cutoff (float): Distance cutoff.
       exp (float): Weighting exponent

    Returns:
       feature(float): Generated feature.
    """

    selected_distances = distances[distances < cutoff]

    feature = sum(list(map(lambda x: 1.0 / (x ** exp), selected_distances)))

    return feature


def generate_features(path, exp=2, cutoff=4.5, complex=False, filename="file.csv"):

    """
       Returns coordinates of ligand's atoms belong to specific element.

    Parameters:
       path (str):Path of structures files in string.
       cutoff (float): Distance cutoff.
       exp (float): Weighting exponent
       complex (bool): Use when ligand and protein are in a single pdb file.
       filename (str): Filename of generated cvs file.

    Returns:
       df(csv):Return csv file of generated features.
    """

    data = {}

    entries = Path(path)

    numbers = len(os.listdir(path))

    bar = progressbar.ProgressBar(maxval=numbers).start()

    for num, entry in enumerate(entries.iterdir()):

        bar.update(num + 1)

        features = []

        if complex:
            complex_file = glob(str(entry))

            pdf = PandasPdb().read_pdb(complex_file[0])

            pmol = pdf.df["HETATM"]

            ppdb = pdf.df["ATOM"]

            ppdb["element_symbol"] = ppdb["atom_name"].map(atom_types)

        else:

            ligand_file = glob(str(entry) + "//*.mol2")

            protein_file = glob(str(entry) + "//*.pdb")

            try:
                pmol = PandasMol2().read_mol2(ligand_file[0])

            except Exception:

                print(f"Error in file structure {str(entry.name)}")

                continue

            pmol = pmol.df

            pmol["element_symbol"] = pmol["atom_type"].apply(lambda x: x.split(".")[0])

            ppdb = PandasPdb().read_pdb(protein_file[0])

            ppdb = ppdb.df["ATOM"]

            ppdb["element_symbol"] = ppdb["atom_name"].map(atom_types)

        for ligand_element in elements:

            for protein_element in elements:

                for amino_acid_group in amino_acid_groups:

                    ligand_coords = ligand_specific_element_coordinates(
                        pmol, ligand_element, complex
                    )

                    protein_coords = protein_specific_element_coordinates(
                        ppdb, protein_element, amino_acid_group
                    )

                    distances = spatial.distance.cdist(
                        ligand_coords, protein_coords
                    ).ravel()

                    features.append(weighting_sum(distances, cutoff, exp))

        if complex:

            name = entry.name

        else:
            name = entry.name

        data[name] = features

    columns_name = generate_columns_name(elements, amino_acid_groups)

    df = pd.DataFrame(data).transpose().rename(columns=columns_name)

    return df.to_csv(filename)


if __name__ == "__main__":

    start = time.time()
    print("\n")
    print("Job is started.")
    print("------------------------------")

    parser = argparse.ArgumentParser(
        description="Generating features for a set of given structures."
    )

    parser.add_argument(
        "-d", "--directory", help="Directory of structures files.", required=True
    )
    parser.add_argument(
        "-n",
        "--exp",
        type=float,
        default=2,
        help="Exponent is used in weighting factor.",
    )

    parser.add_argument(
        "-o",
        "--cutoff",
        type=float,
        default=4.5,
        help="Cutoff distance is used during feature engeineering.",
    )

    parser.add_argument(
        "-c",
        "--complex",
        type=bool,
        default=False,
        help="Indicating input files are complex or not.",
    )
    parser.add_argument(
        "-f", "--filename", default="file.csv", help="Name of output csv file."
    )

    args = parser.parse_args()

    print("Input parameters:")
    print(f"Exponent: {args.exp}")
    print(f"Complex: {args.complex}")
    print(f"Cutoff: {args.cutoff}")
    print(f"Filename: {args.filename}")
    print("------------------------------")
    print("Please wait until all features are generated...")
    generate_features(
        args.directory, args.exp, args.cutoff, args.complex, args.filename
    )

    end = time.time()

    seconds = end - start
    m, s = divmod(seconds, 60)
    h, m = divmod(m, 60)

    print("------------------------------")
    print(f"Job is done at {h} hours, {m} minutes and {s:.2f} seconds!")
    print(f"{args.filename} is created.")