Skip to content

Commit

Permalink
Support ionic repulsion, parquet output, rename out file, and lots of…
Browse files Browse the repository at this point in the history
… bugfixes (#10)

* feat(contacts): add detection of repulsion between like charges

* fix(contacts): add check for hydrogen bond length upper limit

* refactor(contacts): reduce the amount of else None returns

* fix(hbond): do not skip polar contact id when hbonds are not satisfied

* feat(io): support parquet, json, and ndjson output formats

* perf(io): no need to use f64 and i64 for saving results

* feat(cli): allow renaming output file of contacts and sasa

* feat(cli): modify logging levels to be less verbose by default

* fix(ring): skip ring calculation if atoms are missing

* fix(parser): correctly parse residue insertion codes and conformer altloc positions

* build(cargo): update dependencies

* docs: changelog for v0.4.0 release
  • Loading branch information
y1zhou authored Feb 17, 2025
1 parent 5f555ff commit c192255
Show file tree
Hide file tree
Showing 12 changed files with 366 additions and 167 deletions.
22 changes: 13 additions & 9 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,23 +5,26 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [Unreleased]
## [0.4.0] - 2025-02-17

### Added

- PLACEHOLDER
- Detection of repulsion between like charges
- Support of parquet, json, and ndjson output formats
- Added a `-name` flag in the CLI to rename the output file of `contacts` and `sasa` commands

### Fixed

- PLACEHOLDER
- Checks for polar contacts were skipped when hydrogen bond criteria are not satisfied
- Better error messages when rings have missing atoms for finding the center and normal vector
- Nomenclature mix of residue insertion codes and alternative locations; the two are now stored under separate columns (`*_insertion` and `*_altloc`) in the output files

### Changed

- PLACEHOLDER

### Removed

- PLACEHOLDER
- Distance cutoff for T-shaped Pi-stacking lowered from 6Å to 5Å
- Added hydrogen bond distance check to better differentiate Hbonds and polar contacts
- Performance/memory footprint improvement by switching from 64-bit numbers to 32-bit
- Logging is now less verbose


## [0.3.1]
Expand Down Expand Up @@ -75,7 +78,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Initial release
- Detection of common protein-protein interactions in a PDB or mmCIF file

[unreleased]: https://github.com/y1zhou/arpeggia/compare/v0.3.1...HEAD
[unreleased]: https://github.com/y1zhou/arpeggia/compare/v0.4.0...HEAD
[0.4.0]: https://github.com/y1zhou/arpeggia/releases/tag/v0.4.0
[0.3.1]: https://github.com/y1zhou/arpeggia/releases/tag/v0.3.1
[0.3.0]: https://github.com/y1zhou/arpeggia/releases/tag/v0.3.0
[0.2.0]: https://github.com/y1zhou/arpeggia/releases/tag/v0.2.0
Expand Down
22 changes: 13 additions & 9 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,19 +1,23 @@
[package]
name = "arpeggia"
version = "0.3.1"
version = "0.4.0"
description = "A port of the Arpeggio library for Rust"
edition = "2021"
authors = ["Yi Zhou <[email protected]>"]

# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

[dependencies]
clap = { version = "4.5.4", features = ["derive"] }
nalgebra = "0.32.5"
pdbtbx = { version = "0.11.0", features = ["rayon", "rstar"], git = "https://github.com/douweschulte/pdbtbx", rev = "1298c60e8573efbc3f7c77a9a2d6f0656fa8c638" }
polars = { version = "0.39.2", features = ["lazy"] }
clap = { version = "4.5.29", features = ["derive"] }
nalgebra = "0.33.2"
# pdbtbx = { version = "0.11.0", features = [
# "rayon",
# "rstar",
# ], git = "https://github.com/douweschulte/pdbtbx", rev = "1298c60e8573efbc3f7c77a9a2d6f0656fa8c638" }
pdbtbx = { version = "0.12.0", features = ["rayon", "rstar"] }
polars = { version = "0.46.0", features = ["lazy", "parquet", "json"] }
rayon = "1.10.0"
rstar = "0.12.0"
rust-sasa = "0.2.2"
tracing = "0.1.40"
tracing-subscriber = "0.3.18"
rstar = "0.12.2"
rust-sasa = "0.2.4"
tracing = "0.1.41"
tracing-subscriber = "0.3.19"
46 changes: 32 additions & 14 deletions src/cli/contacts.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
use crate::interactions::{InteractionComplex, Interactions, ResultEntry};
use crate::utils::{load_model, write_df_to_csv};
use crate::utils::{load_model, write_df_to_file, DataFrameFileType};
use clap::Parser;
use pdbtbx::*;
use polars::prelude::*;
Expand All @@ -23,6 +23,14 @@ pub(crate) struct Args {
#[arg(short, long)]
groups: String,

/// Name of the output file
#[arg(short, long, default_value_t = String::from("contacts"))]
name: String,

/// Output file type
#[arg(short = 't', long, default_value_t = DataFrameFileType::Csv)]
output_format: DataFrameFileType,

/// Compensation factor for VdW radii dependent interaction types
#[arg(short = 'c', long = "vdw-comp", default_value_t = 0.1)]
vdw_comp: f64,
Expand Down Expand Up @@ -83,13 +91,12 @@ pub(crate) fn run(args: &Args) {
// Prepare output directory
let output_path = Path::new(&args.output).canonicalize().unwrap();
let _ = std::fs::create_dir_all(output_path.clone());
let output_file = output_path.join("contacts.csv");

let output_file_str = output_file.to_str().unwrap();
debug!("Results will be saved to {output_file_str}");
let output_file = output_path
.join(args.name.clone())
.with_extension(args.output_format.to_string());

// Save results and log the identified interactions
info!(
debug!(
"Found {} atom-atom contacts\n{}",
df_atomic.shape().0,
df_atomic
Expand All @@ -103,7 +110,7 @@ pub(crate) fn run(args: &Args) {
if df_clash.height() > 0 {
warn!("Found {} steric clashes\n{}", df_clash.shape().0, df_clash);
}
info!("Found {} ring contacts\n{}", df_ring.shape().0, df_ring);
debug!("Found {} ring contacts\n{}", df_ring.shape().0, df_ring);

// Concate dataframes for saving to CSV
let mut df_contacts = concat([df_atomic.lazy(), df_ring.lazy()], UnionArgs::default())
Expand All @@ -112,7 +119,9 @@ pub(crate) fn run(args: &Args) {
.unwrap();

// Save res to CSV files
write_df_to_csv(&mut df_contacts, output_file);
write_df_to_file(&mut df_contacts, &output_file, args.output_format);
let output_file_str = output_file.to_str().unwrap();
info!("Results saved to {output_file_str}");
}

pub fn get_contacts<'a>(
Expand All @@ -121,7 +130,12 @@ pub fn get_contacts<'a>(
vdw_comp: f64,
dist_cutoff: f64,
) -> (DataFrame, DataFrame, InteractionComplex<'a>) {
let i_complex = InteractionComplex::new(pdb, groups, vdw_comp, dist_cutoff);
let (i_complex, build_ring_err) =
InteractionComplex::new(pdb, groups, vdw_comp, dist_cutoff).unwrap();

if !build_ring_err.is_empty() {
build_ring_err.iter().for_each(|e| warn!("{e}"));
}

// Find interactions
let atomic_contacts = i_complex.get_atomic_contacts();
Expand All @@ -136,9 +150,11 @@ pub fn get_contacts<'a>(
[
"from_chain",
"from_resi",
"from_insertion",
"from_altloc",
"to_chain",
"to_resi",
"to_insertion",
"to_altloc",
],
Default::default(),
Expand All @@ -151,19 +167,21 @@ pub fn get_contacts<'a>(
fn results_to_df(res: &[ResultEntry]) -> DataFrame {
df!(
"interaction" => res.iter().map(|x| x.interaction.to_string()).collect::<Vec<String>>(),
"distance" => res.iter().map(|x| x.distance).collect::<Vec<f64>>(),
"distance" => res.iter().map(|x| x.distance as f32).collect::<Vec<f32>>(),
"from_chain" => res.iter().map(|x| x.ligand.chain.to_owned()).collect::<Vec<String>>(),
"from_resn" => res.iter().map(|x| x.ligand.resn.to_owned()).collect::<Vec<String>>(),
"from_resi" => res.iter().map(|x| x.ligand.resi as i64).collect::<Vec<i64>>(),
"from_resi" => res.iter().map(|x| x.ligand.resi as i32).collect::<Vec<i32>>(),
"from_insertion" => res.iter().map(|x| x.ligand.insertion.to_owned()).collect::<Vec<String>>(),
"from_altloc" => res.iter().map(|x| x.ligand.altloc.to_owned()).collect::<Vec<String>>(),
"from_atomn" => res.iter().map(|x| x.ligand.atomn.to_owned()).collect::<Vec<String>>(),
"from_atomi" => res.iter().map(|x| x.ligand.atomi as i64).collect::<Vec<i64>>(),
"from_atomi" => res.iter().map(|x| x.ligand.atomi as i32).collect::<Vec<i32>>(),
"to_chain" => res.iter().map(|x| x.receptor.chain.to_owned()).collect::<Vec<String>>(),
"to_resn" => res.iter().map(|x| x.receptor.resn.to_owned()).collect::<Vec<String>>(),
"to_resi" => res.iter().map(|x| x.receptor.resi as i64).collect::<Vec<i64>>(),
"to_resi" => res.iter().map(|x| x.receptor.resi as i32).collect::<Vec<i32>>(),
"to_insertion" => res.iter().map(|x| x.receptor.insertion.to_owned()).collect::<Vec<String>>(),
"to_altloc" => res.iter().map(|x| x.receptor.altloc.to_owned()).collect::<Vec<String>>(),
"to_atomn" => res.iter().map(|x| x.receptor.atomn.to_owned()).collect::<Vec<String>>(),
"to_atomi" => res.iter().map(|x| x.receptor.atomi as i64).collect::<Vec<i64>>(),
"to_atomi" => res.iter().map(|x| x.receptor.atomi as i32).collect::<Vec<i32>>(),
)
.unwrap()
}
40 changes: 27 additions & 13 deletions src/cli/sasa.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
use crate::interactions::InteractingEntity;
use crate::residues::ResidueExt;
use crate::utils::{load_model, write_df_to_csv};
use crate::utils::{load_model, write_df_to_file, DataFrameFileType};
use clap::Parser;
use pdbtbx::*;
use polars::prelude::*;
Expand All @@ -16,10 +16,18 @@ pub(crate) struct Args {
#[arg(short, long)]
input: PathBuf,

/// Output CSV file path
/// Output directory
#[arg(short, long)]
output: PathBuf,

/// Name of the output file
#[arg(short, long, default_value_t = String::from("sasa"))]
name: String,

/// Output file type
#[arg(short = 't', long, default_value_t = DataFrameFileType::Csv)]
output_format: DataFrameFileType,

/// Probe radius r (smaller r detects more surface details and reports a larger surface)
#[arg(short = 'r', long = "probe-radius", default_value_t = 1.4)]
probe_radius: f32,
Expand Down Expand Up @@ -63,24 +71,29 @@ pub(crate) fn run(args: &Args) {
let output_path = Path::new(&args.output).canonicalize().unwrap();
let _ = std::fs::create_dir_all(output_path.clone());
let output_file = match output_path.is_dir() {
true => output_path.join("sasa.csv"),
true => output_path.join(args.name.clone()),
false => output_path,
};

let output_file_str = output_file.to_str().unwrap();
debug!("Results will be saved to {output_file_str}");
}
.with_extension(args.output_format.to_string());

// Save results and log the identified SASA
let non_zero_sasa_mask = df_sasa.column("sasa").unwrap().not_equal(0.0).unwrap();
let non_zero_sasa_mask = df_sasa
.column("sasa")
.unwrap()
.as_materialized_series()
.not_equal(0.0)
.unwrap();
let df_sasa_nonzero = df_sasa.filter(&non_zero_sasa_mask).unwrap();
info!(
debug!(
"Found {} atoms with non-zero SASA\n{}",
df_sasa_nonzero.shape().0,
df_sasa_nonzero
);

// Save res to CSV files
write_df_to_csv(&mut df_sasa, output_file);
write_df_to_file(&mut df_sasa, &output_file, args.output_format);
let output_file_str = output_file.to_str().unwrap();
info!("Results saved to {output_file_str}");
}

pub fn get_atom_sasa(pdb: &PDB, probe_radius: f32, n_points: usize) -> DataFrame {
Expand Down Expand Up @@ -118,15 +131,15 @@ pub fn get_atom_sasa(pdb: &PDB, probe_radius: f32, n_points: usize) -> DataFrame
let atom_annot_df = df!(
"chain" => atom_annotations.iter().map(|x| x.chain.to_owned()).collect::<Vec<String>>(),
"resn" => atom_annotations.iter().map(|x| x.resn.to_owned()).collect::<Vec<String>>(),
"resi" => atom_annotations.iter().map(|x| x.resi as i64).collect::<Vec<i64>>(),
"resi" => atom_annotations.iter().map(|x| x.resi as i32).collect::<Vec<i32>>(),
"altloc" => atom_annotations.iter().map(|x| x.altloc.to_owned()).collect::<Vec<String>>(),
"atomn" => atom_annotations.iter().map(|x| x.atomn.to_owned()).collect::<Vec<String>>(),
"atomi" => atom_annotations.iter().map(|x| x.atomi as i64).collect::<Vec<i64>>(),
"atomi" => atom_annotations.iter().map(|x| x.atomi as i32).collect::<Vec<i32>>(),
)
.unwrap();

df!(
"atomi" => atoms.iter().map(|x| x.id as i64).collect::<Vec<i64>>(),
"atomi" => atoms.iter().map(|x| x.id as i32).collect::<Vec<i32>>(),
"sasa" => atom_sasa
)
.unwrap()
Expand All @@ -135,6 +148,7 @@ pub fn get_atom_sasa(pdb: &PDB, probe_radius: f32, n_points: usize) -> DataFrame
["atomi"],
["atomi"],
JoinArgs::new(JoinType::Inner),
None,
)
.unwrap()
.sort(["chain", "resi", "altloc", "atomi"], Default::default())
Expand Down
7 changes: 2 additions & 5 deletions src/interactions/aromatic.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,10 @@ pub fn find_cation_pi(ring: &Ring, entity: &AtomConformerResidueChainModel) -> O
let theta = point_ring_angle(ring, &atom_point);

if (theta <= CATION_PI_ANGLE_THRESHOLD) & (dist <= CATION_PI_DIST_THRESHOLD) {
Some(Interaction::CationPi)
} else {
None
return Some(Interaction::CationPi);
}
} else {
None
}
None
}

/// Identify pi-pi interactions using the classification by [Chakrabarti and Bhattacharyya (2007)](https://doi.org/10.1016/j.pbiomolbio.2007.03.016), Fig. 11.
Expand Down
Loading

0 comments on commit c192255

Please sign in to comment.