Support ionic repulsion, parquet output, rename out file, and lots of…

… bugfixes (#10) * feat(contacts): add detection of repulsion between like charges * fix(contacts): add check for hydrogen bond length upper limit * refactor(contacts): reduce the amount of else None returns * fix(hbond): do not skip polar contact id when hbonds are not satisfied * feat(io): support parquet, json, and ndjson output formats * perf(io): no need to use f64 and i64 for saving results * feat(cli): allow renaming output file of contacts and sasa * feat(cli): modify logging levels to be less verbose by default * fix(ring): skip ring calculation if atoms are missing * fix(parser): correctly parse residue insertion codes and conformer altloc positions * build(cargo): update dependencies * docs: changelog for v0.4.0 release
y1zhou · Feb 17, 2025 · c192255 · c192255
1 parent 5f555ff
commit c192255
Show file tree

Hide file tree

Showing 12 changed files with 366 additions and 167 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,23 +5,26 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
-## [Unreleased]
+## [0.4.0] - 2025-02-17
 
 ### Added
 
-- PLACEHOLDER
+- Detection of repulsion between like charges
+- Support of parquet, json, and ndjson output formats
+- Added a `-name` flag in the CLI to rename the output file of `contacts` and `sasa` commands
 
 ### Fixed
 
-- PLACEHOLDER
+- Checks for polar contacts were skipped when hydrogen bond criteria are not satisfied
+- Better error messages when rings have missing atoms for finding the center and normal vector
+- Nomenclature mix of residue insertion codes and alternative locations; the two are now stored under separate columns (`*_insertion` and `*_altloc`) in the output files
 
 ### Changed
 
-- PLACEHOLDER
-
-### Removed
-
-- PLACEHOLDER
+- Distance cutoff for T-shaped Pi-stacking lowered from 6Å to 5Å
+- Added hydrogen bond distance check to better differentiate Hbonds and polar contacts
+- Performance/memory footprint improvement by switching from 64-bit numbers to 32-bit
+- Logging is now less verbose
 
 
 ## [0.3.1]
@@ -75,7 +78,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Initial release
 - Detection of common protein-protein interactions in a PDB or mmCIF file
 
-[unreleased]: https://github.com/y1zhou/arpeggia/compare/v0.3.1...HEAD
+[unreleased]: https://github.com/y1zhou/arpeggia/compare/v0.4.0...HEAD
+[0.4.0]: https://github.com/y1zhou/arpeggia/releases/tag/v0.4.0
 [0.3.1]: https://github.com/y1zhou/arpeggia/releases/tag/v0.3.1
 [0.3.0]: https://github.com/y1zhou/arpeggia/releases/tag/v0.3.0
 [0.2.0]: https://github.com/y1zhou/arpeggia/releases/tag/v0.2.0

diff --git a/Cargo.toml b/Cargo.toml
@@ -1,19 +1,23 @@
 [package]
 name = "arpeggia"
-version = "0.3.1"
+version = "0.4.0"
 description = "A port of the Arpeggio library for Rust"
 edition = "2021"
 authors = ["Yi Zhou <[email protected]>"]
 
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 
 [dependencies]
-clap = { version = "4.5.4", features = ["derive"] }
-nalgebra = "0.32.5"
-pdbtbx = { version = "0.11.0", features = ["rayon", "rstar"], git = "https://github.com/douweschulte/pdbtbx", rev = "1298c60e8573efbc3f7c77a9a2d6f0656fa8c638" }
-polars = { version = "0.39.2", features = ["lazy"] }
+clap = { version = "4.5.29", features = ["derive"] }
+nalgebra = "0.33.2"
+# pdbtbx = { version = "0.11.0", features = [
+#     "rayon",
+#     "rstar",
+# ], git = "https://github.com/douweschulte/pdbtbx", rev = "1298c60e8573efbc3f7c77a9a2d6f0656fa8c638" }
+pdbtbx = { version = "0.12.0", features = ["rayon", "rstar"] }
+polars = { version = "0.46.0", features = ["lazy", "parquet", "json"] }
 rayon = "1.10.0"
-rstar = "0.12.0"
-rust-sasa = "0.2.2"
-tracing = "0.1.40"
-tracing-subscriber = "0.3.18"
+rstar = "0.12.2"
+rust-sasa = "0.2.4"
+tracing = "0.1.41"
+tracing-subscriber = "0.3.19"
diff --git a/src/cli/contacts.rs b/src/cli/contacts.rs
@@ -1,5 +1,5 @@
 use crate::interactions::{InteractionComplex, Interactions, ResultEntry};
-use crate::utils::{load_model, write_df_to_csv};
+use crate::utils::{load_model, write_df_to_file, DataFrameFileType};
 use clap::Parser;
 use pdbtbx::*;
 use polars::prelude::*;
@@ -23,6 +23,14 @@ pub(crate) struct Args {
     #[arg(short, long)]
     groups: String,
 
+    /// Name of the output file
+    #[arg(short, long, default_value_t = String::from("contacts"))]
+    name: String,
+
+    /// Output file type
+    #[arg(short = 't', long, default_value_t = DataFrameFileType::Csv)]
+    output_format: DataFrameFileType,
+
     /// Compensation factor for VdW radii dependent interaction types
     #[arg(short = 'c', long = "vdw-comp", default_value_t = 0.1)]
     vdw_comp: f64,
@@ -83,13 +91,12 @@ pub(crate) fn run(args: &Args) {
     // Prepare output directory
     let output_path = Path::new(&args.output).canonicalize().unwrap();
     let _ = std::fs::create_dir_all(output_path.clone());
-    let output_file = output_path.join("contacts.csv");
-
-    let output_file_str = output_file.to_str().unwrap();
-    debug!("Results will be saved to {output_file_str}");
+    let output_file = output_path
+        .join(args.name.clone())
+        .with_extension(args.output_format.to_string());
 
     // Save results and log the identified interactions
-    info!(
+    debug!(
         "Found {} atom-atom contacts\n{}",
         df_atomic.shape().0,
         df_atomic
@@ -103,7 +110,7 @@ pub(crate) fn run(args: &Args) {
     if df_clash.height() > 0 {
         warn!("Found {} steric clashes\n{}", df_clash.shape().0, df_clash);
     }
-    info!("Found {} ring contacts\n{}", df_ring.shape().0, df_ring);
+    debug!("Found {} ring contacts\n{}", df_ring.shape().0, df_ring);
 
     // Concate dataframes for saving to CSV
     let mut df_contacts = concat([df_atomic.lazy(), df_ring.lazy()], UnionArgs::default())
@@ -112,7 +119,9 @@ pub(crate) fn run(args: &Args) {
         .unwrap();
 
     // Save res to CSV files
-    write_df_to_csv(&mut df_contacts, output_file);
+    write_df_to_file(&mut df_contacts, &output_file, args.output_format);
+    let output_file_str = output_file.to_str().unwrap();
+    info!("Results saved to {output_file_str}");
 }
 
 pub fn get_contacts<'a>(
@@ -121,7 +130,12 @@ pub fn get_contacts<'a>(
     vdw_comp: f64,
     dist_cutoff: f64,
 ) -> (DataFrame, DataFrame, InteractionComplex<'a>) {
-    let i_complex = InteractionComplex::new(pdb, groups, vdw_comp, dist_cutoff);
+    let (i_complex, build_ring_err) =
+        InteractionComplex::new(pdb, groups, vdw_comp, dist_cutoff).unwrap();
+
+    if !build_ring_err.is_empty() {
+        build_ring_err.iter().for_each(|e| warn!("{e}"));
+    }
 
     // Find interactions
     let atomic_contacts = i_complex.get_atomic_contacts();
@@ -136,9 +150,11 @@ pub fn get_contacts<'a>(
             [
                 "from_chain",
                 "from_resi",
+                "from_insertion",
                 "from_altloc",
                 "to_chain",
                 "to_resi",
+                "to_insertion",
                 "to_altloc",
             ],
             Default::default(),
@@ -151,19 +167,21 @@ pub fn get_contacts<'a>(
 fn results_to_df(res: &[ResultEntry]) -> DataFrame {
     df!(
         "interaction" => res.iter().map(|x| x.interaction.to_string()).collect::<Vec<String>>(),
-        "distance" => res.iter().map(|x| x.distance).collect::<Vec<f64>>(),
+        "distance" => res.iter().map(|x| x.distance as f32).collect::<Vec<f32>>(),
         "from_chain" => res.iter().map(|x| x.ligand.chain.to_owned()).collect::<Vec<String>>(),
         "from_resn" => res.iter().map(|x| x.ligand.resn.to_owned()).collect::<Vec<String>>(),
-        "from_resi" => res.iter().map(|x| x.ligand.resi as i64).collect::<Vec<i64>>(),
+        "from_resi" => res.iter().map(|x| x.ligand.resi as i32).collect::<Vec<i32>>(),
+        "from_insertion" => res.iter().map(|x| x.ligand.insertion.to_owned()).collect::<Vec<String>>(),
         "from_altloc" => res.iter().map(|x| x.ligand.altloc.to_owned()).collect::<Vec<String>>(),
         "from_atomn" => res.iter().map(|x| x.ligand.atomn.to_owned()).collect::<Vec<String>>(),
-        "from_atomi" => res.iter().map(|x| x.ligand.atomi as i64).collect::<Vec<i64>>(),
+        "from_atomi" => res.iter().map(|x| x.ligand.atomi as i32).collect::<Vec<i32>>(),
         "to_chain" => res.iter().map(|x| x.receptor.chain.to_owned()).collect::<Vec<String>>(),
         "to_resn" => res.iter().map(|x| x.receptor.resn.to_owned()).collect::<Vec<String>>(),
-        "to_resi" => res.iter().map(|x| x.receptor.resi as i64).collect::<Vec<i64>>(),
+        "to_resi" => res.iter().map(|x| x.receptor.resi as i32).collect::<Vec<i32>>(),
+        "to_insertion" => res.iter().map(|x| x.receptor.insertion.to_owned()).collect::<Vec<String>>(),
         "to_altloc" => res.iter().map(|x| x.receptor.altloc.to_owned()).collect::<Vec<String>>(),
         "to_atomn" => res.iter().map(|x| x.receptor.atomn.to_owned()).collect::<Vec<String>>(),
-        "to_atomi" => res.iter().map(|x| x.receptor.atomi as i64).collect::<Vec<i64>>(),
+        "to_atomi" => res.iter().map(|x| x.receptor.atomi as i32).collect::<Vec<i32>>(),
     )
     .unwrap()
 }
diff --git a/src/cli/sasa.rs b/src/cli/sasa.rs
@@ -1,6 +1,6 @@
 use crate::interactions::InteractingEntity;
 use crate::residues::ResidueExt;
-use crate::utils::{load_model, write_df_to_csv};
+use crate::utils::{load_model, write_df_to_file, DataFrameFileType};
 use clap::Parser;
 use pdbtbx::*;
 use polars::prelude::*;
@@ -16,10 +16,18 @@ pub(crate) struct Args {
     #[arg(short, long)]
     input: PathBuf,
 
-    /// Output CSV file path
+    /// Output directory
     #[arg(short, long)]
     output: PathBuf,
 
+    /// Name of the output file
+    #[arg(short, long, default_value_t = String::from("sasa"))]
+    name: String,
+
+    /// Output file type
+    #[arg(short = 't', long, default_value_t = DataFrameFileType::Csv)]
+    output_format: DataFrameFileType,
+
     /// Probe radius r (smaller r detects more surface details and reports a larger surface)
     #[arg(short = 'r', long = "probe-radius", default_value_t = 1.4)]
     probe_radius: f32,
@@ -63,24 +71,29 @@ pub(crate) fn run(args: &Args) {
     let output_path = Path::new(&args.output).canonicalize().unwrap();
     let _ = std::fs::create_dir_all(output_path.clone());
     let output_file = match output_path.is_dir() {
-        true => output_path.join("sasa.csv"),
+        true => output_path.join(args.name.clone()),
         false => output_path,
-    };
-
-    let output_file_str = output_file.to_str().unwrap();
-    debug!("Results will be saved to {output_file_str}");
+    }
+    .with_extension(args.output_format.to_string());
 
     // Save results and log the identified SASA
-    let non_zero_sasa_mask = df_sasa.column("sasa").unwrap().not_equal(0.0).unwrap();
+    let non_zero_sasa_mask = df_sasa
+        .column("sasa")
+        .unwrap()
+        .as_materialized_series()
+        .not_equal(0.0)
+        .unwrap();
     let df_sasa_nonzero = df_sasa.filter(&non_zero_sasa_mask).unwrap();
-    info!(
+    debug!(
         "Found {} atoms with non-zero SASA\n{}",
         df_sasa_nonzero.shape().0,
         df_sasa_nonzero
     );
 
     // Save res to CSV files
-    write_df_to_csv(&mut df_sasa, output_file);
+    write_df_to_file(&mut df_sasa, &output_file, args.output_format);
+    let output_file_str = output_file.to_str().unwrap();
+    info!("Results saved to {output_file_str}");
 }
 
 pub fn get_atom_sasa(pdb: &PDB, probe_radius: f32, n_points: usize) -> DataFrame {
@@ -118,15 +131,15 @@ pub fn get_atom_sasa(pdb: &PDB, probe_radius: f32, n_points: usize) -> DataFrame
     let atom_annot_df = df!(
         "chain" => atom_annotations.iter().map(|x| x.chain.to_owned()).collect::<Vec<String>>(),
         "resn" => atom_annotations.iter().map(|x| x.resn.to_owned()).collect::<Vec<String>>(),
-        "resi" => atom_annotations.iter().map(|x| x.resi as i64).collect::<Vec<i64>>(),
+        "resi" => atom_annotations.iter().map(|x| x.resi as i32).collect::<Vec<i32>>(),
         "altloc" => atom_annotations.iter().map(|x| x.altloc.to_owned()).collect::<Vec<String>>(),
         "atomn" => atom_annotations.iter().map(|x| x.atomn.to_owned()).collect::<Vec<String>>(),
-        "atomi" => atom_annotations.iter().map(|x| x.atomi as i64).collect::<Vec<i64>>(),
+        "atomi" => atom_annotations.iter().map(|x| x.atomi as i32).collect::<Vec<i32>>(),
     )
     .unwrap();
 
     df!(
-        "atomi" => atoms.iter().map(|x| x.id as i64).collect::<Vec<i64>>(),
+        "atomi" => atoms.iter().map(|x| x.id as i32).collect::<Vec<i32>>(),
         "sasa" => atom_sasa
     )
     .unwrap()
@@ -135,6 +148,7 @@ pub fn get_atom_sasa(pdb: &PDB, probe_radius: f32, n_points: usize) -> DataFrame
         ["atomi"],
         ["atomi"],
         JoinArgs::new(JoinType::Inner),
+        None,
     )
     .unwrap()
     .sort(["chain", "resi", "altloc", "atomi"], Default::default())

diff --git a/src/interactions/aromatic.rs b/src/interactions/aromatic.rs
@@ -19,13 +19,10 @@ pub fn find_cation_pi(ring: &Ring, entity: &AtomConformerResidueChainModel) -> O
         let theta = point_ring_angle(ring, &atom_point);
 
         if (theta <= CATION_PI_ANGLE_THRESHOLD) & (dist <= CATION_PI_DIST_THRESHOLD) {
-            Some(Interaction::CationPi)
-        } else {
-            None
+            return Some(Interaction::CationPi);
         }
-    } else {
-        None
     }
+    None
 }
 
 /// Identify pi-pi interactions using the classification by [Chakrabarti and Bhattacharyya (2007)](https://doi.org/10.1016/j.pbiomolbio.2007.03.016), Fig. 11.