Skip to content

Commit ea47488

Browse files
authored
Merge pull request #636 from DeepRank/635_tutorials_dbodor
tutorials: avoid error messages in tutorial
2 parents 128d5f9 + e0b209a commit ea47488

File tree

10 files changed

+919
-878
lines changed

10 files changed

+919
-878
lines changed

.github/workflows/notebooks.yml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,9 +54,11 @@ jobs:
5454
- name: Download the data for the tutorials
5555
shell: bash -l {0}
5656
run: |
57-
wget https://zenodo.org/records/8349335/files/data_raw.zip
57+
wget https://zenodo.org/records/13709906/files/data_raw.zip
5858
unzip data_raw.zip -d data_raw
5959
mv data_raw tutorials
60+
echo listing files in data_raw:
61+
ls tutorials/data_raw
6062
6163
- name: Run tutorial notebooks
6264
run: pytest --nbmake tutorials

deeprank2/dataset.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,7 @@ def _check_and_inherit_train( # noqa: C901
112112
for key in data["features_transform"].values():
113113
if key["transform"] is None:
114114
continue
115-
key["transform"] = eval(key["transform"]) # noqa: S307, PGH001
115+
key["transform"] = eval(key["transform"]) # noqa: S307
116116
except pickle.UnpicklingError as e:
117117
msg = "The path provided to `train_source` is not a valid DeepRank2 pre-trained model."
118118
raise ValueError(msg) from e
@@ -277,7 +277,7 @@ def _filter_targets(self, grp: h5py.Group) -> bool:
277277
for operator_string in [">", "<", "==", "<=", ">=", "!="]:
278278
operation = operation.replace(operator_string, f"{target_value}" + operator_string)
279279

280-
if not eval(operation): # noqa: S307, PGH001
280+
if not eval(operation): # noqa: S307
281281
return False
282282

283283
elif target_condition is not None:

deeprank2/query.py

Lines changed: 36 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
import deeprank2.features
2323
from deeprank2.domain.aminoacidlist import convert_aa_nomenclature
2424
from deeprank2.features import components, conservation, contact
25-
from deeprank2.molstruct.residue import Residue, SingleResidueVariant
25+
from deeprank2.molstruct.residue import SingleResidueVariant
2626
from deeprank2.utils.buildgraph import get_contact_atoms, get_structure, get_surrounding_residues
2727
from deeprank2.utils.graph import Graph
2828
from deeprank2.utils.grid import Augmentation, GridSettings, MapMethod
@@ -265,12 +265,11 @@ def _build_helper(self) -> Graph:
265265
structure = self._load_structure()
266266

267267
# find the variant residue and its surroundings
268-
variant_residue: Residue = None
269268
for residue in structure.get_chain(self.variant_chain_id).residues:
270269
if residue.number == self.variant_residue_number and residue.insertion_code == self.insertion_code:
271270
variant_residue = residue
272271
break
273-
if variant_residue is None:
272+
else: # if break is not reached
274273
msg = f"Residue not found in {self.pdb_path}: {self.variant_chain_id} {self.residue_id}"
275274
raise ValueError(msg)
276275
self.variant = SingleResidueVariant(variant_residue, self.variant_amino_acid)
@@ -354,19 +353,12 @@ def _build_helper(self) -> Graph:
354353
raise ValueError(msg)
355354

356355
# build the graph
357-
if self.resolution == "atom":
358-
graph = Graph.build_graph(
359-
contact_atoms,
360-
self.get_query_id(),
361-
self.max_edge_length,
362-
)
363-
elif self.resolution == "residue":
364-
residues_selected = list({atom.residue for atom in contact_atoms})
365-
graph = Graph.build_graph(
366-
residues_selected,
367-
self.get_query_id(),
368-
self.max_edge_length,
369-
)
356+
nodes = contact_atoms if self.resolution == "atom" else list({atom.residue for atom in contact_atoms})
357+
graph = Graph.build_graph(
358+
nodes=nodes,
359+
graph_id=self.get_query_id(),
360+
max_edge_length=self.max_edge_length,
361+
)
370362

371363
graph.center = np.mean([atom.position for atom in contact_atoms], axis=0)
372364
structure = contact_atoms[0].residue.chain.model
@@ -453,7 +445,7 @@ def __iter__(self) -> Iterator[Query]:
453445
def __len__(self) -> int:
454446
return len(self._queries)
455447

456-
def _process_one_query(self, query: Query) -> None:
448+
def _process_one_query(self, query: Query, log_error_traceback: bool = False) -> None:
457449
"""Only one process may access an hdf5 file at a time."""
458450
try:
459451
output_path = f"{self._prefix}-{os.getpid()}.hdf5"
@@ -479,10 +471,12 @@ def _process_one_query(self, query: Query) -> None:
479471

480472
except (ValueError, AttributeError, KeyError, TimeoutError) as e:
481473
_log.warning(
482-
f"\nGraph/Query with ID {query.get_query_id()} ran into an Exception ({e.__class__.__name__}: {e}),"
483-
" and it has not been written to the hdf5 file. More details below:",
474+
f"Graph/Query with ID {query.get_query_id()} ran into an Exception and was not written to the hdf5 file.\n"
475+
f"Exception found: {e.__class__.__name__}: {e}.\n"
476+
"You may proceed with your analysis, but this query will be ignored.\n",
484477
)
485-
_log.exception(e)
478+
if log_error_traceback:
479+
_log.exception(f"----Full error traceback:----\n{e}")
486480

487481
def process(
488482
self,
@@ -493,6 +487,7 @@ def process(
493487
grid_settings: GridSettings | None = None,
494488
grid_map_method: MapMethod | None = None,
495489
grid_augmentation_count: int = 0,
490+
log_error_traceback: bool = False,
496491
) -> list[str]:
497492
"""Render queries into graphs (and optionally grids).
498493
@@ -510,6 +505,8 @@ def process(
510505
grid_settings: If valid together with `grid_map_method`, the grid data will be stored as well. Defaults to None.
511506
grid_map_method: If valid together with `grid_settings`, the grid data will be stored as well. Defaults to None.
512507
grid_augmentation_count: Number of grid data augmentations (must be >= 0). Defaults to 0.
508+
log_error_traceback: if True, logs full error message in case query fails. Otherwise only the error message is logged.
509+
Defaults to false.
513510
514511
Returns:
515512
The list of paths of the generated HDF5 files.
@@ -536,7 +533,7 @@ def process(
536533
self._grid_augmentation_count = grid_augmentation_count
537534

538535
_log.info(f"Creating pool function to process {len(self)} queries...")
539-
pool_function = partial(self._process_one_query)
536+
pool_function = partial(self._process_one_query, log_error_traceback=log_error_traceback)
540537
with Pool(self._cpu_count) as pool:
541538
_log.info("Starting pooling...\n")
542539
pool.map(pool_function, self.queries)
@@ -551,6 +548,24 @@ def process(
551548
os.remove(output_path)
552549
return glob(f"{prefix}.hdf5")
553550

551+
n_processed = 0
552+
for hdf5file in output_paths:
553+
with h5py.File(hdf5file, "r") as hdf5:
554+
# List of all graphs in hdf5, each graph representing
555+
# a SRV and its sourrouding environment
556+
n_processed += len(list(hdf5.keys()))
557+
558+
if not n_processed:
559+
msg = "No queries have been processed."
560+
raise ValueError(msg)
561+
if n_processed != len(self.queries):
562+
_log.warning(
563+
f"Not all queries have been processed. You can proceed with the analysis of {n_processed}/{len(self.queries)} queries.\n"
564+
"Set `log_error_traceback` to True for advanced troubleshooting.",
565+
)
566+
else:
567+
_log.info(f"{n_processed} queries have been processed.")
568+
554569
return output_paths
555570

556571
def _set_feature_modules(self, feature_modules: list[ModuleType, str] | ModuleType | str) -> list[str]:

pyproject.toml

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -53,8 +53,8 @@ dependencies = [
5353
"python-louvain >= 0.16, < 1.0",
5454
"tqdm >= 4.66.4, < 5.0",
5555
"freesasa >= 2.1.1, < 3.0",
56-
"biopython >= 1.83, < 2.0"
57-
]
56+
"biopython >= 1.83, < 2.0",
57+
]
5858

5959
[project.optional-dependencies]
6060
# development dependency groups
@@ -66,7 +66,7 @@ test = [
6666
"pytest-cov >= 4.1.0, < 5.0",
6767
"pytest-runner >= 6.0.0, < 7.0",
6868
"coveralls >= 3.3.1, < 4.0",
69-
"ruff == 0.5.1"
69+
"ruff == 0.6.3",
7070
]
7171
publishing = ["build", "twine", "wheel"]
7272
notebooks = ["nbmake"]
@@ -88,7 +88,7 @@ include = ["deeprank2*"]
8888

8989
[tool.pytest.ini_options]
9090
# pytest options: -ra: show summary info for all test outcomes
91-
addopts = "-ra"
91+
addopts = "-ra"
9292

9393
[tool.ruff]
9494
output-format = "concise"
@@ -148,3 +148,4 @@ isort.known-first-party = ["deeprank2"]
148148
]
149149
"docs/*" = ["ALL"]
150150
"tests/perf/*" = ["T201"] # Use of print statements
151+
"*.ipynb" = ["T201", "E402", "D103"]

tests/data/hdf5/_generate_testdata.ipynb

Lines changed: 13 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -15,11 +15,8 @@
1515
"PATH_TEST = ROOT / \"tests\"\n",
1616
"import glob\n",
1717
"import os\n",
18-
"import re\n",
19-
"import sys\n",
2018
"\n",
2119
"import h5py\n",
22-
"import numpy as np\n",
2320
"import pandas as pd\n",
2421
"\n",
2522
"from deeprank2.dataset import save_hdf5_keys\n",
@@ -79,7 +76,7 @@
7976
" chain_ids=[chain_id1, chain_id2],\n",
8077
" targets=targets,\n",
8178
" pssm_paths={chain_id1: pssm_path1, chain_id2: pssm_path2},\n",
82-
" )\n",
79+
" ),\n",
8380
" )\n",
8481
"\n",
8582
" # Generate graphs and save them in hdf5 files\n",
@@ -128,8 +125,8 @@
128125
"csv_data = pd.read_csv(csv_file_path)\n",
129126
"csv_data.cluster = csv_data.cluster.fillna(-1)\n",
130127
"pdb_ids_csv = [pdb_file.split(\"/\")[-1].split(\".\")[0].replace(\"-\", \"_\") for pdb_file in pdb_files]\n",
131-
"clusters = [csv_data[pdb_id == csv_data.ID].cluster.values[0] for pdb_id in pdb_ids_csv]\n",
132-
"bas = [csv_data[pdb_id == csv_data.ID].measurement_value.values[0] for pdb_id in pdb_ids_csv]\n",
128+
"clusters = [csv_data[pdb_id == csv_data.ID].cluster.to_numpy()[0] for pdb_id in pdb_ids_csv]\n",
129+
"bas = [csv_data[pdb_id == csv_data.ID].measurement_value.to_numpy()[0] for pdb_id in pdb_ids_csv]\n",
133130
"\n",
134131
"queries = QueryCollection()\n",
135132
"print(f\"Adding {len(pdb_files)} queries to the query collection ...\")\n",
@@ -147,7 +144,7 @@
147144
" \"cluster\": clusters[i],\n",
148145
" },\n",
149146
" pssm_paths={\"M\": pssm_m[i], \"P\": pssm_p[i]},\n",
150-
" )\n",
147+
" ),\n",
151148
" )\n",
152149
"print(\"Queries created and ready to be processed.\\n\")\n",
153150
"\n",
@@ -183,7 +180,7 @@
183180
"test_ids = []\n",
184181
"\n",
185182
"with h5py.File(hdf5_path, \"r\") as hdf5:\n",
186-
" for key in hdf5.keys():\n",
183+
" for key in hdf5:\n",
187184
" feature_value = float(hdf5[key][target][feature][()])\n",
188185
" if feature_value in train_clusters:\n",
189186
" train_ids.append(key)\n",
@@ -192,7 +189,7 @@
192189
" elif feature_value in test_clusters:\n",
193190
" test_ids.append(key)\n",
194191
"\n",
195-
" if feature_value in clusters.keys():\n",
192+
" if feature_value in clusters:\n",
196193
" clusters[int(feature_value)] += 1\n",
197194
" else:\n",
198195
" clusters[int(feature_value)] = 1\n",
@@ -278,8 +275,12 @@
278275
" targets = compute_ppi_scores(pdb_path, ref_path)\n",
279276
" queries.add(\n",
280277
" ProteinProteinInterfaceQuery(\n",
281-
" pdb_path=pdb_path, resolution=\"atom\", chain_ids=[chain_id1, chain_id2], targets=targets, pssm_paths={chain_id1: pssm_path1, chain_id2: pssm_path2}\n",
282-
" )\n",
278+
" pdb_path=pdb_path,\n",
279+
" resolution=\"atom\",\n",
280+
" chain_ids=[chain_id1, chain_id2],\n",
281+
" targets=targets,\n",
282+
" pssm_paths={chain_id1: pssm_path1, chain_id2: pssm_path2},\n",
283+
" ),\n",
283284
" )\n",
284285
"\n",
285286
"# Generate graphs and save them in hdf5 files\n",
@@ -303,7 +304,7 @@
303304
"name": "python",
304305
"nbconvert_exporter": "python",
305306
"pygments_lexer": "ipython3",
306-
"version": "3.10.12"
307+
"version": "3.10.14"
307308
},
308309
"orig_nbformat": 4,
309310
"vscode": {

tests/test_dataset.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1201,7 +1201,7 @@ def test_inherit_info_pretrained_model_graphdataset(self) -> None:
12011201
for key in data["features_transform"].values():
12021202
if key["transform"] is None:
12031203
continue
1204-
key["transform"] = eval(key["transform"]) # noqa: S307, PGH001
1204+
key["transform"] = eval(key["transform"]) # noqa: S307
12051205

12061206
dataset_test_vars = vars(dataset_test)
12071207
for param in dataset_test.inherited_params:

tests/utils/test_graph.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727
target_value = 1.0
2828

2929

30-
@pytest.fixture()
30+
@pytest.fixture
3131
def graph() -> Graph:
3232
"""Build a simple graph of two nodes and one edge in between them."""
3333
# load the structure

0 commit comments

Comments
 (0)