Skip to content

Commit

Permalink
(fix): silent progress bar mode
Browse files Browse the repository at this point in the history
Add an optional silent progress bar. Remove systematics warnings, now depends on the size of embeddings.
Adds a CONTRIBUTING.md with PR best practices, code style
guide, and code of conduct for contributors.
  • Loading branch information
delfosseaurelien committed May 25, 2021
1 parent 38b5e5e commit 8ae28fb
Show file tree
Hide file tree
Showing 2 changed files with 120 additions and 52 deletions.
71 changes: 51 additions & 20 deletions biotransformers/utils/utils.py
Original file line number Diff line number Diff line change
@@ -1,37 +1,66 @@
import math
from dataclasses import dataclass
from typing import List
from typing import List, Tuple

from biotransformers.utils.logger import logger

log = logger("utils")


def convert_bytes_size(size_bytes):
def convert_bytes_size(size_bytes: int) -> Tuple[str, bool]:
"""[summary]
Args:
size_bytes: size in bytes
Returns:
Tuple[str,bool]: return the size with correct units and a condition
to display the warning message.
"""
if size_bytes == 0:
return "0B"
return "0B", False
size_name = ("B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB")
i = int(math.floor(math.log(size_bytes, 1024)))
p = math.pow(1024, i)
s = int(round(size_bytes / p, 2))
return "%s%s" % (s, size_name[i])
is_warning = True if i >= 3 else False

return "%s%s" % (s, size_name[i]), is_warning

def _check_memory_embeddings(sequences_list, embeddings_size, pool_mode):

def _check_memory_embeddings(
sequences_list: List[str], embeddings_size: int, pool_mode: Tuple[str, ...]
):
"""Function to compute the memory taken by the embeddings with float64 number.
Args:
sequences_list: sequences of proteins
embeddings_size : size of the embeddings vector, depends on the model
pool_mode : aggregation function
"""
num_of_sequences = len(sequences_list)
emb_dict_len = len(pool_mode)
tensor_memory_bits = 64 # double/float64
memory_bits = num_of_sequences * embeddings_size * emb_dict_len * tensor_memory_bits
memory_bytes = int(memory_bits / 8)
memory_convert_bytes = convert_bytes_size(memory_bytes)
log.warning(
"Embeddings will need about %s of memory."
"Please make sure you have enough space",
memory_convert_bytes,
)
memory_convert_bytes, is_warning = convert_bytes_size(memory_bytes)

if is_warning:
log.warning(
"Embeddings will need about %s of memory."
"Please make sure you have enough space",
memory_convert_bytes,
)


def _check_memory_logits(sequences_list: List[str], vocab_size: int, pass_mode: str):
"""Function to compute the memory taken by the logits with float64 number.
def _check_memory_logits(sequences_list, vocab_size, pass_mode):
Args:
sequences_list ([type]): [description]
vocab_size ([type]): [description]
pass_mode ([type]): [description]
"""
num_of_sequences = len(sequences_list)
sum_seq_len = sum([len(seq) for seq in sequences_list])
max_seq_len = max([len(seq) for seq in sequences_list])
Expand All @@ -42,20 +71,22 @@ def _check_memory_logits(sequences_list, vocab_size, pass_mode):
memory_bits = num_of_sequences * max_seq_len * vocab_size * tensor_memory_bits

memory_bytes = int(memory_bits / 8)
memory_convert_bytes = convert_bytes_size(memory_bytes)
log.warning(
"%s mode will need about %s of memory. Please make sure you have enough space",
pass_mode,
memory_convert_bytes,
)
memory_convert_bytes, is_warning = convert_bytes_size(memory_bytes)

if is_warning:
log.warning(
"%s mode will need about %s of memory. Please make sure you have enough space",
pass_mode,
memory_convert_bytes,
)


def _check_sequence(sequences_list: List[str], model: str, length: int):
"""Function that control sequence length
Args:
model (str): name of the model
length (int): length limit to consider
model : name of the model
length : length limit to consider
Raises:
ValueError is model esm1b_t33_650M_UR50S and sequence_length >1024
"""
Expand Down
101 changes: 69 additions & 32 deletions biotransformers/wrappers/transformers_wrappers.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
sequences, and displays some properties of the transformer model.
"""
from abc import ABC, abstractmethod
from typing import Any, Dict, Generator, Iterable, List, Tuple
from typing import Any, Dict, Generator, Iterable, List, Optional, Tuple

import numpy as np
import torch
Expand Down Expand Up @@ -252,7 +252,15 @@ def _filter_loglikelihoods(
labels: torch.Tensor,
tokens: List[int],
) -> torch.Tensor:
"""Function to compute the loglikelihood of sequences based on logits
Args:
logits : [description]
labels : Position of
tokens: [description]
Returns:
Torch.tensor: tensor
"""
masks = torch.zeros(labels.shape, dtype=torch.bool)
for token_id in tokens:
masks += labels == token_id
Expand Down Expand Up @@ -325,9 +333,7 @@ def _filter_and_pool_embeddings(
return embeddings_dict

def _model_evaluation(
self,
model_inputs: Dict[str, torch.tensor],
batch_size: int = 1,
self, model_inputs: Dict[str, torch.tensor], batch_size: int = 1, **kwargs
) -> Tuple[torch.Tensor, torch.Tensor]:
"""
Function which computes logits and embeddings based on a list of sequences,
Expand All @@ -343,14 +349,15 @@ def _model_evaluation(
* logits [num_seqs, max_len_seqs, vocab_size]
* embeddings [num_seqs, max_len_seqs+1, embedding_size]
"""

silent = kwargs.get("silent", False)
# Initialize logits and embeddings before looping over batches
logits = torch.Tensor() # [num_seqs, max_len_seqs+1, vocab_size]
embeddings = torch.Tensor() # [num_seqs, max_len_seqs+1, embedding_size]

for batch_inputs in tqdm(
self._generate_chunks(model_inputs, batch_size),
total=self._get_num_batch_iter(model_inputs, batch_size),
disable=silent,
):
batch_logits, batch_embeddings = self._model_pass(batch_inputs)

Expand All @@ -360,7 +367,11 @@ def _model_evaluation(
return logits, embeddings

def _compute_logits(
self, model_inputs: Dict[str, torch.Tensor], batch_size: int, pass_mode: str
self,
model_inputs: Dict[str, torch.Tensor],
batch_size: int,
pass_mode: str,
**kwargs
) -> torch.Tensor:
"""Intermediate function to compute logits
Expand All @@ -374,10 +385,14 @@ def _compute_logits(
"""
if pass_mode == "masked":
model_inputs, masked_ids_list = self._repeat_and_mask_inputs(model_inputs)
logits, _ = self._model_evaluation(model_inputs, batch_size=batch_size)
logits, _ = self._model_evaluation(
model_inputs, batch_size=batch_size, **kwargs
)
logits = self._gather_masked_outputs(logits, masked_ids_list)
elif pass_mode == "forward":
logits, _ = self._model_evaluation(model_inputs, batch_size=batch_size)
logits, _ = self._model_evaluation(
model_inputs, batch_size=batch_size, **kwargs
)
return logits

def _compute_accuracy(self, logits: torch.Tensor, labels: torch.Tensor) -> float:
Expand Down Expand Up @@ -445,12 +460,16 @@ def compute_logits(
batch_size: int = 1,
tokens_list: List[str] = None,
pass_mode: str = "forward",
) -> Tuple[np.ndarray, np.ndarray]:
"""Function that computes the logits from sequences
silent: bool = False,
) -> Tuple[List[np.ndarray]]:
"""Function that computes the logits from sequences.
It returns a list of logits for each sequence. Each sequence in the list
contains only the amino acid to interest.
Args:
sequences_list: List of sequences
batch_size: Batch size
batch_size: number of sequences to consider for the forward pass
pass_mode: Mode of model evaluation ('forward' or 'masked')
tokens_list: List of tokens to consider
Expand All @@ -466,32 +485,40 @@ def compute_logits(
inputs, labels, tokens = self._process_sequences_and_tokens(
sequences_list, tokens_list
)
logits = self._compute_logits(inputs, batch_size, pass_mode)
logits = self._compute_logits(inputs, batch_size, pass_mode, silent=silent)
logits, labels = self._filter_logits(logits, labels, tokens)

return logits.numpy(), labels.numpy()
lengths = [len(sequence) for sequence in sequences_list]
splitted_logits = torch.split(logits, lengths, dim=0)
splitted_logits = [logits.numpy() for logits in splitted_logits]

return splitted_logits

def compute_probabilities(
self,
sequences_list: List[str],
batch_size: int = 1,
tokens_list: List[str] = None,
pass_mode: str = "forward",
silent: bool = False,
) -> List[Dict[int, Dict[str, float]]]:
"""Function that computes the probabilities over amino-acids from sequences.
It takes as inputs a list of sequences and returns a list of dictionaries.
Each dictionary contains the probabilities over the natural amino-acids for each
position in the sequence. The keys represent the positions (indexed
starting with 0) and the values are dictionaries of probabilities over
the natural amino-acids for this position. In these dictionaries, the keys are
the amino-acids and the value the corresponding probabilities.
the natural amino-acids for this position.
In these dictionaries, the keys are the amino-acids and the value
the corresponding probabilities.
Args:
sequences_list: List of sequences
batch_size: Batch size
batch_size: number of sequences to consider for the forward pass
pass_mode: Mode of model evaluation ('forward' or 'masked')
tokens_list: List of tokens to consider
silent : display or not progress bar
Returns:
List[Dict[int, Dict[str, float]]]: dictionaries of probabilities per seq
"""
Expand All @@ -504,7 +531,7 @@ def compute_probabilities(
inputs, labels, tokens = self._process_sequences_and_tokens(
sequences_list, tokens_list
)
logits = self._compute_logits(inputs, batch_size, pass_mode)
logits = self._compute_logits(inputs, batch_size, pass_mode, silent=silent)
logits, _ = self._filter_logits(logits, labels, tokens)

lengths = [len(sequence) for sequence in sequences_list]
Expand Down Expand Up @@ -535,6 +562,7 @@ def compute_loglikelihood(
batch_size: int = 1,
tokens_list: List[str] = None,
pass_mode: str = "forward",
silent: bool = False,
) -> np.ndarray:
"""Function that computes loglikelihoods of sequences
Expand All @@ -545,7 +573,7 @@ def compute_loglikelihood(
tokens_list: List of tokens to consider
Returns:
torch.Tensor: loglikelihoods in torch.tensor format
torch.Tensor: loglikelihoods in numpy format
"""
if tokens_list is None:
tokens_list = NATURAL_AAS_LIST
Expand All @@ -556,7 +584,7 @@ def compute_loglikelihood(
inputs, labels, tokens = self._process_sequences_and_tokens(
sequences_list, tokens_list
)
logits = self._compute_logits(inputs, batch_size, pass_mode)
logits = self._compute_logits(inputs, batch_size, pass_mode, silent=silent)
loglikelihoods = self._filter_loglikelihoods(logits, labels, tokens)

return loglikelihoods.numpy()
Expand All @@ -567,15 +595,21 @@ def compute_embeddings(
batch_size: int = 1,
pool_mode: Tuple[str, ...] = ("cls", "mean"),
tokens_list: List[str] = None,
silent: bool = False,
) -> Dict[str, np.ndarray]:
"""Function that computes embeddings of sequences
"""Function that computes embeddings of sequences.
The embedding has a size (n_sequence, num_tokens, embeddings_size) so we use
an aggregation function specified in pool_mode to aggregate the tensor on
the num_tokens dimension. 'mean' signifies that we take the mean over the
num_tokens dimension.
Args:
sequences_list: List of sequences
batch_size: Batch size
pool_mode: Mode of pooling ('cls', 'mean', 'min', 'max)
tokens_list: List of tokens to consider
silent : whereas to display or not progress bar
Returns:
torch.Tensor: Tensor of shape [number_of_sequences, embeddings_size]
"""
Expand All @@ -600,6 +634,7 @@ def compute_embeddings(
for batch_inputs in tqdm(
self._generate_chunks(inputs, batch_size),
total=self._get_num_batch_iter(inputs, batch_size),
disable=silent,
):
_, batch_embeddings = self._model_pass(batch_inputs)
batch_labels = batch_inputs["input_ids"]
Expand All @@ -621,6 +656,7 @@ def compute_accuracy(
batch_size: int = 1,
pass_mode: str = "forward",
tokens_list: List[str] = None,
silent: bool = False,
) -> float:
"""Compute model accuracy from the input sequences
Expand All @@ -629,7 +665,7 @@ def compute_accuracy(
batch_size: [description]. Defaults to 1.
pass_mode: [description]. Defaults to "forward".
tokens_list: [description]. Defaults to None.
silent: whereas to display or not progress bar
Returns:
[type]: [description]
"""
Expand All @@ -641,7 +677,7 @@ def compute_accuracy(
inputs, labels, tokens = self._process_sequences_and_tokens(
sequences_list, tokens_list
)
logits = self._compute_logits(inputs, batch_size, pass_mode)
logits = self._compute_logits(inputs, batch_size, pass_mode, silent=silent)
logits, labels = self._filter_logits(logits, labels, tokens)
accuracy = self._compute_accuracy(logits, labels)

Expand All @@ -652,18 +688,19 @@ def compute_calibration(
sequences_list: List[str],
batch_size: int = 1,
pass_mode: str = "forward",
tokens_list: List[str] = None,
tokens_list: Optional[List[str]] = None,
n_bins: int = 10,
silent: bool = False,
) -> Dict[str, Any]:
"""Compute model calibration from the input sequences
Args:
sequences_list ([type]): [description]
batch_size ([type], optional): [description]. Defaults to 1.
pass_mode ([type], optional): [description]. Defaults to "forward".
tokens_list ([type], optional): [description]. Defaults to None.
n_bins ([type], optional): [description]. Defaults to 10.
sequences_list : [description]
batch_size : [description]. Defaults to 1.
pass_mode : [description]. Defaults to "forward".
tokens_list : [description]. Defaults to None.
n_bins : [description]. Defaults to 10.
silent: display or not progress bar
Returns:
[type]: [description]
"""
Expand All @@ -675,7 +712,7 @@ def compute_calibration(
inputs, labels, tokens = self._process_sequences_and_tokens(
sequences_list, tokens_list
)
logits = self._compute_logits(inputs, batch_size, pass_mode)
logits = self._compute_logits(inputs, batch_size, pass_mode, silent=silent)
logits, labels = self._filter_logits(logits, labels, tokens)
calibration_dict = self._compute_calibration(logits, labels, n_bins)

Expand Down

0 comments on commit 8ae28fb

Please sign in to comment.