From dd74395aeae60935fd1307d927b304c3c8f188f2 Mon Sep 17 00:00:00 2001 From: David Benjamin Date: Wed, 20 Nov 2024 23:03:35 -0500 Subject: [PATCH] Mixed count base batches (#158) --- permutect/architecture/base_model.py | 45 ++++----- permutect/architecture/gated_mlp.py | 97 +++++++++++-------- permutect/data/base_dataset.py | 24 ++--- permutect/data/base_datum.py | 6 +- permutect/parameters.py | 8 +- permutect/test/tools/test_train_base_model.py | 4 +- permutect/test/tools/test_train_model.py | 3 +- permutect/utils.py | 40 ++++++++ 8 files changed, 133 insertions(+), 94 deletions(-) diff --git a/permutect/architecture/base_model.py b/permutect/architecture/base_model.py index a5d20ace..b228f0f7 100644 --- a/permutect/architecture/base_model.py +++ b/permutect/architecture/base_model.py @@ -119,7 +119,6 @@ def __init__(self, params: BaseModelParameters, num_read_features: int, num_info self._dtype = DEFAULT_GPU_FLOAT if device != torch.device("cpu") else DEFAULT_CPU_FLOAT self._ref_sequence_length = ref_sequence_length self._params = params - self.alt_downsample = params.alt_downsample # embeddings of reads, info, and reference sequence prior to the transformer layers self.read_embedding = MLP([num_read_features] + params.read_layers, batch_normalize=params.batch_normalize, dropout_p=params.dropout_p) @@ -130,7 +129,6 @@ def __init__(self, params: BaseModelParameters, num_read_features: int, num_info assert embedding_dim % params.num_transformer_heads == 0 self.ref_alt_reads_encoder = make_gated_ref_alt_mlp_encoder(embedding_dim, params) - self.alt_encoder = make_gated_mlp_encoder(embedding_dim, params) # after encoding alt reads (along with info and ref seq embeddings and with self-attention to ref reads) # pass through another MLP @@ -163,35 +161,31 @@ def forward(self, batch: BaseBatch): # so, for example, "re" means a 2D tensor with all reads in the batch stacked and "vre" means a 3D tensor indexed # first by variant within the batch, then the read def calculate_representations(self, batch: BaseBatch, weight_range: float = 0) -> torch.Tensor: - ref_count, alt_count = batch.ref_count, batch.alt_count - total_ref, total_alt = ref_count * batch.size(), alt_count * batch.size() + ref_counts, alt_counts = batch.ref_counts, batch.alt_counts + total_ref, total_alt = torch.sum(ref_counts).item(), torch.sum(alt_counts).item() read_embeddings_re = self.read_embedding.forward(batch.get_reads_2d().to(dtype=self._dtype)) info_embeddings_ve = self.info_embedding.forward(batch.get_info_2d().to(dtype=self._dtype)) ref_seq_embeddings_ve = self.ref_seq_cnn(batch.get_ref_sequences_2d().to(dtype=self._dtype)) info_and_seq_ve = torch.hstack((info_embeddings_ve, ref_seq_embeddings_ve)) - info_and_seq_re = torch.vstack((torch.repeat_interleave(info_and_seq_ve, ref_count, dim=0), - torch.repeat_interleave(info_and_seq_ve, alt_count, dim=0))) + info_and_seq_re = torch.vstack((torch.repeat_interleave(info_and_seq_ve, repeats=ref_counts, dim=0), + torch.repeat_interleave(info_and_seq_ve, repeats=alt_counts, dim=0))) reads_info_seq_re = torch.hstack((read_embeddings_re, info_and_seq_re)) - ref_reads_info_seq_vre = None if total_ref == 0 else reads_info_seq_re[:total_ref].reshape(batch.size(), ref_count, -1) - alt_reads_info_seq_vre = reads_info_seq_re[total_ref:].reshape(batch.size(), alt_count, -1) - if self.alt_downsample < alt_count: - alt_read_indices = torch.randperm(alt_count)[:self.alt_downsample] - alt_reads_info_seq_vre = alt_reads_info_seq_vre[:, alt_read_indices, :] # downsample only along the middle (read) dimension - alt_count = self.alt_downsample - total_alt = batch.size() * self.alt_downsample + # TODO: might be a bug if every datum in batch has zero ref reads? + ref_reads_info_seq_re = reads_info_seq_re[:total_ref] + alt_reads_info_seq_re = reads_info_seq_re[total_ref:] - # undo some of the above rearrangement + # TODO: make sure it handles ref count = 0 case + transformed_ref_re, transformed_alt_re = self.ref_alt_reads_encoder.forward(ref_reads_info_seq_re, alt_reads_info_seq_re, ref_counts, alt_counts) - transformed_ref_vre, transformed_alt_vre = (None, self.alt_encoder(alt_reads_info_seq_vre)) if total_ref == 0 else \ - self.ref_alt_reads_encoder(ref_reads_info_seq_vre, alt_reads_info_seq_vre) + alt_weights_r = 1 + weight_range * (1 - 2 * torch.rand(total_alt, device=self._device, dtype=self._dtype)) - alt_weights_vr = 1 + weight_range * (1 - 2 * torch.rand(batch.size(), alt_count, device=self._device, dtype=self._dtype)) - alt_wt_sums = torch.sum(alt_weights_vr, dim=1, keepdim=True) - # normalized so read weights within each variant sum to 1 and add dummy e dimension for broadcasting the multiply below - normalized_alt_weights_vr1 = (alt_weights_vr / alt_wt_sums).reshape(batch.size(), alt_count, 1) - alt_means_ve = torch.sum(transformed_alt_vre * normalized_alt_weights_vr1, dim=1) + # normalize so read weights within each variant sum to 1 + alt_wt_sums_v = utils.sums_over_rows(alt_weights_r, alt_counts) + normalized_alt_weights_r = alt_weights_r / torch.repeat_interleave(alt_wt_sums_v, repeats=alt_counts, dim=0) + + alt_means_ve = utils.sums_over_rows(transformed_alt_re * normalized_alt_weights_r[:,None], alt_counts) result_ve = self.aggregation.forward(alt_means_ve) @@ -371,8 +365,10 @@ def loss_function(self, base_model: BaseModel, base_batch: BaseBatch, base_model alt_vre = torch.cat((alt_representations_vre, random_alt_seeds_vre), dim=-1) ref_vre = torch.cat((ref_representations_vre, random_ref_seeds_vre), dim=-1) if ref_count > 0 else None - decoded_alt_vre = self.alt_decoder(alt_vre) - decoded_ref_vre = self.ref_decoder(ref_vre) if ref_count > 0 else None + # TODO: update these to reflect mixed-count batches. Gated MLPs now take inputs flattened over batch dimension + # TODO: and have an extra input of ref and alt read counts + decoded_alt_vre = self.alt_decoder.forward(alt_vre) + decoded_ref_vre = self.ref_decoder.forward(ref_vre) if ref_count > 0 else None decoded_alt_re = torch.reshape(decoded_alt_vre, (var_count * alt_count, -1)) decoded_ref_re = torch.reshape(decoded_ref_vre, (var_count * ref_count, -1)) if ref_count > 0 else None @@ -506,11 +502,10 @@ def learn_base_model(base_model: BaseModel, dataset: BaseDataset, learning_metho .to(device=base_model._device, dtype=base_model._dtype) classifier_bce = torch.nn.BCEWithLogitsLoss(reduction='none') - # TODO: fused = is_cuda? classifier_optimizer = torch.optim.AdamW(classifier_on_top.parameters(), lr=training_params.learning_rate, weight_decay=training_params.weight_decay, - fused=True) + fused=is_cuda) classifier_metrics = LossMetrics() validation_fold_to_use = (dataset.num_folds - 1) if validation_fold is None else validation_fold diff --git a/permutect/architecture/gated_mlp.py b/permutect/architecture/gated_mlp.py index 15fba26a..e862c569 100644 --- a/permutect/architecture/gated_mlp.py +++ b/permutect/architecture/gated_mlp.py @@ -23,6 +23,8 @@ import torch from torch import nn +from permutect import utils + class GatedMLPBlock(nn.Module): """ @@ -66,19 +68,22 @@ def __init__(self, d_model: int, d_ffn: int): # *gMLP* block as a replacement for the [Transformer Layer](../models.html#Encoder). self.size = d_model - def forward(self, x_bre: torch.Tensor): + # X is 2D, counts are the numbers of elements in each consecutive group of rows that form a self-attention group + # that is, is X has 10 rows and counts = [2,3,5], elements 0-1, 2-4, and 5-9 form independent self-attention groups + # In other words, all the reads of a batch are flattened together in X -- the batch information is in counts + def forward(self, x_re: torch.Tensor, counts: torch.IntTensor): """ * `x_bre` is the input read embedding tensor of shape Batch x Reads x Embedding """ # Norm, projection to d_ffn, and activation $Z = \sigma(XU)$ - z_brd = self.activation(self.proj1(self.norm(x_bre))) + z_rd = self.activation(self.proj1(self.norm(x_re))) # Spacial Gating Unit $\tilde{Z} = s(Z)$ - gated_brd = self.sgu(z_brd) + gated_rd = self.sgu.forward(z_rd, counts) # Final projection $Y = \tilde{Z}V$ back to embedding dimension - gated_bre = self.proj2(gated_brd) + gated_re = self.proj2(gated_rd) # Add the shortcut connection - return x_bre + gated_bre + return x_re + gated_re class SpacialGatingUnit(nn.Module): @@ -105,24 +110,23 @@ def __init__(self, d_z: int): # Normalization layer before applying $f_{W,b}(\cdot)$ self.norm = nn.LayerNorm([d_z // 2]) # Weight $W$ in $f_{W,b}(\cdot)$. - # + + # TODO: shouldn't alpha and beta be element-by-element??? self.alpha = nn.Parameter(torch.tensor(0.01)) self.beta = nn.Parameter(torch.tensor(0.01)) - def forward(self, z_brd: torch.Tensor): - """ - * `z_brd` is the input tensor of shape Batch x Reads x Dimension - `[seq_len, batch_size, d_z]` - """ - + # Z is 2D, counts are the numbers of elements in each consecutive group of rows that form a self-attention group + # that is, is X has 10 rows and counts = [2,3,5], elements 0-1, 2-4, and 5-9 form independent self-attention groups + def forward(self, z_rd: torch.Tensor, counts: torch.IntTensor): # Split $Z$ into $Z_1$ and $Z_2$ over the hidden dimension and normalize $Z_2$ before $f_{W,b}(\cdot)$ - z1_brd, z2_brd = torch.chunk(z_brd, 2, dim=-1) - z2_brd = self.norm(z2_brd) + z1_rd, z2_rd = torch.chunk(z_rd, 2, dim=-1) + z2_rd = self.norm(z2_rd) - z2_brd = 1 + self.alpha * z2_brd + torch.mean(z2_brd, dim=1, keepdim=True) + # TODO: self.beta needs to multiply the mean field here!!! + z2_rd = 1 + self.alpha * z2_rd + utils.means_over_rows(z2_rd, counts, keepdim=True) # $Z_1 \odot f_{W,b}(Z_2)$ - return z1_brd * z2_brd + return z1_rd * z2_rd class GatedMLP(nn.Module): @@ -131,9 +135,11 @@ def __init__(self, d_model: int, d_ffn: int, num_blocks: int): self.blocks = nn.ModuleList([GatedMLPBlock(d_model, d_ffn) for _ in range(num_blocks)]) - def forward(self, x): + # X is 2D, counts are the numbers of elements in each consecutive group of rows that form a self-attention group + # that is, is X has 10 rows and counts = [2,3,5], elements 0-1, 2-4, and 5-9 form independent self-attention groups + def forward(self, x, counts): for block in self.blocks: - x = block(x) + x = block.forward(x, counts) return x @@ -166,22 +172,22 @@ def __init__(self, d_model: int, d_ffn: int): # *gMLP* block as a replacement for the [Transformer Layer](../models.html#Encoder). self.size = d_model - def forward(self, ref_bre: torch.Tensor, alt_bre: torch.Tensor): + def forward(self, ref_re: torch.Tensor, alt_re: torch.Tensor, ref_counts: torch.IntTensor, alt_counts: torch.IntTensor): """ * `x_bre` is the input read embedding tensor of shape Batch x Reads x Embedding """ # Norm, projection to d_ffn, and activation $Z = \sigma(XU)$ - zref_brd = self.activation(self.proj1_ref(self.norm(ref_bre))) - zalt_brd = self.activation(self.proj1_alt(self.norm(alt_bre))) + zref_rd = self.activation(self.proj1_ref(self.norm(ref_re))) + zalt_rd = self.activation(self.proj1_alt(self.norm(alt_re))) # Spacial Gating Unit $\tilde{Z} = s(Z)$ - gated_ref_brd, gated_alt_brd = self.sgu(zref_brd, zalt_brd) + gated_ref_rd, gated_alt_rd = self.sgu.forward(zref_rd, zalt_rd, ref_counts, alt_counts) # Final projection $Y = \tilde{Z}V$ back to embedding dimension - gated_ref_bre = self.proj2_ref(gated_ref_brd) - gated_alt_bre = self.proj2_alt(gated_alt_brd) + gated_ref_re = self.proj2_ref(gated_ref_rd) + gated_alt_re = self.proj2_alt(gated_alt_rd) # Add the shortcut connection - return ref_bre + gated_ref_bre, alt_bre + gated_alt_bre + return ref_re + gated_ref_re, alt_re + gated_alt_re class SpacialGatingUnitRefAlt(nn.Module): @@ -196,36 +202,41 @@ def __init__(self, d_z: int): # Normalization layer before applying $f_{W,b}(\cdot)$ self.norm = nn.LayerNorm([d_z // 2]) # Weight $W$ in $f_{W,b}(\cdot)$. - # + + # TODO: maybe let these parameters be element-by-element vectors? self.alpha_ref = nn.Parameter(torch.tensor(0.01)) self.alpha_alt = nn.Parameter(torch.tensor(0.01)) self.beta_ref = nn.Parameter(torch.tensor(0.01)) self.beta_alt = nn.Parameter(torch.tensor(0.01)) - self.gamma = nn.Parameter(torch.tensor(0.01)) - def forward(self, zref_brd: torch.Tensor, zalt_brd: torch.Tensor): - """ - * `z_brd` is the input tensor of shape Batch x Reads x Dimension - `[seq_len, batch_size, d_z]` - """ + # regularizer / sort of imputed value for when there are no ref counts + self.ref_regularizer = nn.Parameter(0.1 * torch.ones(d_z // 2)) + self.regularizer_weight = nn.Parameter(torch.tensor(0.1)) + + def forward(self, zref_rd: torch.Tensor, zalt_rd: torch.Tensor, ref_counts: torch.IntTensor, alt_counts: torch.IntTensor): # Split $Z$ into $Z_1$ and $Z_2$ over the hidden dimension and normalize $Z_2$ before $f_{W,b}(\cdot)$ - z1_ref_brd, z2_ref_brd = torch.chunk(zref_brd, 2, dim=-1) - z1_alt_brd, z2_alt_brd = torch.chunk(zalt_brd, 2, dim=-1) - z2_ref_brd = self.norm(z2_ref_brd) - z2_alt_brd = self.norm(z2_alt_brd) + z1_ref_rd, z2_ref_rd = torch.chunk(zref_rd, 2, dim=-1) + z1_alt_rd, z2_alt_rd = torch.chunk(zalt_rd, 2, dim=-1) + z2_ref_rd = self.norm(z2_ref_rd) + z2_alt_rd = self.norm(z2_alt_rd) + + # these are means by variant -- need repeat_interleave to make them by-read + ref_mean_field_vd = utils.means_over_rows_with_regularizer(z2_ref_rd, ref_counts, self.ref_regularizer, self.regularizer_weight) + alt_mean_field_vd = utils.means_over_rows(z2_alt_rd, alt_counts) - ref_mean_field_brd = torch.mean(z2_ref_brd, dim=1, keepdim=True) - alt_mean_field_brd = torch.mean(z2_alt_brd, dim=1, keepdim=True) + ref_mean_field_on_ref_rd = torch.repeat_interleave(ref_mean_field_vd, dim=0, repeats=ref_counts) + ref_mean_field_on_alt_rd = torch.repeat_interleave(ref_mean_field_vd, dim=0, repeats=alt_counts) + alt_mean_field_on_alt_rd = torch.repeat_interleave(alt_mean_field_vd, dim=0, repeats=alt_counts) # same as above except now there is an additional term for the ref mean field influence on alt # maybe later also let alt mean field influence ref - z2_ref_brd = 1 + self.alpha_ref * z2_ref_brd + self.beta_ref * ref_mean_field_brd - z2_alt_brd = 1 + self.alpha_alt * z2_alt_brd + self.beta_alt * alt_mean_field_brd + self.gamma * ref_mean_field_brd + z2_ref_rd = 1 + self.alpha_ref * z2_ref_rd + self.beta_ref * ref_mean_field_on_ref_rd + z2_alt_rd = 1 + self.alpha_alt * z2_alt_rd + self.beta_alt * alt_mean_field_on_alt_rd + self.gamma * ref_mean_field_on_alt_rd # $Z_1 \odot f_{W,b}(Z_2)$ - return z1_ref_brd * z2_ref_brd, z1_alt_brd * z2_alt_brd + return z1_ref_rd * z2_ref_rd, z1_alt_rd * z2_alt_rd class GatedRefAltMLP(nn.Module): @@ -234,7 +245,7 @@ def __init__(self, d_model: int, d_ffn: int, num_blocks: int): self.blocks = nn.ModuleList([GatedRefAltMLPBlock(d_model, d_ffn) for _ in range(num_blocks)]) - def forward(self, ref, alt): + def forward(self, ref, alt, ref_counts, alt_counts): for block in self.blocks: - ref, alt = block(ref, alt) + ref, alt = block(ref, alt, ref_counts, alt_counts) return ref, alt diff --git a/permutect/data/base_dataset.py b/permutect/data/base_dataset.py index 1e6da29c..299946e6 100644 --- a/permutect/data/base_dataset.py +++ b/permutect/data/base_dataset.py @@ -63,10 +63,8 @@ def __init__(self, data_in_ram: Iterable[BaseDatum] = None, data_tarfile=None, n self._data = RaggedMmap(self._memory_map_dir.name) self._memory_map_mode = True - # keys = (ref read count, alt read count) tuples; values = list of indices # this is used in the batch sampler to make same-shape batches - self.labeled_indices_by_count = [defaultdict(list) for _ in range(num_folds)] - self.unlabeled_indices_by_count = [defaultdict(list) for _ in range(num_folds)] + self.indices_by_fold = [[] for _ in range(num_folds)] # totals by count, then by label -- ARTIFACT, VARIANT, UNLABELED, then by variant type # variant type is done as a 1D np array parallel to the one-hot encoding of variant type @@ -87,7 +85,7 @@ def __init__(self, data_in_ram: Iterable[BaseDatum] = None, data_tarfile=None, n fold = n % num_folds counts = (len(datum.reads_2d) - datum.alt_count, datum.alt_count) - (self.unlabeled_indices_by_count if datum.label == Label.UNLABELED else self.labeled_indices_by_count)[fold][counts].append(n) + self.indices_by_fold[fold].append(n) one_hot = datum.variant_type_one_hot() self.totals[ALL_COUNTS_SENTINEL][datum.label] += one_hot @@ -191,30 +189,24 @@ def chunk(lis, chunk_size): return [lis[i:i + chunk_size] for i in range(0, len(lis), chunk_size)] -# make batches that have a single value for ref, alt counts within batches. Labeled and unlabeled data are mixed. +# Labeled and unlabeled data are mixed. # the artifact model handles weighting the losses to compensate for class imbalance between supervised and unsupervised # thus the sampler is not responsible for balancing the data class SemiSupervisedBatchSampler(Sampler): def __init__(self, dataset: BaseDataset, batch_size, folds_to_use: List[int]): # combine the index maps of all relevant folds - self.indices_by_count = defaultdict(list) + self.indices_to_use = [] for fold in folds_to_use: - new_labeled = dataset.labeled_indices_by_count[fold] - new_unlabeled = dataset.unlabeled_indices_by_count[fold] - for count, indices in new_labeled.items(): - self.indices_by_count[count].extend(indices) - for count, indices in new_unlabeled.items(): - self.indices_by_count[count].extend(indices) + self.indices_to_use.extend(dataset.indices_by_fold[fold]) self.batch_size = batch_size - self.num_batches = sum(math.ceil(len(indices) // self.batch_size) for indices in self.indices_by_count.values()) + self.num_batches = math.ceil(len(self.indices_to_use) // self.batch_size) def __iter__(self): batches = [] # list of lists of indices -- each sublist is a batch - for index_list in self.indices_by_count.values(): - random.shuffle(index_list) - batches.extend(chunk(index_list, self.batch_size)) + random.shuffle(self.indices_to_use) + batches.extend(chunk(self.indices_to_use, self.batch_size)) random.shuffle(batches) return iter(batches) diff --git a/permutect/data/base_datum.py b/permutect/data/base_datum.py index 05d5a0fc..5e1396b1 100644 --- a/permutect/data/base_datum.py +++ b/permutect/data/base_datum.py @@ -550,7 +550,8 @@ def __init__(self, data: List[BaseDatum]): self._original_list = data self.ref_count = len(data[0].reads_2d) - data[0].alt_count self.alt_count = data[0].alt_count - self.alt_counts = IntTensor([data[0].alt_count for _ in data]) + self.alt_counts = IntTensor([datum.alt_count for datum in data]) + self.ref_counts = IntTensor([len(datum.reads_2d) - datum.alt_count for datum in data]) # for datum in data: # assert (datum.label() != Label.UNLABELED) == self.labeled, "Batch may not mix labeled and unlabeled" @@ -582,6 +583,8 @@ def __init__(self, data: List[BaseDatum]): def pin_memory(self): self.ref_sequences_2d = self.ref_sequences_2d.pin_memory() self.reads_2d = self.reads_2d.pin_memory() + self.alt_counts = self.alt_counts.pin_memory() + self.ref_counts = self.ref_counts.pin_memory() self.info_2d = self.info_2d.pin_memory() self.labels = self.labels.pin_memory() self.is_labeled_mask = self.is_labeled_mask.pin_memory() @@ -598,6 +601,7 @@ def copy_to(self, device, non_blocking): new_batch.is_labeled_mask = self.is_labeled_mask.to(device, non_blocking=non_blocking) new_batch.sources = self.sources.to(device, non_blocking=non_blocking) new_batch.alt_counts = self.alt_counts.to(device, non_blocking=non_blocking) + new_batch.ref_counts = self.ref_counts.to(device, non_blocking=non_blocking) return new_batch def original_list(self): diff --git a/permutect/parameters.py b/permutect/parameters.py index 246aa7f7..3ba61024 100644 --- a/permutect/parameters.py +++ b/permutect/parameters.py @@ -14,7 +14,7 @@ class BaseModelParameters: """ def __init__(self, read_layers: List[int], num_transformer_heads: int, transformer_hidden_dimension: int, num_transformer_layers: int, info_layers: List[int], aggregation_layers: List[int], - ref_seq_layers_strings: List[str], dropout_p: float, reweighting_range: float, batch_normalize: bool = False, alt_downsample: int = 100): + ref_seq_layers_strings: List[str], dropout_p: float, reweighting_range: float, batch_normalize: bool = False): self.read_layers = read_layers self.info_layers = info_layers @@ -26,7 +26,6 @@ def __init__(self, read_layers: List[int], num_transformer_heads: int, transform self.dropout_p = dropout_p self.reweighting_range = reweighting_range self.batch_normalize = batch_normalize - self.alt_downsample = alt_downsample def output_dimension(self): return self.aggregation_layers[-1] @@ -43,10 +42,9 @@ def parse_base_model_params(args) -> BaseModelParameters: dropout_p = getattr(args, constants.DROPOUT_P_NAME) reweighting_range = getattr(args, constants.REWEIGHTING_RANGE_NAME) batch_normalize = getattr(args, constants.BATCH_NORMALIZE_NAME) - alt_downsample = getattr(args, constants.ALT_DOWNSAMPLE_NAME) return BaseModelParameters(read_layers, num_transformer_heads, transformer_hidden_dimension, num_transformer_layers, info_layers, aggregation_layers, ref_seq_layer_strings, dropout_p, - reweighting_range, batch_normalize, alt_downsample) + reweighting_range, batch_normalize) def add_base_model_params_to_parser(parser): @@ -75,8 +73,6 @@ def add_base_model_params_to_parser(parser): parser.add_argument('--' + constants.REWEIGHTING_RANGE_NAME, type=float, default=0.3, required=False, help='magnitude of data augmentation by randomly weighted average of read embeddings. ' 'a value of x yields random weights between 1 - x and 1 + x') - parser.add_argument('--' + constants.ALT_DOWNSAMPLE_NAME, type=int, default=100, required=False, - help='max number of alt reads to downsample to inside the model') parser.add_argument('--' + constants.BATCH_NORMALIZE_NAME, action='store_true', help='flag to turn on batch normalization') diff --git a/permutect/test/tools/test_train_base_model.py b/permutect/test/tools/test_train_base_model.py index be3b5e5b..0f19f734 100644 --- a/permutect/test/tools/test_train_base_model.py +++ b/permutect/test/tools/test_train_base_model.py @@ -27,7 +27,6 @@ def test_train_base_model(): 'linear/out_features=10'] setattr(train_model_args, constants.REF_SEQ_LAYER_STRINGS_NAME, cnn_layer_strings) setattr(train_model_args, constants.DROPOUT_P_NAME, 0.0) - setattr(train_model_args, constants.ALT_DOWNSAMPLE_NAME, 20) setattr(train_model_args, constants.BATCH_NORMALIZE_NAME, False) setattr(train_model_args, constants.LEARNING_METHOD_NAME, 'SEMISUPERVISED') @@ -39,7 +38,8 @@ def test_train_base_model(): # training hyperparameters setattr(train_model_args, constants.REWEIGHTING_RANGE_NAME, 0.3) setattr(train_model_args, constants.BATCH_SIZE_NAME, 64) - setattr(train_model_args, constants.NUM_WORKERS_NAME, 2) + setattr(train_model_args, constants.INFERENCE_BATCH_SIZE_NAME, 64) + setattr(train_model_args, constants.NUM_WORKERS_NAME, 0) setattr(train_model_args, constants.NUM_EPOCHS_NAME, 2) setattr(train_model_args, constants.NUM_CALIBRATION_EPOCHS_NAME, 0) setattr(train_model_args, constants.LEARNING_RATE_NAME, 0.001) diff --git a/permutect/test/tools/test_train_model.py b/permutect/test/tools/test_train_model.py index cd56dc2e..906c44f3 100644 --- a/permutect/test/tools/test_train_model.py +++ b/permutect/test/tools/test_train_model.py @@ -32,7 +32,8 @@ def test_train_model(): # training hyperparameters setattr(train_model_args, constants.BATCH_SIZE_NAME, 64) - setattr(train_model_args, constants.NUM_WORKERS_NAME, 2) + setattr(train_model_args, constants.INFERENCE_BATCH_SIZE_NAME, 64) + setattr(train_model_args, constants.NUM_WORKERS_NAME, 0) setattr(train_model_args, constants.NUM_EPOCHS_NAME, 2) setattr(train_model_args, constants.NUM_CALIBRATION_EPOCHS_NAME, 1) setattr(train_model_args, constants.LEARNING_RATE_NAME, 0.001) diff --git a/permutect/utils.py b/permutect/utils.py index 01540861..b7aca8c6 100644 --- a/permutect/utils.py +++ b/permutect/utils.py @@ -183,6 +183,46 @@ def gamma_binomial(n, k, alpha, beta): return exponent_term + gamma_term - torch.log(n + 1) +# for tensor of shape (R, C...) and row counts n1, n2. . nK, return a tensor of shape (K, C...) whose 1st row is the sum of the +# first n1 rows of the input, 2nd row is the sum of the next n2 rows etc +# note that this works for arbitrary C, including empty. That is, it works for 1D, 2D, 3D etc input. +def sums_over_rows(input_tensor: torch.Tensor, counts: torch.IntTensor): + range_ends = torch.cumsum(counts, dim=0) + assert range_ends[-1] == len(input_tensor) # the counts need to add up! + + row_cumsums = torch.cumsum(input_tensor, dim=0) + + # if counts are eg 1, 2, 3 then range ends are 1, 3, 6 and we are interested in cumsums[0, 2, 5] + relevant_cumsums = row_cumsums[(range_ends - 1).long()] + + # if counts are eg 1, 2, 3 we now have, the sum of the first 1, 3, and 6 rows. To get the sums of row 0, rows 1-2, rows 3-5 + # we need the consecutive differences, with a row of zeroes prepended + row_of_zeroes = torch.zeros_like(relevant_cumsums[0])[None] # the [None] makes it (1xC) + relevant_sums = torch.diff(relevant_cumsums, dim=0, prepend=row_of_zeroes) + return relevant_sums + + +# same but divide by the counts to get means +def means_over_rows(input_tensor: torch.Tensor, counts: torch.IntTensor, keepdim: bool = False): + extra_dims = (1,) * (input_tensor.dim() - 1) + result = sums_over_rows(input_tensor, counts) / counts.view(-1, *extra_dims) + + return torch.repeat_interleave(result, dim=0, repeats=counts) if keepdim else result + + +# same but include a regularizer in case of zeros in the counts vector +# regularizer has the dimension of one row of the input tensor +def means_over_rows_with_regularizer(input_tensor: torch.Tensor, counts: torch.IntTensor, regularizer, regularizer_weight, keepdim: bool = False): + # TODO: left off right here + extra_dims = (1,) * (input_tensor.dim() - 1) + + regularized_sums = sums_over_rows(input_tensor, counts) + regularizer[None, :] + regularized_counts = counts + regularizer_weight + result = regularized_sums / regularized_counts.view(-1, *extra_dims) + + return torch.repeat_interleave(result, dim=0, repeats=counts) if keepdim else result + + class StreamingAverage: def __init__(self): self._count = 0.0