From 9b1891384a38e3102fd2d5864d7ed14e5d43c050 Mon Sep 17 00:00:00 2001 From: Jesse Cai Date: Tue, 21 Jan 2025 11:38:19 -0800 Subject: [PATCH 01/23] wip --- torchao/_models/llama/generate.py | 36 +- .../sparsity/superblock/blocksparse.py | 175 +++++++- .../sparsity/superblock/supermask.py | 401 ++++++++---------- .../prototype/sparsity/superblock/utils.py | 13 +- 4 files changed, 383 insertions(+), 242 deletions(-) diff --git a/torchao/_models/llama/generate.py b/torchao/_models/llama/generate.py index b1d3475601..8c96ec5e2e 100644 --- a/torchao/_models/llama/generate.py +++ b/torchao/_models/llama/generate.py @@ -23,7 +23,12 @@ from torchao.utils import TORCH_VERSION_AT_LEAST_2_5, get_model_size_in_bytes torch.sparse.SparseSemiStructuredTensor._FORCE_CUTLASS = False -torch.backends.cuda.enable_cudnn_sdp(True) +from torch._inductor import config as inductorconfig +inductorconfig.triton.unique_kernel_names = True +# torch.backends.cuda.enable_cudnn_sdp(True) +# torch.backends.cuda.enable_math_sdp(False) +# torch.backends.cuda.enable_flash_sdp(False) +# torch.backends.cuda.enable_mem_efficient_sdp(False) class HostEvent: @@ -322,7 +327,7 @@ def main( torch.manual_seed(1234) def ffn_only(mod, fqn): - return isinstance(mod, torch.nn.Linear) and "feed_forward" in fqn + return isinstance(mod, torch.nn.Linear) and "feed_forward" in fqn def not_ffn_only(mod, fqn): return isinstance(mod, torch.nn.Linear) and not ffn_only(mod, fqn) @@ -797,6 +802,27 @@ def ffn_or_attn_only(mod, fqn): # TODO there is a bug here, need to fix sparsify_(model.to(device), semi_sparse_weight(), filter_fn=ffn_only) + # standalone quantization + if "bsr" in sparsity: + from torchao.prototype.sparsity.superblock.utils import ( + accelerate_with_sparsity, + get_args_parser, + simulate_sparsity, + ) + + superblock_args = get_args_parser(benchmark=True).parse_args([]) + superblock_args.sparsity = "bsr" + superblock_args.sparsity_linear = 0.9 + superblock_args.bsr = 64 + + sparsifier_or_none = simulate_sparsity(model, superblock_args, ffn_only) + if sparsifier_or_none is not None: + sparsifier_or_none.squash_mask() + + + model = model.to(device) + accelerate_with_sparsity(model, superblock_args, ffn_only) + model_size = get_model_size_in_bytes(model, ignore_embeddings=True) / 1e9 if save: @@ -811,7 +837,7 @@ def ffn_or_attn_only(mod, fqn): print("Compiling Model") global decode_one_token, prefill decode_one_token = torch.compile( - decode_one_token, mode="reduce-overhead", fullgraph=True + decode_one_token, mode="reduce-overhead", fullgraph=True, dynamic=True, ) if compile_prefill: @@ -850,7 +876,7 @@ def ffn_or_attn_only(mod, fqn): prompt = f"{B_INST} {prompt.strip()} {E_INST}" encoded = encode_tokens(tokenizer, prompt, bos=True, device=device) - if interactive and i >= 0: + if interactive and i >= 0 and prefill_size is None: buffer = [] period_id = tokenizer.encode(".")[0] done_generating = False @@ -920,7 +946,7 @@ def callback(x): device_sync(device=device) # MKG t = time.perf_counter() - t0 - if not interactive and demo_summarize_prompt is None: + if not interactive and demo_summarize_prompt is None and prefill_size is None: tok_list = y[0].tolist() # truncate text after end of string token tokens = ( diff --git a/torchao/prototype/sparsity/superblock/blocksparse.py b/torchao/prototype/sparsity/superblock/blocksparse.py index b5e8432949..7391173b5f 100644 --- a/torchao/prototype/sparsity/superblock/blocksparse.py +++ b/torchao/prototype/sparsity/superblock/blocksparse.py @@ -1,12 +1,12 @@ from functools import partial -from typing import List, Optional, Tuple +from typing import Any, Callable, Dict, List, Optional, Tuple import torch -from torch.sparse._triton_ops import broadcast_batch_dims, bsr_dense_addmm +from torch.sparse._triton_ops import broadcast_batch_dims, bsr_dense_addmm, bsr_dense_mm from torch.utils._python_dispatch import return_and_correct_aliasing - from torchao.quantization.quant_api import _get_linear_subclass_inserter from torchao.utils import TorchAOBaseTensor +# from torchao.prototype.sparsity.blocksparse._triton_ops import bsr_dense_addmm as torchao_bsr_dense_addmm aten = torch.ops.aten @@ -93,6 +93,8 @@ def blocksparse_linear( bias: torch.Tensor, ) -> torch.Tensor: weight_bsr = torch.sparse_bsr_tensor(crow_indices, col_indices, values, size=(M, K)) + # TODO: Change this to call into Triton kernel directly like int_addmm + # This way we know we must be on the hot path return torch.nn.functional.linear(A, weight_bsr, bias) @@ -110,13 +112,59 @@ def blocksparse_linear_abstract( return torch.empty(new_shape, dtype=A.dtype, device=A.device) +# bsr wrapper custom op +@torch.library.custom_op("blocksparse::addmm", mutates_args=()) +def blocksparse_addmm( + x_padded: torch.Tensor, + crow_indices: torch.Tensor, + col_indices: torch.Tensor, + values: torch.Tensor, + M: int, + K: int, + bias: torch.Tensor, +) -> torch.Tensor: + assert bias is None + weight_bsr = torch.sparse_bsr_tensor(crow_indices, col_indices, values, size=(M, K)) + N_padded = x_padded.shape[1] + out = x_padded.new_empty((M, N_padded)) + bsr_dense_addmm( + out, + weight_bsr, + # x, + x_padded, + alpha=1, + beta=0, + out=out, + # left_alpha=left_alpha, + # right_alpha=right_alpha, + ) + return out + + +@torch.library.register_fake("blocksparse::addmm") +def blocksparse_addmm_abstract( + x_padded: torch.Tensor, + crow_indices: torch.Tensor, + col_indices: torch.Tensor, + values: torch.Tensor, + M: int, + K: int, + bias: torch.Tensor, +) -> torch.Tensor: + N_padded = x_padded.shape[1] + return x_padded.new_empty((M, N_padded)) + + # Subclass definition class BlockSparseTensor(TorchAOBaseTensor): + # TODO: Use NJT as a field to store max/min seqlen bsr_crow_indices: Optional[torch.Tensor] bsr_col_indices: Optional[torch.Tensor] bsr_values: Optional[torch.Tensor] + # bsr_nt: Optional[torch.Tensor] __slots__ = ["bsr_crow_indices", "bsr_col_indices", "bsr_values"] + # __slots__ = ["bsr_crow_indices", "bsr_col_indices", "bsr_values", "bsr_nt"] @staticmethod def __new__( # noqa: PYI034 @@ -125,6 +173,7 @@ def __new__( # noqa: PYI034 bsr_crow_indices: Optional[torch.Tensor], bsr_col_indices: Optional[torch.Tensor], bsr_values: Optional[torch.Tensor], + # bsr_nt: Optional[torch.Tensor], requires_grad: bool = False, ): if bsr_values is None: @@ -141,9 +190,10 @@ def __new__( # noqa: PYI034 "requires_grad": requires_grad, } tensor = torch.Tensor._make_wrapper_subclass(cls, shape, **kwargs) # type: ignore[attr-defined] + # tensor.bsr_nt = bsr_nt tensor.bsr_crow_indices = bsr_crow_indices - tensor.bsr_col_indices = bsr_col_indices tensor.bsr_values = bsr_values + tensor.bsr_col_indices = bsr_col_indices return tensor def __repr__(self) -> str: # type: ignore[override] @@ -171,17 +221,21 @@ def __tensor_unflatten__( bsr_crow_indices=inner_tensors.get("bsr_crow_indices", None), bsr_col_indices=inner_tensors.get("bsr_col_indices", None), bsr_values=inner_tensors.get("bsr_values", None), + # bsr_nt=inner_tensors.get("bsr_nt", None), requires_grad=requires_grad, ) @classmethod def from_dense(cls, dense_tensor, blocksize): bsr_tensor = dense_tensor.to_sparse_bsr(blocksize) + # print("A") + # bsr_nt = torch.nested.nested_tensor_from_jagged(bsr_tensor.values().detach(), bsr_tensor.crow_indices().detach()).detach() return cls( shape=dense_tensor.shape, bsr_crow_indices=bsr_tensor.crow_indices(), bsr_col_indices=bsr_tensor.col_indices(), bsr_values=bsr_tensor.values(), + # bsr_nt=bsr_nt, requires_grad=False, ) @@ -191,6 +245,7 @@ def apply_fn_to_shard(self, func): bsr_crow_indices=func(self.bsr_crow_indices), bsr_col_indices=func(self.bsr_col_indices), bsr_values=func(self.bsr_values), + # bsr_nt=func(self.bsr_nt), requires_grad=self.requires_grad, ) @@ -206,6 +261,64 @@ def block_sparse_detach(func, types, args, kwargs): ) +# @implements(aten.unsqueeze.default) +# def block_sparse_unsqueeze(func, types, args, kwargs): +# assert len(args) == 2 +# assert len(kwargs) == 0 +# assert args[-1] == 2 +# bsr = args[0] +# assert bsr.dim() == 2 +# assert not bsr.requires_grad +# return BlockSparseTensor(bsr.shape + (1,), +# bsr.crow_indices(), +# bsr.col_indices(), +# bsr.values().unsqueeze(-1)) +# # bsr.bsr_nt) + + +# @implements(aten.mul.Tensor) +# def block_sparse_mul(func, types, args, kwargs): +# assert len(args) == 2 +# assert len(kwargs) == 0 +# bsr, t = args + +# def my_mul(bsr, t): +# assert isinstance(bsr, BlockSparseTensor) +# assert isinstance(t, torch.Tensor) +# assert bsr.dim() == 3 +# assert t.dim() == 3 +# assert not bsr.requires_grad +# # import pdb; pdb.set_trace() +# assert t.size(0) == 1 +# t_blocked = t.view(t.size(0), t.size(1) // 64, 64, 1) +# masked_t = t_blocked.transpose(0, 1).index_select(0, bsr.col_indices()) +# new_values = bsr.values() * masked_t +# # print("C") +# # bsr_nt = torch.nested.nested_tensor_from_jagged(new_values.detach(), bsr.crow_indices().detach()).detach() +# return BlockSparseTensor(bsr.shape, +# bsr.crow_indices(), +# bsr.col_indices(), +# new_values) +# # bsr_nt) + +# if isinstance(bsr, torch.Tensor) and isinstance(t, BlockSparseTensor): +# return my_mul(t, bsr) +# return my_mul(bsr, t) + + +# @implements(aten.sum.dim_IntList) +# def block_sparse_sum(func, types, args, kwargs): +# bsr, dim = args +# assert type(dim) == list +# assert len(dim) == 1 +# dim = dim[0] +# bsr_dim = bsr.dim() +# assert dim == 1 +# # ret = bsr.bsr_nt.detach().sum(dim=1).view(bsr.shape[0], -1).sum(1, keepdim=True).detach() +# assert ret.dim() + 1 == bsr_dim +# return ret + + @implements(aten.values.default) def block_sparse_values(func, types, args, kwargs): return args[0].bsr_values.detach() @@ -226,12 +339,60 @@ def block_sparse__nnz(func, types, args, kwargs): return args[0].bsr_values.shape[0] +def next_power_of_two(n): + assert n > 0 + return 2 ** (n.bit_length()) + + @implements(torch.nn.functional.linear) def block_sparse_linear(func, types, args, kwargs): - x, w, bias = args - return torch.ops.blocksparse.linear( - x, w.crow_indices(), w.col_indices(), w.values(), w.shape[0], w.shape[1], bias + # linear(x, w^t) + # linear(w, x^t)^t + x_orig, w, bias = args + # # TODO: Change this to do padding to make sure blocksparse.linear works + # return torch.ops.blocksparse.linear( + # x, w.crow_indices(), w.col_indices(), w.values(), w.shape[0], w.shape[1], bias + # ) + x = x_orig.reshape(-1, x_orig.size(-1)).t() + M = w.shape[0] + K = w.shape[1] + N = x.shape[1] + # TODO: Replace this with mul + sum for the mv case similar to + # https://github.com/pytorch/pytorch/blob/a9685767773157440c162caaf125856e04e2981f/torch/_inductor/decomposition.py#L292 + # use .to_dense to get a baseline implementation that works and then use NJT for .sum and such + # breakpoint() + # if x.size(-1) == 1: + # # print("USING THIS") + # # breakpoint() + # out = (torch.mul(w.unsqueeze(2), x.unsqueeze(0))).sum(dim=1) + # out_orig = out.t().reshape(x_orig.shape[:-1] + (M,)) + # if bias is None: + # special_ret = out_orig + # else: + # special_ret = out_orig + bias + # return special_ret + # else: + N_padded = max(16, next_power_of_two(N)) + x_padded = torch.nn.functional.pad(x, (0, N_padded - N), 'constant', 0) + out = torch.ops.blocksparse.addmm( + x_padded, + w.crow_indices(), + w.col_indices(), + w.values(), + M, + K, + None, ) + # import pdb; pdb.set_trace() + # return out.view(x_orig.size(0), -1, M) + out_orig = out[:, :x.size(-1)].t().reshape(x_orig.shape[:-1] + (M,)) + if bias is None: + # if x.size(-1) == 1: + # assert special_ret.size() == out_orig.size() + return out_orig + # if x.size(-1) == 1: + # assert special_ret.size() == out_orig.size() + return out_orig + bias def block_sparse_weight(blocksize=64): diff --git a/torchao/prototype/sparsity/superblock/supermask.py b/torchao/prototype/sparsity/superblock/supermask.py index abd23c566e..e519589003 100644 --- a/torchao/prototype/sparsity/superblock/supermask.py +++ b/torchao/prototype/sparsity/superblock/supermask.py @@ -1,14 +1,17 @@ # Copyright (c) Meta Platforms, Inc. and affiliates. +import torch.nn as nn import math - import torch -import torch.nn as nn +from torch.autograd import Variable import torch.nn.functional as F +import numpy as np + +from torchao.quantization.quant_api import _replace_with_custom_fn_if_matches_filter # original supermask -scores_min = None -scores_max = 9e9 +scores_min=None +scores_max=9e9 uniform_init_01 = False # adjusted supermask, initialize scores with uniform distribution in [0,1], clamp scores in each step in [0,1] @@ -16,54 +19,51 @@ # scores_max=1. # uniform_init_01 = True - def percentile(t, q): """Return the value that is larger than q% of t""" - k = 1 + round(0.01 * float(q) * (t.numel() - 1)) + k = 1 + round(.01 * float(q) * (t.numel() - 1)) return t.view(-1).kthvalue(k).values class GetSubnet(torch.autograd.Function): """Supermask STE function""" - @staticmethod def forward(ctx, scores, zeros, ones, sparsity): - clamped_scores = scores.clamp(min=scores_min, max=scores_max) - k_val = percentile(clamped_scores, sparsity * 100) - return torch.where( - clamped_scores < k_val, zeros.to(scores.device), ones.to(scores.device) - ) - + clamped_scores = scores.clamp(min=scores_min,max=scores_max) + k_val = percentile(clamped_scores, sparsity*100) + return torch.where(clamped_scores < k_val, zeros.to(scores.device), ones.to(scores.device)) @staticmethod def backward(ctx, g): return g, None, None, None +class ApplyMask(torch.autograd.Function): + """Supermask STE function""" + @staticmethod + def forward(ctx, weight, scores): + return weight * scores + @staticmethod + def backward(ctx, grad_output): + grad_weight = grad_scores = None + if ctx.needs_input_grad[0]: + grad_weight = grad_output + if ctx.needs_input_grad[1]: + grad_scores = grad_output + return grad_weight, grad_scores + + class SupermaskLinear(nn.Linear): """Supermask class for Linear layer""" - - def __init__( - self, - sparsity, - fixed_mask, - fixed_weight, - bitwidth, - transform, - fixed_transform, - *args, - **kwargs, - ): + def __init__(self, sparsity, fixed_mask, fixed_weight, bitwidth, transform, fixed_transform, *args, **kwargs): tile_size = kwargs.pop("tile_size", 1) super(SupermaskLinear, self).__init__(*args, **kwargs) # initialize the scores - max_sparsity = 1 - ( - 1 / math.prod([math.ceil(k / tile_size) for k in self.weight.size()]) - ) + max_sparsity = 1 - (1 / math.prod([math.ceil(k / tile_size) for k in self.weight.size()])) self.sparsity = sparsity if self.sparsity > max_sparsity: print( f"reducing sparsity from {self.sparsity} to {max_sparsity}", - f"(maximum sparsity for layer with shape {self.weight.size()} and tile size {tile_size})", + f"(maximum sparsity for layer with shape {self.weight.size()} and tile size {tile_size})" ) self.sparsity = max_sparsity self.tile_size = tile_size @@ -74,60 +74,42 @@ def __init__( ), requires_grad=not fixed_mask, ) - nn.init.uniform_(self.scores) if uniform_init_01 else nn.init.kaiming_uniform_( - self.scores, a=math.sqrt(5) - ) + nn.init.uniform_(self.scores) if uniform_init_01 else nn.init.kaiming_uniform_(self.scores, a=math.sqrt(5)) - # the shift and the scale are transformation parameters + # the shift and the scale are transformation parameters # the actually used weights = self.weight*self.scale+self.shift # the transformation is activated only for quantized weights - self.shift = nn.Parameter(torch.Tensor(1).fill_(0.0), requires_grad=False) - self.scale = nn.Parameter(torch.Tensor(1).fill_(1.0), requires_grad=False) - + self.shift=nn.Parameter(torch.Tensor(1).fill_(0.), requires_grad=False) + self.scale=nn.Parameter(torch.Tensor(1).fill_(1.), requires_grad=False) + with torch.no_grad(): # if bitwidth is None, then use floating point values in self.weight # if bitwidth is not None, then quantize self.weight into k-bit (k=bitwidth) - # quantized values are -2^(k-1), -2^(k-1)+1, ..., 0, 1, ..., 2^(k-1)-1 + # quantized values are -2^(k-1), -2^(k-1)+1, ..., 0, 1, ..., 2^(k-1)-1 # these quantized values are uniformly distributed if bitwidth is not None: weights_max = torch.max(self.weight).item() weights_min = torch.min(self.weight).item() - least_step = (weights_max - weights_min) / pow(2, bitwidth) - left_bound = weights_min - 1e-6 - right_bound = weights_min + least_step + 1e-6 + least_step = (weights_max-weights_min)/pow(2,bitwidth) + left_bound = weights_min-1e-6 + right_bound = weights_min+least_step+1e-6 # self.shift=nn.Parameter(torch.Tensor(1).fill_( (weights_min+(pow(2,bitwidth-1)+0.5)*least_step) if transform[0] is None else transform[0] ), requires_grad=not fixed_transform[0]) # self.scale=nn.Parameter(torch.Tensor(1).fill_( least_step if transform[1] is None else transform[1] ), requires_grad=not fixed_transform[1]) # for example, if using binary weights (k=1) with -a, +a, set transform = [a,2a]; if using binary weights (k=1) with a, 0, set transform = [0,-a]; - self.shift = nn.Parameter( - torch.Tensor(1).fill_( - 0.0 if transform[0] is None else transform[0] - ), - requires_grad=not fixed_transform[0], - ) - self.scale = nn.Parameter( - torch.Tensor(1).fill_( - 1.0 if transform[1] is None else transform[1] - ), - requires_grad=not fixed_transform[1], - ) - for i in range(-int(pow(2, bitwidth - 1)), int(pow(2, bitwidth - 1))): - self.weight[ - torch.logical_and( - self.weight > left_bound, self.weight <= right_bound - ) - ] = i + self.shift=nn.Parameter(torch.Tensor(1).fill_( 0. if transform[0] is None else transform[0] ), requires_grad=not fixed_transform[0]) + self.scale=nn.Parameter(torch.Tensor(1).fill_( 1. if transform[1] is None else transform[1] ), requires_grad=not fixed_transform[1]) + for i in range(-int(pow(2,bitwidth-1)),int(pow(2,bitwidth-1))): + self.weight[torch.logical_and(self.weight>left_bound, self.weight<=right_bound)] = i left_bound = right_bound right_bound += least_step self.weight.requires_grad = not fixed_weight def get_mask(self): - subnet = GetSubnet.apply( - self.scores, - torch.zeros_like(self.scores), - torch.ones_like(self.scores), - self.sparsity, - ) + subnet = GetSubnet.apply(self.scores, + torch.zeros_like(self.scores), + torch.ones_like(self.scores), + self.sparsity) if self.tile_size != 1: for i, k in enumerate(self.weight.shape): @@ -135,46 +117,33 @@ def get_mask(self): subnet = torch.narrow(subnet, i, 0, k) return subnet - + def sparsify_offline(self): subnet = self.get_mask() - self.weight.data = (self.weight * self.scale + self.shift) * subnet + self.weight.data = (self.weight*self.scale+self.shift) * subnet self.sparsify_weights = True def forward(self, x): if not self.sparsify_weights: subnet = self.get_mask() - w = (self.weight * self.scale + self.shift) * subnet - else: - w = self.weight - return F.linear(x, w, self.bias) - + w = (self.weight*self.scale+self.shift) + w = ApplyMask.apply(w, subnet) + return F.linear(x, w, self.bias) + return F.linear(x, self.weight, self.bias) + class SupermaskConv2d(nn.Conv2d): """Supermask class for Conv2d layer""" - - def __init__( - self, - sparsity, - fixed_mask, - fixed_weight, - bitwidth, - transform, - fixed_transform, - *args, - **kwargs, - ): + def __init__(self, sparsity, fixed_mask, fixed_weight, bitwidth, transform, fixed_transform, *args, **kwargs): tile_size = kwargs.pop("tile_size", 1) super(SupermaskConv2d, self).__init__(*args, **kwargs) # initialize the scores - max_sparsity = 1 - ( - 1 / math.prod([math.ceil(k / tile_size) for k in self.weight.size()]) - ) + max_sparsity = 1 - (1 / math.prod([math.ceil(k / tile_size) for k in self.weight.size()])) self.sparsity = sparsity if self.sparsity > max_sparsity: print( f"reducing sparsity from {self.sparsity} to {max_sparsity}", - f"(maximum sparsity for layer with shape {self.weight.size()} and tile size {tile_size})", + f"(maximum sparsity for layer with shape {self.weight.size()} and tile size {tile_size})" ) self.sparsity = max_sparsity self.tile_size = tile_size @@ -184,72 +153,52 @@ def __init__( ), requires_grad=not fixed_mask, ) - nn.init.uniform_(self.scores) if uniform_init_01 else nn.init.kaiming_uniform_( - self.scores, a=math.sqrt(5) - ) + nn.init.uniform_(self.scores) if uniform_init_01 else nn.init.kaiming_uniform_(self.scores, a=math.sqrt(5)) - # the shift and the scale are transformation parameters + # the shift and the scale are transformation parameters # the actually used weights = self.weight*self.scale+self.shift # the transformation is activated only for quantized weights - self.shift = nn.Parameter(torch.Tensor(1).fill_(0.0), requires_grad=False) - self.scale = nn.Parameter(torch.Tensor(1).fill_(1.0), requires_grad=False) + self.shift=nn.Parameter(torch.Tensor(1).fill_(0.), requires_grad=False) + self.scale=nn.Parameter(torch.Tensor(1).fill_(1.), requires_grad=False) with torch.no_grad(): # if bitwidth is None, then use floating point values in self.weight # if bitwidth is not None, then quantize self.weight into k-bit (k=bitwidth) - # quantized values are -2^(k-1), -2^(k-1)+1, ..., 0, 1, ..., 2^(k-1)-1 + # quantized values are -2^(k-1), -2^(k-1)+1, ..., 0, 1, ..., 2^(k-1)-1 # these quantized values are uniformly distributed if bitwidth is not None: weights_max = torch.max(self.weight).item() weights_min = torch.min(self.weight).item() - least_step = (weights_max - weights_min) / pow(2, bitwidth) - left_bound = weights_min - 1e-6 - right_bound = weights_min + least_step + 1e-6 + least_step = (weights_max-weights_min)/pow(2,bitwidth) + left_bound = weights_min-1e-6 + right_bound = weights_min+least_step+1e-6 # self.shift=nn.Parameter(torch.Tensor(1).fill_( (weights_min+(pow(2,bitwidth-1)+0.5)*least_step) if transform[0] is None else transform[0] ), requires_grad=not fixed_transform[0]) # self.scale=nn.Parameter(torch.Tensor(1).fill_( least_step if transform[1] is None else transform[1]), requires_grad=not fixed_transform[1]) # for example, if using binary weights (k=1) with -a, +a, set transform = [a,2a]; if using binary weights (k=1) with a, 0, set transform = [0,-a]; - self.shift = nn.Parameter( - torch.Tensor(1).fill_( - 0.0 if transform[0] is None else transform[0] - ), - requires_grad=not fixed_transform[0], - ) - self.scale = nn.Parameter( - torch.Tensor(1).fill_( - 1.0 if transform[1] is None else transform[1] - ), - requires_grad=not fixed_transform[1], - ) - for i in range(-int(pow(2, bitwidth - 1)), int(pow(2, bitwidth - 1))): - self.weight[ - torch.logical_and( - self.weight > left_bound, self.weight <= right_bound - ) - ] = i + self.shift=nn.Parameter(torch.Tensor(1).fill_( 0. if transform[0] is None else transform[0] ), requires_grad=not fixed_transform[0]) + self.scale=nn.Parameter(torch.Tensor(1).fill_( 1. if transform[1] is None else transform[1] ), requires_grad=not fixed_transform[1]) + for i in range(-int(pow(2,bitwidth-1)),int(pow(2,bitwidth-1))): + self.weight[torch.logical_and(self.weight>left_bound, self.weight<=right_bound)] = i left_bound = right_bound right_bound += least_step self.weight.requires_grad = not fixed_weight def forward(self, x): - subnet = GetSubnet.apply( - self.scores, - torch.zeros_like(self.scores), - torch.ones_like(self.scores), - self.sparsity, - ) - + subnet = GetSubnet.apply(self.scores, + torch.zeros_like(self.scores), + torch.ones_like(self.scores), + self.sparsity) + if self.tile_size != 1: for i, k in enumerate(self.weight.shape): # if k == 1: continue subnet = subnet.repeat_interleave(self.tile_size, dim=i) subnet = torch.narrow(subnet, i, 0, k) - w = (self.weight * self.scale + self.shift) * subnet - return F.conv2d( - x, w, self.bias, self.stride, self.padding, self.dilation, self.groups - ) - + w = (self.weight*self.scale+self.shift) + w = ApplyMask.apply(w, subnet) + return F.conv2d(x, w, self.bias, self.stride, self.padding, self.dilation, self.groups) def apply_supermask( model, @@ -263,103 +212,107 @@ def apply_supermask( skip_first_transformer_sparsity=False, device="cuda", verbose=False, + filter_fn=None, ): - sparsified_modules = {} - - for n, m in model.named_modules(): - # check conditions for skipping sparsity - if skip_last_layer_sparsity and n == "heads.head": - continue - if skip_first_transformer_sparsity and "encoder.layers.encoder_layer_0" in n: - continue - - # convert 1x1 convolutions - if ( - conv1x1_sparsity != 0.0 - and isinstance(m, torch.nn.Conv2d) - and m.kernel_size == (1, 1) - ): - new_m = SupermaskConv2d( - conv1x1_sparsity, - False, - False, - None, - None, - None, - m.in_channels, - m.out_channels, - m.kernel_size, - stride=m.stride, - padding=m.padding, - dilation=m.dilation, - groups=m.groups, - bias=m.bias is not None, - padding_mode=m.padding_mode, - device=device, - tile_size=conv1x1_sp_tilesize, - ) - new_m.weight.data.copy_(m.weight.data) - if m.bias is not None: - new_m.bias.data.copy_(m.bias.data) - sparsified_modules[n] = new_m - continue - - # convert all other convolutions (not tested!) - if conv_sparsity != 0.0 and isinstance(m, torch.nn.Conv2d): - new_m = SupermaskConv2d( - conv_sparsity, - False, - False, - None, - None, - None, - m.in_channels, - m.out_channels, - m.kernel_size, - stride=m.stride, - padding=m.padding, - dilation=m.dilation, - groups=m.groups, - bias=m.bias is not None, - padding_mode=m.padding_mode, - device=device, - tile_size=conv_sp_tilesize, - ) - new_m.weight.data.copy_(m.weight.data) - if m.bias is not None: - new_m.bias.data.copy_(m.bias.data) - sparsified_modules[n] = new_m - continue - - if linear_sparsity != 0.0 and isinstance(m, torch.nn.Linear): - new_m = SupermaskLinear( - linear_sparsity, - False, - False, - None, - None, - None, - m.in_features, - m.out_features, - bias=m.bias is not None, - device=device, - tile_size=linear_sp_tilesize, - ) - new_m.weight.data.copy_(m.weight.data) - if m.bias is not None: - new_m.bias.data.copy_(m.bias.data) - sparsified_modules[n] = new_m - continue - - # add modules to model - for k, v in sparsified_modules.items(): - sm_name, ch_name = k.rsplit(".", 1) - sm = model.get_submodule(sm_name) - sm.add_module(ch_name, v) + # create filter function + # TODO: it might be better to move the filtering function to the script calling this function + is_last_layer = lambda module, name: name == "heads.head" + is_first_transformer_layer = lambda module, name: name == "encoder.layers.encoder_layer_0" + # TODO: create condition for ffn, k,v,q,o projections + reject_fn = lambda module, name : (skip_last_layer_sparsity and is_last_layer(module, name)) or (skip_first_transformer_sparsity and is_first_transformer_layer(module, name)) + if filter_fn is None: + filter_fn = lambda module, name : not reject_fn(module, name) and isinstance(module, (torch.nn.Linear, torch.nn.Conv2d)) + + _replace_with_custom_fn_if_matches_filter( + model, + SuperMaskReplacementClass( + linear_sparsity=linear_sparsity, + linear_sp_tilesize=linear_sp_tilesize, + conv1x1_sparsity=conv1x1_sparsity, + conv1x1_sp_tilesize=conv1x1_sp_tilesize, + conv_sparsity=conv_sparsity, + conv_sp_tilesize=conv_sp_tilesize, + device=device, + verbose=verbose, + ), + filter_fn, + ) + +class SuperMaskReplacementClass: + def __init__( + self, + linear_sparsity=0.0, + linear_sp_tilesize=1, + conv1x1_sparsity=0.0, + conv1x1_sp_tilesize=1, + conv_sparsity=0.0, + conv_sp_tilesize=1, + device="cuda", + verbose=False, + ): + self.linear_sparsity = linear_sparsity + self.linear_sp_tilesize = linear_sp_tilesize + self.conv1x1_sparsity = conv1x1_sparsity + self.conv1x1_sp_tilesize = conv1x1_sp_tilesize + self.conv_sparsity = conv_sparsity + self.conv_sp_tilesize = conv_sp_tilesize + self.device = device + self.verbose = verbose + + def __call__(self, module): + module_new = None + + if self.conv1x1_sparsity != 0.0 and isinstance(module, torch.nn.Conv2d) and module.kernel_size == (1, 1): + # convert 1x1 convolutions + module_new = SupermaskConv2d( + self.conv1x1_sparsity, False, False, None, None, None, + module.in_channels, + module.out_channels, + module.kernel_size, + stride=module.stride, + padding=module.padding, + dilation=module.dilation, + groups=module.groups, + bias=module.bias is not None, + padding_mode=module.padding_mode, + tile_size=self.conv1x1_sp_tilesize, + ).to(device=self.device, dtype=module.weight.dtype) + module_new.weight.data.copy_(module.weight.data) + if module.bias is not None: + module_new.bias.data.copy_(module.bias.data) + elif self.conv_sparsity != 0.0 and isinstance(module, torch.nn.Conv2d): + # convert all other convolutions (not tested!) + module_new = SupermaskConv2d( + self.conv_sparsity, False, False, None, None, None, + module.in_channels, + module.out_channels, + module.kernel_size, + stride=module.stride, + padding=module.padding, + dilation=module.dilation, + groups=module.groups, + bias=module.bias is not None, + padding_mode=module.padding_mode, + tile_size=self.conv_sp_tilesize, + ).to(device=self.device, dtype=module.weight.dtype) + module_new.weight.data.copy_(module.weight.data) + if module.bias is not None: + module_new.bias.data.copy_(module.bias.data) + elif self.linear_sparsity != 0.0 and isinstance(module, torch.nn.Linear): + module_new = SupermaskLinear( + self.linear_sparsity, False, False, None, None, None, + module.in_features, + module.out_features, + bias=module.bias is not None, + tile_size=self.linear_sp_tilesize, + ).to(device=self.device, dtype=module.weight.dtype) + module_new.weight.data.copy_(module.weight.data) + if module.bias is not None: + module_new.bias.data.copy_(module.bias.data) + else: + return module - if verbose: - print( - f'sparsified module "{k}" with sparsity={v.sparsity}, tile size={v.tile_size}' - ) + if self.verbose: + print(f'sparsified module "{module}" with sparsity={module_new.sparsity}, tile size={module_new.tile_size}') - return model + return module_new diff --git a/torchao/prototype/sparsity/superblock/utils.py b/torchao/prototype/sparsity/superblock/utils.py index 89a443bdab..f9fb780e7a 100644 --- a/torchao/prototype/sparsity/superblock/utils.py +++ b/torchao/prototype/sparsity/superblock/utils.py @@ -380,11 +380,11 @@ def mlp_only_with_args( ### Custom sparsification utils def apply_sparsity(model): for name, module in model.named_modules(): - if isinstance(module, SupermaskLinear) and "mlp" in name: + if isinstance(module, SupermaskLinear) and "feed_forward" in name: module.sparsify_offline() -def accelerate_with_sparsity(model, args): +def accelerate_with_sparsity(model, args, filter_fn): if args.sparsity == "bsr": apply_sparsity(model) if args.quantization: @@ -393,13 +393,13 @@ def accelerate_with_sparsity(model, args): quantize_( model, int8_dynamic_activation_int8_weight( - _layout=BlockSparseLayout(blocksize=args.bsr) + layout=BlockSparseLayout(blocksize=args.bsr) ), - superblock_only, + filter_fn, ) else: assert args.bsr is not None, "BSR requires a block size" - sparsify_(model, block_sparse_weight(blocksize=args.bsr), superblock_only) + quantize_(model, block_sparse_weight(blocksize=args.bsr), filter_fn) elif args.sparsity == "semi_structured": if args.quantization: from torchao.dtypes import SemiSparseLayout @@ -417,7 +417,7 @@ def accelerate_with_sparsity(model, args): quantize_(model, int8_dynamic_activation_int8_weight(), mlp_only) -def simulate_sparsity(model, args): +def simulate_sparsity(model, args, filter_fn): if args.sparsity == "bsr": apply_supermask( model, @@ -431,6 +431,7 @@ def simulate_sparsity(model, args): skip_first_transformer_sparsity=args.skip_first_transformer_sparsity, device=args.device, verbose=False, + filter_fn=filter_fn, ) elif args.sparsity == "semi_structured": sparse_config = [] From 19aa5d05e504115a1b5dacdab674f94770e7999b Mon Sep 17 00:00:00 2001 From: Jesse Cai Date: Tue, 21 Jan 2025 12:41:53 -0800 Subject: [PATCH 02/23] cleaned up supermask --- torchao/_models/llama/generate.py | 28 +-- .../sparsity/superblock/supermask.py | 206 ++---------------- .../prototype/sparsity/superblock/utils.py | 17 +- 3 files changed, 36 insertions(+), 215 deletions(-) diff --git a/torchao/_models/llama/generate.py b/torchao/_models/llama/generate.py index 8c96ec5e2e..5cbc62b01b 100644 --- a/torchao/_models/llama/generate.py +++ b/torchao/_models/llama/generate.py @@ -804,24 +804,20 @@ def ffn_or_attn_only(mod, fqn): # standalone quantization if "bsr" in sparsity: - from torchao.prototype.sparsity.superblock.utils import ( - accelerate_with_sparsity, - get_args_parser, - simulate_sparsity, + from torchao.prototype.sparsity.superblock.supermask import SupermaskLinear + sparsify_( + model, + lambda x: SupermaskLinear.from_linear(x, + sparsity_level=0.9, + blocksize=64, + ), + filter_fn=ffn_only, ) - superblock_args = get_args_parser(benchmark=True).parse_args([]) - superblock_args.sparsity = "bsr" - superblock_args.sparsity_linear = 0.9 - superblock_args.bsr = 64 - - sparsifier_or_none = simulate_sparsity(model, superblock_args, ffn_only) - if sparsifier_or_none is not None: - sparsifier_or_none.squash_mask() - - - model = model.to(device) - accelerate_with_sparsity(model, superblock_args, ffn_only) + from torchao.prototype.sparsity.superblock.blocksparse import block_sparse_weight + sparsify_(model, + block_sparse_weight(blocksize=64), + filter_fn=ffn_only) model_size = get_model_size_in_bytes(model, ignore_embeddings=True) / 1e9 diff --git a/torchao/prototype/sparsity/superblock/supermask.py b/torchao/prototype/sparsity/superblock/supermask.py index e519589003..73a05802a8 100644 --- a/torchao/prototype/sparsity/superblock/supermask.py +++ b/torchao/prototype/sparsity/superblock/supermask.py @@ -126,193 +126,31 @@ def sparsify_offline(self): def forward(self, x): if not self.sparsify_weights: subnet = self.get_mask() - w = (self.weight*self.scale+self.shift) - w = ApplyMask.apply(w, subnet) + # w = (self.weight*self.scale+self.shift) + w = ApplyMask.apply(self.weight, subnet) return F.linear(x, w, self.bias) return F.linear(x, self.weight, self.bias) - - -class SupermaskConv2d(nn.Conv2d): - """Supermask class for Conv2d layer""" - def __init__(self, sparsity, fixed_mask, fixed_weight, bitwidth, transform, fixed_transform, *args, **kwargs): - tile_size = kwargs.pop("tile_size", 1) - super(SupermaskConv2d, self).__init__(*args, **kwargs) - # initialize the scores - max_sparsity = 1 - (1 / math.prod([math.ceil(k / tile_size) for k in self.weight.size()])) - self.sparsity = sparsity - if self.sparsity > max_sparsity: - print( - f"reducing sparsity from {self.sparsity} to {max_sparsity}", - f"(maximum sparsity for layer with shape {self.weight.size()} and tile size {tile_size})" - ) - self.sparsity = max_sparsity - self.tile_size = tile_size - self.scores = nn.Parameter( - torch.empty( - [max(1, int(math.ceil(wn / tile_size))) for wn in self.weight.size()] - ), - requires_grad=not fixed_mask, - ) - nn.init.uniform_(self.scores) if uniform_init_01 else nn.init.kaiming_uniform_(self.scores, a=math.sqrt(5)) - - # the shift and the scale are transformation parameters - # the actually used weights = self.weight*self.scale+self.shift - # the transformation is activated only for quantized weights - self.shift=nn.Parameter(torch.Tensor(1).fill_(0.), requires_grad=False) - self.scale=nn.Parameter(torch.Tensor(1).fill_(1.), requires_grad=False) - - with torch.no_grad(): - # if bitwidth is None, then use floating point values in self.weight - # if bitwidth is not None, then quantize self.weight into k-bit (k=bitwidth) - # quantized values are -2^(k-1), -2^(k-1)+1, ..., 0, 1, ..., 2^(k-1)-1 - # these quantized values are uniformly distributed - if bitwidth is not None: - weights_max = torch.max(self.weight).item() - weights_min = torch.min(self.weight).item() - least_step = (weights_max-weights_min)/pow(2,bitwidth) - left_bound = weights_min-1e-6 - right_bound = weights_min+least_step+1e-6 - # self.shift=nn.Parameter(torch.Tensor(1).fill_( (weights_min+(pow(2,bitwidth-1)+0.5)*least_step) if transform[0] is None else transform[0] ), requires_grad=not fixed_transform[0]) - # self.scale=nn.Parameter(torch.Tensor(1).fill_( least_step if transform[1] is None else transform[1]), requires_grad=not fixed_transform[1]) - # for example, if using binary weights (k=1) with -a, +a, set transform = [a,2a]; if using binary weights (k=1) with a, 0, set transform = [0,-a]; - self.shift=nn.Parameter(torch.Tensor(1).fill_( 0. if transform[0] is None else transform[0] ), requires_grad=not fixed_transform[0]) - self.scale=nn.Parameter(torch.Tensor(1).fill_( 1. if transform[1] is None else transform[1] ), requires_grad=not fixed_transform[1]) - for i in range(-int(pow(2,bitwidth-1)),int(pow(2,bitwidth-1))): - self.weight[torch.logical_and(self.weight>left_bound, self.weight<=right_bound)] = i - left_bound = right_bound - right_bound += least_step - - self.weight.requires_grad = not fixed_weight - - def forward(self, x): - subnet = GetSubnet.apply(self.scores, - torch.zeros_like(self.scores), - torch.ones_like(self.scores), - self.sparsity) - - if self.tile_size != 1: - for i, k in enumerate(self.weight.shape): - # if k == 1: continue - subnet = subnet.repeat_interleave(self.tile_size, dim=i) - subnet = torch.narrow(subnet, i, 0, k) - w = (self.weight*self.scale+self.shift) - w = ApplyMask.apply(w, subnet) - return F.conv2d(x, w, self.bias, self.stride, self.padding, self.dilation, self.groups) - -def apply_supermask( - model, - linear_sparsity=0.0, - linear_sp_tilesize=1, - conv1x1_sparsity=0.0, - conv1x1_sp_tilesize=1, - conv_sparsity=0.0, - conv_sp_tilesize=1, - skip_last_layer_sparsity=False, - skip_first_transformer_sparsity=False, - device="cuda", - verbose=False, - filter_fn=None, -): - # create filter function - # TODO: it might be better to move the filtering function to the script calling this function - is_last_layer = lambda module, name: name == "heads.head" - is_first_transformer_layer = lambda module, name: name == "encoder.layers.encoder_layer_0" - # TODO: create condition for ffn, k,v,q,o projections - reject_fn = lambda module, name : (skip_last_layer_sparsity and is_last_layer(module, name)) or (skip_first_transformer_sparsity and is_first_transformer_layer(module, name)) - if filter_fn is None: - filter_fn = lambda module, name : not reject_fn(module, name) and isinstance(module, (torch.nn.Linear, torch.nn.Conv2d)) - - _replace_with_custom_fn_if_matches_filter( - model, - SuperMaskReplacementClass( - linear_sparsity=linear_sparsity, - linear_sp_tilesize=linear_sp_tilesize, - conv1x1_sparsity=conv1x1_sparsity, - conv1x1_sp_tilesize=conv1x1_sp_tilesize, - conv_sparsity=conv_sparsity, - conv_sp_tilesize=conv_sp_tilesize, - device=device, - verbose=verbose, - ), - filter_fn, - ) - -class SuperMaskReplacementClass: - def __init__( - self, - linear_sparsity=0.0, - linear_sp_tilesize=1, - conv1x1_sparsity=0.0, - conv1x1_sp_tilesize=1, - conv_sparsity=0.0, - conv_sp_tilesize=1, - device="cuda", - verbose=False, - ): - self.linear_sparsity = linear_sparsity - self.linear_sp_tilesize = linear_sp_tilesize - self.conv1x1_sparsity = conv1x1_sparsity - self.conv1x1_sp_tilesize = conv1x1_sp_tilesize - self.conv_sparsity = conv_sparsity - self.conv_sp_tilesize = conv_sp_tilesize - self.device = device - self.verbose = verbose - - def __call__(self, module): + @classmethod + def from_linear(cls, linear, sparsity_level=0.0, blocksize=1, inference=True): module_new = None - if self.conv1x1_sparsity != 0.0 and isinstance(module, torch.nn.Conv2d) and module.kernel_size == (1, 1): - # convert 1x1 convolutions - module_new = SupermaskConv2d( - self.conv1x1_sparsity, False, False, None, None, None, - module.in_channels, - module.out_channels, - module.kernel_size, - stride=module.stride, - padding=module.padding, - dilation=module.dilation, - groups=module.groups, - bias=module.bias is not None, - padding_mode=module.padding_mode, - tile_size=self.conv1x1_sp_tilesize, - ).to(device=self.device, dtype=module.weight.dtype) - module_new.weight.data.copy_(module.weight.data) - if module.bias is not None: - module_new.bias.data.copy_(module.bias.data) - elif self.conv_sparsity != 0.0 and isinstance(module, torch.nn.Conv2d): - # convert all other convolutions (not tested!) - module_new = SupermaskConv2d( - self.conv_sparsity, False, False, None, None, None, - module.in_channels, - module.out_channels, - module.kernel_size, - stride=module.stride, - padding=module.padding, - dilation=module.dilation, - groups=module.groups, - bias=module.bias is not None, - padding_mode=module.padding_mode, - tile_size=self.conv_sp_tilesize, - ).to(device=self.device, dtype=module.weight.dtype) - module_new.weight.data.copy_(module.weight.data) - if module.bias is not None: - module_new.bias.data.copy_(module.bias.data) - elif self.linear_sparsity != 0.0 and isinstance(module, torch.nn.Linear): - module_new = SupermaskLinear( - self.linear_sparsity, False, False, None, None, None, - module.in_features, - module.out_features, - bias=module.bias is not None, - tile_size=self.linear_sp_tilesize, - ).to(device=self.device, dtype=module.weight.dtype) - module_new.weight.data.copy_(module.weight.data) - if module.bias is not None: - module_new.bias.data.copy_(module.bias.data) - else: - return module - - if self.verbose: - print(f'sparsified module "{module}" with sparsity={module_new.sparsity}, tile size={module_new.tile_size}') - + assert isinstance(linear, torch.nn.Linear) + module_new = SupermaskLinear( + sparsity_level, False, False, None, None, None, + linear.in_features, + linear.out_features, + bias=linear.bias is not None, + tile_size=blocksize, + ).to(device=linear.weight.device, dtype=linear.weight.dtype) + module_new.weight.data.copy_(linear.weight.data) + if linear.bias is not None: + module_new.bias.data.copy_(linear.bias.data) + if inference: + module_new.sparsify_offline() return module_new + + @classmethod + def to_linear(cls): + pass + diff --git a/torchao/prototype/sparsity/superblock/utils.py b/torchao/prototype/sparsity/superblock/utils.py index f9fb780e7a..8928db63cb 100644 --- a/torchao/prototype/sparsity/superblock/utils.py +++ b/torchao/prototype/sparsity/superblock/utils.py @@ -22,7 +22,6 @@ from torchao.prototype.sparsity.superblock.blocksparse import block_sparse_weight from torchao.prototype.sparsity.superblock.supermask import ( SupermaskLinear, - apply_supermask, ) from torchao.quantization import int8_dynamic_activation_int8_weight, quantize_ from torchao.sparsity import semi_sparse_weight, sparsify_ @@ -396,6 +395,7 @@ def accelerate_with_sparsity(model, args, filter_fn): layout=BlockSparseLayout(blocksize=args.bsr) ), filter_fn, + ) else: assert args.bsr is not None, "BSR requires a block size" @@ -419,20 +419,7 @@ def accelerate_with_sparsity(model, args, filter_fn): def simulate_sparsity(model, args, filter_fn): if args.sparsity == "bsr": - apply_supermask( - model, - linear_sparsity=args.sparsity_linear, - linear_sp_tilesize=args.bsr, - conv1x1_sparsity=args.sparsity_conv1x1, - conv1x1_sp_tilesize=args.bsr, - conv_sparsity=args.sparsity_conv, - conv_sp_tilesize=args.bsr, - skip_last_layer_sparsity=args.skip_last_layer_sparsity, - skip_first_transformer_sparsity=args.skip_first_transformer_sparsity, - device=args.device, - verbose=False, - filter_fn=filter_fn, - ) + pass elif args.sparsity == "semi_structured": sparse_config = [] for name, mod in model.named_modules(): From c1616c4b06b7993c0ee11bd93d811c7db231d777 Mon Sep 17 00:00:00 2001 From: Jesse Cai Date: Tue, 21 Jan 2025 12:48:50 -0800 Subject: [PATCH 03/23] cleanup --- torchao/_models/llama/generate.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/torchao/_models/llama/generate.py b/torchao/_models/llama/generate.py index 5cbc62b01b..b5b6eccb27 100644 --- a/torchao/_models/llama/generate.py +++ b/torchao/_models/llama/generate.py @@ -23,12 +23,6 @@ from torchao.utils import TORCH_VERSION_AT_LEAST_2_5, get_model_size_in_bytes torch.sparse.SparseSemiStructuredTensor._FORCE_CUTLASS = False -from torch._inductor import config as inductorconfig -inductorconfig.triton.unique_kernel_names = True -# torch.backends.cuda.enable_cudnn_sdp(True) -# torch.backends.cuda.enable_math_sdp(False) -# torch.backends.cuda.enable_flash_sdp(False) -# torch.backends.cuda.enable_mem_efficient_sdp(False) class HostEvent: @@ -799,11 +793,11 @@ def ffn_or_attn_only(mod, fqn): from torchao.sparsity import semi_sparse_weight, sparsify_ if "semi" in sparsity: - # TODO there is a bug here, need to fix + # Fixed sparsity level for 2:4 sparsify_(model.to(device), semi_sparse_weight(), filter_fn=ffn_only) - # standalone quantization if "bsr" in sparsity: + # Apply Supermask to get sparse weights from torchao.prototype.sparsity.superblock.supermask import SupermaskLinear sparsify_( model, From 2e78fc32a81c231819c725c932f8bb908dc29172 Mon Sep 17 00:00:00 2001 From: Jesse Cai Date: Wed, 22 Jan 2025 13:41:01 -0800 Subject: [PATCH 04/23] update --- .../sparsity/superblock/blocksparse.py | 163 +++++++++--------- 1 file changed, 85 insertions(+), 78 deletions(-) diff --git a/torchao/prototype/sparsity/superblock/blocksparse.py b/torchao/prototype/sparsity/superblock/blocksparse.py index 7391173b5f..d15d959d8f 100644 --- a/torchao/prototype/sparsity/superblock/blocksparse.py +++ b/torchao/prototype/sparsity/superblock/blocksparse.py @@ -12,28 +12,28 @@ # quantization support -@torch.library.custom_op("blocksparse::bsr_to_dense", mutates_args=()) -def bsr_to_dense( - crow_indices: torch.Tensor, - col_indices: torch.Tensor, - values: torch.Tensor, - M: int, - K: int, -) -> torch.Tensor: - return torch.sparse_bsr_tensor( - crow_indices=crow_indices, col_indices=col_indices, values=values, size=(M, K) - ).to_dense() - - -@torch.library.register_fake("blocksparse::bsr_to_dense") -def bsr_to_dense_abstract( - crow_indices: torch.Tensor, - col_indices: torch.Tensor, - values: torch.Tensor, - M: int, - K: int, -) -> torch.Tensor: - return torch.empty((M, K), dtype=values.dtype, device=values.device) +# @torch.library.custom_op("blocksparse::bsr_to_dense", mutates_args=()) +# def bsr_to_dense( +# crow_indices: torch.Tensor, +# col_indices: torch.Tensor, +# values: torch.Tensor, +# M: int, +# K: int, +# ) -> torch.Tensor: +# return torch.sparse_bsr_tensor( +# crow_indices=crow_indices, col_indices=col_indices, values=values, size=(M, K) +# ).to_dense() + + +# @torch.library.register_fake("blocksparse::bsr_to_dense") +# def bsr_to_dense_abstract( +# crow_indices: torch.Tensor, +# col_indices: torch.Tensor, +# values: torch.Tensor, +# M: int, +# K: int, +# ) -> torch.Tensor: +# return torch.empty((M, K), dtype=values.dtype, device=values.device) @torch.library.custom_op("blocksparse::int_addmm", mutates_args=()) @@ -261,62 +261,60 @@ def block_sparse_detach(func, types, args, kwargs): ) -# @implements(aten.unsqueeze.default) -# def block_sparse_unsqueeze(func, types, args, kwargs): -# assert len(args) == 2 -# assert len(kwargs) == 0 -# assert args[-1] == 2 -# bsr = args[0] -# assert bsr.dim() == 2 -# assert not bsr.requires_grad -# return BlockSparseTensor(bsr.shape + (1,), -# bsr.crow_indices(), -# bsr.col_indices(), -# bsr.values().unsqueeze(-1)) -# # bsr.bsr_nt) - - -# @implements(aten.mul.Tensor) -# def block_sparse_mul(func, types, args, kwargs): -# assert len(args) == 2 -# assert len(kwargs) == 0 -# bsr, t = args - -# def my_mul(bsr, t): -# assert isinstance(bsr, BlockSparseTensor) -# assert isinstance(t, torch.Tensor) -# assert bsr.dim() == 3 -# assert t.dim() == 3 -# assert not bsr.requires_grad -# # import pdb; pdb.set_trace() -# assert t.size(0) == 1 -# t_blocked = t.view(t.size(0), t.size(1) // 64, 64, 1) -# masked_t = t_blocked.transpose(0, 1).index_select(0, bsr.col_indices()) -# new_values = bsr.values() * masked_t -# # print("C") -# # bsr_nt = torch.nested.nested_tensor_from_jagged(new_values.detach(), bsr.crow_indices().detach()).detach() -# return BlockSparseTensor(bsr.shape, -# bsr.crow_indices(), -# bsr.col_indices(), -# new_values) -# # bsr_nt) - -# if isinstance(bsr, torch.Tensor) and isinstance(t, BlockSparseTensor): -# return my_mul(t, bsr) -# return my_mul(bsr, t) - - -# @implements(aten.sum.dim_IntList) -# def block_sparse_sum(func, types, args, kwargs): -# bsr, dim = args -# assert type(dim) == list -# assert len(dim) == 1 -# dim = dim[0] -# bsr_dim = bsr.dim() -# assert dim == 1 -# # ret = bsr.bsr_nt.detach().sum(dim=1).view(bsr.shape[0], -1).sum(1, keepdim=True).detach() -# assert ret.dim() + 1 == bsr_dim -# return ret +@implements(aten.unsqueeze.default) +def block_sparse_unsqueeze(func, types, args, kwargs): + assert len(args) == 2 + assert len(kwargs) == 0 + assert args[-1] == 2 + bsr = args[0] + assert bsr.dim() == 2 + assert not bsr.requires_grad + return BlockSparseTensor(bsr.shape + (1,), + bsr.crow_indices(), + bsr.col_indices(), + bsr.values().unsqueeze(-1)) + # bsr.bsr_nt) + + +@implements(aten.mul.Tensor) +def block_sparse_mul(func, types, args, kwargs): + assert len(args) == 2 + assert len(kwargs) == 0 + bsr, t = args + + def my_mul(bsr, t): + assert isinstance(bsr, BlockSparseTensor) + assert isinstance(t, torch.Tensor) + assert bsr.dim() == 3 + assert t.dim() == 3 + assert not bsr.requires_grad + assert t.size(0) == 1 + t_blocked = t.view(t.size(0), t.size(1) // 64, 64, 1) + masked_t = t_blocked.transpose(0, 1).index_select(0, bsr.col_indices()) + new_values = bsr.values() * masked_t + return BlockSparseTensor(bsr.shape, + bsr.crow_indices(), + bsr.col_indices(), + new_values) + # bsr_nt) + + if isinstance(bsr, torch.Tensor) and isinstance(t, BlockSparseTensor): + return my_mul(t, bsr) + return my_mul(bsr, t) + + +@implements(aten.sum.dim_IntList) +def block_sparse_sum(func, types, args, kwargs): + breakpoint() + bsr, dim = args + assert type(dim) == list + assert len(dim) == 1 + dim = dim[0] + bsr_dim = bsr.dim() + assert dim == 1 + ret = bsr.values.detach().sum(dim=1).view(bsr.shape[0], -1).sum(1, keepdim=True).detach() + assert ret.dim() + 1 == bsr_dim + return ret @implements(aten.values.default) @@ -338,6 +336,15 @@ def block_sparse_col_indices(func, types, args, kwargs): def block_sparse__nnz(func, types, args, kwargs): return args[0].bsr_values.shape[0] +@implements(aten.to_dense.default) +def block_sparse_to_dense(func, types, args, kwargs): + return torch.sparse_bsr_tensor( + crow_indices=args[0].crow_indices, + col_indices=args[0].col_indices, + values=args[0].values, + size=args[0].shape, + ).to_dense() + def next_power_of_two(n): assert n > 0 From bd3a3b1100ff06117e55ca75956ecbad8c87ef4f Mon Sep 17 00:00:00 2001 From: Jesse Cai Date: Thu, 23 Jan 2025 11:52:35 -0800 Subject: [PATCH 05/23] added padding to triton kernel --- torchao/_models/llama/generate.py | 2 +- .../sparsity/superblock/_triton_ops_meta.py | 7756 +++++++++++++++++ .../sparsity/superblock/blocksparse.py | 110 +- .../sparsity/superblock/bsr_triton_ops.py | 2541 ++++++ 4 files changed, 10339 insertions(+), 70 deletions(-) create mode 100644 torchao/prototype/sparsity/superblock/_triton_ops_meta.py create mode 100644 torchao/prototype/sparsity/superblock/bsr_triton_ops.py diff --git a/torchao/_models/llama/generate.py b/torchao/_models/llama/generate.py index b5b6eccb27..a537ccd6d2 100644 --- a/torchao/_models/llama/generate.py +++ b/torchao/_models/llama/generate.py @@ -321,7 +321,7 @@ def main( torch.manual_seed(1234) def ffn_only(mod, fqn): - return isinstance(mod, torch.nn.Linear) and "feed_forward" in fqn + return isinstance(mod, torch.nn.Linear) and "feed_forward" in fqn def not_ffn_only(mod, fqn): return isinstance(mod, torch.nn.Linear) and not ffn_only(mod, fqn) diff --git a/torchao/prototype/sparsity/superblock/_triton_ops_meta.py b/torchao/prototype/sparsity/superblock/_triton_ops_meta.py new file mode 100644 index 0000000000..08471ac058 --- /dev/null +++ b/torchao/prototype/sparsity/superblock/_triton_ops_meta.py @@ -0,0 +1,7756 @@ +# mypy: allow-untyped-defs +"""Provides optimal triton kernel parameters. + +Aim +--- + +The usage of optimal triton kernel parameters may increase the +performance of operations several times. For example, for large tensor +shapes, the usage of a bsr tensor as mat1 argument in addmm-based +operations typically outperforms the corresponding operation with +strided-only inputs when the blocked representation of a tensor +provides a better alignement with memory access than what the strided +representation would provide. + +Pre-computed kernel parameters +------------------------------ + +This script finds and stores the optimal triton kernel parameters for +a specific set of shape configurations. For instance, the set of shape +configurations of the bsr_dense_addmm kernel is defined as + + input, out: M x N strided tensor + mat1: M x K bsr tensor with blocksize (BM, BK) and given sparsity + mat2: M x N strided tensor + dtype = float16, bfloat16, float32 + sparsity = 0.5 + M = 256, 512, ..., 16384 + K = M + N = 256, 512, ..., 131072 + BM = 16, 32, ..., 128 + BK = BM + alpha = 1 + beta = 0, 1 + GPUs: NVIDIA A100-SXM4-80GB + +Approximations +-------------- + +It is practically infeasible to pre-compute optimal kernel parameter +for all possible shape configurations as well as for all existing +GPUs. Therefore, we'll assume that the pre-computed optimal parameters +are good enough approximations when +1) the used GPU is any of NVIDIA A100 Tensor Core GPUs, +2) the actual sparsity of mat1 is different from sparsity value 0.5. + +If a particular shape configuration does not fall in the set of +pre-computed kernel parameters, or it does not match with the listed +approximations above, or the used GPU device is not a NVIDIA A100 GPU, +then a reference set of triton kernel parameters will be used when +executing operations. The reference kernel parameters are defined in +torch/sparse/_triton_ops.py, see bsr_dense_addmm_meta function, for +instance. + +Computing optimal kernel parameters +----------------------------------- + +If the approximations listed above are unacceptable, e.g. when one +seeks a maximal performance possible, the optimal kernel parameters +for a particular GPU can be computed by simply running this script in +the pytorch developement tree:: + + cd /path/to/pytorch + python setup.py develop + python torch/sparse/_triton_ops_meta.py + +This will compute the optimal kernel parameters for the GPU device +available in the host system for all shape configurations listed in +"Pre-computed kernel parameters" above. The results will be stored in +the database of kernel parameters. Currently, this database is defined +as this module (see "BEGIN GENERATED DATA" comment below) that will be +modified when the script is run. Create a pytorch PR with the +corresponding modifications in this file to make the computed optimal +kernel parameters available for other users as pre-computed kernel +parameters. + +Moreover, one can compute the optimal kernel parameters for a specific +set of shape configurations and specific sparsity patterns. For that, +use tuning functions provided by this module: + + tune_bsr_dense_addmm(input, mat1, mat2, beta=1, alpha=1, out=None, verbose=False, store=False) -> meta + +The tuning functions return a dictionary of optimal kernel parameters +that can be passed to the corresponding operation, e.g. + + bsr_dense_addmm(..., meta=meta) + +Or, when store==True, the optimal kernel parameters will be stored in +the database of pre-computed kernel parameters in runtime so that all +addmm-based operations such as torch.addmm, torch.mm, +torch.nn.functional.linear will benefit from using the computed +optimal set of kernel parameters. + +Note that running tune_bsr_dense_addmm can take several minutes. So, +use it wisely, e.g. by implementing persisten storage of optimized +kernel parameters. See the source code of get_meta and +tune_bsr_dense_addmm to learn how to register a custom set of optimal +kernel parameters for addmm-based operations. + +""" +__all__ = ["get_meta", "tune_bsr_dense_addmm", "tune__int_bsr_dense_addmm"] + +import inspect +import itertools +import re +import warnings +from typing import Any + +import torch +from torch.hub import tqdm +from torch.testing import make_tensor + + +def get_meta(op, key, device_name=None, version=(0, torch.float16, 0.5), exact=False): + """Return triton kernel meta parameters of the specified op and its inputs key. + + Parameters + ---------- + op (str): The name of an operation that implementation uses meta parameters. + key (tuple): A tuple of op input parameters, e.g. shapes, etc. + device_name (optional, str): The name of a device for which op + parameters are provided. + version (optional, hashable): Specifies the version of parameters. + exact (optional, bool): When True, the returned data (if + available) corresponds exactly to the specified device_name and + version information. Otherwise, if the corresponding data is not + available but there exists a data set that is computed for a + similar GPU device, then this data set will be returned. + + Returns + ------- + result (dict): The requested mapping of parameter names and + values, or None when no data is available. If the input `key` + contains `"*"`, the result will be a dictionary of keys and + mappings that match with the given `key`. + """ + if device_name is None: + device_name = torch.cuda.get_device_name() + + op_data = _operation_device_version_data.get((op, device_name, version)) + if op_data is None and not exact: + # A lack of op data could be due to using a (slightly) + # different GPU model compared to a model for which optimal + # meta parameters have been computed. In the following we'll + # assume that there is a set of GPU models that all have + # a similar set of optimal meta parameters. + if re.match(r"NVIDIA A100[^\d]", device_name) is not None: + device_name = "NVIDIA A100-SXM4-80GB" + else: + return + op_data = _operation_device_version_data.get((op, device_name, version)) + if op_data is None: + return + + matching_data = {} + if "*" in key: + for op_key in op_data: + if [None for k1, k2 in zip(op_key, key) if k2 != "*" and k1 != k2]: + continue + matching_data[op_key] = op_data[op_key] + else: + values = op_data.get(key) + if values is not None: + matching_data[key] = values + matching_meta = {} + for op_key, values in matching_data.items(): + if op == "scatter_mm": + names = ( + "GROUP_SIZE", + "SPLIT_N", + "TILE_M", + "TILE_N", + "num_stages", + "num_warps", + ) + meta = dict(zip(names, values)) + elif op in {"bsr_dense_addmm", "_int_bsr_dense_addmm"}: + meta = dict( + zip(("GROUP_SIZE_ROW", "SPLIT_N", "num_stages", "num_warps"), values) + ) + else: + raise NotImplementedError(f"names for {op=}") + if "*" not in key: + return meta + + matching_meta[op_key] = meta + + if "*" in key: + return matching_meta + + +def update(op, device_name, version, key, value): + """Update the db of op parameters.""" + # skip storing possible optimization failures: + if not value: + warnings.warn( + f"skipping empty value for {op}: {device_name=} {version=} {key=}" + ) + return + if (op, device_name, version) in _operation_device_version_data: + if _operation_device_version_data[op, device_name, version].get(key) == value: + return + _operation_device_version_data[op, device_name, version][key] = value + else: + _operation_device_version_data[op, device_name, version] = {key: value} + + +def dump(): + """Store the current runtime db state to the module file.""" + current_file = inspect.getfile(dump) + f = open(current_file) + current_content = f.read() + f.close() + begin_data_str = "# BEGIN GENERATED DATA\n" + begin_data_index = current_content.find(begin_data_str) + end_data_index = current_content.find(" # END GENERATED DATA\n") + if begin_data_index == -1 or end_data_index == -1: + warnings.warn( + f"{current_file} cannot be updated:" + " BEGIN/END GENERATED DATA comment blocks appear to be corrupted" + ) + return + + def sort_key(key): + op, device_name, version = key + version = tuple( + (str(item) if isinstance(item, torch.dtype) else item) for item in version + ) + return (op, device_name, version) + + part1 = current_content[: begin_data_index + len(begin_data_str)] + part2 = current_content[end_data_index:] + data_part = [] + for op_key in sorted(_operation_device_version_data, key=sort_key): + data_part.append(" " + repr(op_key).replace("'", '"') + ": {") + op_data = _operation_device_version_data[op_key] + data_part.extend(f" {key}: {op_data[key]}," for key in sorted(op_data)) + data_part.append(" },") + new_content = part1 + "\n".join(data_part) + "\n" + part2 + if current_content != new_content: + f = open(current_file, "w") + f.write(new_content) + f.close() + + +def minimize( + target_func, + initial_parameters, + reference_parameters, + step_func, + max_step=2, + verbose=False, + all_values=None, +): + """Find a dict of parameters that minimizes the target function using + the initial dict of parameters and a step function that progresses + a specified parameter in a dict of parameters. + + Parameters + ---------- + target_func (callable): a functional with the signature + ``target_func(parameters: dict) -> float`` + initial_parameters (dict): a set of parameters used as an initial + value to the minimization process. + reference_parameters (dict): a set of parameters used as an + reference value with respect to which the speed up is computed. + step_func (callable): a functional with the signature + ``step_func(parameter_name:str, parameter_value:int, direction:int, parameters:dict) -> int`` + that increments or decrements (when ``direction`` is positive or + negative, respectively) the parameter with given name and value. + When return value is equal to ``parameter_value``, it means that + no step along the given direction can be made. + + Returns + ------- + parameters (dict): a set of parameters that minimizes the target + function. + speedup_incr (float): a speedup change given in percentage. + timing (float): the value of the target function at the parameters. + sensitivity_message (str): a message containing sensitivity. + information of parameters around the target function minimizer. + """ + + def to_key(parameters): + return tuple(parameters[k] for k in sorted(parameters)) + + def from_key(key, parameters): + return dict(zip(sorted(parameters), key)) + + if all_values is None: + all_values = {} + + directions = list(range(-max_step, max_step + 1)) + names = sorted(initial_parameters) + all_directions = [] + for d_tuple in itertools.product(*((directions,) * len(names))): + dist = sum(map(abs, d_tuple)) + if dist > 0 and dist <= max_step: + all_directions.append((dist, d_tuple)) + all_directions.sort() + + try: + reference_target = target_func(reference_parameters) + except Exception as msg: + if verbose and "out of resource" not in str(msg): + print(f"{reference_parameters=} lead to failure: {msg}.") + reference_target = None + + if reference_target is not None: + all_values[to_key(reference_parameters)] = reference_target + + parameters = initial_parameters + try: + initial_target = target_func(parameters) + except Exception as msg: + if reference_target is None: + if verbose: + print( + f"{initial_parameters=} lead to failure: {msg}. Optimization failed!" + ) + return {}, -1, -1, f"{msg}" + if verbose and "out of resource" not in str(msg): + print( + f"{initial_parameters=} lead to failure: {msg}. Using reference parameters instead of initial parameters." + ) + parameters = reference_parameters + initial_target = reference_target + + if reference_target is None: + if verbose: + print("Using initial parameters instead of reference parameters.") + reference_target = initial_target + + initial_key = to_key(parameters) + minimal_target = all_values[initial_key] = initial_target + pbar = tqdm( + total=len(all_directions), + desc="Tuning...", + disable=not verbose, + ncols=75, + ) + while True: + for i, (_, d_tuple) in enumerate(all_directions): + pbar.update(1) + next_parameters = parameters.copy() + for name, direction in zip(names, d_tuple): + value = next_parameters[name] + if direction == 0: + continue + next_value = step_func(name, value, direction, parameters) + if next_value == value: + break + next_parameters[name] = next_value + else: + next_key = to_key(next_parameters) + if next_key in all_values: + continue + try: + next_target = target_func(next_parameters) + except Exception as msg: + all_values[next_key] = str(msg) + if verbose and "out of resource" not in str(msg): + print(f"{next_parameters=} lead to failure: {msg}. Skipping.") + continue + all_values[next_key] = next_target + + if next_target < minimal_target: + minimal_target = next_target + parameters = next_parameters + pbar.total += i + 1 + break + else: + # ensure stable minimizer: + minimizer_keys = { + k + for k, v in all_values.items() + if isinstance(v, float) and abs(1 - v / minimal_target) < 0.001 + } + minimizer_key = ( + initial_key if initial_key in minimizer_keys else min(minimizer_keys) + ) + parameters = from_key(minimizer_key, parameters) + speedup_incr = (1 - minimal_target / reference_target) * 100 + if speedup_incr < 0: + if verbose: + print( + f"{speedup_incr=} is negative. Rerunning minimize with reference parameters as initial parameters." + ) + return minimize( + target_func, + reference_parameters, + reference_parameters, + step_func, + max_step=max_step, + verbose=verbose, + all_values=all_values, + ) + sensitivity = [] + for name in parameters: + value = parameters[name] + rel_diffs = [] + for direction in range(-max_step, max_step + 1): + if direction == 0: + continue + next_value = step_func(name, value, direction, parameters) + if next_value == value: + rel_diffs.append(0) + continue + next_parameters = parameters.copy() + next_parameters[name] = next_value + next_key = to_key(next_parameters) + next_target = all_values.get(next_key) + if next_target is None or isinstance(next_target, str): + rel_diffs.append(0) + continue + rel_diff = (next_target / minimal_target - 1) * 100 + rel_diffs.append(rel_diff) + sensitivity.append((max(rel_diffs), rel_diffs, name)) + + sensitivity_message = [f"timing0={initial_target:.3f}"] + for _, rel_diffs, name in sorted(sensitivity, reverse=True): + left_diffs = "|".join( + [f"{rel_diff:.1f}" for rel_diff in rel_diffs[:max_step]] + ) + right_diffs = "|".join( + [f"{rel_diff:.1f}" for rel_diff in rel_diffs[max_step:]] + ) + sensitivity_message.append( + f"{name}={parameters[name]} ({left_diffs}...{right_diffs} %)" + ) + sensitivity_message = ", ".join(sensitivity_message) + return parameters, speedup_incr, minimal_target, sensitivity_message + + +def create_blocked_tensor(B, M, N, blocksize, sparsity, dtype, device): + assert ( + sparsity <= 1.0 and sparsity >= 0.0 + ), "sparsity should be a value between 0 and 1" + assert M % blocksize[0] == 0 + assert N % blocksize[1] == 0 + shape = (B, M // blocksize[0], N // blocksize[1])[int(B == 0) :] + A = torch.bernoulli( + torch.full(shape, 1 - sparsity, dtype=torch.float32, device=device) + ).to(dtype) + expected_nnz = int((1 - sparsity) * M * N / (blocksize[0] * blocksize[1])) + nonzero_indices = A.flatten().nonzero() + actual_nnz = nonzero_indices.shape[0] + if actual_nnz > expected_nnz: + selected_nonzeros = torch.randperm(actual_nnz)[: actual_nnz - expected_nnz] + A.flatten()[nonzero_indices[selected_nonzeros]] = 0 + elif actual_nnz < expected_nnz: + zero_indices = (A == 0).flatten().nonzero() + selected_zeros = torch.randperm(zero_indices.shape[0])[ + : expected_nnz - actual_nnz + ] + A.flatten()[zero_indices[selected_zeros]] = 1 + A = torch.repeat_interleave(A, blocksize[0], dim=-2) + A = torch.repeat_interleave(A, blocksize[1], dim=-1) + return A + + +def optimize_scatter_mm( + m, k, n, bm, bk, dtype=torch.float16, device="cuda", sparsity=0.5, force=False +): + import triton + + from torch.sparse._triton_ops import bsr_scatter_mm, bsr_scatter_mm_indices_data + + key = (m, k, n, bm, bk) + + version = (0, dtype, sparsity) + device_name = torch.cuda.get_device_name() + + reference_meta = dict( + GROUP_SIZE=1, + TILE_M=16, + TILE_N=16, + SPLIT_N=n // 16, + num_stages=1, + num_warps=1, + ) + + initial_meta = get_meta( + "scatter_mm", key, device_name=device_name, version=version, exact=True + ) + if initial_meta is None: + initial_meta = get_meta( + "bsr_dense_addmm", + key, + device_name=device_name, + version=(0, dtype, 0.5), + exact=True, + ) + if initial_meta is None: + initial_meta = reference_meta + elif not force: + return + + torch.manual_seed(0) + bsr = create_blocked_tensor( + 0, m, k, (bm, bk), sparsity, dtype, device + ).to_sparse_bsr((bm, bk)) + dense = make_tensor(k, n, dtype=dtype, device=device) + + def bench(meta, bsr=bsr, dense=dense): + indices_data = bsr_scatter_mm_indices_data( + bsr, dense, indices_format="bsr_strided_mm_compressed", **meta + ) + + def test_func(): + return bsr_scatter_mm(bsr, dense, indices_data=indices_data) + + ms_min = triton.testing.do_bench(test_func, warmup=500, rep=100) + + return ms_min + + def step_meta_parameter(name, value, direction, meta, m=m, n=n, k=k, bm=bm, bk=bk): + # return next value in positive or negative direction, or + # input value if the step will result an invalid + # value. The input value is assumed to be valid. + + is_log = name in {"SPLIT_N", "TILE_M", "TILE_N", "num_warps"} + min_value = dict( + SPLIT_N=1, TILE_M=16, TILE_N=16, num_warps=1, num_stages=1, GROUP_SIZE=1 + )[name] + max_value = dict( + SPLIT_N=n // meta["TILE_N"], TILE_M=bm, TILE_N=n // meta["SPLIT_N"] + ).get(name) + value_step = dict( + SPLIT_N=2, TILE_M=2, TILE_N=2, num_warps=2, num_stages=1, GROUP_SIZE=1 + )[name] + if is_log: + next_value = ( + value * value_step**direction + if direction > 0 + else value // (value_step ** abs(direction)) + ) + else: + next_value = value + value_step * direction + if min_value is not None: + next_value = max(next_value, min_value) + if max_value is not None: + next_value = min(next_value, max_value) + if name == "SPLIT_N" and n % next_value != 0: + return value + # Hard-skip parameter combinations that break CUDA state for pytorch: + if (dtype, name, next_value, m, n, k, bm, bk) in { + (torch.float32, "num_warps", 32, 256, 256, 256, 16, 16), + (torch.float32, "num_warps", 16, 256, 256, 256, 32, 32), + (torch.float32, "num_warps", 16, 256, 256, 256, 64, 64), + (torch.float32, "num_warps", 16, 256, 256, 256, 128, 128), + (torch.float32, "num_warps", 16, 512, 512, 256, 128, 128), + } and re.match(r"NVIDIA A100[^\d]", device_name) is not None: + return value + return next_value + + meta, speedup, timing, _sensitivity_message = minimize( + bench, initial_meta, reference_meta, step_meta_parameter + ) + if initial_meta is not reference_meta and initial_meta == meta and not force: + return + print(f"{meta=} {speedup=:.1f} % {timing=:.3f} ms") + if speedup < 0: + return + device_name = torch.cuda.get_device_name() + + update( + "scatter_mm", device_name, version, key, tuple(meta[k] for k in sorted(meta)) + ) + + +def tune__int_bsr_dense_addmm( + input, + bsr, + dense, + *, + beta=1, + alpha=1, + out=None, + store=False, + verbose=False, + force=False, +): + return tune_bsr_dense_addmm( + input, + bsr, + dense, + beta=beta, + alpha=alpha, + out=out, + store=store, + verbose=verbose, + force=force, + opname="_int_bsr_dense_addmm", + ) + + +def tune_bsr_dense_addmm( + input, + bsr, + dense, + *, + beta=1, + alpha=1, + left_alpha=None, + right_alpha=None, + out=None, + store=False, + verbose=False, + force=False, + opname=None, +): + """Tune bsr_dense_addmm kernel parameters against the given inputs. + + When store is True, the tuning results will be stored in the + database of kernel parameters. + """ + import triton + + if opname is None: + opname = "bsr_dense_addmm" + + if opname == "_int_bsr_dense_addmm": + from torch.sparse._triton_ops import _int_bsr_dense_addmm as bsr_dense_addmm + else: + from torch.sparse._triton_ops import bsr_dense_addmm + + N = dense.shape[-1] + values = bsr.values() + crow_indices = bsr.crow_indices() + batch_ndim = crow_indices.dim() - 1 + M, K = bsr.shape[batch_ndim : batch_ndim + 2] + BM, BK = values.shape[batch_ndim + 1 : batch_ndim + 3] + + # Reference parameters is a set of parameters that leads to a + # successful kernel call and the corresponding timing is used as a + # reference for computing speedups. Avoid changing the reference + # parameters when possible. + reference_meta = dict( + GROUP_SIZE_ROW=1, num_stages=1, num_warps=4, SPLIT_N=max(N // BM, 1) + ) + + # Compute the key of parameters: + sparsity = round(1 - bsr._nnz() * BM * BK / (M * K), 2) + dtype = bsr.dtype + if out is None: + out_dtype = dtype + else: + out_dtype = out.dtype + if out_dtype is dtype: + version_dtype = dtype + else: + version_dtype = (dtype, out_dtype) + version = (0, version_dtype, sparsity) + key = (M, K, N, BM, BK, beta == 0, beta == 1, alpha == 1) + + # For tuning, for an initial state, use parameters from the + # database if available, otherwise, use the reference parameters. + initial_meta = get_meta(opname, key, version=version, exact=True) + if initial_meta is None: + may_skip_update = False + initial_meta = get_meta(opname, key, version=(0, dtype, 0.5), exact=True) + if initial_meta is None: + initial_meta = reference_meta + elif not force: + return initial_meta + else: + may_skip_update = True + + # The target function that is minimized in the tuning process: + def bench(meta, input=input, bsr=bsr, dense=dense, alpha=alpha, out=out): + def test_func(): + return bsr_dense_addmm( + input, + bsr, + dense, + beta=beta, + alpha=alpha, + left_alpha=left_alpha, + right_alpha=right_alpha, + meta=meta, + out=out, + ) + + return triton.testing.do_bench(test_func, warmup=500, rep=100) + + # The step function that increments a specified meta parameter: + def step_meta_parameter(name, value, direction, meta, M=M, N=N, K=K, BM=BM, BK=BK): + # return next value in positive or negative direction, or + # input value if the step will result an invalid + # value. The input value is assumed to be valid. + is_log = name in {"SPLIT_N", "num_warps"} + min_value = dict(SPLIT_N=1, num_warps=1, num_stages=1, GROUP_SIZE_ROW=1)[name] + max_value = dict(SPLIT_N=max(N // BM, 1)).get(name) + value_step = dict(SPLIT_N=2, num_warps=2, num_stages=1, GROUP_SIZE_ROW=1)[name] + if is_log: + next_value = ( + value * value_step**direction + if direction > 0 + else value // (value_step ** abs(direction)) + ) + else: + next_value = value + value_step * direction + if min_value is not None: + next_value = max(next_value, min_value) + if max_value is not None: + next_value = min(next_value, max_value) + if name == "SPLIT_N" and N % next_value != 0: + return value + return next_value + + # Tune: + meta, speedup, timing, sensitivity_message = minimize( + bench, + initial_meta, + reference_meta, + step_meta_parameter, + max_step=2, + verbose=verbose, + ) + if verbose: + print(f"-> {sensitivity_message}, {speedup=:.1f} %, {timing=:.3f} ms") + + if store and not ( + may_skip_update and meta == initial_meta and initial_meta is not reference_meta + ): + device_name = torch.cuda.get_device_name() + update( + opname, + device_name, + version, + key, + tuple(meta[k] for k in sorted(meta)), + ) + + return meta + + +def optimize_bsr_dense_addmm( + m, + k, + n, + bm, + bk, + beta=1, + alpha=1, + use_left_alpha=False, + use_right_alpha=False, + dtype=torch.float16, + out_dtype=None, + device="cuda", + sparsity=0.5, + force=False, + verbose=False, + opname=None, +): + torch.manual_seed(0) + bsr = create_blocked_tensor( + 0, m, k, (bm, bk), sparsity, dtype, device + ).to_sparse_bsr((bm, bk)) + dense = make_tensor(k, n, dtype=dtype, device=device) + input = make_tensor(m, n, dtype=dtype, device=device) + left_alpha = make_tensor(m, dtype=dtype, device=device) if use_left_alpha else None + right_alpha = ( + make_tensor(n, dtype=dtype, device=device) if use_right_alpha else None + ) + if out_dtype is not None: + out = dense.new_empty((m, n), dtype=out_dtype) + else: + out = None + tune_bsr_dense_addmm( + input, + bsr, + dense, + beta=beta, + alpha=alpha, + left_alpha=left_alpha, + right_alpha=right_alpha, + out=out, + store=True, + force=force, + verbose=verbose, + opname=opname, + ) + + +def main(op="scatter_mm", force=False, dtype=torch.float16, verbose=True): + import itertools + + sizes_lst = [ + 256, + 512, + 1024, + 2048, + 4096, + 8192, + 16384, + 32768, + 65536, + 131072, + 50432, + 65792, + ] + sizes3_lst = [3 * sz for sz in [64, 128] + sizes_lst if sz <= 2048] + shapes_lst = [(sz, sz) for sz in sizes_lst[:-4] + sizes3_lst] + shapes_lst.extend([(3072, 768), (768, 3072)]) + shapes_lst.extend([(5120, 1280), (1280, 5120)]) + if dtype is torch.int8: + # triton does not support smaller blocks than 32 + blocksize_lst = [(32, 32), (64, 64), (128, 128), (256, 256)] + else: + blocksize_lst = [(16, 16), (32, 32), (64, 64), (128, 128)] + sparsity_lst = [0.5, 0.7, 0.3][:1] + for sparsity in sparsity_lst: + print(f"{op, dtype, sparsity=}") + try: + for (M, K), N, (BM, BK) in itertools.product( + shapes_lst, sizes_lst, blocksize_lst + ): + if not (BM <= M and BK <= K and M % BM == 0 and K % BK == 0): + continue + if op == "scatter_mm": + optimize_scatter_mm( + M, K, N, BM, BK, force=force, sparsity=sparsity, dtype=dtype + ) + elif op in {"bsr_dense_addmm", "_int_bsr_dense_addmm"}: + if M == K and N == 50432: + continue + print(f"{M, K, N, (BM, BK)=}") + for alpha, beta in [(1, 1), (1, 0)]: + optimize_bsr_dense_addmm( + M, + K, + N, + BM, + BK, + beta=beta, + alpha=alpha, + force=force, + sparsity=sparsity, + dtype=dtype, + verbose=verbose, + opname=op, + ) + else: + raise NotImplementedError(op) + except KeyboardInterrupt: + break + except Exception: + dump() + raise + dump() + + if 0: + # Check performance dependence on sparsity and apply + # adjustments when differences are noticable (more than 10%). + # + # When using NVIDIA A100 GPU, the performance dependence on + # sparsity is insignificant (0 % ... 10 %) for majority of + # shapes/blocksizes combinations. However, for a very few + # specific size combinations, the effect of sparsity on + # performance can be up to 20 %. + for (M, K), N, (BM, BK) in itertools.product( + shapes_lst, sizes_lst, blocksize_lst + ): + meta_lst: list = [] + key = (M, K, N, BM, BK) + for sparsity1 in sparsity_lst: + torch.manual_seed(0) + bsr = create_blocked_tensor( + 0, M, K, (BM, BK), sparsity1, dtype, device="cuda" + ).to_sparse_bsr((BM, BK)) + dense = make_tensor(K, N, dtype=dtype, device="cuda") + meta_lst = [] + for sparsity in sparsity_lst: + meta = get_meta(op, key, version=(0, dtype, sparsity), exact=True) + if meta is None: + continue + + def bench(meta, bsr=bsr, dense=dense): + import triton + + if op == "scatter_mm": + from torch.sparse._triton_ops import ( + bsr_scatter_mm, + bsr_scatter_mm_indices_data, + ) + + indices_data = bsr_scatter_mm_indices_data( + bsr, + dense, + indices_format="bsr_strided_mm_compressed", + **meta, + ) + + def test_func(): + return bsr_scatter_mm( + bsr, dense, indices_data=indices_data + ) + + else: + raise NotImplementedError(op) + + ms_min = triton.testing.do_bench(test_func, warmup=500, rep=100) + + return ms_min + + meta_lst.append( + (bench(meta), sparsity, tuple(meta[k] for k in sorted(meta))) + ) + if not meta_lst: + continue + meta_lst = sorted(meta_lst) + index = next( + i for i, item in enumerate(meta_lst) if item[1] == sparsity1 + ) + if meta_lst[0][2] == meta_lst[index][2]: + continue + speeddiff = (1 - meta_lst[index][0] / meta_lst[0][0]) * 100 + if abs(speeddiff) < 10: + continue + + print(sparsity1, index, key, meta_lst, speeddiff) + + if index > 0: + device_name = torch.cuda.get_device_name() + meta = get_meta( + op, key, version=(0, dtype, meta_lst[0][1]), exact=True + ) + update( + op, + device_name, + (0, dtype, sparsity1), + key, + tuple(meta[k] for k in sorted(meta)), + ) + print("update") + dump() + + +_operation_device_version_data: dict[Any, dict] = { + # Warning: the data in between the BEGIN/END DATA comment lines + # below is generated. It can be updated either manually or via + # calling dump function defined above. + # + # Legend [op: key -> data]: + # scatter_mm : M, K, N, Ms, Ks -> GROUP_SIZE, SPLIT_N, TILE_M, TILE_N, num_stages, num_warps + # bsr_dense_addmm : M, K, N, Ms, Ks, beta==0, beta==1, alpha==1 -> GROUP_SIZE_ROW, SPLIT_N, num_stages, num_warps + # + # BEGIN GENERATED DATA + ("_int_bsr_dense_addmm", "NVIDIA A100-SXM4-80GB", (0, torch.int8, 0.5)): { + (192, 192, 256, 32, 32, False, True, True): (2, 8, 1, 4), + (192, 192, 256, 32, 32, True, False, True): (2, 8, 5, 4), + (192, 192, 512, 32, 32, False, True, True): (1, 16, 1, 4), + (192, 192, 512, 32, 32, True, False, True): (1, 16, 5, 4), + (192, 192, 1024, 32, 32, False, True, True): (1, 32, 1, 4), + (192, 192, 1024, 32, 32, True, False, True): (4, 32, 4, 4), + (192, 192, 2048, 32, 32, False, True, True): (2, 64, 1, 4), + (192, 192, 2048, 32, 32, True, False, True): (3, 16, 5, 4), + (192, 192, 4096, 32, 32, False, True, True): (1, 128, 1, 4), + (192, 192, 4096, 32, 32, True, False, True): (1, 128, 1, 4), + (192, 192, 8192, 32, 32, False, True, True): (1, 256, 1, 4), + (192, 192, 8192, 32, 32, True, False, True): (1, 64, 3, 4), + (192, 192, 16384, 32, 32, False, True, True): (2, 512, 1, 4), + (192, 192, 16384, 32, 32, True, False, True): (5, 128, 1, 4), + (192, 192, 32768, 32, 32, False, True, True): (1, 1024, 1, 4), + (192, 192, 32768, 32, 32, True, False, True): (1, 256, 1, 4), + (192, 192, 65536, 32, 32, False, True, True): (1, 1024, 1, 8), + (192, 192, 65536, 32, 32, True, False, True): (1, 512, 1, 4), + (192, 192, 131072, 32, 32, False, True, True): (1, 2048, 1, 8), + (192, 192, 131072, 32, 32, True, False, True): (2, 512, 1, 4), + (256, 256, 256, 32, 32, False, True, True): (4, 8, 1, 4), + (256, 256, 256, 32, 32, True, False, True): (1, 8, 6, 4), + (256, 256, 256, 64, 64, False, True, True): (1, 4, 1, 16), + (256, 256, 256, 64, 64, True, False, True): (1, 4, 4, 4), + (256, 256, 256, 128, 128, False, True, True): (3, 2, 1, 16), + (256, 256, 256, 128, 128, True, False, True): (1, 2, 1, 4), + (256, 256, 512, 32, 32, False, True, True): (2, 16, 1, 4), + (256, 256, 512, 32, 32, True, False, True): (2, 16, 4, 4), + (256, 256, 512, 64, 64, False, True, True): (7, 8, 1, 16), + (256, 256, 512, 64, 64, True, False, True): (3, 8, 3, 4), + (256, 256, 512, 128, 128, False, True, True): (1, 4, 1, 32), + (256, 256, 512, 128, 128, True, False, True): (1, 4, 1, 4), + (256, 256, 1024, 32, 32, False, True, True): (1, 32, 1, 4), + (256, 256, 1024, 32, 32, True, False, True): (1, 8, 6, 4), + (256, 256, 1024, 64, 64, False, True, True): (2, 16, 1, 16), + (256, 256, 1024, 64, 64, True, False, True): (1, 16, 5, 4), + (256, 256, 1024, 128, 128, False, True, True): (4, 8, 1, 32), + (256, 256, 1024, 128, 128, True, False, True): (1, 8, 2, 4), + (256, 256, 2048, 32, 32, False, True, True): (1, 64, 1, 4), + (256, 256, 2048, 32, 32, True, False, True): (2, 32, 3, 2), + (256, 256, 2048, 64, 64, False, True, True): (2, 32, 1, 16), + (256, 256, 2048, 64, 64, True, False, True): (1, 16, 3, 4), + (256, 256, 2048, 128, 128, False, True, True): (1, 16, 1, 32), + (256, 256, 2048, 128, 128, True, False, True): (1, 16, 2, 4), + (256, 256, 4096, 32, 32, False, True, True): (2, 128, 1, 4), + (256, 256, 4096, 32, 32, True, False, True): (1, 32, 3, 2), + (256, 256, 4096, 64, 64, False, True, True): (2, 64, 1, 8), + (256, 256, 4096, 64, 64, True, False, True): (1, 64, 3, 2), + (256, 256, 4096, 128, 128, False, True, True): (2, 32, 1, 32), + (256, 256, 4096, 128, 128, True, False, True): (3, 32, 2, 8), + (256, 256, 8192, 32, 32, False, True, True): (1, 256, 1, 4), + (256, 256, 8192, 32, 32, True, False, True): (1, 64, 1, 4), + (256, 256, 8192, 64, 64, False, True, True): (1, 128, 1, 8), + (256, 256, 8192, 64, 64, True, False, True): (2, 128, 1, 4), + (256, 256, 8192, 128, 128, False, True, True): (4, 64, 1, 32), + (256, 256, 8192, 128, 128, True, False, True): (3, 64, 1, 4), + (256, 256, 16384, 32, 32, False, True, True): (1, 256, 1, 8), + (256, 256, 16384, 32, 32, True, False, True): (3, 128, 1, 4), + (256, 256, 16384, 64, 64, False, True, True): (2, 256, 1, 8), + (256, 256, 16384, 64, 64, True, False, True): (2, 256, 1, 4), + (256, 256, 16384, 128, 128, False, True, True): (2, 128, 1, 32), + (256, 256, 16384, 128, 128, True, False, True): (4, 128, 2, 4), + (256, 256, 32768, 32, 32, False, True, True): (2, 512, 1, 8), + (256, 256, 32768, 32, 32, True, False, True): (1, 256, 1, 4), + (256, 256, 32768, 64, 64, False, True, True): (1, 512, 1, 8), + (256, 256, 32768, 64, 64, True, False, True): (1, 512, 1, 4), + (256, 256, 32768, 128, 128, False, True, True): (2, 256, 1, 32), + (256, 256, 32768, 128, 128, True, False, True): (1, 256, 2, 4), + (256, 256, 65536, 32, 32, False, True, True): (1, 1024, 1, 8), + (256, 256, 65536, 32, 32, True, False, True): (1, 512, 1, 4), + (256, 256, 65536, 64, 64, False, True, True): (1, 1024, 1, 8), + (256, 256, 65536, 64, 64, True, False, True): (1, 512, 1, 4), + (256, 256, 65536, 128, 128, False, True, True): (2, 512, 1, 16), + (256, 256, 65536, 128, 128, True, False, True): (1, 512, 1, 4), + (256, 256, 65792, 32, 32, False, True, True): (1, 1028, 1, 8), + (256, 256, 65792, 32, 32, True, False, True): (1, 514, 1, 4), + (256, 256, 65792, 64, 64, False, True, True): (1, 1028, 1, 8), + (256, 256, 65792, 64, 64, True, False, True): (4, 257, 1, 4), + (256, 256, 65792, 128, 128, False, True, True): (2, 514, 1, 16), + (256, 256, 65792, 128, 128, True, False, True): (3, 514, 1, 4), + (256, 256, 131072, 32, 32, False, True, True): (1, 2048, 1, 8), + (256, 256, 131072, 32, 32, True, False, True): (2, 1024, 1, 4), + (256, 256, 131072, 64, 64, False, True, True): (1, 2048, 1, 8), + (256, 256, 131072, 64, 64, True, False, True): (2, 512, 1, 4), + (256, 256, 131072, 128, 128, False, True, True): (2, 1024, 1, 16), + (256, 256, 131072, 128, 128, True, False, True): (4, 1024, 1, 4), + (384, 384, 256, 32, 32, False, True, True): (1, 8, 1, 4), + (384, 384, 256, 32, 32, True, False, True): (5, 8, 5, 4), + (384, 384, 256, 64, 64, False, True, True): (2, 4, 1, 16), + (384, 384, 256, 64, 64, True, False, True): (1, 4, 5, 4), + (384, 384, 512, 32, 32, False, True, True): (2, 16, 1, 4), + (384, 384, 512, 32, 32, True, False, True): (1, 16, 4, 4), + (384, 384, 512, 64, 64, False, True, True): (3, 8, 1, 16), + (384, 384, 512, 64, 64, True, False, True): (3, 8, 3, 4), + (384, 384, 1024, 32, 32, False, True, True): (2, 32, 1, 4), + (384, 384, 1024, 32, 32, True, False, True): (1, 8, 6, 4), + (384, 384, 1024, 64, 64, False, True, True): (2, 16, 1, 16), + (384, 384, 1024, 64, 64, True, False, True): (1, 16, 5, 4), + (384, 384, 2048, 32, 32, False, True, True): (1, 64, 1, 4), + (384, 384, 2048, 32, 32, True, False, True): (3, 16, 4, 4), + (384, 384, 2048, 64, 64, False, True, True): (2, 32, 1, 16), + (384, 384, 2048, 64, 64, True, False, True): (1, 16, 4, 4), + (384, 384, 4096, 32, 32, False, True, True): (4, 64, 1, 8), + (384, 384, 4096, 32, 32, True, False, True): (4, 32, 1, 4), + (384, 384, 4096, 64, 64, False, True, True): (1, 64, 1, 8), + (384, 384, 4096, 64, 64, True, False, True): (1, 64, 1, 4), + (384, 384, 8192, 32, 32, False, True, True): (1, 128, 1, 8), + (384, 384, 8192, 32, 32, True, False, True): (3, 64, 1, 1), + (384, 384, 8192, 64, 64, False, True, True): (2, 128, 1, 8), + (384, 384, 8192, 64, 64, True, False, True): (1, 64, 2, 2), + (384, 384, 16384, 32, 32, False, True, True): (1, 256, 1, 8), + (384, 384, 16384, 32, 32, True, False, True): (1, 128, 1, 4), + (384, 384, 16384, 64, 64, False, True, True): (2, 256, 1, 8), + (384, 384, 16384, 64, 64, True, False, True): (2, 128, 1, 4), + (384, 384, 32768, 32, 32, False, True, True): (1, 512, 1, 8), + (384, 384, 32768, 32, 32, True, False, True): (1, 256, 1, 4), + (384, 384, 32768, 64, 64, False, True, True): (1, 512, 1, 8), + (384, 384, 32768, 64, 64, True, False, True): (1, 256, 3, 2), + (384, 384, 65536, 32, 32, False, True, True): (1, 1024, 1, 8), + (384, 384, 65536, 32, 32, True, False, True): (1, 512, 3, 2), + (384, 384, 65536, 64, 64, False, True, True): (2, 1024, 1, 8), + (384, 384, 65536, 64, 64, True, False, True): (3, 256, 3, 4), + (384, 384, 131072, 32, 32, False, True, True): (1, 2048, 1, 8), + (384, 384, 131072, 32, 32, True, False, True): (1, 1024, 3, 2), + (384, 384, 131072, 64, 64, False, True, True): (1, 2048, 1, 8), + (384, 384, 131072, 64, 64, True, False, True): (2, 512, 3, 4), + (512, 512, 256, 32, 32, False, True, True): (1, 8, 1, 4), + (512, 512, 256, 32, 32, True, False, True): (4, 8, 4, 4), + (512, 512, 256, 64, 64, False, True, True): (3, 4, 1, 16), + (512, 512, 256, 64, 64, True, False, True): (2, 4, 5, 4), + (512, 512, 256, 128, 128, False, True, True): (4, 2, 1, 16), + (512, 512, 256, 128, 128, True, False, True): (1, 2, 3, 4), + (512, 512, 256, 256, 256, False, True, True): (1, 1, 1, 32), + (512, 512, 256, 256, 256, True, False, True): (2, 1, 1, 32), + (512, 512, 512, 32, 32, False, True, True): (3, 16, 1, 4), + (512, 512, 512, 32, 32, True, False, True): (1, 8, 4, 2), + (512, 512, 512, 64, 64, False, True, True): (2, 8, 1, 16), + (512, 512, 512, 64, 64, True, False, True): (2, 8, 5, 4), + (512, 512, 512, 128, 128, False, True, True): (3, 4, 1, 16), + (512, 512, 512, 128, 128, True, False, True): (1, 4, 3, 4), + (512, 512, 512, 256, 256, False, True, True): (1, 2, 1, 32), + (512, 512, 512, 256, 256, True, False, True): (2, 2, 1, 32), + (512, 512, 1024, 32, 32, False, True, True): (2, 32, 1, 4), + (512, 512, 1024, 32, 32, True, False, True): (4, 16, 3, 2), + (512, 512, 1024, 64, 64, False, True, True): (4, 16, 1, 16), + (512, 512, 1024, 64, 64, True, False, True): (1, 8, 4, 4), + (512, 512, 1024, 128, 128, False, True, True): (1, 8, 1, 32), + (512, 512, 1024, 128, 128, True, False, True): (1, 8, 3, 4), + (512, 512, 1024, 256, 256, False, True, True): (4, 4, 1, 32), + (512, 512, 1024, 256, 256, True, False, True): (2, 4, 1, 32), + (512, 512, 2048, 32, 32, False, True, True): (3, 32, 1, 8), + (512, 512, 2048, 32, 32, True, False, True): (1, 16, 3, 4), + (512, 512, 2048, 64, 64, False, True, True): (1, 32, 1, 8), + (512, 512, 2048, 64, 64, True, False, True): (1, 32, 3, 2), + (512, 512, 2048, 128, 128, False, True, True): (4, 16, 1, 32), + (512, 512, 2048, 128, 128, True, False, True): (1, 16, 3, 4), + (512, 512, 2048, 256, 256, False, True, True): (1, 8, 1, 32), + (512, 512, 2048, 256, 256, True, False, True): (3, 8, 1, 32), + (512, 512, 4096, 32, 32, False, True, True): (1, 64, 1, 8), + (512, 512, 4096, 32, 32, True, False, True): (5, 32, 1, 4), + (512, 512, 4096, 64, 64, False, True, True): (1, 64, 1, 8), + (512, 512, 4096, 64, 64, True, False, True): (1, 64, 1, 4), + (512, 512, 4096, 128, 128, False, True, True): (5, 32, 1, 32), + (512, 512, 4096, 128, 128, True, False, True): (2, 32, 3, 4), + (512, 512, 4096, 256, 256, False, True, True): (1, 16, 1, 32), + (512, 512, 4096, 256, 256, True, False, True): (3, 16, 1, 32), + (512, 512, 8192, 32, 32, False, True, True): (3, 128, 1, 8), + (512, 512, 8192, 32, 32, True, False, True): (3, 64, 1, 4), + (512, 512, 8192, 64, 64, False, True, True): (4, 128, 1, 8), + (512, 512, 8192, 64, 64, True, False, True): (1, 64, 3, 2), + (512, 512, 8192, 128, 128, False, True, True): (5, 64, 1, 32), + (512, 512, 8192, 128, 128, True, False, True): (1, 64, 2, 4), + (512, 512, 8192, 256, 256, False, True, True): (1, 32, 1, 32), + (512, 512, 8192, 256, 256, True, False, True): (1, 32, 1, 32), + (512, 512, 16384, 32, 32, False, True, True): (1, 256, 1, 8), + (512, 512, 16384, 32, 32, True, False, True): (2, 128, 1, 4), + (512, 512, 16384, 64, 64, False, True, True): (2, 256, 1, 8), + (512, 512, 16384, 64, 64, True, False, True): (1, 128, 3, 2), + (512, 512, 16384, 128, 128, False, True, True): (4, 128, 1, 16), + (512, 512, 16384, 128, 128, True, False, True): (2, 128, 1, 4), + (512, 512, 16384, 256, 256, False, True, True): (1, 64, 1, 32), + (512, 512, 16384, 256, 256, True, False, True): (2, 64, 1, 32), + (512, 512, 32768, 32, 32, False, True, True): (1, 512, 1, 8), + (512, 512, 32768, 32, 32, True, False, True): (2, 256, 1, 4), + (512, 512, 32768, 64, 64, False, True, True): (1, 512, 1, 8), + (512, 512, 32768, 64, 64, True, False, True): (1, 256, 3, 2), + (512, 512, 32768, 128, 128, False, True, True): (4, 256, 1, 16), + (512, 512, 32768, 128, 128, True, False, True): (2, 256, 1, 4), + (512, 512, 32768, 256, 256, False, True, True): (1, 128, 1, 32), + (512, 512, 32768, 256, 256, True, False, True): (2, 128, 1, 32), + (512, 512, 65536, 32, 32, False, True, True): (1, 1024, 1, 8), + (512, 512, 65536, 32, 32, True, False, True): (2, 512, 1, 2), + (512, 512, 65536, 64, 64, False, True, True): (1, 1024, 1, 8), + (512, 512, 65536, 64, 64, True, False, True): (1, 512, 3, 2), + (512, 512, 65536, 128, 128, False, True, True): (4, 512, 1, 16), + (512, 512, 65536, 128, 128, True, False, True): (1, 512, 1, 4), + (512, 512, 65536, 256, 256, False, True, True): (1, 256, 1, 32), + (512, 512, 65536, 256, 256, True, False, True): (1, 256, 1, 32), + (512, 512, 65792, 32, 32, False, True, True): (1, 1028, 1, 8), + (512, 512, 65792, 32, 32, True, False, True): (1, 514, 3, 2), + (512, 512, 65792, 64, 64, False, True, True): (1, 1028, 1, 8), + (512, 512, 65792, 64, 64, True, False, True): (2, 257, 3, 4), + (512, 512, 65792, 128, 128, False, True, True): (4, 514, 1, 16), + (512, 512, 65792, 128, 128, True, False, True): (1, 514, 1, 4), + (512, 512, 65792, 256, 256, False, True, True): (1, 257, 1, 32), + (512, 512, 65792, 256, 256, True, False, True): (2, 257, 1, 32), + (512, 512, 131072, 32, 32, False, True, True): (1, 2048, 1, 8), + (512, 512, 131072, 32, 32, True, False, True): (1, 1024, 3, 2), + (512, 512, 131072, 64, 64, False, True, True): (1, 2048, 1, 8), + (512, 512, 131072, 64, 64, True, False, True): (1, 1024, 3, 2), + (512, 512, 131072, 128, 128, False, True, True): (4, 1024, 1, 16), + (512, 512, 131072, 128, 128, True, False, True): (1, 1024, 1, 4), + (512, 512, 131072, 256, 256, False, True, True): (1, 512, 1, 32), + (512, 512, 131072, 256, 256, True, False, True): (2, 512, 1, 32), + (768, 768, 256, 32, 32, False, True, True): (1, 8, 1, 4), + (768, 768, 256, 32, 32, True, False, True): (2, 8, 4, 4), + (768, 768, 256, 64, 64, False, True, True): (3, 4, 1, 16), + (768, 768, 256, 64, 64, True, False, True): (2, 4, 4, 4), + (768, 768, 256, 128, 128, False, True, True): (1, 2, 1, 8), + (768, 768, 256, 128, 128, True, False, True): (1, 2, 3, 4), + (768, 768, 512, 32, 32, False, True, True): (1, 16, 1, 4), + (768, 768, 512, 32, 32, True, False, True): (1, 4, 5, 4), + (768, 768, 512, 64, 64, False, True, True): (1, 8, 3, 32), + (768, 768, 512, 64, 64, True, False, True): (4, 8, 4, 4), + (768, 768, 512, 128, 128, False, True, True): (4, 4, 1, 16), + (768, 768, 512, 128, 128, True, False, True): (4, 4, 3, 4), + (768, 768, 1024, 32, 32, False, True, True): (1, 16, 1, 8), + (768, 768, 1024, 32, 32, True, False, True): (1, 8, 3, 4), + (768, 768, 1024, 64, 64, False, True, True): (3, 16, 1, 16), + (768, 768, 1024, 64, 64, True, False, True): (1, 8, 4, 4), + (768, 768, 1024, 128, 128, False, True, True): (3, 8, 1, 32), + (768, 768, 1024, 128, 128, True, False, True): (1, 8, 3, 4), + (768, 768, 2048, 32, 32, False, True, True): (2, 32, 1, 8), + (768, 768, 2048, 32, 32, True, False, True): (3, 16, 1, 4), + (768, 768, 2048, 64, 64, False, True, True): (1, 32, 1, 8), + (768, 768, 2048, 64, 64, True, False, True): (4, 8, 3, 4), + (768, 768, 2048, 128, 128, False, True, True): (1, 16, 1, 32), + (768, 768, 2048, 128, 128, True, False, True): (1, 16, 3, 4), + (768, 768, 4096, 32, 32, False, True, True): (1, 64, 1, 8), + (768, 768, 4096, 32, 32, True, False, True): (1, 32, 1, 1), + (768, 768, 4096, 64, 64, False, True, True): (2, 64, 1, 8), + (768, 768, 4096, 64, 64, True, False, True): (1, 32, 2, 2), + (768, 768, 4096, 128, 128, False, True, True): (1, 32, 1, 32), + (768, 768, 4096, 128, 128, True, False, True): (6, 32, 1, 4), + (768, 768, 8192, 32, 32, False, True, True): (1, 128, 1, 8), + (768, 768, 8192, 32, 32, True, False, True): (1, 64, 1, 4), + (768, 768, 8192, 64, 64, False, True, True): (1, 128, 1, 8), + (768, 768, 8192, 64, 64, True, False, True): (4, 32, 3, 4), + (768, 768, 8192, 128, 128, False, True, True): (2, 64, 1, 16), + (768, 768, 8192, 128, 128, True, False, True): (2, 64, 3, 4), + (768, 768, 16384, 32, 32, False, True, True): (1, 256, 1, 8), + (768, 768, 16384, 32, 32, True, False, True): (1, 128, 1, 4), + (768, 768, 16384, 64, 64, False, True, True): (1, 256, 1, 8), + (768, 768, 16384, 64, 64, True, False, True): (1, 128, 3, 2), + (768, 768, 16384, 128, 128, False, True, True): (2, 128, 1, 16), + (768, 768, 16384, 128, 128, True, False, True): (2, 128, 1, 4), + (768, 768, 32768, 32, 32, False, True, True): (1, 512, 1, 8), + (768, 768, 32768, 32, 32, True, False, True): (1, 256, 3, 2), + (768, 768, 32768, 64, 64, False, True, True): (2, 512, 1, 8), + (768, 768, 32768, 64, 64, True, False, True): (1, 256, 3, 2), + (768, 768, 32768, 128, 128, False, True, True): (2, 256, 1, 16), + (768, 768, 32768, 128, 128, True, False, True): (3, 256, 1, 4), + (768, 768, 65536, 32, 32, False, True, True): (1, 1024, 1, 8), + (768, 768, 65536, 32, 32, True, False, True): (1, 512, 3, 2), + (768, 768, 65536, 64, 64, False, True, True): (2, 512, 1, 4), + (768, 768, 65536, 64, 64, True, False, True): (1, 512, 3, 2), + (768, 768, 65536, 128, 128, False, True, True): (2, 512, 1, 16), + (768, 768, 65536, 128, 128, True, False, True): (2, 512, 1, 4), + (768, 768, 131072, 32, 32, False, True, True): (1, 2048, 1, 8), + (768, 768, 131072, 32, 32, True, False, True): (1, 1024, 3, 2), + (768, 768, 131072, 64, 64, False, True, True): (2, 1024, 1, 4), + (768, 768, 131072, 64, 64, True, False, True): (2, 1024, 3, 2), + (768, 768, 131072, 128, 128, False, True, True): (2, 1024, 1, 16), + (768, 768, 131072, 128, 128, True, False, True): (2, 1024, 1, 4), + (768, 3072, 256, 32, 32, False, True, True): (3, 8, 4, 8), + (768, 3072, 256, 32, 32, True, False, True): (3, 8, 5, 4), + (768, 3072, 256, 64, 64, False, True, True): (1, 4, 4, 16), + (768, 3072, 256, 64, 64, True, False, True): (1, 4, 4, 4), + (768, 3072, 256, 128, 128, False, True, True): (2, 2, 1, 8), + (768, 3072, 256, 128, 128, True, False, True): (2, 2, 4, 4), + (768, 3072, 256, 256, 256, False, True, True): (1, 1, 1, 32), + (768, 3072, 256, 256, 256, True, False, True): (1, 1, 1, 32), + (768, 3072, 512, 32, 32, False, True, True): (1, 16, 1, 4), + (768, 3072, 512, 32, 32, True, False, True): (2, 4, 4, 4), + (768, 3072, 512, 64, 64, False, True, True): (3, 8, 4, 16), + (768, 3072, 512, 64, 64, True, False, True): (1, 8, 4, 4), + (768, 3072, 512, 128, 128, False, True, True): (2, 4, 1, 8), + (768, 3072, 512, 128, 128, True, False, True): (4, 4, 3, 4), + (768, 3072, 512, 256, 256, False, True, True): (1, 2, 1, 32), + (768, 3072, 512, 256, 256, True, False, True): (1, 2, 1, 32), + (768, 3072, 1024, 32, 32, False, True, True): (1, 16, 1, 8), + (768, 3072, 1024, 32, 32, True, False, True): (3, 8, 3, 4), + (768, 3072, 1024, 64, 64, False, True, True): (2, 16, 1, 16), + (768, 3072, 1024, 64, 64, True, False, True): (1, 8, 3, 4), + (768, 3072, 1024, 128, 128, False, True, True): (1, 8, 1, 8), + (768, 3072, 1024, 128, 128, True, False, True): (3, 8, 4, 4), + (768, 3072, 1024, 256, 256, False, True, True): (1, 4, 1, 32), + (768, 3072, 1024, 256, 256, True, False, True): (4, 4, 1, 32), + (768, 3072, 2048, 32, 32, False, True, True): (3, 32, 1, 8), + (768, 3072, 2048, 32, 32, True, False, True): (4, 8, 3, 4), + (768, 3072, 2048, 64, 64, False, True, True): (5, 16, 1, 16), + (768, 3072, 2048, 64, 64, True, False, True): (6, 8, 3, 4), + (768, 3072, 2048, 128, 128, False, True, True): (2, 16, 1, 16), + (768, 3072, 2048, 128, 128, True, False, True): (1, 16, 4, 4), + (768, 3072, 2048, 256, 256, False, True, True): (1, 8, 1, 32), + (768, 3072, 2048, 256, 256, True, False, True): (1, 8, 1, 32), + (768, 3072, 4096, 32, 32, False, True, True): (1, 64, 1, 8), + (768, 3072, 4096, 32, 32, True, False, True): (1, 32, 3, 4), + (768, 3072, 4096, 64, 64, False, True, True): (1, 64, 1, 8), + (768, 3072, 4096, 64, 64, True, False, True): (2, 16, 3, 4), + (768, 3072, 4096, 128, 128, False, True, True): (1, 32, 1, 8), + (768, 3072, 4096, 128, 128, True, False, True): (2, 32, 2, 4), + (768, 3072, 4096, 256, 256, False, True, True): (1, 16, 1, 32), + (768, 3072, 4096, 256, 256, True, False, True): (1, 16, 1, 32), + (768, 3072, 8192, 32, 32, False, True, True): (1, 128, 1, 8), + (768, 3072, 8192, 32, 32, True, False, True): (1, 64, 1, 4), + (768, 3072, 8192, 64, 64, False, True, True): (1, 128, 1, 8), + (768, 3072, 8192, 64, 64, True, False, True): (2, 32, 3, 4), + (768, 3072, 8192, 128, 128, False, True, True): (2, 64, 1, 16), + (768, 3072, 8192, 128, 128, True, False, True): (2, 64, 3, 4), + (768, 3072, 8192, 256, 256, False, True, True): (1, 32, 1, 32), + (768, 3072, 8192, 256, 256, True, False, True): (1, 32, 1, 32), + (768, 3072, 16384, 32, 32, False, True, True): (1, 256, 1, 8), + (768, 3072, 16384, 32, 32, True, False, True): (1, 128, 1, 4), + (768, 3072, 16384, 64, 64, False, True, True): (1, 256, 1, 8), + (768, 3072, 16384, 64, 64, True, False, True): (2, 64, 3, 4), + (768, 3072, 16384, 128, 128, False, True, True): (2, 128, 1, 16), + (768, 3072, 16384, 128, 128, True, False, True): (2, 128, 3, 4), + (768, 3072, 16384, 256, 256, False, True, True): (1, 64, 1, 32), + (768, 3072, 16384, 256, 256, True, False, True): (1, 64, 1, 32), + (768, 3072, 32768, 32, 32, False, True, True): (1, 512, 1, 8), + (768, 3072, 32768, 32, 32, True, False, True): (1, 256, 3, 2), + (768, 3072, 32768, 64, 64, False, True, True): (1, 512, 1, 8), + (768, 3072, 32768, 64, 64, True, False, True): (3, 128, 3, 4), + (768, 3072, 32768, 128, 128, False, True, True): (2, 256, 1, 16), + (768, 3072, 32768, 128, 128, True, False, True): (2, 256, 3, 4), + (768, 3072, 32768, 256, 256, False, True, True): (1, 128, 1, 32), + (768, 3072, 32768, 256, 256, True, False, True): (1, 128, 1, 32), + (768, 3072, 50432, 32, 32, False, True, True): (1, 788, 1, 8), + (768, 3072, 50432, 32, 32, True, False, True): (1, 394, 3, 2), + (768, 3072, 50432, 64, 64, False, True, True): (1, 788, 1, 8), + (768, 3072, 50432, 64, 64, True, False, True): (2, 197, 3, 4), + (768, 3072, 50432, 128, 128, False, True, True): (2, 394, 1, 16), + (768, 3072, 50432, 128, 128, True, False, True): (2, 394, 3, 4), + (768, 3072, 50432, 256, 256, False, True, True): (1, 197, 1, 32), + (768, 3072, 50432, 256, 256, True, False, True): (1, 197, 1, 32), + (768, 3072, 65536, 32, 32, False, True, True): (1, 1024, 1, 8), + (768, 3072, 65536, 32, 32, True, False, True): (1, 512, 3, 2), + (768, 3072, 65536, 64, 64, False, True, True): (1, 1024, 1, 8), + (768, 3072, 65536, 64, 64, True, False, True): (2, 256, 3, 4), + (768, 3072, 65536, 128, 128, False, True, True): (2, 512, 1, 16), + (768, 3072, 65536, 128, 128, True, False, True): (2, 512, 3, 4), + (768, 3072, 65536, 256, 256, False, True, True): (1, 256, 1, 32), + (768, 3072, 65536, 256, 256, True, False, True): (1, 256, 1, 32), + (768, 3072, 131072, 32, 32, False, True, True): (1, 2048, 1, 8), + (768, 3072, 131072, 32, 32, True, False, True): (1, 1024, 3, 2), + (768, 3072, 131072, 64, 64, False, True, True): (1, 2048, 1, 8), + (768, 3072, 131072, 64, 64, True, False, True): (2, 512, 3, 4), + (768, 3072, 131072, 128, 128, False, True, True): (2, 1024, 1, 16), + (768, 3072, 131072, 128, 128, True, False, True): (1, 1024, 3, 4), + (768, 3072, 131072, 256, 256, False, True, True): (1, 512, 1, 32), + (768, 3072, 131072, 256, 256, True, False, True): (1, 512, 1, 32), + (1024, 1024, 256, 32, 32, False, True, True): (1, 8, 1, 4), + (1024, 1024, 256, 32, 32, True, False, True): (1, 8, 5, 4), + (1024, 1024, 256, 64, 64, False, True, True): (1, 4, 1, 16), + (1024, 1024, 256, 64, 64, True, False, True): (4, 4, 4, 4), + (1024, 1024, 256, 128, 128, False, True, True): (1, 2, 1, 8), + (1024, 1024, 256, 128, 128, True, False, True): (1, 2, 3, 8), + (1024, 1024, 256, 256, 256, False, True, True): (1, 1, 1, 32), + (1024, 1024, 256, 256, 256, True, False, True): (1, 1, 1, 32), + (1024, 1024, 512, 32, 32, False, True, True): (5, 16, 1, 4), + (1024, 1024, 512, 32, 32, True, False, True): (2, 8, 4, 2), + (1024, 1024, 512, 64, 64, False, True, True): (4, 8, 1, 16), + (1024, 1024, 512, 64, 64, True, False, True): (1, 4, 3, 4), + (1024, 1024, 512, 128, 128, False, True, True): (3, 4, 1, 16), + (1024, 1024, 512, 128, 128, True, False, True): (1, 4, 2, 4), + (1024, 1024, 512, 256, 256, False, True, True): (1, 2, 1, 32), + (1024, 1024, 512, 256, 256, True, False, True): (1, 2, 1, 32), + (1024, 1024, 1024, 32, 32, False, True, True): (1, 16, 1, 8), + (1024, 1024, 1024, 32, 32, True, False, True): (1, 8, 3, 4), + (1024, 1024, 1024, 64, 64, False, True, True): (3, 16, 1, 8), + (1024, 1024, 1024, 64, 64, True, False, True): (1, 16, 3, 2), + (1024, 1024, 1024, 128, 128, False, True, True): (1, 8, 1, 16), + (1024, 1024, 1024, 128, 128, True, False, True): (2, 8, 3, 8), + (1024, 1024, 1024, 256, 256, False, True, True): (1, 4, 1, 32), + (1024, 1024, 1024, 256, 256, True, False, True): (2, 4, 1, 32), + (1024, 1024, 2048, 32, 32, False, True, True): (2, 32, 1, 8), + (1024, 1024, 2048, 32, 32, True, False, True): (3, 16, 1, 4), + (1024, 1024, 2048, 64, 64, False, True, True): (1, 32, 1, 8), + (1024, 1024, 2048, 64, 64, True, False, True): (3, 32, 1, 4), + (1024, 1024, 2048, 128, 128, False, True, True): (4, 16, 1, 16), + (1024, 1024, 2048, 128, 128, True, False, True): (1, 16, 3, 4), + (1024, 1024, 2048, 256, 256, False, True, True): (1, 8, 1, 32), + (1024, 1024, 2048, 256, 256, True, False, True): (1, 8, 1, 32), + (1024, 1024, 4096, 32, 32, False, True, True): (4, 64, 1, 8), + (1024, 1024, 4096, 32, 32, True, False, True): (3, 32, 1, 4), + (1024, 1024, 4096, 64, 64, False, True, True): (3, 64, 1, 8), + (1024, 1024, 4096, 64, 64, True, False, True): (1, 32, 3, 2), + (1024, 1024, 4096, 128, 128, False, True, True): (4, 32, 1, 16), + (1024, 1024, 4096, 128, 128, True, False, True): (2, 32, 2, 4), + (1024, 1024, 4096, 256, 256, False, True, True): (1, 16, 1, 32), + (1024, 1024, 4096, 256, 256, True, False, True): (7, 16, 1, 32), + (1024, 1024, 8192, 32, 32, False, True, True): (1, 128, 1, 8), + (1024, 1024, 8192, 32, 32, True, False, True): (4, 64, 1, 4), + (1024, 1024, 8192, 64, 64, False, True, True): (2, 128, 1, 8), + (1024, 1024, 8192, 64, 64, True, False, True): (3, 32, 3, 4), + (1024, 1024, 8192, 128, 128, False, True, True): (4, 64, 1, 16), + (1024, 1024, 8192, 128, 128, True, False, True): (2, 64, 2, 4), + (1024, 1024, 8192, 256, 256, False, True, True): (1, 32, 1, 32), + (1024, 1024, 8192, 256, 256, True, False, True): (1, 32, 1, 32), + (1024, 1024, 16384, 32, 32, False, True, True): (1, 256, 1, 8), + (1024, 1024, 16384, 32, 32, True, False, True): (1, 128, 1, 4), + (1024, 1024, 16384, 64, 64, False, True, True): (1, 256, 1, 8), + (1024, 1024, 16384, 64, 64, True, False, True): (4, 64, 3, 4), + (1024, 1024, 16384, 128, 128, False, True, True): (4, 128, 1, 16), + (1024, 1024, 16384, 128, 128, True, False, True): (1, 128, 3, 4), + (1024, 1024, 16384, 256, 256, False, True, True): (1, 64, 1, 32), + (1024, 1024, 16384, 256, 256, True, False, True): (1, 64, 1, 32), + (1024, 1024, 32768, 32, 32, False, True, True): (1, 512, 1, 8), + (1024, 1024, 32768, 32, 32, True, False, True): (1, 256, 3, 2), + (1024, 1024, 32768, 64, 64, False, True, True): (1, 256, 1, 4), + (1024, 1024, 32768, 64, 64, True, False, True): (4, 128, 3, 4), + (1024, 1024, 32768, 128, 128, False, True, True): (4, 256, 1, 16), + (1024, 1024, 32768, 128, 128, True, False, True): (2, 256, 3, 4), + (1024, 1024, 32768, 256, 256, False, True, True): (1, 128, 1, 32), + (1024, 1024, 32768, 256, 256, True, False, True): (2, 128, 1, 32), + (1024, 1024, 65536, 32, 32, False, True, True): (1, 1024, 1, 8), + (1024, 1024, 65536, 32, 32, True, False, True): (1, 512, 3, 2), + (1024, 1024, 65536, 64, 64, False, True, True): (1, 512, 1, 4), + (1024, 1024, 65536, 64, 64, True, False, True): (2, 256, 3, 4), + (1024, 1024, 65536, 128, 128, False, True, True): (4, 512, 1, 16), + (1024, 1024, 65536, 128, 128, True, False, True): (4, 512, 3, 4), + (1024, 1024, 65536, 256, 256, False, True, True): (1, 256, 1, 32), + (1024, 1024, 65536, 256, 256, True, False, True): (1, 256, 1, 32), + (1024, 1024, 65792, 32, 32, False, True, True): (1, 1028, 1, 8), + (1024, 1024, 65792, 32, 32, True, False, True): (1, 514, 3, 2), + (1024, 1024, 65792, 64, 64, False, True, True): (2, 514, 1, 4), + (1024, 1024, 65792, 64, 64, True, False, True): (4, 257, 3, 4), + (1024, 1024, 65792, 128, 128, False, True, True): (2, 514, 1, 16), + (1024, 1024, 65792, 128, 128, True, False, True): (2, 514, 2, 4), + (1024, 1024, 65792, 256, 256, False, True, True): (1, 257, 1, 32), + (1024, 1024, 65792, 256, 256, True, False, True): (1, 257, 1, 32), + (1024, 1024, 131072, 32, 32, False, True, True): (1, 2048, 1, 8), + (1024, 1024, 131072, 32, 32, True, False, True): (1, 1024, 3, 2), + (1024, 1024, 131072, 64, 64, False, True, True): (2, 1024, 1, 4), + (1024, 1024, 131072, 64, 64, True, False, True): (2, 512, 3, 4), + (1024, 1024, 131072, 128, 128, False, True, True): (4, 1024, 1, 16), + (1024, 1024, 131072, 128, 128, True, False, True): (1, 1024, 3, 4), + (1024, 1024, 131072, 256, 256, False, True, True): (1, 512, 1, 32), + (1024, 1024, 131072, 256, 256, True, False, True): (1, 512, 1, 32), + (1280, 5120, 65792, 32, 32, False, True, True): (1, 1028, 1, 8), + (1280, 5120, 65792, 32, 32, True, False, True): (1, 514, 3, 2), + (1280, 5120, 65792, 64, 64, False, True, True): (1, 1028, 1, 8), + (1280, 5120, 65792, 64, 64, True, False, True): (2, 257, 3, 4), + (1280, 5120, 65792, 128, 128, False, True, True): (2, 514, 1, 16), + (1280, 5120, 65792, 128, 128, True, False, True): (1, 514, 3, 4), + (1280, 5120, 65792, 256, 256, False, True, True): (1, 257, 1, 32), + (1280, 5120, 65792, 256, 256, True, False, True): (1, 257, 1, 32), + (1536, 1536, 256, 32, 32, False, True, True): (1, 8, 1, 4), + (1536, 1536, 256, 32, 32, True, False, True): (2, 8, 1, 8), + (1536, 1536, 256, 64, 64, False, True, True): (4, 4, 1, 16), + (1536, 1536, 256, 64, 64, True, False, True): (1, 4, 4, 4), + (1536, 1536, 256, 128, 128, False, True, True): (2, 2, 1, 16), + (1536, 1536, 256, 128, 128, True, False, True): (2, 2, 3, 4), + (1536, 1536, 256, 256, 256, False, True, True): (1, 1, 1, 32), + (1536, 1536, 256, 256, 256, True, False, True): (1, 1, 1, 32), + (1536, 1536, 512, 32, 32, False, True, True): (1, 8, 1, 8), + (1536, 1536, 512, 32, 32, True, False, True): (3, 4, 4, 4), + (1536, 1536, 512, 64, 64, False, True, True): (3, 8, 1, 16), + (1536, 1536, 512, 64, 64, True, False, True): (1, 4, 3, 4), + (1536, 1536, 512, 128, 128, False, True, True): (1, 4, 1, 16), + (1536, 1536, 512, 128, 128, True, False, True): (2, 4, 4, 4), + (1536, 1536, 512, 256, 256, False, True, True): (1, 2, 1, 32), + (1536, 1536, 512, 256, 256, True, False, True): (1, 2, 1, 32), + (1536, 1536, 1024, 32, 32, False, True, True): (4, 16, 1, 8), + (1536, 1536, 1024, 32, 32, True, False, True): (2, 8, 1, 4), + (1536, 1536, 1024, 64, 64, False, True, True): (2, 16, 1, 16), + (1536, 1536, 1024, 64, 64, True, False, True): (2, 4, 3, 4), + (1536, 1536, 1024, 128, 128, False, True, True): (3, 8, 1, 32), + (1536, 1536, 1024, 128, 128, True, False, True): (4, 8, 3, 4), + (1536, 1536, 1024, 256, 256, False, True, True): (1, 4, 1, 32), + (1536, 1536, 1024, 256, 256, True, False, True): (1, 4, 1, 32), + (1536, 1536, 2048, 32, 32, False, True, True): (1, 32, 1, 8), + (1536, 1536, 2048, 32, 32, True, False, True): (1, 16, 1, 4), + (1536, 1536, 2048, 64, 64, False, True, True): (1, 32, 1, 8), + (1536, 1536, 2048, 64, 64, True, False, True): (1, 16, 2, 2), + (1536, 1536, 2048, 128, 128, False, True, True): (2, 16, 1, 16), + (1536, 1536, 2048, 128, 128, True, False, True): (4, 16, 2, 4), + (1536, 1536, 2048, 256, 256, False, True, True): (1, 8, 1, 32), + (1536, 1536, 2048, 256, 256, True, False, True): (1, 8, 1, 32), + (1536, 1536, 4096, 32, 32, False, True, True): (1, 64, 1, 8), + (1536, 1536, 4096, 32, 32, True, False, True): (1, 32, 1, 4), + (1536, 1536, 4096, 64, 64, False, True, True): (3, 64, 1, 8), + (1536, 1536, 4096, 64, 64, True, False, True): (1, 32, 3, 2), + (1536, 1536, 4096, 128, 128, False, True, True): (1, 32, 1, 8), + (1536, 1536, 4096, 128, 128, True, False, True): (2, 32, 2, 4), + (1536, 1536, 4096, 256, 256, False, True, True): (1, 16, 1, 32), + (1536, 1536, 4096, 256, 256, True, False, True): (2, 16, 1, 32), + (1536, 1536, 8192, 32, 32, False, True, True): (1, 128, 1, 8), + (1536, 1536, 8192, 32, 32, True, False, True): (1, 64, 1, 4), + (1536, 1536, 8192, 64, 64, False, True, True): (3, 128, 1, 8), + (1536, 1536, 8192, 64, 64, True, False, True): (1, 64, 3, 2), + (1536, 1536, 8192, 128, 128, False, True, True): (1, 64, 1, 8), + (1536, 1536, 8192, 128, 128, True, False, True): (1, 64, 2, 4), + (1536, 1536, 8192, 256, 256, False, True, True): (1, 32, 1, 32), + (1536, 1536, 8192, 256, 256, True, False, True): (2, 32, 1, 32), + (1536, 1536, 16384, 32, 32, False, True, True): (1, 256, 1, 8), + (1536, 1536, 16384, 32, 32, True, False, True): (1, 128, 3, 2), + (1536, 1536, 16384, 64, 64, False, True, True): (2, 128, 1, 4), + (1536, 1536, 16384, 64, 64, True, False, True): (2, 64, 3, 4), + (1536, 1536, 16384, 128, 128, False, True, True): (1, 128, 1, 8), + (1536, 1536, 16384, 128, 128, True, False, True): (2, 128, 2, 4), + (1536, 1536, 16384, 256, 256, False, True, True): (1, 64, 1, 32), + (1536, 1536, 16384, 256, 256, True, False, True): (2, 64, 1, 32), + (1536, 1536, 32768, 32, 32, False, True, True): (1, 512, 1, 8), + (1536, 1536, 32768, 32, 32, True, False, True): (1, 256, 3, 2), + (1536, 1536, 32768, 64, 64, False, True, True): (1, 256, 1, 4), + (1536, 1536, 32768, 64, 64, True, False, True): (3, 128, 3, 4), + (1536, 1536, 32768, 128, 128, False, True, True): (1, 256, 1, 8), + (1536, 1536, 32768, 128, 128, True, False, True): (1, 256, 2, 4), + (1536, 1536, 32768, 256, 256, False, True, True): (1, 128, 1, 32), + (1536, 1536, 32768, 256, 256, True, False, True): (2, 128, 1, 32), + (1536, 1536, 65536, 32, 32, False, True, True): (1, 1024, 1, 8), + (1536, 1536, 65536, 32, 32, True, False, True): (1, 512, 3, 2), + (1536, 1536, 65536, 64, 64, False, True, True): (1, 512, 1, 4), + (1536, 1536, 65536, 64, 64, True, False, True): (1, 512, 3, 2), + (1536, 1536, 65536, 128, 128, False, True, True): (1, 512, 1, 8), + (1536, 1536, 65536, 128, 128, True, False, True): (1, 512, 3, 4), + (1536, 1536, 65536, 256, 256, False, True, True): (1, 256, 1, 32), + (1536, 1536, 65536, 256, 256, True, False, True): (2, 256, 1, 32), + (1536, 1536, 131072, 32, 32, False, True, True): (1, 2048, 1, 8), + (1536, 1536, 131072, 32, 32, True, False, True): (1, 1024, 3, 2), + (1536, 1536, 131072, 64, 64, False, True, True): (3, 1024, 1, 4), + (1536, 1536, 131072, 64, 64, True, False, True): (3, 512, 3, 4), + (1536, 1536, 131072, 128, 128, False, True, True): (1, 1024, 1, 8), + (1536, 1536, 131072, 128, 128, True, False, True): (1, 1024, 3, 4), + (1536, 1536, 131072, 256, 256, False, True, True): (1, 512, 1, 32), + (1536, 1536, 131072, 256, 256, True, False, True): (2, 512, 1, 32), + (2048, 2048, 256, 32, 32, False, True, True): (3, 8, 1, 4), + (2048, 2048, 256, 32, 32, True, False, True): (1, 4, 4, 2), + (2048, 2048, 256, 64, 64, False, True, True): (2, 4, 1, 16), + (2048, 2048, 256, 64, 64, True, False, True): (1, 2, 3, 4), + (2048, 2048, 256, 128, 128, False, True, True): (1, 2, 1, 8), + (2048, 2048, 256, 128, 128, True, False, True): (1, 2, 4, 4), + (2048, 2048, 256, 256, 256, False, True, True): (1, 1, 1, 32), + (2048, 2048, 256, 256, 256, True, False, True): (1, 1, 1, 32), + (2048, 2048, 512, 32, 32, False, True, True): (3, 8, 1, 8), + (2048, 2048, 512, 32, 32, True, False, True): (4, 4, 3, 2), + (2048, 2048, 512, 64, 64, False, True, True): (1, 8, 1, 8), + (2048, 2048, 512, 64, 64, True, False, True): (1, 8, 3, 4), + (2048, 2048, 512, 128, 128, False, True, True): (1, 4, 1, 8), + (2048, 2048, 512, 128, 128, True, False, True): (1, 4, 4, 4), + (2048, 2048, 512, 256, 256, False, True, True): (1, 2, 1, 32), + (2048, 2048, 512, 256, 256, True, False, True): (2, 2, 1, 32), + (2048, 2048, 1024, 32, 32, False, True, True): (1, 16, 1, 8), + (2048, 2048, 1024, 32, 32, True, False, True): (3, 8, 1, 4), + (2048, 2048, 1024, 64, 64, False, True, True): (4, 16, 1, 8), + (2048, 2048, 1024, 64, 64, True, False, True): (1, 8, 3, 2), + (2048, 2048, 1024, 128, 128, False, True, True): (4, 8, 1, 16), + (2048, 2048, 1024, 128, 128, True, False, True): (2, 8, 2, 4), + (2048, 2048, 1024, 256, 256, False, True, True): (1, 4, 1, 32), + (2048, 2048, 1024, 256, 256, True, False, True): (3, 4, 1, 32), + (2048, 2048, 2048, 32, 32, False, True, True): (1, 32, 1, 8), + (2048, 2048, 2048, 32, 32, True, False, True): (1, 16, 1, 4), + (2048, 2048, 2048, 64, 64, False, True, True): (1, 32, 1, 8), + (2048, 2048, 2048, 64, 64, True, False, True): (1, 16, 3, 2), + (2048, 2048, 2048, 128, 128, False, True, True): (4, 16, 1, 16), + (2048, 2048, 2048, 128, 128, True, False, True): (2, 16, 2, 4), + (2048, 2048, 2048, 256, 256, False, True, True): (1, 8, 1, 32), + (2048, 2048, 2048, 256, 256, True, False, True): (1, 8, 1, 32), + (2048, 2048, 4096, 32, 32, False, True, True): (1, 64, 1, 8), + (2048, 2048, 4096, 32, 32, True, False, True): (1, 32, 1, 4), + (2048, 2048, 4096, 64, 64, False, True, True): (4, 64, 1, 8), + (2048, 2048, 4096, 64, 64, True, False, True): (2, 16, 3, 4), + (2048, 2048, 4096, 128, 128, False, True, True): (4, 32, 1, 8), + (2048, 2048, 4096, 128, 128, True, False, True): (1, 32, 2, 4), + (2048, 2048, 4096, 256, 256, False, True, True): (1, 16, 1, 32), + (2048, 2048, 4096, 256, 256, True, False, True): (4, 16, 1, 32), + (2048, 2048, 8192, 32, 32, False, True, True): (1, 128, 1, 8), + (2048, 2048, 8192, 32, 32, True, False, True): (1, 64, 1, 4), + (2048, 2048, 8192, 64, 64, False, True, True): (2, 64, 1, 4), + (2048, 2048, 8192, 64, 64, True, False, True): (2, 32, 3, 4), + (2048, 2048, 8192, 128, 128, False, True, True): (4, 64, 1, 8), + (2048, 2048, 8192, 128, 128, True, False, True): (2, 64, 2, 4), + (2048, 2048, 8192, 256, 256, False, True, True): (1, 32, 1, 32), + (2048, 2048, 8192, 256, 256, True, False, True): (4, 32, 1, 32), + (2048, 2048, 16384, 32, 32, False, True, True): (1, 256, 1, 8), + (2048, 2048, 16384, 32, 32, True, False, True): (1, 128, 3, 2), + (2048, 2048, 16384, 64, 64, False, True, True): (2, 128, 1, 4), + (2048, 2048, 16384, 64, 64, True, False, True): (2, 64, 3, 4), + (2048, 2048, 16384, 128, 128, False, True, True): (1, 128, 1, 8), + (2048, 2048, 16384, 128, 128, True, False, True): (2, 128, 2, 4), + (2048, 2048, 16384, 256, 256, False, True, True): (1, 64, 1, 32), + (2048, 2048, 16384, 256, 256, True, False, True): (4, 64, 1, 32), + (2048, 2048, 32768, 32, 32, False, True, True): (1, 512, 1, 8), + (2048, 2048, 32768, 32, 32, True, False, True): (1, 256, 3, 2), + (2048, 2048, 32768, 64, 64, False, True, True): (2, 256, 1, 4), + (2048, 2048, 32768, 64, 64, True, False, True): (2, 128, 3, 4), + (2048, 2048, 32768, 128, 128, False, True, True): (1, 256, 1, 8), + (2048, 2048, 32768, 128, 128, True, False, True): (2, 256, 2, 4), + (2048, 2048, 32768, 256, 256, False, True, True): (1, 128, 1, 32), + (2048, 2048, 32768, 256, 256, True, False, True): (4, 128, 1, 32), + (2048, 2048, 65536, 32, 32, False, True, True): (1, 1024, 1, 8), + (2048, 2048, 65536, 32, 32, True, False, True): (1, 512, 3, 2), + (2048, 2048, 65536, 64, 64, False, True, True): (1, 512, 1, 4), + (2048, 2048, 65536, 64, 64, True, False, True): (2, 256, 3, 4), + (2048, 2048, 65536, 128, 128, False, True, True): (1, 512, 1, 8), + (2048, 2048, 65536, 128, 128, True, False, True): (1, 512, 2, 4), + (2048, 2048, 65536, 256, 256, False, True, True): (1, 256, 1, 32), + (2048, 2048, 65536, 256, 256, True, False, True): (4, 256, 1, 32), + (2048, 2048, 65792, 32, 32, False, True, True): (1, 1028, 1, 8), + (2048, 2048, 65792, 32, 32, True, False, True): (1, 514, 3, 2), + (2048, 2048, 65792, 64, 64, False, True, True): (1, 514, 1, 4), + (2048, 2048, 65792, 64, 64, True, False, True): (2, 257, 3, 4), + (2048, 2048, 65792, 128, 128, False, True, True): (1, 514, 1, 8), + (2048, 2048, 65792, 128, 128, True, False, True): (1, 514, 2, 4), + (2048, 2048, 65792, 256, 256, False, True, True): (1, 257, 1, 32), + (2048, 2048, 65792, 256, 256, True, False, True): (1, 257, 1, 32), + (2048, 2048, 131072, 32, 32, False, True, True): (1, 2048, 1, 8), + (2048, 2048, 131072, 32, 32, True, False, True): (1, 1024, 3, 2), + (2048, 2048, 131072, 64, 64, False, True, True): (1, 1024, 1, 4), + (2048, 2048, 131072, 64, 64, True, False, True): (2, 512, 3, 4), + (2048, 2048, 131072, 128, 128, False, True, True): (1, 1024, 1, 8), + (2048, 2048, 131072, 128, 128, True, False, True): (1, 1024, 3, 4), + (2048, 2048, 131072, 256, 256, False, True, True): (1, 512, 1, 32), + (2048, 2048, 131072, 256, 256, True, False, True): (4, 512, 1, 32), + (3072, 768, 256, 32, 32, False, True, True): (5, 4, 1, 8), + (3072, 768, 256, 32, 32, True, False, True): (2, 2, 4, 4), + (3072, 768, 256, 64, 64, False, True, True): (1, 4, 1, 16), + (3072, 768, 256, 64, 64, True, False, True): (2, 2, 3, 4), + (3072, 768, 256, 128, 128, False, True, True): (5, 2, 1, 16), + (3072, 768, 256, 128, 128, True, False, True): (1, 2, 5, 4), + (3072, 768, 256, 256, 256, False, True, True): (1, 1, 1, 32), + (3072, 768, 256, 256, 256, True, False, True): (1, 1, 1, 32), + (3072, 768, 512, 32, 32, False, True, True): (1, 8, 1, 8), + (3072, 768, 512, 32, 32, True, False, True): (5, 4, 1, 4), + (3072, 768, 512, 64, 64, False, True, True): (1, 8, 1, 8), + (3072, 768, 512, 64, 64, True, False, True): (3, 2, 3, 4), + (3072, 768, 512, 128, 128, False, True, True): (3, 4, 1, 32), + (3072, 768, 512, 128, 128, True, False, True): (2, 4, 3, 4), + (3072, 768, 512, 256, 256, False, True, True): (1, 2, 1, 32), + (3072, 768, 512, 256, 256, True, False, True): (2, 2, 1, 32), + (3072, 768, 1024, 32, 32, False, True, True): (2, 16, 1, 8), + (3072, 768, 1024, 32, 32, True, False, True): (3, 8, 1, 4), + (3072, 768, 1024, 64, 64, False, True, True): (4, 16, 1, 8), + (3072, 768, 1024, 64, 64, True, False, True): (1, 8, 3, 2), + (3072, 768, 1024, 128, 128, False, True, True): (2, 8, 1, 32), + (3072, 768, 1024, 128, 128, True, False, True): (3, 8, 2, 4), + (3072, 768, 1024, 256, 256, False, True, True): (1, 4, 1, 32), + (3072, 768, 1024, 256, 256, True, False, True): (4, 4, 1, 32), + (3072, 768, 2048, 32, 32, False, True, True): (1, 32, 1, 8), + (3072, 768, 2048, 32, 32, True, False, True): (1, 16, 1, 4), + (3072, 768, 2048, 64, 64, False, True, True): (2, 32, 1, 8), + (3072, 768, 2048, 64, 64, True, False, True): (2, 8, 3, 4), + (3072, 768, 2048, 128, 128, False, True, True): (2, 16, 1, 16), + (3072, 768, 2048, 128, 128, True, False, True): (2, 16, 1, 4), + (3072, 768, 2048, 256, 256, False, True, True): (1, 8, 1, 32), + (3072, 768, 2048, 256, 256, True, False, True): (2, 8, 1, 32), + (3072, 768, 4096, 32, 32, False, True, True): (1, 64, 1, 8), + (3072, 768, 4096, 32, 32, True, False, True): (1, 32, 1, 2), + (3072, 768, 4096, 64, 64, False, True, True): (2, 64, 1, 8), + (3072, 768, 4096, 64, 64, True, False, True): (2, 32, 2, 2), + (3072, 768, 4096, 128, 128, False, True, True): (1, 32, 1, 8), + (3072, 768, 4096, 128, 128, True, False, True): (2, 32, 2, 4), + (3072, 768, 4096, 256, 256, False, True, True): (1, 16, 1, 32), + (3072, 768, 4096, 256, 256, True, False, True): (4, 16, 1, 32), + (3072, 768, 8192, 32, 32, False, True, True): (1, 128, 1, 8), + (3072, 768, 8192, 32, 32, True, False, True): (3, 64, 1, 2), + (3072, 768, 8192, 64, 64, False, True, True): (1, 128, 1, 8), + (3072, 768, 8192, 64, 64, True, False, True): (2, 64, 2, 2), + (3072, 768, 8192, 128, 128, False, True, True): (1, 64, 1, 8), + (3072, 768, 8192, 128, 128, True, False, True): (2, 64, 2, 4), + (3072, 768, 8192, 256, 256, False, True, True): (1, 32, 1, 32), + (3072, 768, 8192, 256, 256, True, False, True): (4, 32, 1, 32), + (3072, 768, 16384, 32, 32, False, True, True): (1, 256, 1, 8), + (3072, 768, 16384, 32, 32, True, False, True): (1, 128, 1, 2), + (3072, 768, 16384, 64, 64, False, True, True): (2, 128, 1, 4), + (3072, 768, 16384, 64, 64, True, False, True): (1, 128, 2, 2), + (3072, 768, 16384, 128, 128, False, True, True): (1, 128, 1, 8), + (3072, 768, 16384, 128, 128, True, False, True): (1, 128, 1, 4), + (3072, 768, 16384, 256, 256, False, True, True): (1, 64, 1, 32), + (3072, 768, 16384, 256, 256, True, False, True): (4, 64, 1, 32), + (3072, 768, 32768, 32, 32, False, True, True): (1, 512, 1, 8), + (3072, 768, 32768, 32, 32, True, False, True): (1, 256, 1, 2), + (3072, 768, 32768, 64, 64, False, True, True): (1, 256, 1, 4), + (3072, 768, 32768, 64, 64, True, False, True): (2, 256, 2, 2), + (3072, 768, 32768, 128, 128, False, True, True): (1, 256, 1, 8), + (3072, 768, 32768, 128, 128, True, False, True): (2, 256, 1, 4), + (3072, 768, 32768, 256, 256, False, True, True): (1, 128, 1, 32), + (3072, 768, 32768, 256, 256, True, False, True): (4, 128, 1, 32), + (3072, 768, 50432, 32, 32, False, True, True): (1, 788, 1, 8), + (3072, 768, 50432, 32, 32, True, False, True): (1, 394, 1, 2), + (3072, 768, 50432, 64, 64, False, True, True): (2, 394, 1, 4), + (3072, 768, 50432, 64, 64, True, False, True): (2, 394, 2, 2), + (3072, 768, 50432, 128, 128, False, True, True): (1, 394, 1, 8), + (3072, 768, 50432, 128, 128, True, False, True): (2, 394, 1, 4), + (3072, 768, 50432, 256, 256, False, True, True): (1, 197, 1, 32), + (3072, 768, 50432, 256, 256, True, False, True): (1, 197, 1, 32), + (3072, 768, 65536, 32, 32, False, True, True): (1, 1024, 1, 8), + (3072, 768, 65536, 32, 32, True, False, True): (1, 512, 1, 2), + (3072, 768, 65536, 64, 64, False, True, True): (1, 512, 1, 4), + (3072, 768, 65536, 64, 64, True, False, True): (2, 512, 2, 2), + (3072, 768, 65536, 128, 128, False, True, True): (1, 512, 1, 8), + (3072, 768, 65536, 128, 128, True, False, True): (2, 512, 1, 4), + (3072, 768, 65536, 256, 256, False, True, True): (1, 256, 1, 32), + (3072, 768, 65536, 256, 256, True, False, True): (4, 256, 1, 32), + (3072, 768, 131072, 32, 32, False, True, True): (1, 2048, 1, 8), + (3072, 768, 131072, 32, 32, True, False, True): (1, 1024, 1, 2), + (3072, 768, 131072, 64, 64, False, True, True): (2, 1024, 1, 4), + (3072, 768, 131072, 64, 64, True, False, True): (2, 1024, 2, 2), + (3072, 768, 131072, 128, 128, False, True, True): (1, 1024, 1, 8), + (3072, 768, 131072, 128, 128, True, False, True): (2, 1024, 1, 4), + (3072, 768, 131072, 256, 256, False, True, True): (1, 512, 1, 32), + (3072, 768, 131072, 256, 256, True, False, True): (4, 512, 1, 32), + (3072, 3072, 256, 32, 32, False, True, True): (1, 4, 1, 8), + (3072, 3072, 256, 32, 32, True, False, True): (2, 2, 5, 4), + (3072, 3072, 256, 64, 64, False, True, True): (2, 4, 1, 16), + (3072, 3072, 256, 64, 64, True, False, True): (3, 2, 3, 4), + (3072, 3072, 256, 128, 128, False, True, True): (1, 2, 1, 8), + (3072, 3072, 256, 128, 128, True, False, True): (1, 2, 5, 4), + (3072, 3072, 256, 256, 256, False, True, True): (1, 1, 1, 32), + (3072, 3072, 256, 256, 256, True, False, True): (1, 1, 1, 32), + (3072, 3072, 512, 32, 32, False, True, True): (1, 8, 1, 8), + (3072, 3072, 512, 32, 32, True, False, True): (3, 2, 3, 4), + (3072, 3072, 512, 64, 64, False, True, True): (1, 8, 1, 8), + (3072, 3072, 512, 64, 64, True, False, True): (3, 2, 3, 4), + (3072, 3072, 512, 128, 128, False, True, True): (2, 4, 1, 8), + (3072, 3072, 512, 128, 128, True, False, True): (2, 4, 4, 4), + (3072, 3072, 512, 256, 256, False, True, True): (1, 2, 1, 32), + (3072, 3072, 512, 256, 256, True, False, True): (1, 2, 1, 32), + (3072, 3072, 1024, 32, 32, False, True, True): (1, 16, 1, 8), + (3072, 3072, 1024, 32, 32, True, False, True): (3, 8, 3, 4), + (3072, 3072, 1024, 64, 64, False, True, True): (2, 16, 1, 8), + (3072, 3072, 1024, 64, 64, True, False, True): (2, 4, 3, 4), + (3072, 3072, 1024, 128, 128, False, True, True): (1, 8, 1, 8), + (3072, 3072, 1024, 128, 128, True, False, True): (3, 8, 2, 4), + (3072, 3072, 1024, 256, 256, False, True, True): (1, 4, 1, 32), + (3072, 3072, 1024, 256, 256, True, False, True): (3, 4, 1, 32), + (3072, 3072, 2048, 32, 32, False, True, True): (1, 32, 1, 8), + (3072, 3072, 2048, 32, 32, True, False, True): (1, 16, 1, 4), + (3072, 3072, 2048, 64, 64, False, True, True): (1, 32, 1, 8), + (3072, 3072, 2048, 64, 64, True, False, True): (1, 16, 3, 2), + (3072, 3072, 2048, 128, 128, False, True, True): (1, 16, 1, 8), + (3072, 3072, 2048, 128, 128, True, False, True): (2, 16, 2, 4), + (3072, 3072, 2048, 256, 256, False, True, True): (1, 8, 1, 32), + (3072, 3072, 2048, 256, 256, True, False, True): (3, 8, 1, 32), + (3072, 3072, 4096, 32, 32, False, True, True): (1, 64, 1, 8), + (3072, 3072, 4096, 32, 32, True, False, True): (1, 32, 1, 4), + (3072, 3072, 4096, 64, 64, False, True, True): (1, 64, 1, 8), + (3072, 3072, 4096, 64, 64, True, False, True): (3, 16, 3, 4), + (3072, 3072, 4096, 128, 128, False, True, True): (1, 32, 1, 8), + (3072, 3072, 4096, 128, 128, True, False, True): (2, 32, 2, 4), + (3072, 3072, 4096, 256, 256, False, True, True): (1, 16, 1, 32), + (3072, 3072, 4096, 256, 256, True, False, True): (2, 16, 1, 32), + (3072, 3072, 8192, 32, 32, False, True, True): (1, 128, 1, 8), + (3072, 3072, 8192, 32, 32, True, False, True): (1, 64, 1, 2), + (3072, 3072, 8192, 64, 64, False, True, True): (1, 64, 1, 4), + (3072, 3072, 8192, 64, 64, True, False, True): (1, 64, 3, 2), + (3072, 3072, 8192, 128, 128, False, True, True): (1, 64, 1, 8), + (3072, 3072, 8192, 128, 128, True, False, True): (2, 64, 2, 4), + (3072, 3072, 8192, 256, 256, False, True, True): (1, 32, 1, 32), + (3072, 3072, 8192, 256, 256, True, False, True): (4, 32, 1, 32), + (3072, 3072, 16384, 32, 32, False, True, True): (1, 256, 1, 8), + (3072, 3072, 16384, 32, 32, True, False, True): (1, 128, 3, 2), + (3072, 3072, 16384, 64, 64, False, True, True): (1, 128, 1, 4), + (3072, 3072, 16384, 64, 64, True, False, True): (2, 64, 3, 4), + (3072, 3072, 16384, 128, 128, False, True, True): (1, 128, 1, 8), + (3072, 3072, 16384, 128, 128, True, False, True): (1, 128, 2, 4), + (3072, 3072, 16384, 256, 256, False, True, True): (1, 64, 1, 32), + (3072, 3072, 16384, 256, 256, True, False, True): (4, 64, 1, 32), + (3072, 3072, 32768, 32, 32, False, True, True): (1, 512, 1, 8), + (3072, 3072, 32768, 32, 32, True, False, True): (1, 256, 3, 2), + (3072, 3072, 32768, 64, 64, False, True, True): (1, 256, 1, 4), + (3072, 3072, 32768, 64, 64, True, False, True): (1, 256, 3, 2), + (3072, 3072, 32768, 128, 128, False, True, True): (1, 256, 1, 8), + (3072, 3072, 32768, 128, 128, True, False, True): (1, 256, 2, 4), + (3072, 3072, 32768, 256, 256, False, True, True): (1, 128, 1, 32), + (3072, 3072, 32768, 256, 256, True, False, True): (4, 128, 1, 32), + (3072, 3072, 65536, 32, 32, False, True, True): (1, 1024, 1, 8), + (3072, 3072, 65536, 32, 32, True, False, True): (1, 512, 3, 2), + (3072, 3072, 65536, 64, 64, False, True, True): (1, 512, 1, 4), + (3072, 3072, 65536, 64, 64, True, False, True): (2, 256, 3, 4), + (3072, 3072, 65536, 128, 128, False, True, True): (1, 512, 1, 8), + (3072, 3072, 65536, 128, 128, True, False, True): (1, 512, 3, 4), + (3072, 3072, 65536, 256, 256, False, True, True): (1, 256, 1, 32), + (3072, 3072, 65536, 256, 256, True, False, True): (4, 256, 1, 32), + (3072, 3072, 131072, 32, 32, False, True, True): (1, 2048, 1, 8), + (3072, 3072, 131072, 32, 32, True, False, True): (1, 1024, 3, 2), + (3072, 3072, 131072, 64, 64, False, True, True): (1, 1024, 1, 4), + (3072, 3072, 131072, 64, 64, True, False, True): (1, 1024, 3, 2), + (3072, 3072, 131072, 128, 128, False, True, True): (1, 1024, 1, 8), + (3072, 3072, 131072, 128, 128, True, False, True): (1, 1024, 3, 4), + (3072, 3072, 131072, 256, 256, False, True, True): (1, 512, 1, 32), + (3072, 3072, 131072, 256, 256, True, False, True): (4, 512, 1, 32), + (4096, 4096, 256, 32, 32, False, True, True): (1, 4, 1, 8), + (4096, 4096, 256, 32, 32, True, False, True): (5, 2, 3, 4), + (4096, 4096, 256, 64, 64, False, True, True): (3, 4, 1, 8), + (4096, 4096, 256, 64, 64, True, False, True): (3, 4, 3, 2), + (4096, 4096, 256, 128, 128, False, True, True): (1, 2, 1, 8), + (4096, 4096, 256, 128, 128, True, False, True): (2, 2, 4, 4), + (4096, 4096, 256, 256, 256, False, True, True): (1, 1, 1, 32), + (4096, 4096, 256, 256, 256, True, False, True): (1, 1, 1, 32), + (4096, 4096, 512, 32, 32, False, True, True): (1, 8, 1, 8), + (4096, 4096, 512, 32, 32, True, False, True): (1, 4, 1, 4), + (4096, 4096, 512, 64, 64, False, True, True): (1, 8, 1, 8), + (4096, 4096, 512, 64, 64, True, False, True): (3, 4, 2, 2), + (4096, 4096, 512, 128, 128, False, True, True): (2, 4, 1, 8), + (4096, 4096, 512, 128, 128, True, False, True): (2, 4, 2, 4), + (4096, 4096, 512, 256, 256, False, True, True): (2, 2, 1, 32), + (4096, 4096, 512, 256, 256, True, False, True): (2, 2, 1, 32), + (4096, 4096, 1024, 32, 32, False, True, True): (4, 16, 1, 8), + (4096, 4096, 1024, 32, 32, True, False, True): (1, 8, 1, 4), + (4096, 4096, 1024, 64, 64, False, True, True): (1, 16, 1, 8), + (4096, 4096, 1024, 64, 64, True, False, True): (4, 4, 3, 4), + (4096, 4096, 1024, 128, 128, False, True, True): (2, 8, 1, 8), + (4096, 4096, 1024, 128, 128, True, False, True): (1, 8, 3, 4), + (4096, 4096, 1024, 256, 256, False, True, True): (1, 4, 1, 32), + (4096, 4096, 1024, 256, 256, True, False, True): (6, 4, 1, 32), + (4096, 4096, 2048, 32, 32, False, True, True): (1, 32, 1, 8), + (4096, 4096, 2048, 32, 32, True, False, True): (1, 16, 1, 4), + (4096, 4096, 2048, 64, 64, False, True, True): (4, 32, 1, 8), + (4096, 4096, 2048, 64, 64, True, False, True): (4, 8, 3, 4), + (4096, 4096, 2048, 128, 128, False, True, True): (2, 16, 1, 8), + (4096, 4096, 2048, 128, 128, True, False, True): (1, 16, 3, 4), + (4096, 4096, 2048, 256, 256, False, True, True): (1, 8, 1, 32), + (4096, 4096, 2048, 256, 256, True, False, True): (4, 8, 1, 32), + (4096, 4096, 4096, 32, 32, False, True, True): (1, 64, 1, 8), + (4096, 4096, 4096, 32, 32, True, False, True): (1, 32, 1, 4), + (4096, 4096, 4096, 64, 64, False, True, True): (1, 64, 1, 8), + (4096, 4096, 4096, 64, 64, True, False, True): (1, 32, 3, 2), + (4096, 4096, 4096, 128, 128, False, True, True): (1, 32, 1, 8), + (4096, 4096, 4096, 128, 128, True, False, True): (2, 32, 3, 4), + (4096, 4096, 4096, 256, 256, False, True, True): (1, 16, 1, 32), + (4096, 4096, 4096, 256, 256, True, False, True): (4, 16, 1, 32), + (4096, 4096, 8192, 32, 32, False, True, True): (1, 128, 1, 8), + (4096, 4096, 8192, 32, 32, True, False, True): (1, 64, 1, 4), + (4096, 4096, 8192, 64, 64, False, True, True): (1, 128, 1, 8), + (4096, 4096, 8192, 64, 64, True, False, True): (1, 64, 3, 2), + (4096, 4096, 8192, 128, 128, False, True, True): (1, 64, 1, 8), + (4096, 4096, 8192, 128, 128, True, False, True): (1, 64, 3, 4), + (4096, 4096, 8192, 256, 256, False, True, True): (1, 32, 1, 32), + (4096, 4096, 8192, 256, 256, True, False, True): (4, 32, 1, 32), + (4096, 4096, 16384, 32, 32, False, True, True): (1, 256, 1, 8), + (4096, 4096, 16384, 32, 32, True, False, True): (1, 128, 3, 2), + (4096, 4096, 16384, 64, 64, False, True, True): (1, 128, 1, 4), + (4096, 4096, 16384, 64, 64, True, False, True): (4, 64, 3, 4), + (4096, 4096, 16384, 128, 128, False, True, True): (1, 128, 1, 8), + (4096, 4096, 16384, 128, 128, True, False, True): (1, 128, 3, 4), + (4096, 4096, 16384, 256, 256, False, True, True): (1, 64, 1, 32), + (4096, 4096, 16384, 256, 256, True, False, True): (4, 64, 1, 32), + (4096, 4096, 32768, 32, 32, False, True, True): (1, 512, 1, 8), + (4096, 4096, 32768, 32, 32, True, False, True): (1, 256, 3, 2), + (4096, 4096, 32768, 64, 64, False, True, True): (1, 256, 1, 4), + (4096, 4096, 32768, 64, 64, True, False, True): (1, 256, 3, 2), + (4096, 4096, 32768, 128, 128, False, True, True): (1, 256, 1, 8), + (4096, 4096, 32768, 128, 128, True, False, True): (1, 256, 3, 4), + (4096, 4096, 32768, 256, 256, False, True, True): (1, 128, 1, 32), + (4096, 4096, 32768, 256, 256, True, False, True): (4, 128, 1, 32), + (4096, 4096, 65536, 32, 32, False, True, True): (1, 1024, 1, 8), + (4096, 4096, 65536, 32, 32, True, False, True): (1, 512, 3, 2), + (4096, 4096, 65536, 64, 64, False, True, True): (1, 512, 1, 4), + (4096, 4096, 65536, 64, 64, True, False, True): (4, 256, 3, 4), + (4096, 4096, 65536, 128, 128, False, True, True): (1, 512, 1, 8), + (4096, 4096, 65536, 128, 128, True, False, True): (1, 512, 3, 4), + (4096, 4096, 65536, 256, 256, False, True, True): (1, 256, 1, 32), + (4096, 4096, 65536, 256, 256, True, False, True): (4, 256, 1, 32), + (4096, 4096, 65792, 32, 32, False, True, True): (1, 1028, 1, 8), + (4096, 4096, 65792, 32, 32, True, False, True): (1, 514, 3, 2), + (4096, 4096, 65792, 64, 64, False, True, True): (1, 1028, 1, 8), + (4096, 4096, 65792, 64, 64, True, False, True): (1, 514, 3, 2), + (4096, 4096, 65792, 128, 128, False, True, True): (1, 514, 1, 8), + (4096, 4096, 65792, 128, 128, True, False, True): (1, 514, 2, 4), + (4096, 4096, 65792, 256, 256, False, True, True): (1, 257, 1, 32), + (4096, 4096, 65792, 256, 256, True, False, True): (1, 257, 1, 32), + (4096, 4096, 131072, 32, 32, False, True, True): (1, 2048, 1, 8), + (4096, 4096, 131072, 32, 32, True, False, True): (1, 1024, 3, 2), + (4096, 4096, 131072, 64, 64, False, True, True): (1, 2048, 1, 8), + (4096, 4096, 131072, 64, 64, True, False, True): (1, 1024, 3, 2), + (4096, 4096, 131072, 128, 128, False, True, True): (1, 1024, 1, 8), + (4096, 4096, 131072, 128, 128, True, False, True): (1, 1024, 3, 4), + (4096, 4096, 131072, 256, 256, False, True, True): (1, 512, 1, 32), + (4096, 4096, 131072, 256, 256, True, False, True): (4, 512, 1, 32), + (5120, 1280, 65792, 32, 32, False, True, True): (1, 1028, 1, 8), + (5120, 1280, 65792, 32, 32, True, False, True): (1, 514, 1, 2), + (5120, 1280, 65792, 64, 64, False, True, True): (1, 514, 1, 4), + (5120, 1280, 65792, 64, 64, True, False, True): (1, 514, 2, 2), + (5120, 1280, 65792, 128, 128, False, True, True): (1, 514, 1, 8), + (5120, 1280, 65792, 128, 128, True, False, True): (1, 514, 2, 4), + (5120, 1280, 65792, 256, 256, False, True, True): (1, 257, 1, 32), + (5120, 1280, 65792, 256, 256, True, False, True): (1, 257, 1, 32), + (6144, 6144, 256, 32, 32, False, True, True): (2, 4, 1, 8), + (6144, 6144, 256, 32, 32, True, False, True): (2, 1, 4, 4), + (6144, 6144, 256, 64, 64, False, True, True): (1, 4, 1, 8), + (6144, 6144, 256, 64, 64, True, False, True): (5, 1, 3, 4), + (6144, 6144, 256, 128, 128, False, True, True): (1, 2, 1, 8), + (6144, 6144, 256, 128, 128, True, False, True): (1, 2, 3, 4), + (6144, 6144, 256, 256, 256, False, True, True): (1, 1, 1, 32), + (6144, 6144, 256, 256, 256, True, False, True): (1, 1, 1, 32), + (6144, 6144, 512, 32, 32, False, True, True): (1, 8, 1, 8), + (6144, 6144, 512, 32, 32, True, False, True): (1, 4, 4, 2), + (6144, 6144, 512, 64, 64, False, True, True): (2, 8, 1, 8), + (6144, 6144, 512, 64, 64, True, False, True): (2, 2, 3, 4), + (6144, 6144, 512, 128, 128, False, True, True): (3, 4, 1, 8), + (6144, 6144, 512, 128, 128, True, False, True): (2, 4, 3, 4), + (6144, 6144, 512, 256, 256, False, True, True): (1, 2, 1, 32), + (6144, 6144, 512, 256, 256, True, False, True): (2, 2, 1, 32), + (6144, 6144, 1024, 32, 32, False, True, True): (1, 16, 1, 8), + (6144, 6144, 1024, 32, 32, True, False, True): (1, 8, 1, 4), + (6144, 6144, 1024, 64, 64, False, True, True): (1, 16, 1, 8), + (6144, 6144, 1024, 64, 64, True, False, True): (4, 4, 3, 4), + (6144, 6144, 1024, 128, 128, False, True, True): (1, 8, 1, 8), + (6144, 6144, 1024, 128, 128, True, False, True): (3, 8, 3, 4), + (6144, 6144, 1024, 256, 256, False, True, True): (1, 4, 1, 32), + (6144, 6144, 1024, 256, 256, True, False, True): (1, 4, 1, 32), + (6144, 6144, 2048, 32, 32, False, True, True): (1, 32, 1, 8), + (6144, 6144, 2048, 32, 32, True, False, True): (1, 16, 1, 4), + (6144, 6144, 2048, 64, 64, False, True, True): (1, 32, 1, 8), + (6144, 6144, 2048, 64, 64, True, False, True): (4, 8, 3, 4), + (6144, 6144, 2048, 128, 128, False, True, True): (1, 16, 1, 8), + (6144, 6144, 2048, 128, 128, True, False, True): (3, 16, 3, 4), + (6144, 6144, 2048, 256, 256, False, True, True): (1, 8, 1, 32), + (6144, 6144, 2048, 256, 256, True, False, True): (4, 8, 1, 32), + (6144, 6144, 4096, 32, 32, False, True, True): (1, 64, 1, 8), + (6144, 6144, 4096, 32, 32, True, False, True): (1, 32, 1, 4), + (6144, 6144, 4096, 64, 64, False, True, True): (1, 64, 1, 8), + (6144, 6144, 4096, 64, 64, True, False, True): (4, 16, 3, 4), + (6144, 6144, 4096, 128, 128, False, True, True): (1, 32, 1, 8), + (6144, 6144, 4096, 128, 128, True, False, True): (4, 32, 3, 4), + (6144, 6144, 4096, 256, 256, False, True, True): (1, 16, 1, 32), + (6144, 6144, 4096, 256, 256, True, False, True): (4, 16, 1, 32), + (6144, 6144, 8192, 32, 32, False, True, True): (1, 128, 1, 8), + (6144, 6144, 8192, 32, 32, True, False, True): (1, 64, 1, 4), + (6144, 6144, 8192, 64, 64, False, True, True): (1, 128, 1, 8), + (6144, 6144, 8192, 64, 64, True, False, True): (4, 32, 3, 4), + (6144, 6144, 8192, 128, 128, False, True, True): (1, 64, 1, 8), + (6144, 6144, 8192, 128, 128, True, False, True): (1, 64, 3, 4), + (6144, 6144, 8192, 256, 256, False, True, True): (1, 32, 1, 32), + (6144, 6144, 8192, 256, 256, True, False, True): (4, 32, 1, 32), + (6144, 6144, 16384, 32, 32, False, True, True): (1, 256, 1, 8), + (6144, 6144, 16384, 32, 32, True, False, True): (1, 128, 1, 4), + (6144, 6144, 16384, 64, 64, False, True, True): (1, 256, 1, 8), + (6144, 6144, 16384, 64, 64, True, False, True): (4, 64, 3, 4), + (6144, 6144, 16384, 128, 128, False, True, True): (1, 128, 1, 8), + (6144, 6144, 16384, 128, 128, True, False, True): (4, 128, 3, 4), + (6144, 6144, 16384, 256, 256, False, True, True): (1, 64, 1, 32), + (6144, 6144, 16384, 256, 256, True, False, True): (4, 64, 1, 32), + (6144, 6144, 32768, 32, 32, False, True, True): (1, 512, 1, 8), + (6144, 6144, 32768, 32, 32, True, False, True): (1, 256, 1, 4), + (6144, 6144, 32768, 64, 64, False, True, True): (1, 512, 1, 8), + (6144, 6144, 32768, 64, 64, True, False, True): (4, 128, 3, 4), + (6144, 6144, 32768, 128, 128, False, True, True): (1, 256, 1, 8), + (6144, 6144, 32768, 128, 128, True, False, True): (1, 256, 3, 4), + (6144, 6144, 32768, 256, 256, False, True, True): (1, 128, 1, 32), + (6144, 6144, 32768, 256, 256, True, False, True): (4, 128, 1, 32), + (6144, 6144, 65536, 32, 32, False, True, True): (1, 1024, 1, 8), + (6144, 6144, 65536, 32, 32, True, False, True): (1, 512, 1, 4), + (6144, 6144, 65536, 64, 64, False, True, True): (1, 1024, 1, 8), + (6144, 6144, 65536, 64, 64, True, False, True): (4, 256, 3, 4), + (6144, 6144, 65536, 128, 128, False, True, True): (1, 512, 1, 8), + (6144, 6144, 65536, 128, 128, True, False, True): (1, 512, 3, 4), + (6144, 6144, 65536, 256, 256, False, True, True): (1, 256, 1, 32), + (6144, 6144, 65536, 256, 256, True, False, True): (4, 256, 1, 32), + (6144, 6144, 131072, 32, 32, False, True, True): (1, 2048, 1, 8), + (6144, 6144, 131072, 32, 32, True, False, True): (1, 1024, 1, 4), + (6144, 6144, 131072, 64, 64, False, True, True): (1, 2048, 1, 8), + (6144, 6144, 131072, 64, 64, True, False, True): (4, 512, 3, 4), + (6144, 6144, 131072, 128, 128, False, True, True): (1, 1024, 1, 8), + (6144, 6144, 131072, 128, 128, True, False, True): (1, 1024, 3, 4), + (6144, 6144, 131072, 256, 256, False, True, True): (1, 512, 1, 32), + (6144, 6144, 131072, 256, 256, True, False, True): (4, 512, 1, 32), + (8192, 8192, 256, 32, 32, False, True, True): (1, 4, 1, 8), + (8192, 8192, 256, 32, 32, True, False, True): (3, 2, 3, 4), + (8192, 8192, 256, 64, 64, False, True, True): (1, 4, 1, 4), + (8192, 8192, 256, 64, 64, True, False, True): (1, 4, 1, 4), + (8192, 8192, 256, 128, 128, False, True, True): (1, 2, 1, 8), + (8192, 8192, 256, 128, 128, True, False, True): (2, 2, 3, 4), + (8192, 8192, 256, 256, 256, False, True, True): (1, 1, 1, 32), + (8192, 8192, 256, 256, 256, True, False, True): (1, 1, 1, 32), + (8192, 8192, 512, 32, 32, False, True, True): (4, 8, 1, 8), + (8192, 8192, 512, 32, 32, True, False, True): (2, 4, 4, 2), + (8192, 8192, 512, 64, 64, False, True, True): (4, 4, 1, 4), + (8192, 8192, 512, 64, 64, True, False, True): (3, 2, 3, 4), + (8192, 8192, 512, 128, 128, False, True, True): (1, 4, 1, 8), + (8192, 8192, 512, 128, 128, True, False, True): (1, 4, 3, 4), + (8192, 8192, 512, 256, 256, False, True, True): (1, 2, 1, 32), + (8192, 8192, 512, 256, 256, True, False, True): (1, 2, 1, 32), + (8192, 8192, 1024, 32, 32, False, True, True): (4, 16, 1, 8), + (8192, 8192, 1024, 32, 32, True, False, True): (1, 8, 3, 2), + (8192, 8192, 1024, 64, 64, False, True, True): (4, 8, 1, 4), + (8192, 8192, 1024, 64, 64, True, False, True): (4, 4, 3, 4), + (8192, 8192, 1024, 128, 128, False, True, True): (1, 8, 1, 8), + (8192, 8192, 1024, 128, 128, True, False, True): (1, 8, 3, 4), + (8192, 8192, 1024, 256, 256, False, True, True): (1, 4, 1, 32), + (8192, 8192, 1024, 256, 256, True, False, True): (4, 4, 1, 32), + (8192, 8192, 2048, 32, 32, False, True, True): (4, 32, 1, 8), + (8192, 8192, 2048, 32, 32, True, False, True): (1, 16, 3, 2), + (8192, 8192, 2048, 64, 64, False, True, True): (4, 32, 1, 8), + (8192, 8192, 2048, 64, 64, True, False, True): (4, 8, 3, 4), + (8192, 8192, 2048, 128, 128, False, True, True): (4, 16, 1, 8), + (8192, 8192, 2048, 128, 128, True, False, True): (4, 16, 3, 4), + (8192, 8192, 2048, 256, 256, False, True, True): (1, 8, 1, 32), + (8192, 8192, 2048, 256, 256, True, False, True): (4, 8, 1, 32), + (8192, 8192, 4096, 32, 32, False, True, True): (4, 64, 1, 8), + (8192, 8192, 4096, 32, 32, True, False, True): (2, 32, 3, 2), + (8192, 8192, 4096, 64, 64, False, True, True): (4, 64, 1, 8), + (8192, 8192, 4096, 64, 64, True, False, True): (4, 16, 3, 4), + (8192, 8192, 4096, 128, 128, False, True, True): (4, 32, 1, 8), + (8192, 8192, 4096, 128, 128, True, False, True): (4, 32, 3, 4), + (8192, 8192, 4096, 256, 256, False, True, True): (1, 16, 1, 32), + (8192, 8192, 4096, 256, 256, True, False, True): (2, 16, 1, 32), + (8192, 8192, 8192, 32, 32, False, True, True): (4, 128, 1, 8), + (8192, 8192, 8192, 32, 32, True, False, True): (1, 64, 3, 2), + (8192, 8192, 8192, 64, 64, False, True, True): (4, 64, 1, 4), + (8192, 8192, 8192, 64, 64, True, False, True): (4, 32, 3, 4), + (8192, 8192, 8192, 128, 128, False, True, True): (4, 64, 1, 16), + (8192, 8192, 8192, 128, 128, True, False, True): (4, 64, 3, 4), + (8192, 8192, 8192, 256, 256, False, True, True): (1, 32, 1, 32), + (8192, 8192, 8192, 256, 256, True, False, True): (4, 32, 1, 32), + (8192, 8192, 16384, 32, 32, False, True, True): (4, 256, 1, 8), + (8192, 8192, 16384, 32, 32, True, False, True): (4, 128, 3, 2), + (8192, 8192, 16384, 64, 64, False, True, True): (4, 128, 1, 4), + (8192, 8192, 16384, 64, 64, True, False, True): (4, 64, 3, 4), + (8192, 8192, 16384, 128, 128, False, True, True): (4, 128, 1, 16), + (8192, 8192, 16384, 128, 128, True, False, True): (4, 128, 3, 4), + (8192, 8192, 16384, 256, 256, False, True, True): (1, 64, 1, 32), + (8192, 8192, 16384, 256, 256, True, False, True): (4, 64, 1, 32), + (8192, 8192, 32768, 32, 32, False, True, True): (4, 512, 1, 8), + (8192, 8192, 32768, 32, 32, True, False, True): (2, 256, 3, 2), + (8192, 8192, 32768, 64, 64, False, True, True): (4, 256, 1, 4), + (8192, 8192, 32768, 64, 64, True, False, True): (4, 128, 3, 4), + (8192, 8192, 32768, 128, 128, False, True, True): (4, 256, 1, 16), + (8192, 8192, 32768, 128, 128, True, False, True): (4, 256, 3, 4), + (8192, 8192, 32768, 256, 256, False, True, True): (1, 128, 1, 32), + (8192, 8192, 32768, 256, 256, True, False, True): (4, 128, 1, 32), + (8192, 8192, 65536, 32, 32, False, True, True): (4, 1024, 1, 8), + (8192, 8192, 65536, 32, 32, True, False, True): (4, 512, 3, 2), + (8192, 8192, 65536, 64, 64, False, True, True): (4, 512, 1, 4), + (8192, 8192, 65536, 64, 64, True, False, True): (4, 256, 3, 4), + (8192, 8192, 65536, 128, 128, False, True, True): (4, 512, 1, 16), + (8192, 8192, 65536, 128, 128, True, False, True): (4, 512, 3, 4), + (8192, 8192, 65536, 256, 256, False, True, True): (1, 256, 1, 32), + (8192, 8192, 65536, 256, 256, True, False, True): (4, 256, 1, 32), + (8192, 8192, 65792, 32, 32, False, True, True): (4, 1028, 1, 8), + (8192, 8192, 65792, 32, 32, True, False, True): (1, 514, 3, 2), + (8192, 8192, 65792, 64, 64, False, True, True): (4, 1028, 1, 8), + (8192, 8192, 65792, 64, 64, True, False, True): (2, 257, 3, 4), + (8192, 8192, 65792, 128, 128, False, True, True): (4, 514, 1, 16), + (8192, 8192, 65792, 128, 128, True, False, True): (2, 514, 3, 4), + (8192, 8192, 65792, 256, 256, False, True, True): (1, 257, 1, 32), + (8192, 8192, 65792, 256, 256, True, False, True): (1, 257, 1, 32), + (8192, 8192, 131072, 32, 32, False, True, True): (4, 2048, 1, 8), + (8192, 8192, 131072, 32, 32, True, False, True): (4, 1024, 3, 2), + (8192, 8192, 131072, 64, 64, False, True, True): (4, 1024, 1, 4), + (8192, 8192, 131072, 64, 64, True, False, True): (4, 512, 3, 4), + (8192, 8192, 131072, 128, 128, False, True, True): (4, 1024, 1, 16), + (8192, 8192, 131072, 128, 128, True, False, True): (4, 1024, 3, 4), + (8192, 8192, 131072, 256, 256, False, True, True): (1, 512, 1, 32), + (8192, 8192, 131072, 256, 256, True, False, True): (4, 512, 1, 32), + (16384, 16384, 256, 32, 32, False, True, True): (4, 4, 1, 8), + (16384, 16384, 256, 32, 32, True, False, True): (2, 2, 4, 2), + (16384, 16384, 256, 64, 64, False, True, True): (2, 2, 1, 4), + (16384, 16384, 256, 64, 64, True, False, True): (5, 1, 3, 4), + (16384, 16384, 256, 128, 128, False, True, True): (6, 2, 1, 8), + (16384, 16384, 256, 128, 128, True, False, True): (6, 2, 3, 4), + (16384, 16384, 256, 256, 256, False, True, True): (1, 1, 1, 32), + (16384, 16384, 256, 256, 256, True, False, True): (1, 1, 1, 32), + (16384, 16384, 512, 32, 32, False, True, True): (4, 8, 1, 8), + (16384, 16384, 512, 32, 32, True, False, True): (1, 4, 4, 2), + (16384, 16384, 512, 64, 64, False, True, True): (4, 4, 1, 4), + (16384, 16384, 512, 64, 64, True, False, True): (2, 2, 3, 4), + (16384, 16384, 512, 128, 128, False, True, True): (4, 4, 1, 8), + (16384, 16384, 512, 128, 128, True, False, True): (4, 4, 3, 4), + (16384, 16384, 512, 256, 256, False, True, True): (1, 2, 1, 32), + (16384, 16384, 512, 256, 256, True, False, True): (2, 2, 1, 32), + (16384, 16384, 1024, 32, 32, False, True, True): (4, 16, 1, 8), + (16384, 16384, 1024, 32, 32, True, False, True): (1, 8, 3, 2), + (16384, 16384, 1024, 64, 64, False, True, True): (4, 8, 1, 4), + (16384, 16384, 1024, 64, 64, True, False, True): (4, 4, 3, 4), + (16384, 16384, 1024, 128, 128, False, True, True): (4, 4, 1, 8), + (16384, 16384, 1024, 128, 128, True, False, True): (4, 8, 3, 4), + (16384, 16384, 1024, 256, 256, False, True, True): (1, 4, 1, 32), + (16384, 16384, 1024, 256, 256, True, False, True): (4, 4, 1, 32), + (16384, 16384, 2048, 32, 32, False, True, True): (4, 32, 1, 8), + (16384, 16384, 2048, 32, 32, True, False, True): (2, 16, 3, 2), + (16384, 16384, 2048, 64, 64, False, True, True): (4, 16, 1, 4), + (16384, 16384, 2048, 64, 64, True, False, True): (4, 8, 3, 4), + (16384, 16384, 2048, 128, 128, False, True, True): (4, 16, 1, 8), + (16384, 16384, 2048, 128, 128, True, False, True): (4, 16, 3, 4), + (16384, 16384, 2048, 256, 256, False, True, True): (1, 8, 1, 32), + (16384, 16384, 2048, 256, 256, True, False, True): (4, 8, 1, 32), + (16384, 16384, 4096, 32, 32, False, True, True): (4, 64, 1, 8), + (16384, 16384, 4096, 32, 32, True, False, True): (2, 32, 3, 2), + (16384, 16384, 4096, 64, 64, False, True, True): (2, 32, 1, 4), + (16384, 16384, 4096, 64, 64, True, False, True): (4, 16, 3, 4), + (16384, 16384, 4096, 128, 128, False, True, True): (4, 32, 1, 8), + (16384, 16384, 4096, 128, 128, True, False, True): (4, 32, 3, 4), + (16384, 16384, 4096, 256, 256, False, True, True): (1, 16, 1, 32), + (16384, 16384, 4096, 256, 256, True, False, True): (4, 16, 1, 32), + (16384, 16384, 8192, 32, 32, False, True, True): (4, 128, 1, 8), + (16384, 16384, 8192, 32, 32, True, False, True): (2, 64, 3, 2), + (16384, 16384, 8192, 64, 64, False, True, True): (4, 64, 1, 4), + (16384, 16384, 8192, 64, 64, True, False, True): (4, 32, 3, 4), + (16384, 16384, 8192, 128, 128, False, True, True): (4, 64, 1, 16), + (16384, 16384, 8192, 128, 128, True, False, True): (4, 64, 3, 4), + (16384, 16384, 8192, 256, 256, False, True, True): (1, 32, 1, 32), + (16384, 16384, 8192, 256, 256, True, False, True): (4, 32, 1, 32), + (16384, 16384, 16384, 32, 32, False, True, True): (4, 256, 1, 8), + (16384, 16384, 16384, 32, 32, True, False, True): (2, 128, 3, 2), + (16384, 16384, 16384, 64, 64, False, True, True): (4, 128, 1, 4), + (16384, 16384, 16384, 64, 64, True, False, True): (4, 64, 3, 4), + (16384, 16384, 16384, 128, 128, False, True, True): (1, 64, 1, 8), + (16384, 16384, 16384, 128, 128, True, False, True): (4, 128, 3, 4), + (16384, 16384, 16384, 256, 256, False, True, True): (1, 64, 1, 32), + (16384, 16384, 16384, 256, 256, True, False, True): (4, 64, 1, 32), + (16384, 16384, 32768, 32, 32, False, True, True): (4, 512, 1, 8), + (16384, 16384, 32768, 32, 32, True, False, True): (1, 256, 3, 2), + (16384, 16384, 32768, 64, 64, False, True, True): (4, 256, 1, 4), + (16384, 16384, 32768, 64, 64, True, False, True): (4, 128, 3, 4), + (16384, 16384, 32768, 128, 128, False, True, True): (4, 256, 1, 16), + (16384, 16384, 32768, 128, 128, True, False, True): (4, 256, 3, 4), + (16384, 16384, 32768, 256, 256, False, True, True): (1, 128, 1, 32), + (16384, 16384, 32768, 256, 256, True, False, True): (4, 128, 1, 32), + (16384, 16384, 65536, 32, 32, False, True, True): (4, 1024, 1, 8), + (16384, 16384, 65536, 32, 32, True, False, True): (1, 512, 3, 2), + (16384, 16384, 65536, 64, 64, False, True, True): (2, 512, 1, 4), + (16384, 16384, 65536, 64, 64, True, False, True): (4, 256, 3, 4), + (16384, 16384, 65536, 128, 128, False, True, True): (4, 512, 1, 16), + (16384, 16384, 65536, 128, 128, True, False, True): (4, 512, 3, 4), + (16384, 16384, 65536, 256, 256, False, True, True): (1, 256, 1, 32), + (16384, 16384, 65536, 256, 256, True, False, True): (4, 256, 1, 32), + (16384, 16384, 65792, 32, 32, False, True, True): (4, 1028, 1, 8), + (16384, 16384, 65792, 32, 32, True, False, True): (1, 514, 3, 2), + (16384, 16384, 65792, 64, 64, False, True, True): (2, 514, 1, 4), + (16384, 16384, 65792, 64, 64, True, False, True): (2, 257, 3, 4), + (16384, 16384, 65792, 128, 128, False, True, True): (2, 514, 1, 16), + (16384, 16384, 65792, 128, 128, True, False, True): (2, 514, 3, 4), + (16384, 16384, 65792, 256, 256, False, True, True): (1, 257, 1, 32), + (16384, 16384, 65792, 256, 256, True, False, True): (1, 257, 1, 32), + (16384, 16384, 131072, 32, 32, False, True, True): (4, 1024, 1, 8), + (16384, 16384, 131072, 32, 32, True, False, True): (4, 512, 3, 4), + (16384, 16384, 131072, 64, 64, False, True, True): (4, 1024, 1, 4), + (16384, 16384, 131072, 64, 64, True, False, True): (4, 1024, 3, 2), + (16384, 16384, 131072, 128, 128, False, True, True): (2, 1024, 3, 8), + (16384, 16384, 131072, 128, 128, True, False, True): (4, 1024, 3, 4), + (16384, 16384, 131072, 256, 256, False, True, True): (4, 512, 1, 32), + (16384, 16384, 131072, 256, 256, True, False, True): (4, 512, 1, 32), + (32768, 32768, 256, 32, 32, False, True, True): (4, 4, 1, 8), + (32768, 32768, 256, 32, 32, True, False, True): (1, 2, 4, 2), + (32768, 32768, 256, 64, 64, False, True, True): (2, 2, 1, 4), + (32768, 32768, 256, 64, 64, True, False, True): (2, 1, 3, 4), + (32768, 32768, 256, 128, 128, False, True, True): (4, 2, 1, 8), + (32768, 32768, 256, 128, 128, True, False, True): (4, 2, 3, 4), + (32768, 32768, 256, 256, 256, False, True, True): (1, 1, 1, 32), + (32768, 32768, 256, 256, 256, True, False, True): (1, 1, 1, 32), + (32768, 32768, 512, 32, 32, False, True, True): (4, 8, 1, 8), + (32768, 32768, 512, 32, 32, True, False, True): (1, 4, 3, 2), + (32768, 32768, 512, 64, 64, False, True, True): (4, 4, 1, 4), + (32768, 32768, 512, 64, 64, True, False, True): (4, 2, 3, 4), + (32768, 32768, 512, 128, 128, False, True, True): (1, 2, 1, 8), + (32768, 32768, 512, 128, 128, True, False, True): (4, 4, 3, 4), + (32768, 32768, 512, 256, 256, False, True, True): (1, 2, 1, 32), + (32768, 32768, 512, 256, 256, True, False, True): (2, 2, 1, 32), + (32768, 32768, 1024, 32, 32, False, True, True): (4, 16, 1, 8), + (32768, 32768, 1024, 32, 32, True, False, True): (1, 8, 4, 2), + (32768, 32768, 1024, 64, 64, False, True, True): (4, 8, 1, 4), + (32768, 32768, 1024, 64, 64, True, False, True): (4, 4, 3, 4), + (32768, 32768, 1024, 128, 128, False, True, True): (1, 4, 1, 8), + (32768, 32768, 1024, 128, 128, True, False, True): (4, 8, 3, 4), + (32768, 32768, 1024, 256, 256, False, True, True): (1, 4, 1, 32), + (32768, 32768, 1024, 256, 256, True, False, True): (1, 4, 1, 32), + (32768, 32768, 2048, 32, 32, False, True, True): (2, 32, 1, 8), + (32768, 32768, 2048, 32, 32, True, False, True): (1, 16, 4, 2), + (32768, 32768, 2048, 64, 64, False, True, True): (2, 16, 1, 4), + (32768, 32768, 2048, 64, 64, True, False, True): (4, 8, 3, 4), + (32768, 32768, 2048, 128, 128, False, True, True): (1, 8, 1, 8), + (32768, 32768, 2048, 128, 128, True, False, True): (4, 16, 3, 4), + (32768, 32768, 2048, 256, 256, False, True, True): (1, 8, 1, 32), + (32768, 32768, 2048, 256, 256, True, False, True): (4, 8, 1, 32), + (32768, 32768, 4096, 32, 32, False, True, True): (2, 64, 1, 8), + (32768, 32768, 4096, 32, 32, True, False, True): (2, 32, 3, 2), + (32768, 32768, 4096, 64, 64, False, True, True): (2, 32, 1, 4), + (32768, 32768, 4096, 64, 64, True, False, True): (2, 16, 3, 4), + (32768, 32768, 4096, 128, 128, False, True, True): (1, 16, 1, 8), + (32768, 32768, 4096, 128, 128, True, False, True): (2, 32, 3, 4), + (32768, 32768, 4096, 256, 256, False, True, True): (1, 16, 1, 32), + (32768, 32768, 4096, 256, 256, True, False, True): (4, 16, 1, 32), + (32768, 32768, 8192, 32, 32, False, True, True): (2, 128, 1, 8), + (32768, 32768, 8192, 32, 32, True, False, True): (2, 64, 3, 2), + (32768, 32768, 8192, 64, 64, False, True, True): (2, 64, 1, 4), + (32768, 32768, 8192, 64, 64, True, False, True): (2, 32, 3, 4), + (32768, 32768, 8192, 128, 128, False, True, True): (1, 32, 1, 8), + (32768, 32768, 8192, 128, 128, True, False, True): (4, 64, 3, 4), + (32768, 32768, 8192, 256, 256, False, True, True): (1, 32, 1, 32), + (32768, 32768, 8192, 256, 256, True, False, True): (4, 32, 1, 32), + (32768, 32768, 16384, 32, 32, False, True, True): (2, 256, 1, 8), + (32768, 32768, 16384, 32, 32, True, False, True): (2, 128, 4, 2), + (32768, 32768, 16384, 64, 64, False, True, True): (2, 128, 1, 4), + (32768, 32768, 16384, 64, 64, True, False, True): (4, 64, 3, 4), + (32768, 32768, 16384, 128, 128, False, True, True): (1, 64, 1, 8), + (32768, 32768, 16384, 128, 128, True, False, True): (4, 128, 3, 4), + (32768, 32768, 16384, 256, 256, False, True, True): (1, 64, 1, 32), + (32768, 32768, 16384, 256, 256, True, False, True): (2, 64, 1, 32), + (32768, 32768, 32768, 32, 32, False, True, True): (2, 512, 1, 8), + (32768, 32768, 32768, 32, 32, True, False, True): (4, 256, 3, 2), + (32768, 32768, 32768, 64, 64, False, True, True): (1, 256, 1, 4), + (32768, 32768, 32768, 64, 64, True, False, True): (2, 128, 3, 4), + (32768, 32768, 32768, 128, 128, False, True, True): (1, 128, 1, 8), + (32768, 32768, 32768, 128, 128, True, False, True): (2, 256, 3, 4), + (32768, 32768, 32768, 256, 256, False, True, True): (1, 128, 1, 32), + (32768, 32768, 32768, 256, 256, True, False, True): (1, 128, 1, 32), + (32768, 32768, 65536, 32, 32, False, True, True): (2, 512, 1, 8), + (32768, 32768, 65536, 32, 32, True, False, True): (3, 512, 4, 2), + (32768, 32768, 65536, 64, 64, False, True, True): (1, 512, 1, 4), + (32768, 32768, 65536, 64, 64, True, False, True): (2, 512, 3, 2), + (32768, 32768, 65536, 128, 128, False, True, True): (1, 256, 1, 8), + (32768, 32768, 65536, 128, 128, True, False, True): (2, 512, 3, 4), + (32768, 32768, 65536, 256, 256, False, True, True): (1, 256, 1, 32), + (32768, 32768, 65536, 256, 256, True, False, True): (1, 256, 1, 32), + }, + ("_int_bsr_dense_addmm", "NVIDIA A100-SXM4-80GB", (0, torch.int8, 0.56)): { + (192, 192, 256, 64, 64, False, True, True): (3, 4, 3, 32), + (192, 192, 256, 64, 64, True, False, True): (1, 4, 3, 4), + (192, 192, 512, 64, 64, False, True, True): (1, 8, 1, 16), + (192, 192, 512, 64, 64, True, False, True): (1, 8, 5, 4), + (192, 192, 1024, 64, 64, False, True, True): (4, 16, 1, 16), + (192, 192, 1024, 64, 64, True, False, True): (3, 16, 3, 4), + (192, 192, 2048, 64, 64, False, True, True): (5, 32, 1, 8), + (192, 192, 2048, 64, 64, True, False, True): (2, 32, 4, 4), + (192, 192, 4096, 64, 64, False, True, True): (4, 64, 1, 16), + (192, 192, 4096, 64, 64, True, False, True): (1, 32, 4, 4), + (192, 192, 8192, 64, 64, False, True, True): (2, 128, 1, 8), + (192, 192, 8192, 64, 64, True, False, True): (3, 64, 1, 4), + (192, 192, 16384, 64, 64, False, True, True): (2, 256, 1, 8), + (192, 192, 16384, 64, 64, True, False, True): (1, 128, 3, 2), + (192, 192, 32768, 64, 64, False, True, True): (2, 512, 1, 8), + (192, 192, 32768, 64, 64, True, False, True): (3, 128, 1, 4), + (192, 192, 65536, 64, 64, False, True, True): (3, 1024, 1, 8), + (192, 192, 65536, 64, 64, True, False, True): (1, 512, 3, 4), + (192, 192, 131072, 64, 64, False, True, True): (1, 2048, 1, 8), + (192, 192, 131072, 64, 64, True, False, True): (1, 512, 1, 4), + (384, 384, 256, 128, 128, False, True, True): (4, 2, 1, 16), + (384, 384, 256, 128, 128, True, False, True): (1, 2, 3, 4), + (384, 384, 512, 128, 128, False, True, True): (2, 4, 1, 16), + (384, 384, 512, 128, 128, True, False, True): (2, 4, 3, 4), + (384, 384, 1024, 128, 128, False, True, True): (3, 8, 1, 32), + (384, 384, 1024, 128, 128, True, False, True): (3, 8, 3, 4), + (384, 384, 2048, 128, 128, False, True, True): (3, 16, 1, 32), + (384, 384, 2048, 128, 128, True, False, True): (2, 16, 3, 4), + (384, 384, 4096, 128, 128, False, True, True): (3, 32, 1, 32), + (384, 384, 4096, 128, 128, True, False, True): (3, 32, 3, 4), + (384, 384, 8192, 128, 128, False, True, True): (2, 64, 1, 32), + (384, 384, 8192, 128, 128, True, False, True): (4, 64, 1, 4), + (384, 384, 16384, 128, 128, False, True, True): (2, 128, 1, 32), + (384, 384, 16384, 128, 128, True, False, True): (2, 128, 1, 4), + (384, 384, 32768, 128, 128, False, True, True): (3, 256, 1, 16), + (384, 384, 32768, 128, 128, True, False, True): (1, 256, 1, 4), + (384, 384, 65536, 128, 128, False, True, True): (4, 512, 1, 16), + (384, 384, 65536, 128, 128, True, False, True): (1, 512, 1, 4), + (384, 384, 131072, 128, 128, False, True, True): (4, 1024, 1, 16), + (384, 384, 131072, 128, 128, True, False, True): (1, 1024, 1, 4), + (768, 768, 256, 256, 256, False, True, True): (1, 1, 1, 32), + (768, 768, 256, 256, 256, True, False, True): (3, 1, 1, 32), + (768, 768, 512, 256, 256, False, True, True): (1, 2, 1, 32), + (768, 768, 512, 256, 256, True, False, True): (1, 2, 1, 32), + (768, 768, 1024, 256, 256, False, True, True): (1, 4, 1, 32), + (768, 768, 1024, 256, 256, True, False, True): (2, 4, 1, 32), + (768, 768, 2048, 256, 256, False, True, True): (1, 8, 1, 32), + (768, 768, 2048, 256, 256, True, False, True): (2, 8, 1, 32), + (768, 768, 4096, 256, 256, False, True, True): (1, 16, 1, 32), + (768, 768, 4096, 256, 256, True, False, True): (1, 16, 1, 32), + (768, 768, 8192, 256, 256, False, True, True): (1, 32, 1, 32), + (768, 768, 8192, 256, 256, True, False, True): (2, 32, 1, 32), + (768, 768, 16384, 256, 256, False, True, True): (1, 64, 1, 32), + (768, 768, 16384, 256, 256, True, False, True): (7, 64, 1, 32), + (768, 768, 32768, 256, 256, False, True, True): (1, 128, 1, 32), + (768, 768, 32768, 256, 256, True, False, True): (1, 128, 1, 32), + (768, 768, 65536, 256, 256, False, True, True): (1, 256, 1, 32), + (768, 768, 65536, 256, 256, True, False, True): (1, 256, 1, 32), + (768, 768, 131072, 256, 256, False, True, True): (1, 512, 1, 32), + (768, 768, 131072, 256, 256, True, False, True): (1, 512, 1, 32), + }, + ("_int_bsr_dense_addmm", "NVIDIA A100-SXM4-80GB", (0, torch.int8, 1.0)): { + (256, 256, 256, 256, 256, False, True, True): (2, 1, 1, 4), + (256, 256, 256, 256, 256, True, False, True): (2, 1, 2, 1), + (256, 256, 512, 256, 256, False, True, True): (2, 1, 1, 2), + (256, 256, 512, 256, 256, True, False, True): (2, 2, 2, 8), + (256, 256, 1024, 256, 256, False, True, True): (1, 4, 1, 4), + (256, 256, 1024, 256, 256, True, False, True): (1, 2, 2, 4), + (256, 256, 2048, 256, 256, False, True, True): (1, 4, 1, 2), + (256, 256, 2048, 256, 256, True, False, True): (1, 8, 1, 2), + (256, 256, 4096, 256, 256, False, True, True): (1, 16, 1, 4), + (256, 256, 4096, 256, 256, True, False, True): (1, 16, 1, 2), + (256, 256, 8192, 256, 256, False, True, True): (1, 16, 3, 4), + (256, 256, 8192, 256, 256, True, False, True): (1, 8, 1, 4), + (256, 256, 16384, 256, 256, False, True, True): (2, 16, 1, 8), + (256, 256, 16384, 256, 256, True, False, True): (1, 32, 1, 2), + (256, 256, 32768, 256, 256, False, True, True): (1, 128, 1, 8), + (256, 256, 32768, 256, 256, True, False, True): (1, 128, 1, 4), + (256, 256, 65536, 256, 256, False, True, True): (1, 4, 1, 1), + (256, 256, 65536, 256, 256, True, False, True): (1, 128, 1, 4), + (256, 256, 65792, 256, 256, False, True, True): (1, 128, 2, 16), + (256, 256, 65792, 256, 256, True, False, True): (1, 16, 3, 4), + (256, 256, 131072, 256, 256, False, True, True): (1, 512, 1, 4), + (256, 256, 131072, 256, 256, True, False, True): (1, 512, 1, 2), + }, + ("bsr_dense_addmm", "NVIDIA A100-SXM4-80GB", (0, torch.bfloat16, 0.5)): { + (16, 16, 16, 16, 16, False, False, False): (2, 1, 1, 2), + (16, 16, 16, 16, 16, False, False, True): (1, 1, 1, 4), + (16, 16, 16, 16, 16, False, True, False): (1, 1, 3, 16), + (16, 16, 16, 16, 16, False, True, True): (1, 1, 1, 8), + (16, 16, 16, 16, 16, True, False, False): (2, 1, 1, 8), + (16, 16, 16, 16, 16, True, False, True): (1, 1, 1, 8), + (16, 16, 32, 16, 16, False, False, False): (1, 2, 1, 8), + (16, 16, 32, 16, 16, False, False, True): (1, 2, 2, 4), + (16, 16, 32, 16, 16, False, True, False): (1, 1, 2, 4), + (16, 16, 32, 16, 16, False, True, True): (1, 1, 2, 4), + (16, 16, 32, 16, 16, True, False, False): (1, 1, 2, 4), + (16, 16, 32, 16, 16, True, False, True): (2, 2, 1, 2), + (16, 16, 64, 16, 16, False, False, False): (1, 4, 2, 4), + (16, 16, 64, 16, 16, False, False, True): (1, 2, 1, 2), + (16, 16, 64, 16, 16, False, True, False): (2, 1, 1, 2), + (16, 16, 64, 16, 16, False, True, True): (1, 4, 1, 8), + (16, 16, 64, 16, 16, True, False, False): (1, 4, 1, 1), + (16, 16, 64, 16, 16, True, False, True): (1, 4, 2, 4), + (16, 32, 16, 16, 16, False, False, False): (1, 1, 2, 2), + (16, 32, 16, 16, 16, False, False, True): (1, 1, 1, 4), + (16, 32, 16, 16, 16, False, True, False): (1, 1, 1, 2), + (16, 32, 16, 16, 16, False, True, True): (1, 1, 1, 1), + (16, 32, 16, 16, 16, True, False, False): (1, 1, 1, 2), + (16, 32, 16, 16, 16, True, False, True): (2, 1, 1, 2), + (16, 32, 16, 16, 32, False, False, False): (1, 1, 1, 4), + (16, 32, 16, 16, 32, False, False, True): (1, 1, 1, 8), + (16, 32, 16, 16, 32, False, True, False): (1, 1, 1, 8), + (16, 32, 16, 16, 32, False, True, True): (1, 1, 2, 4), + (16, 32, 16, 16, 32, True, False, False): (1, 1, 1, 2), + (16, 32, 16, 16, 32, True, False, True): (1, 1, 1, 1), + (16, 32, 32, 16, 16, False, False, False): (2, 2, 1, 4), + (16, 32, 32, 16, 16, False, False, True): (2, 2, 1, 2), + (16, 32, 32, 16, 16, False, True, False): (1, 1, 2, 8), + (16, 32, 32, 16, 16, False, True, True): (1, 2, 1, 1), + (16, 32, 32, 16, 16, True, False, False): (1, 1, 1, 8), + (16, 32, 32, 16, 16, True, False, True): (1, 2, 1, 4), + (16, 32, 32, 16, 32, False, False, False): (1, 1, 2, 8), + (16, 32, 32, 16, 32, False, False, True): (2, 1, 1, 8), + (16, 32, 32, 16, 32, False, True, False): (1, 1, 1, 4), + (16, 32, 32, 16, 32, False, True, True): (1, 1, 1, 4), + (16, 32, 32, 16, 32, True, False, False): (1, 2, 1, 8), + (16, 32, 32, 16, 32, True, False, True): (1, 1, 1, 4), + (16, 32, 64, 16, 16, False, False, False): (1, 4, 3, 8), + (16, 32, 64, 16, 16, False, False, True): (1, 4, 1, 4), + (16, 32, 64, 16, 16, False, True, False): (1, 4, 1, 4), + (16, 32, 64, 16, 16, False, True, True): (2, 4, 1, 4), + (16, 32, 64, 16, 16, True, False, False): (1, 2, 1, 4), + (16, 32, 64, 16, 16, True, False, True): (1, 2, 1, 4), + (16, 32, 64, 16, 32, False, False, False): (1, 4, 1, 8), + (16, 32, 64, 16, 32, False, False, True): (1, 4, 1, 4), + (16, 32, 64, 16, 32, False, True, False): (1, 4, 1, 2), + (16, 32, 64, 16, 32, False, True, True): (1, 2, 1, 4), + (16, 32, 64, 16, 32, True, False, False): (1, 2, 1, 4), + (16, 32, 64, 16, 32, True, False, True): (1, 2, 1, 2), + (16, 64, 16, 16, 32, False, False, False): (1, 1, 1, 2), + (16, 64, 16, 16, 32, False, False, True): (1, 1, 2, 2), + (16, 64, 16, 16, 32, False, True, False): (1, 1, 2, 8), + (16, 64, 16, 16, 32, False, True, True): (1, 1, 1, 4), + (16, 64, 16, 16, 32, True, False, False): (1, 1, 1, 8), + (16, 64, 16, 16, 32, True, False, True): (1, 1, 1, 4), + (16, 64, 32, 16, 32, False, False, False): (1, 2, 1, 2), + (16, 64, 32, 16, 32, False, False, True): (1, 2, 1, 4), + (16, 64, 32, 16, 32, False, True, False): (1, 2, 1, 4), + (16, 64, 32, 16, 32, False, True, True): (2, 2, 1, 4), + (16, 64, 32, 16, 32, True, False, False): (1, 2, 1, 4), + (16, 64, 32, 16, 32, True, False, True): (1, 2, 1, 8), + (16, 64, 64, 16, 32, False, False, False): (1, 2, 1, 4), + (16, 64, 64, 16, 32, False, False, True): (1, 4, 2, 2), + (16, 64, 64, 16, 32, False, True, False): (1, 1, 1, 4), + (16, 64, 64, 16, 32, False, True, True): (1, 4, 1, 2), + (16, 64, 64, 16, 32, True, False, False): (1, 2, 1, 4), + (16, 64, 64, 16, 32, True, False, True): (1, 4, 1, 4), + (32, 16, 16, 16, 16, False, False, False): (1, 1, 1, 8), + (32, 16, 16, 16, 16, False, False, True): (1, 1, 2, 4), + (32, 16, 16, 16, 16, False, True, False): (1, 1, 1, 4), + (32, 16, 16, 16, 16, False, True, True): (1, 1, 2, 4), + (32, 16, 16, 16, 16, True, False, False): (1, 1, 1, 2), + (32, 16, 16, 16, 16, True, False, True): (1, 1, 1, 4), + (32, 16, 32, 16, 16, False, False, False): (1, 1, 1, 4), + (32, 16, 32, 16, 16, False, False, True): (2, 2, 1, 4), + (32, 16, 32, 16, 16, False, True, False): (1, 2, 2, 2), + (32, 16, 32, 16, 16, False, True, True): (2, 2, 1, 4), + (32, 16, 32, 16, 16, True, False, False): (1, 2, 2, 8), + (32, 16, 32, 16, 16, True, False, True): (1, 2, 1, 2), + (32, 16, 64, 16, 16, False, False, False): (1, 4, 1, 4), + (32, 16, 64, 16, 16, False, False, True): (1, 4, 2, 4), + (32, 16, 64, 16, 16, False, True, False): (1, 2, 2, 2), + (32, 16, 64, 16, 16, False, True, True): (3, 4, 1, 4), + (32, 16, 64, 16, 16, True, False, False): (1, 2, 1, 2), + (32, 16, 64, 16, 16, True, False, True): (1, 2, 1, 4), + (32, 32, 16, 16, 16, False, False, False): (1, 1, 3, 4), + (32, 32, 16, 16, 16, False, False, True): (1, 1, 1, 4), + (32, 32, 16, 16, 16, False, True, False): (1, 1, 1, 2), + (32, 32, 16, 16, 16, False, True, True): (1, 1, 1, 4), + (32, 32, 16, 16, 16, True, False, False): (1, 1, 1, 4), + (32, 32, 16, 16, 16, True, False, True): (1, 1, 2, 2), + (32, 32, 16, 16, 32, False, False, False): (2, 1, 1, 4), + (32, 32, 16, 16, 32, False, False, True): (1, 1, 1, 4), + (32, 32, 16, 16, 32, False, True, False): (1, 1, 1, 4), + (32, 32, 16, 16, 32, False, True, True): (3, 1, 2, 4), + (32, 32, 16, 16, 32, True, False, False): (1, 1, 1, 4), + (32, 32, 16, 16, 32, True, False, True): (1, 1, 1, 4), + (32, 32, 16, 32, 32, False, False, False): (1, 1, 1, 8), + (32, 32, 16, 32, 32, False, False, True): (1, 1, 1, 4), + (32, 32, 16, 32, 32, False, True, False): (1, 1, 2, 1), + (32, 32, 16, 32, 32, False, True, True): (2, 1, 2, 2), + (32, 32, 16, 32, 32, True, False, False): (1, 1, 1, 8), + (32, 32, 16, 32, 32, True, False, True): (2, 1, 3, 4), + (32, 32, 32, 16, 16, False, False, False): (1, 2, 1, 4), + (32, 32, 32, 16, 16, False, False, True): (2, 2, 1, 4), + (32, 32, 32, 16, 16, False, True, False): (1, 1, 1, 8), + (32, 32, 32, 16, 16, False, True, True): (2, 2, 1, 4), + (32, 32, 32, 16, 16, True, False, False): (1, 1, 1, 4), + (32, 32, 32, 16, 16, True, False, True): (2, 2, 2, 4), + (32, 32, 32, 16, 32, False, False, False): (2, 2, 1, 8), + (32, 32, 32, 16, 32, False, False, True): (1, 2, 1, 2), + (32, 32, 32, 16, 32, False, True, False): (1, 2, 1, 4), + (32, 32, 32, 16, 32, False, True, True): (1, 2, 1, 4), + (32, 32, 32, 16, 32, True, False, False): (1, 2, 1, 4), + (32, 32, 32, 16, 32, True, False, True): (1, 2, 1, 2), + (32, 32, 32, 32, 32, False, False, False): (1, 1, 3, 8), + (32, 32, 32, 32, 32, False, False, True): (1, 1, 1, 8), + (32, 32, 32, 32, 32, False, True, False): (2, 1, 3, 4), + (32, 32, 32, 32, 32, False, True, True): (2, 1, 1, 2), + (32, 32, 32, 32, 32, True, False, False): (1, 1, 1, 2), + (32, 32, 32, 32, 32, True, False, True): (4, 1, 1, 1), + (32, 32, 64, 16, 16, False, False, False): (1, 4, 1, 4), + (32, 32, 64, 16, 16, False, False, True): (1, 4, 1, 4), + (32, 32, 64, 16, 16, False, True, False): (1, 2, 1, 8), + (32, 32, 64, 16, 16, False, True, True): (1, 4, 1, 2), + (32, 32, 64, 16, 16, True, False, False): (2, 4, 1, 2), + (32, 32, 64, 16, 16, True, False, True): (1, 4, 1, 2), + (32, 32, 64, 16, 32, False, False, False): (1, 2, 1, 8), + (32, 32, 64, 16, 32, False, False, True): (1, 4, 2, 2), + (32, 32, 64, 16, 32, False, True, False): (1, 2, 1, 4), + (32, 32, 64, 16, 32, False, True, True): (1, 4, 1, 4), + (32, 32, 64, 16, 32, True, False, False): (1, 4, 2, 2), + (32, 32, 64, 16, 32, True, False, True): (3, 4, 2, 2), + (32, 32, 64, 32, 32, False, False, False): (2, 2, 1, 4), + (32, 32, 64, 32, 32, False, False, True): (1, 2, 1, 4), + (32, 32, 64, 32, 32, False, True, False): (1, 1, 1, 8), + (32, 32, 64, 32, 32, False, True, True): (1, 1, 1, 4), + (32, 32, 64, 32, 32, True, False, False): (1, 2, 1, 2), + (32, 32, 64, 32, 32, True, False, True): (3, 2, 1, 8), + (32, 64, 16, 16, 32, False, False, False): (1, 1, 2, 2), + (32, 64, 16, 16, 32, False, False, True): (1, 1, 1, 4), + (32, 64, 16, 16, 32, False, True, False): (1, 1, 2, 4), + (32, 64, 16, 16, 32, False, True, True): (1, 1, 1, 4), + (32, 64, 16, 16, 32, True, False, False): (1, 1, 1, 2), + (32, 64, 16, 16, 32, True, False, True): (2, 1, 2, 2), + (32, 64, 16, 32, 32, False, False, False): (1, 1, 1, 1), + (32, 64, 16, 32, 32, False, False, True): (2, 1, 1, 4), + (32, 64, 16, 32, 32, False, True, False): (1, 1, 1, 1), + (32, 64, 16, 32, 32, False, True, True): (1, 1, 2, 2), + (32, 64, 16, 32, 32, True, False, False): (1, 1, 2, 4), + (32, 64, 16, 32, 32, True, False, True): (1, 1, 1, 4), + (32, 64, 32, 16, 32, False, False, False): (2, 2, 1, 4), + (32, 64, 32, 16, 32, False, False, True): (1, 2, 1, 4), + (32, 64, 32, 16, 32, False, True, False): (1, 1, 1, 4), + (32, 64, 32, 16, 32, False, True, True): (2, 2, 3, 4), + (32, 64, 32, 16, 32, True, False, False): (1, 1, 1, 2), + (32, 64, 32, 16, 32, True, False, True): (1, 2, 1, 2), + (32, 64, 32, 32, 32, False, False, False): (1, 1, 1, 2), + (32, 64, 32, 32, 32, False, False, True): (2, 1, 1, 4), + (32, 64, 32, 32, 32, False, True, False): (1, 1, 1, 8), + (32, 64, 32, 32, 32, False, True, True): (1, 1, 2, 4), + (32, 64, 32, 32, 32, True, False, False): (2, 1, 1, 4), + (32, 64, 32, 32, 32, True, False, True): (1, 1, 2, 4), + (32, 64, 64, 16, 32, False, False, False): (1, 4, 1, 4), + (32, 64, 64, 16, 32, False, False, True): (1, 4, 2, 4), + (32, 64, 64, 16, 32, False, True, False): (1, 4, 2, 2), + (32, 64, 64, 16, 32, False, True, True): (1, 4, 1, 4), + (32, 64, 64, 16, 32, True, False, False): (1, 4, 1, 8), + (32, 64, 64, 16, 32, True, False, True): (1, 4, 2, 1), + (32, 64, 64, 32, 32, False, False, False): (1, 1, 1, 4), + (32, 64, 64, 32, 32, False, False, True): (2, 2, 1, 4), + (32, 64, 64, 32, 32, False, True, False): (1, 1, 1, 4), + (32, 64, 64, 32, 32, False, True, True): (2, 2, 1, 4), + (32, 64, 64, 32, 32, True, False, False): (1, 2, 2, 4), + (32, 64, 64, 32, 32, True, False, True): (2, 2, 3, 4), + (64, 32, 16, 32, 32, False, False, False): (1, 1, 1, 4), + (64, 32, 16, 32, 32, False, False, True): (1, 1, 1, 4), + (64, 32, 16, 32, 32, False, True, False): (1, 1, 1, 8), + (64, 32, 16, 32, 32, False, True, True): (1, 1, 1, 4), + (64, 32, 16, 32, 32, True, False, False): (1, 1, 1, 16), + (64, 32, 16, 32, 32, True, False, True): (2, 1, 1, 4), + (64, 32, 32, 32, 32, False, False, False): (1, 1, 3, 4), + (64, 32, 32, 32, 32, False, False, True): (2, 1, 1, 4), + (64, 32, 32, 32, 32, False, True, False): (1, 1, 2, 4), + (64, 32, 32, 32, 32, False, True, True): (2, 1, 1, 4), + (64, 32, 32, 32, 32, True, False, False): (2, 1, 1, 16), + (64, 32, 32, 32, 32, True, False, True): (2, 1, 1, 4), + (64, 32, 64, 32, 32, False, False, False): (1, 2, 1, 4), + (64, 32, 64, 32, 32, False, False, True): (2, 2, 1, 4), + (64, 32, 64, 32, 32, False, True, False): (1, 1, 1, 4), + (64, 32, 64, 32, 32, False, True, True): (2, 2, 1, 4), + (64, 32, 64, 32, 32, True, False, False): (1, 2, 1, 8), + (64, 32, 64, 32, 32, True, False, True): (2, 2, 3, 4), + (64, 64, 16, 32, 32, False, False, False): (1, 1, 2, 16), + (64, 64, 16, 32, 32, False, False, True): (1, 1, 3, 4), + (64, 64, 16, 32, 32, False, True, False): (1, 1, 1, 2), + (64, 64, 16, 32, 32, False, True, True): (2, 1, 1, 4), + (64, 64, 16, 32, 32, True, False, False): (2, 1, 3, 2), + (64, 64, 16, 32, 32, True, False, True): (1, 1, 2, 4), + (64, 64, 32, 32, 32, False, False, False): (1, 1, 1, 8), + (64, 64, 32, 32, 32, False, False, True): (2, 1, 2, 4), + (64, 64, 32, 32, 32, False, True, False): (2, 1, 1, 4), + (64, 64, 32, 32, 32, False, True, True): (1, 1, 2, 4), + (64, 64, 32, 32, 32, True, False, False): (2, 1, 1, 4), + (64, 64, 32, 32, 32, True, False, True): (1, 1, 2, 4), + (64, 64, 64, 32, 32, False, False, False): (1, 2, 2, 4), + (64, 64, 64, 32, 32, False, False, True): (1, 2, 2, 2), + (64, 64, 64, 32, 32, False, True, False): (1, 2, 1, 2), + (64, 64, 64, 32, 32, False, True, True): (1, 2, 1, 4), + (64, 64, 64, 32, 32, True, False, False): (1, 2, 1, 4), + (64, 64, 64, 32, 32, True, False, True): (1, 2, 1, 4), + (192, 192, 256, 16, 16, False, True, True): (1, 8, 5, 4), + (192, 192, 256, 16, 16, True, False, True): (2, 8, 5, 2), + (192, 192, 256, 32, 32, False, True, True): (1, 8, 6, 4), + (192, 192, 256, 32, 32, True, False, True): (3, 8, 5, 2), + (192, 192, 512, 16, 16, False, True, True): (1, 16, 5, 2), + (192, 192, 512, 16, 16, True, False, True): (1, 8, 4, 2), + (192, 192, 512, 32, 32, False, True, True): (2, 16, 5, 4), + (192, 192, 512, 32, 32, True, False, True): (2, 8, 5, 2), + (192, 192, 1024, 16, 16, False, True, True): (1, 16, 3, 4), + (192, 192, 1024, 16, 16, True, False, True): (1, 16, 6, 2), + (192, 192, 1024, 32, 32, False, True, True): (1, 32, 3, 4), + (192, 192, 1024, 32, 32, True, False, True): (1, 16, 4, 2), + (192, 192, 2048, 16, 16, False, True, True): (1, 32, 1, 4), + (192, 192, 2048, 16, 16, True, False, True): (4, 32, 4, 2), + (192, 192, 2048, 32, 32, False, True, True): (1, 16, 3, 8), + (192, 192, 2048, 32, 32, True, False, True): (2, 32, 4, 2), + (192, 192, 4096, 16, 16, False, True, True): (2, 64, 1, 4), + (192, 192, 4096, 16, 16, True, False, True): (1, 32, 3, 2), + (192, 192, 4096, 32, 32, False, True, True): (1, 64, 1, 8), + (192, 192, 4096, 32, 32, True, False, True): (2, 32, 4, 4), + (192, 192, 8192, 16, 16, False, True, True): (1, 64, 1, 4), + (192, 192, 8192, 16, 16, True, False, True): (2, 32, 3, 1), + (192, 192, 8192, 32, 32, False, True, True): (3, 128, 1, 4), + (192, 192, 8192, 32, 32, True, False, True): (1, 64, 3, 4), + (192, 192, 16384, 16, 16, False, True, True): (1, 128, 1, 4), + (192, 192, 16384, 16, 16, True, False, True): (4, 64, 3, 1), + (192, 192, 16384, 32, 32, False, True, True): (1, 128, 1, 4), + (192, 192, 16384, 32, 32, True, False, True): (1, 64, 3, 4), + (192, 192, 32768, 16, 16, False, True, True): (2, 256, 1, 2), + (192, 192, 32768, 16, 16, True, False, True): (1, 128, 3, 4), + (192, 192, 32768, 32, 32, False, True, True): (2, 256, 1, 4), + (192, 192, 32768, 32, 32, True, False, True): (4, 128, 3, 4), + (192, 192, 65536, 16, 16, False, True, True): (2, 512, 1, 2), + (192, 192, 65536, 16, 16, True, False, True): (2, 256, 3, 2), + (192, 192, 65536, 32, 32, False, True, True): (2, 512, 1, 4), + (192, 192, 65536, 32, 32, True, False, True): (1, 256, 3, 4), + (192, 192, 131072, 16, 16, False, True, True): (4, 1024, 1, 2), + (192, 192, 131072, 16, 16, True, False, True): (3, 512, 3, 2), + (192, 192, 131072, 32, 32, False, True, True): (1, 1024, 1, 2), + (192, 192, 131072, 32, 32, True, False, True): (1, 512, 3, 4), + (256, 256, 256, 16, 16, False, True, True): (4, 8, 5, 1), + (256, 256, 256, 16, 16, True, False, True): (2, 8, 4, 2), + (256, 256, 256, 32, 32, False, True, True): (2, 8, 5, 2), + (256, 256, 256, 32, 32, True, False, True): (1, 8, 5, 4), + (256, 256, 256, 64, 64, False, True, True): (2, 4, 4, 4), + (256, 256, 256, 64, 64, True, False, True): (1, 4, 3, 4), + (256, 256, 256, 128, 128, False, True, True): (4, 2, 2, 8), + (256, 256, 256, 128, 128, True, False, True): (1, 2, 2, 8), + (256, 256, 512, 16, 16, False, True, True): (1, 16, 5, 1), + (256, 256, 512, 16, 16, True, False, True): (3, 16, 3, 2), + (256, 256, 512, 32, 32, False, True, True): (2, 8, 5, 2), + (256, 256, 512, 32, 32, True, False, True): (1, 16, 4, 4), + (256, 256, 512, 64, 64, False, True, True): (1, 8, 4, 4), + (256, 256, 512, 64, 64, True, False, True): (3, 8, 3, 4), + (256, 256, 512, 128, 128, False, True, True): (1, 4, 2, 8), + (256, 256, 512, 128, 128, True, False, True): (1, 4, 2, 8), + (256, 256, 1024, 16, 16, False, True, True): (1, 16, 5, 4), + (256, 256, 1024, 16, 16, True, False, True): (5, 16, 4, 2), + (256, 256, 1024, 32, 32, False, True, True): (1, 32, 5, 2), + (256, 256, 1024, 32, 32, True, False, True): (2, 16, 5, 2), + (256, 256, 1024, 64, 64, False, True, True): (1, 16, 4, 4), + (256, 256, 1024, 64, 64, True, False, True): (1, 16, 4, 4), + (256, 256, 1024, 128, 128, False, True, True): (1, 8, 2, 8), + (256, 256, 1024, 128, 128, True, False, True): (1, 8, 2, 8), + (256, 256, 2048, 16, 16, False, True, True): (1, 16, 4, 4), + (256, 256, 2048, 16, 16, True, False, True): (2, 32, 5, 1), + (256, 256, 2048, 32, 32, False, True, True): (1, 64, 4, 1), + (256, 256, 2048, 32, 32, True, False, True): (2, 32, 4, 2), + (256, 256, 2048, 64, 64, False, True, True): (8, 16, 5, 4), + (256, 256, 2048, 64, 64, True, False, True): (1, 16, 4, 4), + (256, 256, 2048, 128, 128, False, True, True): (2, 16, 2, 8), + (256, 256, 2048, 128, 128, True, False, True): (1, 16, 2, 8), + (256, 256, 4096, 16, 16, False, True, True): (1, 64, 1, 4), + (256, 256, 4096, 16, 16, True, False, True): (1, 16, 3, 2), + (256, 256, 4096, 32, 32, False, True, True): (6, 32, 3, 2), + (256, 256, 4096, 32, 32, True, False, True): (4, 32, 4, 2), + (256, 256, 4096, 64, 64, False, True, True): (6, 64, 3, 4), + (256, 256, 4096, 64, 64, True, False, True): (2, 64, 3, 4), + (256, 256, 4096, 128, 128, False, True, True): (1, 32, 2, 8), + (256, 256, 4096, 128, 128, True, False, True): (1, 32, 2, 8), + (256, 256, 8192, 16, 16, False, True, True): (2, 32, 3, 4), + (256, 256, 8192, 16, 16, True, False, True): (4, 64, 3, 2), + (256, 256, 8192, 32, 32, False, True, True): (1, 64, 3, 4), + (256, 256, 8192, 32, 32, True, False, True): (3, 128, 1, 2), + (256, 256, 8192, 64, 64, False, True, True): (9, 128, 1, 4), + (256, 256, 8192, 64, 64, True, False, True): (8, 128, 1, 4), + (256, 256, 8192, 128, 128, False, True, True): (7, 64, 1, 4), + (256, 256, 8192, 128, 128, True, False, True): (1, 32, 1, 16), + (256, 256, 16384, 16, 16, False, True, True): (3, 128, 3, 2), + (256, 256, 16384, 16, 16, True, False, True): (5, 64, 3, 2), + (256, 256, 16384, 32, 32, False, True, True): (3, 128, 3, 2), + (256, 256, 16384, 32, 32, True, False, True): (1, 128, 3, 2), + (256, 256, 16384, 64, 64, False, True, True): (3, 128, 1, 4), + (256, 256, 16384, 64, 64, True, False, True): (2, 128, 1, 4), + (256, 256, 16384, 128, 128, False, True, True): (7, 128, 1, 4), + (256, 256, 16384, 128, 128, True, False, True): (1, 128, 2, 8), + (256, 256, 32768, 16, 16, False, True, True): (2, 128, 3, 2), + (256, 256, 32768, 16, 16, True, False, True): (1, 128, 3, 2), + (256, 256, 32768, 32, 32, False, True, True): (1, 256, 3, 4), + (256, 256, 32768, 32, 32, True, False, True): (3, 256, 3, 2), + (256, 256, 32768, 64, 64, False, True, True): (1, 256, 1, 4), + (256, 256, 32768, 64, 64, True, False, True): (3, 256, 1, 4), + (256, 256, 32768, 128, 128, False, True, True): (9, 256, 1, 4), + (256, 256, 32768, 128, 128, True, False, True): (2, 256, 1, 4), + (256, 256, 65536, 16, 16, False, True, True): (1, 256, 3, 2), + (256, 256, 65536, 16, 16, True, False, True): (1, 256, 3, 2), + (256, 256, 65536, 32, 32, False, True, True): (2, 512, 3, 2), + (256, 256, 65536, 32, 32, True, False, True): (2, 512, 3, 2), + (256, 256, 65536, 64, 64, False, True, True): (2, 512, 1, 4), + (256, 256, 65536, 64, 64, True, False, True): (1, 512, 1, 4), + (256, 256, 65536, 128, 128, False, True, True): (7, 512, 1, 4), + (256, 256, 65536, 128, 128, True, False, True): (2, 512, 1, 4), + (256, 256, 131072, 16, 16, False, True, True): (1, 512, 3, 2), + (256, 256, 131072, 16, 16, True, False, True): (1, 512, 3, 2), + (256, 256, 131072, 32, 32, False, True, True): (1, 1024, 3, 2), + (256, 256, 131072, 32, 32, True, False, True): (1, 1024, 3, 2), + (256, 256, 131072, 64, 64, False, True, True): (1, 1024, 1, 4), + (256, 256, 131072, 64, 64, True, False, True): (1, 1024, 1, 4), + (256, 256, 131072, 128, 128, False, True, True): (3, 1024, 1, 4), + (256, 256, 131072, 128, 128, True, False, True): (1, 1024, 1, 4), + (384, 384, 256, 16, 16, False, True, True): (1, 8, 5, 2), + (384, 384, 256, 16, 16, True, False, True): (3, 4, 5, 2), + (384, 384, 256, 32, 32, False, True, True): (2, 8, 4, 4), + (384, 384, 256, 32, 32, True, False, True): (1, 4, 6, 2), + (384, 384, 256, 64, 64, False, True, True): (2, 4, 4, 4), + (384, 384, 256, 64, 64, True, False, True): (2, 4, 4, 4), + (384, 384, 512, 16, 16, False, True, True): (1, 8, 4, 2), + (384, 384, 512, 16, 16, True, False, True): (1, 4, 5, 4), + (384, 384, 512, 32, 32, False, True, True): (1, 8, 4, 4), + (384, 384, 512, 32, 32, True, False, True): (3, 8, 5, 2), + (384, 384, 512, 64, 64, False, True, True): (3, 8, 3, 4), + (384, 384, 512, 64, 64, True, False, True): (5, 8, 5, 4), + (384, 384, 1024, 16, 16, False, True, True): (3, 16, 4, 2), + (384, 384, 1024, 16, 16, True, False, True): (1, 8, 4, 4), + (384, 384, 1024, 32, 32, False, True, True): (6, 32, 3, 2), + (384, 384, 1024, 32, 32, True, False, True): (3, 8, 4, 4), + (384, 384, 1024, 64, 64, False, True, True): (3, 16, 3, 4), + (384, 384, 1024, 64, 64, True, False, True): (2, 16, 4, 4), + (384, 384, 2048, 16, 16, False, True, True): (1, 32, 1, 4), + (384, 384, 2048, 16, 16, True, False, True): (1, 16, 5, 2), + (384, 384, 2048, 32, 32, False, True, True): (1, 32, 1, 8), + (384, 384, 2048, 32, 32, True, False, True): (1, 8, 4, 4), + (384, 384, 2048, 64, 64, False, True, True): (4, 16, 3, 4), + (384, 384, 2048, 64, 64, True, False, True): (1, 16, 3, 8), + (384, 384, 4096, 16, 16, False, True, True): (5, 32, 1, 4), + (384, 384, 4096, 16, 16, True, False, True): (6, 32, 3, 2), + (384, 384, 4096, 32, 32, False, True, True): (1, 32, 1, 8), + (384, 384, 4096, 32, 32, True, False, True): (1, 16, 3, 4), + (384, 384, 4096, 64, 64, False, True, True): (1, 64, 1, 4), + (384, 384, 4096, 64, 64, True, False, True): (2, 32, 3, 4), + (384, 384, 8192, 16, 16, False, True, True): (2, 64, 1, 4), + (384, 384, 8192, 16, 16, True, False, True): (3, 32, 3, 2), + (384, 384, 8192, 32, 32, False, True, True): (5, 64, 1, 8), + (384, 384, 8192, 32, 32, True, False, True): (1, 32, 3, 2), + (384, 384, 8192, 64, 64, False, True, True): (1, 128, 1, 4), + (384, 384, 8192, 64, 64, True, False, True): (3, 64, 3, 4), + (384, 384, 16384, 16, 16, False, True, True): (1, 128, 1, 2), + (384, 384, 16384, 16, 16, True, False, True): (4, 128, 3, 2), + (384, 384, 16384, 32, 32, False, True, True): (3, 128, 1, 4), + (384, 384, 16384, 32, 32, True, False, True): (1, 128, 3, 2), + (384, 384, 16384, 64, 64, False, True, True): (3, 256, 1, 4), + (384, 384, 16384, 64, 64, True, False, True): (2, 128, 3, 4), + (384, 384, 32768, 16, 16, False, True, True): (1, 256, 1, 2), + (384, 384, 32768, 16, 16, True, False, True): (1, 128, 3, 4), + (384, 384, 32768, 32, 32, False, True, True): (1, 256, 1, 2), + (384, 384, 32768, 32, 32, True, False, True): (1, 128, 3, 4), + (384, 384, 32768, 64, 64, False, True, True): (2, 256, 1, 4), + (384, 384, 32768, 64, 64, True, False, True): (1, 256, 3, 4), + (384, 384, 65536, 16, 16, False, True, True): (4, 512, 1, 2), + (384, 384, 65536, 16, 16, True, False, True): (1, 256, 3, 4), + (384, 384, 65536, 32, 32, False, True, True): (1, 512, 1, 2), + (384, 384, 65536, 32, 32, True, False, True): (1, 256, 3, 4), + (384, 384, 65536, 64, 64, False, True, True): (3, 512, 1, 4), + (384, 384, 65536, 64, 64, True, False, True): (3, 256, 3, 4), + (384, 384, 131072, 16, 16, False, True, True): (1, 512, 1, 1), + (384, 384, 131072, 16, 16, True, False, True): (1, 512, 3, 4), + (384, 384, 131072, 32, 32, False, True, True): (1, 512, 1, 4), + (384, 384, 131072, 32, 32, True, False, True): (1, 512, 3, 4), + (384, 384, 131072, 64, 64, False, True, True): (3, 1024, 1, 4), + (384, 384, 131072, 64, 64, True, False, True): (3, 512, 3, 4), + (512, 512, 256, 16, 16, False, True, True): (2, 4, 5, 4), + (512, 512, 256, 16, 16, True, False, True): (3, 4, 5, 4), + (512, 512, 256, 32, 32, False, True, True): (1, 4, 5, 2), + (512, 512, 256, 32, 32, True, False, True): (4, 8, 5, 1), + (512, 512, 256, 64, 64, False, True, True): (4, 4, 5, 4), + (512, 512, 256, 64, 64, True, False, True): (5, 4, 5, 4), + (512, 512, 256, 128, 128, False, True, True): (3, 2, 2, 8), + (512, 512, 256, 128, 128, True, False, True): (2, 2, 2, 8), + (512, 512, 512, 16, 16, False, True, True): (1, 8, 5, 4), + (512, 512, 512, 16, 16, True, False, True): (4, 8, 5, 2), + (512, 512, 512, 32, 32, False, True, True): (1, 16, 4, 1), + (512, 512, 512, 32, 32, True, False, True): (1, 8, 5, 2), + (512, 512, 512, 64, 64, False, True, True): (4, 8, 5, 4), + (512, 512, 512, 64, 64, True, False, True): (2, 8, 5, 4), + (512, 512, 512, 128, 128, False, True, True): (2, 4, 2, 8), + (512, 512, 512, 128, 128, True, False, True): (1, 4, 2, 8), + (512, 512, 1024, 16, 16, False, True, True): (2, 8, 4, 4), + (512, 512, 1024, 16, 16, True, False, True): (1, 8, 4, 4), + (512, 512, 1024, 32, 32, False, True, True): (3, 16, 4, 2), + (512, 512, 1024, 32, 32, True, False, True): (1, 16, 5, 2), + (512, 512, 1024, 64, 64, False, True, True): (2, 8, 3, 4), + (512, 512, 1024, 64, 64, True, False, True): (2, 16, 3, 4), + (512, 512, 1024, 128, 128, False, True, True): (2, 8, 2, 8), + (512, 512, 1024, 128, 128, True, False, True): (3, 8, 2, 8), + (512, 512, 2048, 16, 16, False, True, True): (4, 16, 3, 2), + (512, 512, 2048, 16, 16, True, False, True): (1, 16, 4, 2), + (512, 512, 2048, 32, 32, False, True, True): (3, 32, 3, 2), + (512, 512, 2048, 32, 32, True, False, True): (2, 32, 3, 2), + (512, 512, 2048, 64, 64, False, True, True): (6, 32, 3, 2), + (512, 512, 2048, 64, 64, True, False, True): (1, 32, 3, 2), + (512, 512, 2048, 128, 128, False, True, True): (4, 16, 2, 8), + (512, 512, 2048, 128, 128, True, False, True): (1, 16, 2, 8), + (512, 512, 4096, 16, 16, False, True, True): (1, 16, 3, 2), + (512, 512, 4096, 16, 16, True, False, True): (4, 32, 3, 2), + (512, 512, 4096, 32, 32, False, True, True): (3, 32, 3, 2), + (512, 512, 4096, 32, 32, True, False, True): (2, 32, 3, 2), + (512, 512, 4096, 64, 64, False, True, True): (1, 32, 3, 4), + (512, 512, 4096, 64, 64, True, False, True): (1, 64, 3, 4), + (512, 512, 4096, 128, 128, False, True, True): (4, 32, 1, 4), + (512, 512, 4096, 128, 128, True, False, True): (4, 32, 2, 8), + (512, 512, 8192, 16, 16, False, True, True): (8, 64, 3, 2), + (512, 512, 8192, 16, 16, True, False, True): (4, 64, 3, 2), + (512, 512, 8192, 32, 32, False, True, True): (3, 64, 3, 2), + (512, 512, 8192, 32, 32, True, False, True): (3, 64, 3, 2), + (512, 512, 8192, 64, 64, False, True, True): (1, 64, 3, 4), + (512, 512, 8192, 64, 64, True, False, True): (7, 64, 3, 4), + (512, 512, 8192, 128, 128, False, True, True): (1, 64, 1, 4), + (512, 512, 8192, 128, 128, True, False, True): (4, 64, 2, 8), + (512, 512, 16384, 16, 16, False, True, True): (1, 64, 3, 2), + (512, 512, 16384, 16, 16, True, False, True): (1, 128, 3, 2), + (512, 512, 16384, 32, 32, False, True, True): (3, 128, 3, 2), + (512, 512, 16384, 32, 32, True, False, True): (1, 128, 3, 2), + (512, 512, 16384, 64, 64, False, True, True): (4, 64, 2, 4), + (512, 512, 16384, 64, 64, True, False, True): (2, 64, 2, 4), + (512, 512, 16384, 128, 128, False, True, True): (4, 128, 1, 4), + (512, 512, 16384, 128, 128, True, False, True): (2, 128, 1, 4), + (512, 512, 32768, 16, 16, False, True, True): (1, 128, 3, 2), + (512, 512, 32768, 16, 16, True, False, True): (1, 128, 3, 2), + (512, 512, 32768, 32, 32, False, True, True): (1, 256, 3, 2), + (512, 512, 32768, 32, 32, True, False, True): (1, 256, 3, 2), + (512, 512, 32768, 64, 64, False, True, True): (1, 256, 3, 4), + (512, 512, 32768, 64, 64, True, False, True): (2, 256, 3, 4), + (512, 512, 32768, 128, 128, False, True, True): (5, 256, 1, 4), + (512, 512, 32768, 128, 128, True, False, True): (4, 256, 1, 4), + (512, 512, 65536, 16, 16, False, True, True): (1, 256, 3, 2), + (512, 512, 65536, 16, 16, True, False, True): (1, 256, 3, 1), + (512, 512, 65536, 32, 32, False, True, True): (1, 512, 3, 2), + (512, 512, 65536, 32, 32, True, False, True): (1, 512, 3, 2), + (512, 512, 65536, 64, 64, False, True, True): (4, 256, 2, 4), + (512, 512, 65536, 64, 64, True, False, True): (2, 512, 3, 4), + (512, 512, 65536, 128, 128, False, True, True): (6, 512, 1, 4), + (512, 512, 65536, 128, 128, True, False, True): (4, 512, 1, 4), + (512, 512, 131072, 16, 16, False, True, True): (1, 512, 3, 2), + (512, 512, 131072, 16, 16, True, False, True): (1, 512, 3, 1), + (512, 512, 131072, 32, 32, False, True, True): (1, 1024, 3, 2), + (512, 512, 131072, 32, 32, True, False, True): (1, 1024, 3, 2), + (512, 512, 131072, 64, 64, False, True, True): (4, 512, 2, 4), + (512, 512, 131072, 64, 64, True, False, True): (4, 1024, 3, 4), + (512, 512, 131072, 128, 128, False, True, True): (6, 1024, 1, 4), + (512, 512, 131072, 128, 128, True, False, True): (4, 1024, 1, 4), + (768, 768, 256, 16, 16, False, True, True): (1, 8, 4, 1), + (768, 768, 256, 16, 16, True, False, True): (3, 2, 6, 4), + (768, 768, 256, 32, 32, False, True, True): (3, 8, 3, 4), + (768, 768, 256, 32, 32, True, False, True): (1, 4, 4, 2), + (768, 768, 256, 64, 64, False, True, True): (2, 4, 3, 4), + (768, 768, 256, 64, 64, True, False, True): (1, 4, 4, 4), + (768, 768, 256, 128, 128, False, True, True): (2, 2, 3, 8), + (768, 768, 256, 128, 128, True, False, True): (4, 2, 3, 8), + (768, 768, 512, 16, 16, False, True, True): (4, 8, 4, 2), + (768, 768, 512, 16, 16, True, False, True): (4, 8, 6, 2), + (768, 768, 512, 32, 32, False, True, True): (1, 8, 4, 4), + (768, 768, 512, 32, 32, True, False, True): (3, 8, 4, 2), + (768, 768, 512, 64, 64, False, True, True): (1, 8, 3, 4), + (768, 768, 512, 64, 64, True, False, True): (1, 8, 4, 4), + (768, 768, 512, 128, 128, False, True, True): (1, 4, 3, 8), + (768, 768, 512, 128, 128, True, False, True): (4, 4, 3, 8), + (768, 768, 1024, 16, 16, False, True, True): (3, 16, 1, 4), + (768, 768, 1024, 16, 16, True, False, True): (1, 8, 5, 2), + (768, 768, 1024, 32, 32, False, True, True): (3, 16, 1, 8), + (768, 768, 1024, 32, 32, True, False, True): (1, 16, 3, 2), + (768, 768, 1024, 64, 64, False, True, True): (1, 8, 3, 4), + (768, 768, 1024, 64, 64, True, False, True): (2, 8, 3, 8), + (768, 768, 1024, 128, 128, False, True, True): (1, 8, 3, 8), + (768, 768, 1024, 128, 128, True, False, True): (1, 8, 3, 8), + (768, 768, 2048, 16, 16, False, True, True): (2, 16, 1, 2), + (768, 768, 2048, 16, 16, True, False, True): (1, 16, 3, 2), + (768, 768, 2048, 32, 32, False, True, True): (5, 32, 1, 4), + (768, 768, 2048, 32, 32, True, False, True): (3, 8, 3, 4), + (768, 768, 2048, 64, 64, False, True, True): (1, 16, 1, 8), + (768, 768, 2048, 64, 64, True, False, True): (3, 16, 3, 4), + (768, 768, 2048, 128, 128, False, True, True): (2, 16, 3, 8), + (768, 768, 2048, 128, 128, True, False, True): (1, 16, 3, 8), + (768, 768, 4096, 16, 16, False, True, True): (3, 32, 1, 4), + (768, 768, 4096, 16, 16, True, False, True): (2, 32, 3, 1), + (768, 768, 4096, 32, 32, False, True, True): (2, 64, 1, 4), + (768, 768, 4096, 32, 32, True, False, True): (1, 16, 4, 4), + (768, 768, 4096, 64, 64, False, True, True): (3, 64, 3, 4), + (768, 768, 4096, 64, 64, True, False, True): (2, 16, 3, 4), + (768, 768, 4096, 128, 128, False, True, True): (1, 32, 3, 8), + (768, 768, 4096, 128, 128, True, False, True): (4, 32, 3, 8), + (768, 768, 8192, 16, 16, False, True, True): (1, 64, 1, 2), + (768, 768, 8192, 16, 16, True, False, True): (4, 64, 3, 2), + (768, 768, 8192, 32, 32, False, True, True): (1, 64, 1, 8), + (768, 768, 8192, 32, 32, True, False, True): (2, 32, 3, 4), + (768, 768, 8192, 64, 64, False, True, True): (4, 64, 3, 4), + (768, 768, 8192, 64, 64, True, False, True): (2, 32, 3, 4), + (768, 768, 8192, 128, 128, False, True, True): (2, 64, 3, 8), + (768, 768, 8192, 128, 128, True, False, True): (1, 64, 3, 8), + (768, 768, 16384, 16, 16, False, True, True): (1, 128, 1, 2), + (768, 768, 16384, 16, 16, True, False, True): (1, 64, 4, 4), + (768, 768, 16384, 32, 32, False, True, True): (1, 128, 1, 8), + (768, 768, 16384, 32, 32, True, False, True): (1, 64, 3, 4), + (768, 768, 16384, 64, 64, False, True, True): (4, 128, 3, 4), + (768, 768, 16384, 64, 64, True, False, True): (1, 64, 3, 4), + (768, 768, 16384, 128, 128, False, True, True): (3, 128, 1, 4), + (768, 768, 16384, 128, 128, True, False, True): (3, 128, 2, 4), + (768, 768, 32768, 16, 16, False, True, True): (1, 256, 1, 2), + (768, 768, 32768, 16, 16, True, False, True): (1, 128, 4, 4), + (768, 768, 32768, 32, 32, False, True, True): (1, 128, 1, 2), + (768, 768, 32768, 32, 32, True, False, True): (1, 128, 3, 4), + (768, 768, 32768, 64, 64, False, True, True): (1, 256, 1, 4), + (768, 768, 32768, 64, 64, True, False, True): (2, 128, 3, 4), + (768, 768, 32768, 128, 128, False, True, True): (3, 256, 1, 4), + (768, 768, 32768, 128, 128, True, False, True): (2, 256, 2, 4), + (768, 768, 65536, 16, 16, False, True, True): (4, 512, 1, 2), + (768, 768, 65536, 16, 16, True, False, True): (1, 256, 4, 4), + (768, 768, 65536, 32, 32, False, True, True): (1, 256, 1, 2), + (768, 768, 65536, 32, 32, True, False, True): (1, 256, 3, 4), + (768, 768, 65536, 64, 64, False, True, True): (3, 512, 1, 4), + (768, 768, 65536, 64, 64, True, False, True): (2, 256, 3, 4), + (768, 768, 65536, 128, 128, False, True, True): (3, 512, 1, 4), + (768, 768, 65536, 128, 128, True, False, True): (2, 512, 2, 4), + (768, 768, 131072, 16, 16, False, True, True): (4, 1024, 1, 2), + (768, 768, 131072, 16, 16, True, False, True): (1, 512, 4, 1), + (768, 768, 131072, 32, 32, False, True, True): (1, 512, 1, 2), + (768, 768, 131072, 32, 32, True, False, True): (1, 512, 3, 4), + (768, 768, 131072, 64, 64, False, True, True): (1, 1024, 1, 4), + (768, 768, 131072, 64, 64, True, False, True): (2, 512, 3, 4), + (768, 768, 131072, 128, 128, False, True, True): (3, 1024, 1, 4), + (768, 768, 131072, 128, 128, True, False, True): (1, 1024, 2, 4), + (768, 3072, 256, 16, 16, False, True, True): (3, 8, 6, 1), + (768, 3072, 256, 16, 16, True, False, True): (1, 4, 6, 2), + (768, 3072, 256, 32, 32, False, True, True): (1, 8, 4, 4), + (768, 3072, 256, 32, 32, True, False, True): (3, 4, 6, 4), + (768, 3072, 256, 64, 64, False, True, True): (2, 4, 3, 4), + (768, 3072, 256, 64, 64, True, False, True): (1, 4, 4, 4), + (768, 3072, 256, 128, 128, False, True, True): (2, 2, 3, 8), + (768, 3072, 256, 128, 128, True, False, True): (1, 2, 3, 8), + (768, 3072, 512, 16, 16, False, True, True): (1, 8, 4, 2), + (768, 3072, 512, 16, 16, True, False, True): (1, 8, 5, 2), + (768, 3072, 512, 32, 32, False, True, True): (1, 16, 3, 2), + (768, 3072, 512, 32, 32, True, False, True): (1, 8, 5, 2), + (768, 3072, 512, 64, 64, False, True, True): (1, 8, 3, 4), + (768, 3072, 512, 64, 64, True, False, True): (3, 8, 4, 4), + (768, 3072, 512, 128, 128, False, True, True): (1, 4, 3, 8), + (768, 3072, 512, 128, 128, True, False, True): (2, 4, 3, 8), + (768, 3072, 1024, 16, 16, False, True, True): (1, 16, 1, 4), + (768, 3072, 1024, 16, 16, True, False, True): (5, 4, 4, 4), + (768, 3072, 1024, 32, 32, False, True, True): (3, 8, 3, 4), + (768, 3072, 1024, 32, 32, True, False, True): (1, 8, 4, 4), + (768, 3072, 1024, 64, 64, False, True, True): (2, 16, 3, 4), + (768, 3072, 1024, 64, 64, True, False, True): (2, 16, 4, 4), + (768, 3072, 1024, 128, 128, False, True, True): (1, 8, 3, 8), + (768, 3072, 1024, 128, 128, True, False, True): (5, 8, 3, 8), + (768, 3072, 2048, 16, 16, False, True, True): (3, 16, 1, 2), + (768, 3072, 2048, 16, 16, True, False, True): (1, 8, 3, 4), + (768, 3072, 2048, 32, 32, False, True, True): (4, 16, 1, 8), + (768, 3072, 2048, 32, 32, True, False, True): (3, 8, 3, 4), + (768, 3072, 2048, 64, 64, False, True, True): (2, 16, 3, 4), + (768, 3072, 2048, 64, 64, True, False, True): (2, 16, 3, 4), + (768, 3072, 2048, 128, 128, False, True, True): (3, 16, 3, 8), + (768, 3072, 2048, 128, 128, True, False, True): (4, 16, 3, 8), + (768, 3072, 4096, 16, 16, False, True, True): (1, 32, 1, 4), + (768, 3072, 4096, 16, 16, True, False, True): (1, 16, 3, 1), + (768, 3072, 4096, 32, 32, False, True, True): (3, 32, 1, 8), + (768, 3072, 4096, 32, 32, True, False, True): (3, 16, 4, 4), + (768, 3072, 4096, 64, 64, False, True, True): (2, 32, 3, 4), + (768, 3072, 4096, 64, 64, True, False, True): (2, 16, 3, 4), + (768, 3072, 4096, 128, 128, False, True, True): (5, 32, 1, 4), + (768, 3072, 4096, 128, 128, True, False, True): (9, 32, 3, 8), + (768, 3072, 8192, 16, 16, False, True, True): (1, 32, 1, 4), + (768, 3072, 8192, 16, 16, True, False, True): (4, 64, 4, 2), + (768, 3072, 8192, 32, 32, False, True, True): (1, 64, 1, 8), + (768, 3072, 8192, 32, 32, True, False, True): (2, 64, 4, 2), + (768, 3072, 8192, 64, 64, False, True, True): (1, 64, 3, 4), + (768, 3072, 8192, 64, 64, True, False, True): (2, 32, 3, 4), + (768, 3072, 8192, 128, 128, False, True, True): (2, 64, 3, 8), + (768, 3072, 8192, 128, 128, True, False, True): (2, 64, 3, 8), + (768, 3072, 16384, 16, 16, False, True, True): (1, 64, 1, 4), + (768, 3072, 16384, 16, 16, True, False, True): (1, 64, 4, 1), + (768, 3072, 16384, 32, 32, False, True, True): (1, 128, 1, 8), + (768, 3072, 16384, 32, 32, True, False, True): (1, 64, 3, 4), + (768, 3072, 16384, 64, 64, False, True, True): (1, 128, 3, 4), + (768, 3072, 16384, 64, 64, True, False, True): (4, 64, 3, 4), + (768, 3072, 16384, 128, 128, False, True, True): (2, 128, 3, 8), + (768, 3072, 16384, 128, 128, True, False, True): (2, 128, 3, 8), + (768, 3072, 32768, 16, 16, False, True, True): (1, 128, 1, 4), + (768, 3072, 32768, 16, 16, True, False, True): (1, 128, 4, 1), + (768, 3072, 32768, 32, 32, False, True, True): (1, 256, 1, 8), + (768, 3072, 32768, 32, 32, True, False, True): (1, 128, 3, 4), + (768, 3072, 32768, 64, 64, False, True, True): (1, 256, 3, 4), + (768, 3072, 32768, 64, 64, True, False, True): (1, 128, 3, 4), + (768, 3072, 32768, 128, 128, False, True, True): (3, 256, 1, 4), + (768, 3072, 32768, 128, 128, True, False, True): (2, 256, 3, 8), + (768, 3072, 50432, 16, 16, False, True, True): (1, 197, 1, 4), + (768, 3072, 50432, 16, 16, True, False, True): (4, 197, 4, 4), + (768, 3072, 50432, 32, 32, False, True, True): (1, 197, 1, 4), + (768, 3072, 50432, 32, 32, True, False, True): (4, 197, 3, 4), + (768, 3072, 50432, 64, 64, False, True, True): (1, 394, 3, 4), + (768, 3072, 50432, 64, 64, True, False, True): (3, 197, 3, 4), + (768, 3072, 50432, 128, 128, False, True, True): (3, 394, 1, 4), + (768, 3072, 50432, 128, 128, True, False, True): (1, 394, 3, 8), + (768, 3072, 65536, 16, 16, False, True, True): (1, 256, 1, 4), + (768, 3072, 65536, 16, 16, True, False, True): (5, 256, 4, 1), + (768, 3072, 65536, 32, 32, False, True, True): (1, 256, 1, 4), + (768, 3072, 65536, 32, 32, True, False, True): (3, 256, 3, 4), + (768, 3072, 65536, 64, 64, False, True, True): (2, 512, 3, 4), + (768, 3072, 65536, 64, 64, True, False, True): (3, 256, 3, 4), + (768, 3072, 65536, 128, 128, False, True, True): (3, 512, 1, 4), + (768, 3072, 65536, 128, 128, True, False, True): (2, 512, 3, 8), + (768, 3072, 131072, 16, 16, False, True, True): (1, 512, 1, 4), + (768, 3072, 131072, 16, 16, True, False, True): (5, 512, 4, 1), + (768, 3072, 131072, 32, 32, False, True, True): (1, 512, 1, 4), + (768, 3072, 131072, 32, 32, True, False, True): (4, 512, 3, 4), + (768, 3072, 131072, 64, 64, False, True, True): (1, 1024, 3, 4), + (768, 3072, 131072, 64, 64, True, False, True): (1, 512, 3, 4), + (768, 3072, 131072, 128, 128, False, True, True): (3, 1024, 1, 4), + (768, 3072, 131072, 128, 128, True, False, True): (1, 1024, 3, 8), + (1024, 1024, 256, 16, 16, False, True, True): (1, 4, 5, 4), + (1024, 1024, 256, 16, 16, True, False, True): (3, 4, 4, 4), + (1024, 1024, 256, 32, 32, False, True, True): (4, 4, 5, 2), + (1024, 1024, 256, 32, 32, True, False, True): (3, 4, 5, 2), + (1024, 1024, 256, 64, 64, False, True, True): (1, 4, 5, 4), + (1024, 1024, 256, 64, 64, True, False, True): (1, 4, 5, 4), + (1024, 1024, 256, 128, 128, False, True, True): (1, 2, 2, 8), + (1024, 1024, 256, 128, 128, True, False, True): (2, 2, 2, 8), + (1024, 1024, 512, 16, 16, False, True, True): (3, 4, 4, 4), + (1024, 1024, 512, 16, 16, True, False, True): (4, 8, 5, 2), + (1024, 1024, 512, 32, 32, False, True, True): (1, 8, 4, 2), + (1024, 1024, 512, 32, 32, True, False, True): (1, 8, 4, 2), + (1024, 1024, 512, 64, 64, False, True, True): (4, 8, 4, 4), + (1024, 1024, 512, 64, 64, True, False, True): (2, 8, 3, 4), + (1024, 1024, 512, 128, 128, False, True, True): (2, 4, 2, 8), + (1024, 1024, 512, 128, 128, True, False, True): (1, 4, 2, 8), + (1024, 1024, 1024, 16, 16, False, True, True): (3, 8, 4, 4), + (1024, 1024, 1024, 16, 16, True, False, True): (4, 8, 4, 2), + (1024, 1024, 1024, 32, 32, False, True, True): (1, 16, 3, 2), + (1024, 1024, 1024, 32, 32, True, False, True): (1, 16, 3, 2), + (1024, 1024, 1024, 64, 64, False, True, True): (1, 16, 3, 4), + (1024, 1024, 1024, 64, 64, True, False, True): (3, 16, 3, 2), + (1024, 1024, 1024, 128, 128, False, True, True): (1, 8, 2, 8), + (1024, 1024, 1024, 128, 128, True, False, True): (2, 8, 2, 8), + (1024, 1024, 2048, 16, 16, False, True, True): (3, 8, 3, 4), + (1024, 1024, 2048, 16, 16, True, False, True): (3, 8, 3, 2), + (1024, 1024, 2048, 32, 32, False, True, True): (5, 16, 3, 4), + (1024, 1024, 2048, 32, 32, True, False, True): (1, 16, 3, 2), + (1024, 1024, 2048, 64, 64, False, True, True): (6, 16, 4, 4), + (1024, 1024, 2048, 64, 64, True, False, True): (5, 16, 3, 4), + (1024, 1024, 2048, 128, 128, False, True, True): (4, 16, 2, 8), + (1024, 1024, 2048, 128, 128, True, False, True): (4, 16, 2, 8), + (1024, 1024, 4096, 16, 16, False, True, True): (8, 32, 3, 2), + (1024, 1024, 4096, 16, 16, True, False, True): (4, 32, 3, 2), + (1024, 1024, 4096, 32, 32, False, True, True): (2, 32, 3, 4), + (1024, 1024, 4096, 32, 32, True, False, True): (3, 32, 3, 2), + (1024, 1024, 4096, 64, 64, False, True, True): (3, 32, 3, 4), + (1024, 1024, 4096, 64, 64, True, False, True): (1, 32, 3, 4), + (1024, 1024, 4096, 128, 128, False, True, True): (4, 32, 2, 8), + (1024, 1024, 4096, 128, 128, True, False, True): (1, 32, 2, 8), + (1024, 1024, 8192, 16, 16, False, True, True): (4, 64, 3, 2), + (1024, 1024, 8192, 16, 16, True, False, True): (4, 64, 3, 2), + (1024, 1024, 8192, 32, 32, False, True, True): (8, 64, 3, 4), + (1024, 1024, 8192, 32, 32, True, False, True): (4, 32, 3, 4), + (1024, 1024, 8192, 64, 64, False, True, True): (4, 64, 3, 4), + (1024, 1024, 8192, 64, 64, True, False, True): (2, 64, 3, 4), + (1024, 1024, 8192, 128, 128, False, True, True): (4, 64, 2, 8), + (1024, 1024, 8192, 128, 128, True, False, True): (4, 64, 1, 4), + (1024, 1024, 16384, 16, 16, False, True, True): (1, 64, 3, 2), + (1024, 1024, 16384, 16, 16, True, False, True): (1, 64, 3, 2), + (1024, 1024, 16384, 32, 32, False, True, True): (1, 128, 3, 2), + (1024, 1024, 16384, 32, 32, True, False, True): (1, 64, 3, 4), + (1024, 1024, 16384, 64, 64, False, True, True): (1, 128, 3, 4), + (1024, 1024, 16384, 64, 64, True, False, True): (1, 128, 3, 4), + (1024, 1024, 16384, 128, 128, False, True, True): (2, 128, 1, 4), + (1024, 1024, 16384, 128, 128, True, False, True): (4, 128, 1, 4), + (1024, 1024, 32768, 16, 16, False, True, True): (1, 128, 3, 2), + (1024, 1024, 32768, 16, 16, True, False, True): (1, 128, 3, 2), + (1024, 1024, 32768, 32, 32, False, True, True): (1, 256, 3, 2), + (1024, 1024, 32768, 32, 32, True, False, True): (1, 128, 3, 4), + (1024, 1024, 32768, 64, 64, False, True, True): (2, 128, 2, 4), + (1024, 1024, 32768, 64, 64, True, False, True): (1, 256, 3, 4), + (1024, 1024, 32768, 128, 128, False, True, True): (2, 256, 1, 4), + (1024, 1024, 32768, 128, 128, True, False, True): (4, 256, 1, 4), + (1024, 1024, 65536, 16, 16, False, True, True): (1, 256, 3, 4), + (1024, 1024, 65536, 16, 16, True, False, True): (1, 256, 3, 4), + (1024, 1024, 65536, 32, 32, False, True, True): (9, 256, 3, 4), + (1024, 1024, 65536, 32, 32, True, False, True): (7, 256, 3, 4), + (1024, 1024, 65536, 64, 64, False, True, True): (2, 256, 2, 4), + (1024, 1024, 65536, 64, 64, True, False, True): (2, 512, 3, 4), + (1024, 1024, 65536, 128, 128, False, True, True): (2, 512, 1, 4), + (1024, 1024, 65536, 128, 128, True, False, True): (4, 512, 1, 4), + (1024, 1024, 131072, 16, 16, False, True, True): (11, 512, 3, 2), + (1024, 1024, 131072, 16, 16, True, False, True): (11, 512, 3, 2), + (1024, 1024, 131072, 32, 32, False, True, True): (4, 512, 3, 4), + (1024, 1024, 131072, 32, 32, True, False, True): (6, 512, 3, 4), + (1024, 1024, 131072, 64, 64, False, True, True): (2, 512, 2, 4), + (1024, 1024, 131072, 64, 64, True, False, True): (2, 1024, 3, 4), + (1024, 1024, 131072, 128, 128, False, True, True): (4, 1024, 1, 4), + (1024, 1024, 131072, 128, 128, True, False, True): (4, 1024, 1, 4), + (1280, 5120, 65792, 16, 16, False, True, True): (1, 257, 1, 4), + (1280, 5120, 65792, 16, 16, True, False, True): (5, 257, 4, 1), + (1280, 5120, 65792, 32, 32, False, True, True): (1, 514, 1, 8), + (1280, 5120, 65792, 32, 32, True, False, True): (2, 257, 3, 4), + (1280, 5120, 65792, 64, 64, False, True, True): (1, 514, 3, 4), + (1280, 5120, 65792, 64, 64, True, False, True): (1, 257, 3, 4), + (1280, 5120, 65792, 128, 128, False, True, True): (1, 514, 3, 8), + (1280, 5120, 65792, 128, 128, True, False, True): (2, 514, 3, 8), + (1536, 1536, 256, 16, 16, False, True, True): (1, 4, 6, 2), + (1536, 1536, 256, 16, 16, True, False, True): (3, 4, 5, 2), + (1536, 1536, 256, 32, 32, False, True, True): (2, 4, 3, 4), + (1536, 1536, 256, 32, 32, True, False, True): (1, 4, 5, 2), + (1536, 1536, 256, 64, 64, False, True, True): (2, 4, 3, 4), + (1536, 1536, 256, 64, 64, True, False, True): (1, 4, 4, 4), + (1536, 1536, 256, 128, 128, False, True, True): (3, 2, 3, 8), + (1536, 1536, 256, 128, 128, True, False, True): (6, 2, 3, 8), + (1536, 1536, 512, 16, 16, False, True, True): (1, 8, 1, 4), + (1536, 1536, 512, 16, 16, True, False, True): (3, 4, 5, 2), + (1536, 1536, 512, 32, 32, False, True, True): (1, 8, 1, 8), + (1536, 1536, 512, 32, 32, True, False, True): (1, 4, 4, 4), + (1536, 1536, 512, 64, 64, False, True, True): (3, 8, 5, 4), + (1536, 1536, 512, 64, 64, True, False, True): (3, 8, 3, 4), + (1536, 1536, 512, 128, 128, False, True, True): (2, 4, 3, 8), + (1536, 1536, 512, 128, 128, True, False, True): (3, 4, 3, 8), + (1536, 1536, 1024, 16, 16, False, True, True): (1, 8, 1, 2), + (1536, 1536, 1024, 16, 16, True, False, True): (2, 8, 4, 2), + (1536, 1536, 1024, 32, 32, False, True, True): (8, 16, 1, 4), + (1536, 1536, 1024, 32, 32, True, False, True): (3, 8, 4, 2), + (1536, 1536, 1024, 64, 64, False, True, True): (1, 16, 3, 4), + (1536, 1536, 1024, 64, 64, True, False, True): (3, 8, 3, 4), + (1536, 1536, 1024, 128, 128, False, True, True): (3, 8, 3, 8), + (1536, 1536, 1024, 128, 128, True, False, True): (3, 8, 3, 8), + (1536, 1536, 2048, 16, 16, False, True, True): (1, 16, 1, 4), + (1536, 1536, 2048, 16, 16, True, False, True): (1, 8, 3, 1), + (1536, 1536, 2048, 32, 32, False, True, True): (3, 16, 1, 8), + (1536, 1536, 2048, 32, 32, True, False, True): (3, 8, 4, 4), + (1536, 1536, 2048, 64, 64, False, True, True): (1, 16, 3, 4), + (1536, 1536, 2048, 64, 64, True, False, True): (3, 8, 3, 4), + (1536, 1536, 2048, 128, 128, False, True, True): (4, 16, 1, 4), + (1536, 1536, 2048, 128, 128, True, False, True): (6, 16, 3, 8), + (1536, 1536, 4096, 16, 16, False, True, True): (1, 32, 1, 2), + (1536, 1536, 4096, 16, 16, True, False, True): (4, 32, 4, 2), + (1536, 1536, 4096, 32, 32, False, True, True): (1, 32, 1, 8), + (1536, 1536, 4096, 32, 32, True, False, True): (5, 32, 4, 2), + (1536, 1536, 4096, 64, 64, False, True, True): (2, 32, 3, 4), + (1536, 1536, 4096, 64, 64, True, False, True): (2, 16, 3, 4), + (1536, 1536, 4096, 128, 128, False, True, True): (4, 32, 3, 8), + (1536, 1536, 4096, 128, 128, True, False, True): (4, 32, 3, 8), + (1536, 1536, 8192, 16, 16, False, True, True): (1, 64, 1, 2), + (1536, 1536, 8192, 16, 16, True, False, True): (4, 64, 4, 2), + (1536, 1536, 8192, 32, 32, False, True, True): (2, 64, 1, 8), + (1536, 1536, 8192, 32, 32, True, False, True): (2, 32, 3, 4), + (1536, 1536, 8192, 64, 64, False, True, True): (1, 64, 3, 4), + (1536, 1536, 8192, 64, 64, True, False, True): (2, 32, 3, 4), + (1536, 1536, 8192, 128, 128, False, True, True): (4, 64, 3, 8), + (1536, 1536, 8192, 128, 128, True, False, True): (1, 64, 3, 8), + (1536, 1536, 16384, 16, 16, False, True, True): (1, 128, 1, 2), + (1536, 1536, 16384, 16, 16, True, False, True): (1, 64, 4, 4), + (1536, 1536, 16384, 32, 32, False, True, True): (1, 64, 1, 2), + (1536, 1536, 16384, 32, 32, True, False, True): (1, 64, 3, 4), + (1536, 1536, 16384, 64, 64, False, True, True): (1, 128, 3, 4), + (1536, 1536, 16384, 64, 64, True, False, True): (1, 64, 3, 4), + (1536, 1536, 16384, 128, 128, False, True, True): (1, 128, 1, 4), + (1536, 1536, 16384, 128, 128, True, False, True): (1, 128, 2, 4), + (1536, 1536, 32768, 16, 16, False, True, True): (1, 256, 1, 2), + (1536, 1536, 32768, 16, 16, True, False, True): (1, 128, 3, 2), + (1536, 1536, 32768, 32, 32, False, True, True): (1, 128, 1, 2), + (1536, 1536, 32768, 32, 32, True, False, True): (1, 128, 3, 4), + (1536, 1536, 32768, 64, 64, False, True, True): (1, 256, 3, 4), + (1536, 1536, 32768, 64, 64, True, False, True): (1, 128, 3, 4), + (1536, 1536, 32768, 128, 128, False, True, True): (1, 256, 1, 4), + (1536, 1536, 32768, 128, 128, True, False, True): (2, 256, 2, 4), + (1536, 1536, 65536, 16, 16, False, True, True): (2, 512, 1, 2), + (1536, 1536, 65536, 16, 16, True, False, True): (1, 256, 4, 4), + (1536, 1536, 65536, 32, 32, False, True, True): (1, 256, 1, 2), + (1536, 1536, 65536, 32, 32, True, False, True): (1, 256, 3, 4), + (1536, 1536, 65536, 64, 64, False, True, True): (1, 512, 3, 4), + (1536, 1536, 65536, 64, 64, True, False, True): (3, 256, 3, 4), + (1536, 1536, 65536, 128, 128, False, True, True): (3, 512, 1, 4), + (1536, 1536, 65536, 128, 128, True, False, True): (4, 512, 2, 4), + (1536, 1536, 131072, 16, 16, False, True, True): (2, 1024, 1, 2), + (1536, 1536, 131072, 16, 16, True, False, True): (9, 512, 4, 4), + (1536, 1536, 131072, 32, 32, False, True, True): (1, 512, 1, 2), + (1536, 1536, 131072, 32, 32, True, False, True): (5, 512, 3, 4), + (1536, 1536, 131072, 64, 64, False, True, True): (1, 1024, 3, 4), + (1536, 1536, 131072, 64, 64, True, False, True): (2, 512, 3, 4), + (1536, 1536, 131072, 128, 128, False, True, True): (3, 1024, 1, 4), + (1536, 1536, 131072, 128, 128, True, False, True): (1, 1024, 2, 4), + (2048, 2048, 256, 16, 16, False, True, True): (1, 4, 5, 2), + (2048, 2048, 256, 16, 16, True, False, True): (4, 4, 5, 2), + (2048, 2048, 256, 32, 32, False, True, True): (3, 4, 6, 2), + (2048, 2048, 256, 32, 32, True, False, True): (2, 4, 5, 2), + (2048, 2048, 256, 64, 64, False, True, True): (2, 4, 4, 4), + (2048, 2048, 256, 64, 64, True, False, True): (2, 4, 3, 4), + (2048, 2048, 256, 128, 128, False, True, True): (3, 2, 2, 8), + (2048, 2048, 256, 128, 128, True, False, True): (3, 2, 2, 8), + (2048, 2048, 512, 16, 16, False, True, True): (3, 4, 4, 4), + (2048, 2048, 512, 16, 16, True, False, True): (1, 4, 4, 4), + (2048, 2048, 512, 32, 32, False, True, True): (1, 4, 3, 4), + (2048, 2048, 512, 32, 32, True, False, True): (1, 4, 4, 2), + (2048, 2048, 512, 64, 64, False, True, True): (1, 8, 3, 4), + (2048, 2048, 512, 64, 64, True, False, True): (1, 8, 3, 4), + (2048, 2048, 512, 128, 128, False, True, True): (3, 4, 2, 8), + (2048, 2048, 512, 128, 128, True, False, True): (2, 4, 2, 8), + (2048, 2048, 1024, 16, 16, False, True, True): (3, 4, 3, 4), + (2048, 2048, 1024, 16, 16, True, False, True): (4, 8, 3, 2), + (2048, 2048, 1024, 32, 32, False, True, True): (3, 8, 3, 4), + (2048, 2048, 1024, 32, 32, True, False, True): (1, 8, 3, 2), + (2048, 2048, 1024, 64, 64, False, True, True): (1, 8, 3, 4), + (2048, 2048, 1024, 64, 64, True, False, True): (1, 8, 3, 4), + (2048, 2048, 1024, 128, 128, False, True, True): (4, 8, 1, 4), + (2048, 2048, 1024, 128, 128, True, False, True): (2, 8, 1, 4), + (2048, 2048, 2048, 16, 16, False, True, True): (4, 16, 3, 2), + (2048, 2048, 2048, 16, 16, True, False, True): (4, 16, 3, 2), + (2048, 2048, 2048, 32, 32, False, True, True): (1, 16, 3, 2), + (2048, 2048, 2048, 32, 32, True, False, True): (1, 16, 3, 2), + (2048, 2048, 2048, 64, 64, False, True, True): (4, 16, 3, 4), + (2048, 2048, 2048, 64, 64, True, False, True): (4, 16, 3, 4), + (2048, 2048, 2048, 128, 128, False, True, True): (6, 16, 2, 8), + (2048, 2048, 2048, 128, 128, True, False, True): (3, 16, 1, 4), + (2048, 2048, 4096, 16, 16, False, True, True): (4, 32, 4, 2), + (2048, 2048, 4096, 16, 16, True, False, True): (4, 32, 3, 2), + (2048, 2048, 4096, 32, 32, False, True, True): (4, 16, 3, 8), + (2048, 2048, 4096, 32, 32, True, False, True): (4, 16, 3, 8), + (2048, 2048, 4096, 64, 64, False, True, True): (1, 32, 3, 4), + (2048, 2048, 4096, 64, 64, True, False, True): (3, 32, 3, 4), + (2048, 2048, 4096, 128, 128, False, True, True): (2, 32, 1, 4), + (2048, 2048, 4096, 128, 128, True, False, True): (2, 32, 1, 4), + (2048, 2048, 8192, 16, 16, False, True, True): (4, 64, 4, 2), + (2048, 2048, 8192, 16, 16, True, False, True): (4, 64, 4, 2), + (2048, 2048, 8192, 32, 32, False, True, True): (4, 32, 4, 8), + (2048, 2048, 8192, 32, 32, True, False, True): (4, 32, 3, 8), + (2048, 2048, 8192, 64, 64, False, True, True): (4, 64, 3, 4), + (2048, 2048, 8192, 64, 64, True, False, True): (4, 64, 3, 4), + (2048, 2048, 8192, 128, 128, False, True, True): (2, 64, 1, 4), + (2048, 2048, 8192, 128, 128, True, False, True): (2, 64, 1, 4), + (2048, 2048, 16384, 16, 16, False, True, True): (4, 64, 3, 2), + (2048, 2048, 16384, 16, 16, True, False, True): (1, 64, 3, 2), + (2048, 2048, 16384, 32, 32, False, True, True): (4, 64, 3, 4), + (2048, 2048, 16384, 32, 32, True, False, True): (4, 64, 3, 4), + (2048, 2048, 16384, 64, 64, False, True, True): (4, 128, 3, 4), + (2048, 2048, 16384, 64, 64, True, False, True): (4, 128, 3, 4), + (2048, 2048, 16384, 128, 128, False, True, True): (2, 128, 1, 4), + (2048, 2048, 16384, 128, 128, True, False, True): (2, 128, 1, 4), + (2048, 2048, 32768, 16, 16, False, True, True): (8, 128, 3, 2), + (2048, 2048, 32768, 16, 16, True, False, True): (8, 128, 3, 4), + (2048, 2048, 32768, 32, 32, False, True, True): (8, 128, 3, 4), + (2048, 2048, 32768, 32, 32, True, False, True): (8, 128, 3, 4), + (2048, 2048, 32768, 64, 64, False, True, True): (1, 128, 2, 4), + (2048, 2048, 32768, 64, 64, True, False, True): (8, 256, 3, 4), + (2048, 2048, 32768, 128, 128, False, True, True): (2, 256, 1, 4), + (2048, 2048, 32768, 128, 128, True, False, True): (2, 256, 1, 4), + (2048, 2048, 65536, 16, 16, False, True, True): (9, 256, 4, 4), + (2048, 2048, 65536, 16, 16, True, False, True): (7, 256, 4, 4), + (2048, 2048, 65536, 32, 32, False, True, True): (7, 256, 3, 4), + (2048, 2048, 65536, 32, 32, True, False, True): (3, 256, 3, 4), + (2048, 2048, 65536, 64, 64, False, True, True): (2, 256, 2, 4), + (2048, 2048, 65536, 64, 64, True, False, True): (6, 512, 3, 4), + (2048, 2048, 65536, 128, 128, False, True, True): (2, 512, 1, 4), + (2048, 2048, 65536, 128, 128, True, False, True): (2, 512, 1, 4), + (2048, 2048, 131072, 16, 16, False, True, True): (9, 512, 4, 4), + (2048, 2048, 131072, 16, 16, True, False, True): (9, 512, 4, 4), + (2048, 2048, 131072, 32, 32, False, True, True): (7, 512, 4, 4), + (2048, 2048, 131072, 32, 32, True, False, True): (3, 512, 3, 4), + (2048, 2048, 131072, 64, 64, False, True, True): (2, 512, 2, 4), + (2048, 2048, 131072, 64, 64, True, False, True): (4, 1024, 3, 4), + (2048, 2048, 131072, 128, 128, False, True, True): (1, 1024, 1, 4), + (2048, 2048, 131072, 128, 128, True, False, True): (2, 1024, 1, 4), + (3072, 768, 256, 16, 16, False, True, True): (6, 4, 1, 4), + (3072, 768, 256, 16, 16, True, False, True): (3, 1, 4, 4), + (3072, 768, 256, 32, 32, False, True, True): (6, 8, 1, 2), + (3072, 768, 256, 32, 32, True, False, True): (1, 2, 4, 4), + (3072, 768, 256, 64, 64, False, True, True): (1, 4, 4, 4), + (3072, 768, 256, 64, 64, True, False, True): (4, 2, 4, 4), + (3072, 768, 256, 128, 128, False, True, True): (1, 2, 3, 8), + (3072, 768, 256, 128, 128, True, False, True): (1, 2, 3, 8), + (3072, 768, 512, 16, 16, False, True, True): (2, 4, 1, 4), + (3072, 768, 512, 16, 16, True, False, True): (1, 4, 4, 1), + (3072, 768, 512, 32, 32, False, True, True): (3, 8, 1, 4), + (3072, 768, 512, 32, 32, True, False, True): (1, 2, 3, 4), + (3072, 768, 512, 64, 64, False, True, True): (1, 8, 1, 4), + (3072, 768, 512, 64, 64, True, False, True): (4, 4, 3, 4), + (3072, 768, 512, 128, 128, False, True, True): (1, 4, 3, 8), + (3072, 768, 512, 128, 128, True, False, True): (1, 4, 3, 8), + (3072, 768, 1024, 16, 16, False, True, True): (1, 8, 1, 4), + (3072, 768, 1024, 16, 16, True, False, True): (3, 4, 3, 1), + (3072, 768, 1024, 32, 32, False, True, True): (1, 8, 1, 8), + (3072, 768, 1024, 32, 32, True, False, True): (1, 4, 4, 4), + (3072, 768, 1024, 64, 64, False, True, True): (1, 16, 3, 4), + (3072, 768, 1024, 64, 64, True, False, True): (1, 4, 3, 4), + (3072, 768, 1024, 128, 128, False, True, True): (1, 8, 3, 8), + (3072, 768, 1024, 128, 128, True, False, True): (2, 8, 3, 8), + (3072, 768, 2048, 16, 16, False, True, True): (3, 8, 1, 4), + (3072, 768, 2048, 16, 16, True, False, True): (2, 8, 3, 4), + (3072, 768, 2048, 32, 32, False, True, True): (3, 16, 1, 8), + (3072, 768, 2048, 32, 32, True, False, True): (3, 8, 3, 4), + (3072, 768, 2048, 64, 64, False, True, True): (1, 16, 1, 4), + (3072, 768, 2048, 64, 64, True, False, True): (1, 16, 3, 4), + (3072, 768, 2048, 128, 128, False, True, True): (1, 16, 3, 8), + (3072, 768, 2048, 128, 128, True, False, True): (2, 16, 2, 4), + (3072, 768, 4096, 16, 16, False, True, True): (1, 16, 1, 4), + (3072, 768, 4096, 16, 16, True, False, True): (4, 32, 4, 2), + (3072, 768, 4096, 32, 32, False, True, True): (2, 32, 1, 8), + (3072, 768, 4096, 32, 32, True, False, True): (7, 16, 3, 4), + (3072, 768, 4096, 64, 64, False, True, True): (2, 32, 1, 4), + (3072, 768, 4096, 64, 64, True, False, True): (2, 16, 2, 4), + (3072, 768, 4096, 128, 128, False, True, True): (1, 32, 3, 8), + (3072, 768, 4096, 128, 128, True, False, True): (3, 32, 2, 4), + (3072, 768, 8192, 16, 16, False, True, True): (2, 32, 1, 4), + (3072, 768, 8192, 16, 16, True, False, True): (4, 64, 4, 2), + (3072, 768, 8192, 32, 32, False, True, True): (4, 32, 1, 4), + (3072, 768, 8192, 32, 32, True, False, True): (4, 32, 3, 4), + (3072, 768, 8192, 64, 64, False, True, True): (2, 64, 1, 4), + (3072, 768, 8192, 64, 64, True, False, True): (4, 32, 2, 4), + (3072, 768, 8192, 128, 128, False, True, True): (3, 64, 1, 4), + (3072, 768, 8192, 128, 128, True, False, True): (6, 64, 2, 4), + (3072, 768, 16384, 16, 16, False, True, True): (1, 64, 1, 4), + (3072, 768, 16384, 16, 16, True, False, True): (1, 64, 1, 1), + (3072, 768, 16384, 32, 32, False, True, True): (1, 64, 1, 4), + (3072, 768, 16384, 32, 32, True, False, True): (4, 64, 3, 4), + (3072, 768, 16384, 64, 64, False, True, True): (4, 128, 1, 4), + (3072, 768, 16384, 64, 64, True, False, True): (4, 64, 2, 4), + (3072, 768, 16384, 128, 128, False, True, True): (3, 128, 1, 4), + (3072, 768, 16384, 128, 128, True, False, True): (4, 128, 2, 4), + (3072, 768, 32768, 16, 16, False, True, True): (1, 128, 1, 4), + (3072, 768, 32768, 16, 16, True, False, True): (8, 128, 4, 1), + (3072, 768, 32768, 32, 32, False, True, True): (1, 128, 1, 4), + (3072, 768, 32768, 32, 32, True, False, True): (8, 128, 3, 4), + (3072, 768, 32768, 64, 64, False, True, True): (1, 256, 1, 4), + (3072, 768, 32768, 64, 64, True, False, True): (1, 128, 2, 4), + (3072, 768, 32768, 128, 128, False, True, True): (3, 256, 1, 4), + (3072, 768, 32768, 128, 128, True, False, True): (8, 256, 2, 4), + (3072, 768, 50432, 16, 16, False, True, True): (1, 197, 1, 4), + (3072, 768, 50432, 16, 16, True, False, True): (7, 197, 4, 1), + (3072, 768, 50432, 32, 32, False, True, True): (1, 197, 1, 4), + (3072, 768, 50432, 32, 32, True, False, True): (4, 197, 3, 4), + (3072, 768, 50432, 64, 64, False, True, True): (1, 394, 1, 4), + (3072, 768, 50432, 64, 64, True, False, True): (3, 197, 2, 4), + (3072, 768, 50432, 128, 128, False, True, True): (3, 394, 1, 4), + (3072, 768, 50432, 128, 128, True, False, True): (8, 394, 2, 4), + (3072, 768, 65536, 16, 16, False, True, True): (1, 256, 1, 4), + (3072, 768, 65536, 16, 16, True, False, True): (15, 256, 4, 1), + (3072, 768, 65536, 32, 32, False, True, True): (1, 256, 1, 4), + (3072, 768, 65536, 32, 32, True, False, True): (15, 256, 3, 4), + (3072, 768, 65536, 64, 64, False, True, True): (1, 512, 1, 4), + (3072, 768, 65536, 64, 64, True, False, True): (2, 256, 2, 4), + (3072, 768, 65536, 128, 128, False, True, True): (3, 512, 1, 4), + (3072, 768, 65536, 128, 128, True, False, True): (3, 512, 2, 4), + (3072, 768, 131072, 16, 16, False, True, True): (1, 512, 1, 4), + (3072, 768, 131072, 16, 16, True, False, True): (15, 512, 4, 1), + (3072, 768, 131072, 32, 32, False, True, True): (1, 512, 1, 4), + (3072, 768, 131072, 32, 32, True, False, True): (9, 512, 3, 4), + (3072, 768, 131072, 64, 64, False, True, True): (1, 1024, 1, 4), + (3072, 768, 131072, 64, 64, True, False, True): (3, 512, 2, 4), + (3072, 768, 131072, 128, 128, False, True, True): (3, 1024, 1, 4), + (3072, 768, 131072, 128, 128, True, False, True): (1, 1024, 2, 4), + (3072, 3072, 256, 16, 16, False, True, True): (5, 4, 1, 4), + (3072, 3072, 256, 16, 16, True, False, True): (1, 2, 5, 2), + (3072, 3072, 256, 32, 32, False, True, True): (5, 4, 1, 8), + (3072, 3072, 256, 32, 32, True, False, True): (1, 4, 4, 2), + (3072, 3072, 256, 64, 64, False, True, True): (2, 4, 4, 4), + (3072, 3072, 256, 64, 64, True, False, True): (2, 4, 4, 4), + (3072, 3072, 256, 128, 128, False, True, True): (1, 2, 3, 8), + (3072, 3072, 256, 128, 128, True, False, True): (1, 2, 3, 8), + (3072, 3072, 512, 16, 16, False, True, True): (5, 4, 1, 2), + (3072, 3072, 512, 16, 16, True, False, True): (1, 2, 3, 4), + (3072, 3072, 512, 32, 32, False, True, True): (3, 8, 1, 4), + (3072, 3072, 512, 32, 32, True, False, True): (1, 4, 4, 2), + (3072, 3072, 512, 64, 64, False, True, True): (1, 8, 2, 2), + (3072, 3072, 512, 64, 64, True, False, True): (2, 4, 3, 4), + (3072, 3072, 512, 128, 128, False, True, True): (2, 4, 3, 8), + (3072, 3072, 512, 128, 128, True, False, True): (1, 4, 3, 8), + (3072, 3072, 1024, 16, 16, False, True, True): (1, 8, 1, 4), + (3072, 3072, 1024, 16, 16, True, False, True): (2, 8, 3, 1), + (3072, 3072, 1024, 32, 32, False, True, True): (1, 16, 1, 4), + (3072, 3072, 1024, 32, 32, True, False, True): (1, 4, 4, 4), + (3072, 3072, 1024, 64, 64, False, True, True): (1, 8, 3, 4), + (3072, 3072, 1024, 64, 64, True, False, True): (2, 4, 3, 4), + (3072, 3072, 1024, 128, 128, False, True, True): (1, 8, 1, 4), + (3072, 3072, 1024, 128, 128, True, False, True): (2, 8, 3, 8), + (3072, 3072, 2048, 16, 16, False, True, True): (1, 16, 1, 2), + (3072, 3072, 2048, 16, 16, True, False, True): (2, 16, 4, 2), + (3072, 3072, 2048, 32, 32, False, True, True): (1, 16, 1, 8), + (3072, 3072, 2048, 32, 32, True, False, True): (3, 8, 4, 4), + (3072, 3072, 2048, 64, 64, False, True, True): (3, 16, 3, 4), + (3072, 3072, 2048, 64, 64, True, False, True): (3, 8, 3, 4), + (3072, 3072, 2048, 128, 128, False, True, True): (1, 16, 3, 8), + (3072, 3072, 2048, 128, 128, True, False, True): (5, 16, 3, 8), + (3072, 3072, 4096, 16, 16, False, True, True): (1, 32, 1, 2), + (3072, 3072, 4096, 16, 16, True, False, True): (4, 32, 4, 2), + (3072, 3072, 4096, 32, 32, False, True, True): (1, 32, 1, 8), + (3072, 3072, 4096, 32, 32, True, False, True): (3, 16, 3, 4), + (3072, 3072, 4096, 64, 64, False, True, True): (1, 32, 3, 4), + (3072, 3072, 4096, 64, 64, True, False, True): (3, 16, 3, 4), + (3072, 3072, 4096, 128, 128, False, True, True): (3, 32, 3, 8), + (3072, 3072, 4096, 128, 128, True, False, True): (3, 32, 3, 8), + (3072, 3072, 8192, 16, 16, False, True, True): (1, 64, 1, 2), + (3072, 3072, 8192, 16, 16, True, False, True): (4, 64, 4, 2), + (3072, 3072, 8192, 32, 32, False, True, True): (1, 64, 1, 8), + (3072, 3072, 8192, 32, 32, True, False, True): (6, 32, 3, 4), + (3072, 3072, 8192, 64, 64, False, True, True): (1, 64, 3, 4), + (3072, 3072, 8192, 64, 64, True, False, True): (2, 32, 3, 4), + (3072, 3072, 8192, 128, 128, False, True, True): (2, 64, 3, 8), + (3072, 3072, 8192, 128, 128, True, False, True): (1, 64, 3, 8), + (3072, 3072, 16384, 16, 16, False, True, True): (1, 128, 1, 2), + (3072, 3072, 16384, 16, 16, True, False, True): (4, 128, 4, 2), + (3072, 3072, 16384, 32, 32, False, True, True): (1, 64, 1, 2), + (3072, 3072, 16384, 32, 32, True, False, True): (4, 64, 3, 4), + (3072, 3072, 16384, 64, 64, False, True, True): (1, 128, 3, 4), + (3072, 3072, 16384, 64, 64, True, False, True): (4, 64, 3, 4), + (3072, 3072, 16384, 128, 128, False, True, True): (1, 128, 1, 4), + (3072, 3072, 16384, 128, 128, True, False, True): (1, 128, 3, 8), + (3072, 3072, 32768, 16, 16, False, True, True): (1, 256, 1, 2), + (3072, 3072, 32768, 16, 16, True, False, True): (8, 128, 4, 4), + (3072, 3072, 32768, 32, 32, False, True, True): (1, 256, 1, 8), + (3072, 3072, 32768, 32, 32, True, False, True): (5, 128, 3, 4), + (3072, 3072, 32768, 64, 64, False, True, True): (1, 256, 3, 4), + (3072, 3072, 32768, 64, 64, True, False, True): (3, 128, 3, 4), + (3072, 3072, 32768, 128, 128, False, True, True): (1, 256, 1, 4), + (3072, 3072, 32768, 128, 128, True, False, True): (3, 256, 2, 4), + (3072, 3072, 65536, 16, 16, False, True, True): (1, 512, 1, 2), + (3072, 3072, 65536, 16, 16, True, False, True): (7, 256, 4, 4), + (3072, 3072, 65536, 32, 32, False, True, True): (1, 256, 1, 2), + (3072, 3072, 65536, 32, 32, True, False, True): (5, 256, 3, 4), + (3072, 3072, 65536, 64, 64, False, True, True): (1, 512, 3, 4), + (3072, 3072, 65536, 64, 64, True, False, True): (3, 256, 3, 4), + (3072, 3072, 65536, 128, 128, False, True, True): (1, 512, 1, 4), + (3072, 3072, 65536, 128, 128, True, False, True): (3, 512, 2, 4), + (3072, 3072, 131072, 16, 16, False, True, True): (1, 1024, 1, 2), + (3072, 3072, 131072, 16, 16, True, False, True): (5, 512, 4, 4), + (3072, 3072, 131072, 32, 32, False, True, True): (1, 512, 1, 2), + (3072, 3072, 131072, 32, 32, True, False, True): (5, 512, 3, 4), + (3072, 3072, 131072, 64, 64, False, True, True): (1, 1024, 3, 4), + (3072, 3072, 131072, 64, 64, True, False, True): (3, 512, 3, 4), + (3072, 3072, 131072, 128, 128, False, True, True): (1, 1024, 1, 4), + (3072, 3072, 131072, 128, 128, True, False, True): (6, 1024, 2, 4), + (4096, 4096, 256, 16, 16, False, True, True): (2, 2, 5, 4), + (4096, 4096, 256, 16, 16, True, False, True): (2, 2, 4, 2), + (4096, 4096, 256, 32, 32, False, True, True): (1, 2, 4, 4), + (4096, 4096, 256, 32, 32, True, False, True): (3, 2, 4, 2), + (4096, 4096, 256, 64, 64, False, True, True): (3, 4, 3, 4), + (4096, 4096, 256, 64, 64, True, False, True): (1, 4, 3, 2), + (4096, 4096, 256, 128, 128, False, True, True): (1, 2, 2, 8), + (4096, 4096, 256, 128, 128, True, False, True): (1, 2, 2, 8), + (4096, 4096, 512, 16, 16, False, True, True): (4, 2, 3, 4), + (4096, 4096, 512, 16, 16, True, False, True): (1, 2, 3, 4), + (4096, 4096, 512, 32, 32, False, True, True): (1, 4, 3, 4), + (4096, 4096, 512, 32, 32, True, False, True): (3, 4, 3, 2), + (4096, 4096, 512, 64, 64, False, True, True): (4, 4, 4, 4), + (4096, 4096, 512, 64, 64, True, False, True): (3, 4, 3, 4), + (4096, 4096, 512, 128, 128, False, True, True): (2, 4, 2, 8), + (4096, 4096, 512, 128, 128, True, False, True): (2, 4, 1, 4), + (4096, 4096, 1024, 16, 16, False, True, True): (2, 8, 3, 2), + (4096, 4096, 1024, 16, 16, True, False, True): (2, 8, 3, 2), + (4096, 4096, 1024, 32, 32, False, True, True): (1, 8, 3, 4), + (4096, 4096, 1024, 32, 32, True, False, True): (1, 8, 3, 2), + (4096, 4096, 1024, 64, 64, False, True, True): (1, 8, 3, 4), + (4096, 4096, 1024, 64, 64, True, False, True): (1, 8, 3, 4), + (4096, 4096, 1024, 128, 128, False, True, True): (4, 8, 1, 4), + (4096, 4096, 1024, 128, 128, True, False, True): (2, 8, 2, 8), + (4096, 4096, 2048, 16, 16, False, True, True): (2, 8, 4, 4), + (4096, 4096, 2048, 16, 16, True, False, True): (2, 8, 4, 4), + (4096, 4096, 2048, 32, 32, False, True, True): (4, 8, 3, 8), + (4096, 4096, 2048, 32, 32, True, False, True): (4, 8, 4, 8), + (4096, 4096, 2048, 64, 64, False, True, True): (4, 16, 3, 4), + (4096, 4096, 2048, 64, 64, True, False, True): (4, 16, 3, 4), + (4096, 4096, 2048, 128, 128, False, True, True): (1, 16, 1, 4), + (4096, 4096, 2048, 128, 128, True, False, True): (4, 16, 1, 4), + (4096, 4096, 4096, 16, 16, False, True, True): (4, 32, 4, 4), + (4096, 4096, 4096, 16, 16, True, False, True): (2, 32, 4, 4), + (4096, 4096, 4096, 32, 32, False, True, True): (4, 16, 4, 8), + (4096, 4096, 4096, 32, 32, True, False, True): (4, 16, 4, 8), + (4096, 4096, 4096, 64, 64, False, True, True): (4, 32, 3, 4), + (4096, 4096, 4096, 64, 64, True, False, True): (2, 32, 3, 4), + (4096, 4096, 4096, 128, 128, False, True, True): (2, 32, 1, 4), + (4096, 4096, 4096, 128, 128, True, False, True): (2, 32, 1, 4), + (4096, 4096, 8192, 16, 16, False, True, True): (4, 64, 4, 2), + (4096, 4096, 8192, 16, 16, True, False, True): (4, 64, 4, 2), + (4096, 4096, 8192, 32, 32, False, True, True): (4, 32, 4, 8), + (4096, 4096, 8192, 32, 32, True, False, True): (4, 32, 4, 8), + (4096, 4096, 8192, 64, 64, False, True, True): (4, 64, 3, 4), + (4096, 4096, 8192, 64, 64, True, False, True): (4, 64, 3, 4), + (4096, 4096, 8192, 128, 128, False, True, True): (1, 64, 1, 4), + (4096, 4096, 8192, 128, 128, True, False, True): (1, 64, 1, 4), + (4096, 4096, 16384, 16, 16, False, True, True): (4, 64, 4, 4), + (4096, 4096, 16384, 16, 16, True, False, True): (4, 64, 4, 4), + (4096, 4096, 16384, 32, 32, False, True, True): (4, 64, 4, 8), + (4096, 4096, 16384, 32, 32, True, False, True): (4, 64, 4, 8), + (4096, 4096, 16384, 64, 64, False, True, True): (4, 128, 3, 4), + (4096, 4096, 16384, 64, 64, True, False, True): (4, 128, 3, 4), + (4096, 4096, 16384, 128, 128, False, True, True): (1, 128, 1, 4), + (4096, 4096, 16384, 128, 128, True, False, True): (1, 128, 1, 4), + (4096, 4096, 32768, 16, 16, False, True, True): (8, 128, 4, 4), + (4096, 4096, 32768, 16, 16, True, False, True): (5, 128, 4, 4), + (4096, 4096, 32768, 32, 32, False, True, True): (5, 128, 4, 4), + (4096, 4096, 32768, 32, 32, True, False, True): (3, 128, 4, 8), + (4096, 4096, 32768, 64, 64, False, True, True): (3, 256, 3, 4), + (4096, 4096, 32768, 64, 64, True, False, True): (2, 256, 3, 4), + (4096, 4096, 32768, 128, 128, False, True, True): (1, 256, 1, 4), + (4096, 4096, 32768, 128, 128, True, False, True): (1, 256, 1, 4), + (4096, 4096, 65536, 16, 16, False, True, True): (5, 256, 4, 4), + (4096, 4096, 65536, 16, 16, True, False, True): (5, 256, 4, 4), + (4096, 4096, 65536, 32, 32, False, True, True): (4, 256, 4, 8), + (4096, 4096, 65536, 32, 32, True, False, True): (4, 256, 4, 8), + (4096, 4096, 65536, 64, 64, False, True, True): (1, 512, 3, 4), + (4096, 4096, 65536, 64, 64, True, False, True): (3, 512, 3, 4), + (4096, 4096, 65536, 128, 128, False, True, True): (1, 512, 1, 4), + (4096, 4096, 65536, 128, 128, True, False, True): (1, 512, 1, 4), + (4096, 4096, 131072, 16, 16, False, True, True): (5, 512, 4, 4), + (4096, 4096, 131072, 16, 16, True, False, True): (5, 512, 4, 4), + (4096, 4096, 131072, 32, 32, False, True, True): (4, 512, 4, 4), + (4096, 4096, 131072, 32, 32, True, False, True): (2, 512, 3, 4), + (4096, 4096, 131072, 64, 64, False, True, True): (1, 1024, 3, 4), + (4096, 4096, 131072, 64, 64, True, False, True): (3, 1024, 3, 4), + (4096, 4096, 131072, 128, 128, False, True, True): (1, 1024, 1, 4), + (4096, 4096, 131072, 128, 128, True, False, True): (1, 1024, 1, 4), + (5120, 1280, 65792, 16, 16, False, True, True): (1, 257, 1, 4), + (5120, 1280, 65792, 16, 16, True, False, True): (11, 257, 4, 1), + (5120, 1280, 65792, 32, 32, False, True, True): (1, 257, 1, 4), + (5120, 1280, 65792, 32, 32, True, False, True): (5, 257, 3, 4), + (5120, 1280, 65792, 64, 64, False, True, True): (1, 514, 1, 4), + (5120, 1280, 65792, 64, 64, True, False, True): (5, 257, 2, 4), + (5120, 1280, 65792, 128, 128, False, True, True): (3, 514, 1, 4), + (5120, 1280, 65792, 128, 128, True, False, True): (7, 514, 2, 4), + (6144, 6144, 256, 16, 16, False, True, True): (1, 2, 1, 4), + (6144, 6144, 256, 16, 16, True, False, True): (3, 1, 4, 4), + (6144, 6144, 256, 32, 32, False, True, True): (3, 2, 1, 8), + (6144, 6144, 256, 32, 32, True, False, True): (1, 1, 4, 4), + (6144, 6144, 256, 64, 64, False, True, True): (4, 2, 3, 4), + (6144, 6144, 256, 64, 64, True, False, True): (3, 2, 4, 4), + (6144, 6144, 256, 128, 128, False, True, True): (2, 2, 3, 8), + (6144, 6144, 256, 128, 128, True, False, True): (1, 2, 3, 8), + (6144, 6144, 512, 16, 16, False, True, True): (4, 4, 1, 4), + (6144, 6144, 512, 16, 16, True, False, True): (3, 2, 3, 1), + (6144, 6144, 512, 32, 32, False, True, True): (1, 8, 1, 4), + (6144, 6144, 512, 32, 32, True, False, True): (1, 2, 3, 2), + (6144, 6144, 512, 64, 64, False, True, True): (2, 4, 3, 4), + (6144, 6144, 512, 64, 64, True, False, True): (2, 2, 3, 4), + (6144, 6144, 512, 128, 128, False, True, True): (1, 4, 3, 8), + (6144, 6144, 512, 128, 128, True, False, True): (1, 4, 3, 8), + (6144, 6144, 1024, 16, 16, False, True, True): (1, 8, 1, 2), + (6144, 6144, 1024, 16, 16, True, False, True): (4, 8, 4, 4), + (6144, 6144, 1024, 32, 32, False, True, True): (1, 8, 4, 2), + (6144, 6144, 1024, 32, 32, True, False, True): (1, 8, 4, 2), + (6144, 6144, 1024, 64, 64, False, True, True): (4, 8, 3, 4), + (6144, 6144, 1024, 64, 64, True, False, True): (1, 4, 3, 4), + (6144, 6144, 1024, 128, 128, False, True, True): (2, 8, 3, 8), + (6144, 6144, 1024, 128, 128, True, False, True): (1, 8, 3, 8), + (6144, 6144, 2048, 16, 16, False, True, True): (4, 4, 1, 4), + (6144, 6144, 2048, 16, 16, True, False, True): (2, 8, 4, 4), + (6144, 6144, 2048, 32, 32, False, True, True): (1, 16, 4, 2), + (6144, 6144, 2048, 32, 32, True, False, True): (4, 8, 4, 8), + (6144, 6144, 2048, 64, 64, False, True, True): (4, 16, 3, 4), + (6144, 6144, 2048, 64, 64, True, False, True): (2, 8, 3, 4), + (6144, 6144, 2048, 128, 128, False, True, True): (1, 16, 3, 8), + (6144, 6144, 2048, 128, 128, True, False, True): (4, 16, 3, 8), + (6144, 6144, 4096, 16, 16, False, True, True): (4, 8, 1, 4), + (6144, 6144, 4096, 16, 16, True, False, True): (4, 32, 4, 2), + (6144, 6144, 4096, 32, 32, False, True, True): (4, 16, 1, 2), + (6144, 6144, 4096, 32, 32, True, False, True): (2, 8, 3, 8), + (6144, 6144, 4096, 64, 64, False, True, True): (4, 32, 3, 4), + (6144, 6144, 4096, 64, 64, True, False, True): (4, 16, 3, 4), + (6144, 6144, 4096, 128, 128, False, True, True): (4, 32, 3, 8), + (6144, 6144, 4096, 128, 128, True, False, True): (4, 32, 3, 8), + (6144, 6144, 8192, 16, 16, False, True, True): (2, 16, 1, 2), + (6144, 6144, 8192, 16, 16, True, False, True): (4, 64, 4, 2), + (6144, 6144, 8192, 32, 32, False, True, True): (4, 32, 1, 2), + (6144, 6144, 8192, 32, 32, True, False, True): (4, 32, 4, 8), + (6144, 6144, 8192, 64, 64, False, True, True): (4, 64, 3, 4), + (6144, 6144, 8192, 64, 64, True, False, True): (4, 32, 3, 4), + (6144, 6144, 8192, 128, 128, False, True, True): (4, 64, 3, 8), + (6144, 6144, 8192, 128, 128, True, False, True): (4, 64, 3, 8), + (6144, 6144, 16384, 16, 16, False, True, True): (2, 32, 1, 2), + (6144, 6144, 16384, 16, 16, True, False, True): (4, 64, 4, 4), + (6144, 6144, 16384, 32, 32, False, True, True): (4, 64, 1, 2), + (6144, 6144, 16384, 32, 32, True, False, True): (4, 64, 3, 2), + (6144, 6144, 16384, 64, 64, False, True, True): (4, 128, 3, 4), + (6144, 6144, 16384, 64, 64, True, False, True): (2, 32, 3, 8), + (6144, 6144, 16384, 128, 128, False, True, True): (4, 128, 3, 8), + (6144, 6144, 16384, 128, 128, True, False, True): (4, 128, 3, 8), + (6144, 6144, 32768, 16, 16, False, True, True): (2, 64, 1, 2), + (6144, 6144, 32768, 16, 16, True, False, True): (3, 128, 4, 4), + (6144, 6144, 32768, 32, 32, False, True, True): (4, 128, 1, 2), + (6144, 6144, 32768, 32, 32, True, False, True): (3, 128, 3, 4), + (6144, 6144, 32768, 64, 64, False, True, True): (4, 256, 3, 4), + (6144, 6144, 32768, 64, 64, True, False, True): (2, 64, 3, 8), + (6144, 6144, 32768, 128, 128, False, True, True): (4, 256, 3, 8), + (6144, 6144, 32768, 128, 128, True, False, True): (4, 256, 3, 8), + (6144, 6144, 65536, 16, 16, False, True, True): (2, 128, 1, 2), + (6144, 6144, 65536, 16, 16, True, False, True): (4, 256, 4, 4), + (6144, 6144, 65536, 32, 32, False, True, True): (4, 256, 1, 2), + (6144, 6144, 65536, 32, 32, True, False, True): (4, 256, 3, 4), + (6144, 6144, 65536, 64, 64, False, True, True): (4, 512, 3, 4), + (6144, 6144, 65536, 64, 64, True, False, True): (2, 128, 3, 8), + (6144, 6144, 65536, 128, 128, False, True, True): (4, 512, 3, 8), + (6144, 6144, 65536, 128, 128, True, False, True): (4, 512, 3, 8), + (6144, 6144, 131072, 16, 16, False, True, True): (2, 256, 1, 2), + (6144, 6144, 131072, 16, 16, True, False, True): (5, 512, 4, 1), + (6144, 6144, 131072, 32, 32, False, True, True): (4, 512, 1, 2), + (6144, 6144, 131072, 32, 32, True, False, True): (4, 512, 3, 2), + (6144, 6144, 131072, 64, 64, False, True, True): (4, 1024, 3, 4), + (6144, 6144, 131072, 64, 64, True, False, True): (2, 256, 3, 8), + (6144, 6144, 131072, 128, 128, False, True, True): (4, 1024, 3, 8), + (6144, 6144, 131072, 128, 128, True, False, True): (4, 1024, 3, 8), + (8192, 8192, 256, 16, 16, False, True, True): (1, 1, 3, 4), + (8192, 8192, 256, 16, 16, True, False, True): (4, 1, 3, 4), + (8192, 8192, 256, 32, 32, False, True, True): (1, 2, 3, 4), + (8192, 8192, 256, 32, 32, True, False, True): (1, 2, 3, 4), + (8192, 8192, 256, 64, 64, False, True, True): (6, 2, 3, 8), + (8192, 8192, 256, 64, 64, True, False, True): (4, 2, 3, 8), + (8192, 8192, 256, 128, 128, False, True, True): (1, 2, 1, 4), + (8192, 8192, 256, 128, 128, True, False, True): (1, 2, 1, 4), + (8192, 8192, 512, 16, 16, False, True, True): (4, 4, 3, 2), + (8192, 8192, 512, 16, 16, True, False, True): (4, 4, 3, 4), + (8192, 8192, 512, 32, 32, False, True, True): (1, 4, 3, 4), + (8192, 8192, 512, 32, 32, True, False, True): (3, 4, 3, 2), + (8192, 8192, 512, 64, 64, False, True, True): (1, 4, 3, 4), + (8192, 8192, 512, 64, 64, True, False, True): (1, 4, 3, 4), + (8192, 8192, 512, 128, 128, False, True, True): (4, 4, 2, 8), + (8192, 8192, 512, 128, 128, True, False, True): (4, 4, 2, 8), + (8192, 8192, 1024, 16, 16, False, True, True): (4, 8, 4, 4), + (8192, 8192, 1024, 16, 16, True, False, True): (2, 8, 4, 4), + (8192, 8192, 1024, 32, 32, False, True, True): (2, 4, 4, 8), + (8192, 8192, 1024, 32, 32, True, False, True): (1, 4, 3, 4), + (8192, 8192, 1024, 64, 64, False, True, True): (4, 8, 3, 4), + (8192, 8192, 1024, 64, 64, True, False, True): (2, 8, 3, 4), + (8192, 8192, 1024, 128, 128, False, True, True): (4, 8, 1, 4), + (8192, 8192, 1024, 128, 128, True, False, True): (4, 8, 1, 4), + (8192, 8192, 2048, 16, 16, False, True, True): (2, 8, 4, 4), + (8192, 8192, 2048, 16, 16, True, False, True): (2, 8, 4, 4), + (8192, 8192, 2048, 32, 32, False, True, True): (2, 8, 4, 8), + (8192, 8192, 2048, 32, 32, True, False, True): (2, 8, 4, 8), + (8192, 8192, 2048, 64, 64, False, True, True): (4, 8, 2, 4), + (8192, 8192, 2048, 64, 64, True, False, True): (4, 16, 3, 4), + (8192, 8192, 2048, 128, 128, False, True, True): (4, 16, 1, 4), + (8192, 8192, 2048, 128, 128, True, False, True): (4, 16, 1, 4), + (8192, 8192, 4096, 16, 16, False, True, True): (4, 16, 4, 4), + (8192, 8192, 4096, 16, 16, True, False, True): (4, 32, 4, 2), + (8192, 8192, 4096, 32, 32, False, True, True): (2, 16, 4, 8), + (8192, 8192, 4096, 32, 32, True, False, True): (2, 16, 4, 8), + (8192, 8192, 4096, 64, 64, False, True, True): (4, 32, 3, 4), + (8192, 8192, 4096, 64, 64, True, False, True): (4, 16, 2, 4), + (8192, 8192, 4096, 128, 128, False, True, True): (4, 32, 1, 4), + (8192, 8192, 4096, 128, 128, True, False, True): (4, 32, 1, 4), + (8192, 8192, 8192, 16, 16, False, True, True): (4, 64, 4, 2), + (8192, 8192, 8192, 16, 16, True, False, True): (4, 64, 4, 2), + (8192, 8192, 8192, 32, 32, False, True, True): (2, 32, 4, 8), + (8192, 8192, 8192, 32, 32, True, False, True): (2, 32, 4, 8), + (8192, 8192, 8192, 64, 64, False, True, True): (4, 32, 3, 8), + (8192, 8192, 8192, 64, 64, True, False, True): (4, 32, 2, 4), + (8192, 8192, 8192, 128, 128, False, True, True): (4, 64, 1, 4), + (8192, 8192, 8192, 128, 128, True, False, True): (4, 64, 1, 4), + (8192, 8192, 16384, 16, 16, False, True, True): (4, 64, 4, 4), + (8192, 8192, 16384, 16, 16, True, False, True): (4, 64, 4, 4), + (8192, 8192, 16384, 32, 32, False, True, True): (4, 64, 3, 4), + (8192, 8192, 16384, 32, 32, True, False, True): (4, 64, 4, 8), + (8192, 8192, 16384, 64, 64, False, True, True): (4, 64, 2, 4), + (8192, 8192, 16384, 64, 64, True, False, True): (4, 64, 2, 4), + (8192, 8192, 16384, 128, 128, False, True, True): (4, 128, 1, 4), + (8192, 8192, 16384, 128, 128, True, False, True): (4, 128, 1, 4), + (8192, 8192, 32768, 16, 16, False, True, True): (3, 128, 4, 4), + (8192, 8192, 32768, 16, 16, True, False, True): (3, 128, 4, 4), + (8192, 8192, 32768, 32, 32, False, True, True): (2, 128, 4, 8), + (8192, 8192, 32768, 32, 32, True, False, True): (2, 128, 4, 8), + (8192, 8192, 32768, 64, 64, False, True, True): (2, 128, 2, 4), + (8192, 8192, 32768, 64, 64, True, False, True): (2, 128, 2, 4), + (8192, 8192, 32768, 128, 128, False, True, True): (4, 256, 1, 4), + (8192, 8192, 32768, 128, 128, True, False, True): (4, 256, 1, 4), + (8192, 8192, 65536, 16, 16, False, True, True): (3, 256, 4, 4), + (8192, 8192, 65536, 16, 16, True, False, True): (3, 256, 4, 4), + (8192, 8192, 65536, 32, 32, False, True, True): (2, 256, 3, 4), + (8192, 8192, 65536, 32, 32, True, False, True): (2, 256, 3, 4), + (8192, 8192, 65536, 64, 64, False, True, True): (2, 256, 2, 4), + (8192, 8192, 65536, 64, 64, True, False, True): (2, 256, 3, 8), + (8192, 8192, 65536, 128, 128, False, True, True): (4, 512, 1, 4), + (8192, 8192, 65536, 128, 128, True, False, True): (4, 512, 1, 4), + (8192, 8192, 131072, 16, 16, False, True, True): (3, 512, 4, 4), + (8192, 8192, 131072, 16, 16, True, False, True): (3, 512, 4, 4), + (8192, 8192, 131072, 32, 32, False, True, True): (2, 512, 4, 4), + (8192, 8192, 131072, 32, 32, True, False, True): (2, 512, 3, 4), + (8192, 8192, 131072, 64, 64, False, True, True): (4, 512, 2, 4), + (8192, 8192, 131072, 64, 64, True, False, True): (2, 512, 2, 4), + (8192, 8192, 131072, 128, 128, False, True, True): (4, 1024, 1, 4), + (8192, 8192, 131072, 128, 128, True, False, True): (4, 1024, 1, 4), + (16384, 16384, 256, 16, 16, False, True, True): (2, 2, 6, 4), + (16384, 16384, 256, 16, 16, True, False, True): (2, 2, 6, 4), + (16384, 16384, 256, 32, 32, False, True, True): (4, 2, 3, 2), + (16384, 16384, 256, 32, 32, True, False, True): (4, 2, 3, 2), + (16384, 16384, 256, 64, 64, False, True, True): (2, 2, 4, 4), + (16384, 16384, 256, 64, 64, True, False, True): (4, 2, 3, 8), + (16384, 16384, 256, 128, 128, False, True, True): (4, 2, 2, 8), + (16384, 16384, 256, 128, 128, True, False, True): (4, 2, 2, 8), + (16384, 16384, 512, 16, 16, False, True, True): (1, 2, 4, 4), + (16384, 16384, 512, 16, 16, True, False, True): (1, 2, 4, 4), + (16384, 16384, 512, 32, 32, False, True, True): (2, 2, 4, 8), + (16384, 16384, 512, 32, 32, True, False, True): (2, 2, 4, 8), + (16384, 16384, 512, 64, 64, False, True, True): (4, 4, 3, 4), + (16384, 16384, 512, 64, 64, True, False, True): (4, 4, 3, 4), + (16384, 16384, 512, 128, 128, False, True, True): (4, 4, 2, 8), + (16384, 16384, 512, 128, 128, True, False, True): (4, 4, 2, 8), + (16384, 16384, 1024, 16, 16, False, True, True): (3, 4, 4, 4), + (16384, 16384, 1024, 16, 16, True, False, True): (2, 8, 4, 4), + (16384, 16384, 1024, 32, 32, False, True, True): (2, 4, 4, 8), + (16384, 16384, 1024, 32, 32, True, False, True): (1, 4, 4, 8), + (16384, 16384, 1024, 64, 64, False, True, True): (2, 8, 3, 4), + (16384, 16384, 1024, 64, 64, True, False, True): (2, 8, 3, 4), + (16384, 16384, 1024, 128, 128, False, True, True): (4, 8, 1, 4), + (16384, 16384, 1024, 128, 128, True, False, True): (4, 8, 1, 4), + (16384, 16384, 2048, 16, 16, False, True, True): (2, 8, 4, 4), + (16384, 16384, 2048, 16, 16, True, False, True): (2, 8, 4, 4), + (16384, 16384, 2048, 32, 32, False, True, True): (1, 8, 4, 8), + (16384, 16384, 2048, 32, 32, True, False, True): (2, 8, 4, 8), + (16384, 16384, 2048, 64, 64, False, True, True): (2, 8, 2, 4), + (16384, 16384, 2048, 64, 64, True, False, True): (2, 8, 2, 4), + (16384, 16384, 2048, 128, 128, False, True, True): (4, 16, 1, 4), + (16384, 16384, 2048, 128, 128, True, False, True): (4, 16, 1, 4), + (16384, 16384, 4096, 16, 16, False, True, True): (2, 16, 4, 4), + (16384, 16384, 4096, 16, 16, True, False, True): (2, 16, 4, 4), + (16384, 16384, 4096, 32, 32, False, True, True): (1, 8, 3, 8), + (16384, 16384, 4096, 32, 32, True, False, True): (2, 16, 3, 4), + (16384, 16384, 4096, 64, 64, False, True, True): (2, 16, 2, 4), + (16384, 16384, 4096, 64, 64, True, False, True): (2, 16, 2, 4), + (16384, 16384, 4096, 128, 128, False, True, True): (4, 32, 1, 4), + (16384, 16384, 4096, 128, 128, True, False, True): (4, 32, 1, 4), + (16384, 16384, 8192, 16, 16, False, True, True): (4, 64, 4, 2), + (16384, 16384, 8192, 16, 16, True, False, True): (4, 64, 4, 2), + (16384, 16384, 8192, 32, 32, False, True, True): (2, 32, 4, 8), + (16384, 16384, 8192, 32, 32, True, False, True): (2, 32, 3, 4), + (16384, 16384, 8192, 64, 64, False, True, True): (2, 32, 4, 8), + (16384, 16384, 8192, 64, 64, True, False, True): (2, 32, 3, 8), + (16384, 16384, 8192, 128, 128, False, True, True): (4, 64, 1, 4), + (16384, 16384, 8192, 128, 128, True, False, True): (4, 64, 1, 4), + (16384, 16384, 16384, 16, 16, False, True, True): (1, 64, 4, 4), + (16384, 16384, 16384, 16, 16, True, False, True): (1, 64, 4, 4), + (16384, 16384, 16384, 32, 32, False, True, True): (1, 64, 3, 8), + (16384, 16384, 16384, 32, 32, True, False, True): (1, 64, 3, 4), + (16384, 16384, 16384, 64, 64, False, True, True): (1, 64, 2, 4), + (16384, 16384, 16384, 64, 64, True, False, True): (1, 64, 4, 8), + (16384, 16384, 16384, 128, 128, False, True, True): (4, 128, 1, 4), + (16384, 16384, 16384, 128, 128, True, False, True): (4, 128, 1, 4), + (16384, 16384, 32768, 16, 16, False, True, True): (1, 128, 4, 4), + (16384, 16384, 32768, 16, 16, True, False, True): (1, 128, 4, 4), + (16384, 16384, 32768, 32, 32, False, True, True): (1, 128, 4, 2), + (16384, 16384, 32768, 32, 32, True, False, True): (1, 128, 3, 8), + (16384, 16384, 32768, 64, 64, False, True, True): (2, 128, 2, 4), + (16384, 16384, 32768, 64, 64, True, False, True): (1, 128, 3, 8), + (16384, 16384, 32768, 128, 128, False, True, True): (4, 256, 1, 4), + (16384, 16384, 32768, 128, 128, True, False, True): (4, 256, 1, 4), + (16384, 16384, 65536, 16, 16, False, True, True): (1, 256, 4, 4), + (16384, 16384, 65536, 16, 16, True, False, True): (1, 256, 4, 4), + (16384, 16384, 65536, 32, 32, False, True, True): (1, 256, 3, 4), + (16384, 16384, 65536, 32, 32, True, False, True): (1, 256, 3, 4), + (16384, 16384, 65536, 64, 64, False, True, True): (1, 256, 2, 4), + (16384, 16384, 65536, 64, 64, True, False, True): (2, 256, 2, 4), + (16384, 16384, 65536, 128, 128, False, True, True): (4, 512, 1, 4), + (16384, 16384, 65536, 128, 128, True, False, True): (4, 512, 1, 4), + (16384, 16384, 131072, 16, 16, False, True, True): (2, 512, 4, 4), + (16384, 16384, 131072, 16, 16, True, False, True): (1, 512, 4, 4), + (16384, 16384, 131072, 32, 32, False, True, True): (1, 512, 4, 8), + (16384, 16384, 131072, 32, 32, True, False, True): (1, 512, 3, 4), + (16384, 16384, 131072, 64, 64, False, True, True): (2, 512, 2, 4), + (16384, 16384, 131072, 64, 64, True, False, True): (1, 512, 2, 4), + (16384, 16384, 131072, 128, 128, False, True, True): (4, 1024, 1, 4), + (16384, 16384, 131072, 128, 128, True, False, True): (4, 1024, 1, 4), + }, + ("bsr_dense_addmm", "NVIDIA A100-SXM4-80GB", (0, torch.bfloat16, 0.56)): { + (192, 192, 256, 64, 64, False, True, True): (3, 4, 3, 4), + (192, 192, 256, 64, 64, True, False, True): (1, 4, 4, 4), + (192, 192, 512, 64, 64, False, True, True): (2, 8, 3, 4), + (192, 192, 512, 64, 64, True, False, True): (2, 8, 3, 4), + (192, 192, 1024, 64, 64, False, True, True): (1, 16, 3, 4), + (192, 192, 1024, 64, 64, True, False, True): (1, 16, 5, 4), + (192, 192, 2048, 64, 64, False, True, True): (3, 32, 3, 4), + (192, 192, 2048, 64, 64, True, False, True): (5, 32, 3, 4), + (192, 192, 4096, 64, 64, False, True, True): (1, 64, 4, 4), + (192, 192, 4096, 64, 64, True, False, True): (2, 32, 3, 4), + (192, 192, 8192, 64, 64, False, True, True): (1, 128, 2, 4), + (192, 192, 8192, 64, 64, True, False, True): (1, 64, 3, 4), + (192, 192, 16384, 64, 64, False, True, True): (1, 256, 1, 4), + (192, 192, 16384, 64, 64, True, False, True): (1, 64, 3, 4), + (192, 192, 32768, 64, 64, False, True, True): (2, 512, 1, 2), + (192, 192, 32768, 64, 64, True, False, True): (2, 256, 2, 4), + (192, 192, 65536, 64, 64, False, True, True): (3, 512, 1, 4), + (192, 192, 65536, 64, 64, True, False, True): (1, 512, 2, 4), + (192, 192, 131072, 64, 64, False, True, True): (5, 1024, 1, 4), + (192, 192, 131072, 64, 64, True, False, True): (4, 512, 2, 4), + (384, 384, 256, 128, 128, False, True, True): (3, 2, 3, 8), + (384, 384, 256, 128, 128, True, False, True): (1, 2, 3, 8), + (384, 384, 512, 128, 128, False, True, True): (4, 4, 3, 8), + (384, 384, 512, 128, 128, True, False, True): (3, 4, 3, 8), + (384, 384, 1024, 128, 128, False, True, True): (1, 8, 3, 8), + (384, 384, 1024, 128, 128, True, False, True): (2, 8, 3, 8), + (384, 384, 2048, 128, 128, False, True, True): (5, 16, 3, 8), + (384, 384, 2048, 128, 128, True, False, True): (5, 16, 3, 8), + (384, 384, 4096, 128, 128, False, True, True): (3, 32, 3, 8), + (384, 384, 4096, 128, 128, True, False, True): (6, 32, 3, 8), + (384, 384, 8192, 128, 128, False, True, True): (2, 64, 3, 8), + (384, 384, 8192, 128, 128, True, False, True): (4, 32, 2, 8), + (384, 384, 16384, 128, 128, False, True, True): (2, 128, 3, 8), + (384, 384, 16384, 128, 128, True, False, True): (5, 128, 2, 4), + (384, 384, 32768, 128, 128, False, True, True): (2, 256, 3, 8), + (384, 384, 32768, 128, 128, True, False, True): (3, 256, 2, 4), + (384, 384, 65536, 128, 128, False, True, True): (3, 512, 1, 4), + (384, 384, 65536, 128, 128, True, False, True): (1, 512, 2, 4), + (384, 384, 131072, 128, 128, False, True, True): (3, 1024, 1, 4), + (384, 384, 131072, 128, 128, True, False, True): (1, 1024, 2, 4), + }, + ("bsr_dense_addmm", "NVIDIA A100-SXM4-80GB", (0, torch.float16, 0.5)): { + (16, 16, 16, 16, 16, False, False, False): (1, 1, 1, 1), + (16, 16, 16, 16, 16, False, False, True): (1, 1, 2, 2), + (16, 16, 16, 16, 16, False, True, False): (1, 1, 1, 1), + (16, 16, 16, 16, 16, False, True, True): (1, 1, 1, 8), + (16, 16, 16, 16, 16, True, False, False): (3, 1, 3, 4), + (16, 16, 16, 16, 16, True, False, True): (1, 1, 2, 1), + (16, 16, 32, 16, 16, False, False, False): (1, 2, 1, 8), + (16, 16, 32, 16, 16, False, False, True): (1, 2, 1, 2), + (16, 16, 32, 16, 16, False, True, False): (2, 1, 1, 4), + (16, 16, 32, 16, 16, False, True, True): (1, 2, 1, 4), + (16, 16, 32, 16, 16, True, False, False): (1, 1, 1, 4), + (16, 16, 32, 16, 16, True, False, True): (1, 2, 1, 2), + (16, 16, 64, 16, 16, False, False, False): (1, 4, 1, 1), + (16, 16, 64, 16, 16, False, False, True): (1, 2, 2, 4), + (16, 16, 64, 16, 16, False, True, False): (1, 4, 1, 4), + (16, 16, 64, 16, 16, False, True, True): (1, 2, 1, 4), + (16, 16, 64, 16, 16, True, False, False): (1, 4, 1, 2), + (16, 16, 64, 16, 16, True, False, True): (1, 1, 1, 2), + (16, 32, 16, 16, 16, False, False, False): (1, 1, 2, 4), + (16, 32, 16, 16, 16, False, False, True): (1, 1, 1, 4), + (16, 32, 16, 16, 16, False, True, False): (1, 1, 1, 2), + (16, 32, 16, 16, 16, False, True, True): (1, 1, 1, 2), + (16, 32, 16, 16, 16, True, False, False): (1, 1, 2, 16), + (16, 32, 16, 16, 16, True, False, True): (1, 1, 1, 4), + (16, 32, 16, 16, 32, False, False, False): (2, 1, 1, 8), + (16, 32, 16, 16, 32, False, False, True): (2, 1, 1, 8), + (16, 32, 16, 16, 32, False, True, False): (1, 1, 2, 1), + (16, 32, 16, 16, 32, False, True, True): (1, 1, 1, 4), + (16, 32, 16, 16, 32, True, False, False): (2, 1, 1, 8), + (16, 32, 16, 16, 32, True, False, True): (1, 1, 2, 4), + (16, 32, 32, 16, 16, False, False, False): (1, 1, 1, 16), + (16, 32, 32, 16, 16, False, False, True): (1, 2, 1, 2), + (16, 32, 32, 16, 16, False, True, False): (1, 2, 1, 8), + (16, 32, 32, 16, 16, False, True, True): (3, 2, 1, 4), + (16, 32, 32, 16, 16, True, False, False): (1, 2, 1, 4), + (16, 32, 32, 16, 16, True, False, True): (1, 2, 1, 2), + (16, 32, 32, 16, 32, False, False, False): (1, 2, 1, 2), + (16, 32, 32, 16, 32, False, False, True): (1, 1, 1, 4), + (16, 32, 32, 16, 32, False, True, False): (1, 1, 2, 4), + (16, 32, 32, 16, 32, False, True, True): (1, 2, 1, 2), + (16, 32, 32, 16, 32, True, False, False): (1, 2, 1, 2), + (16, 32, 32, 16, 32, True, False, True): (1, 2, 1, 16), + (16, 32, 64, 16, 16, False, False, False): (1, 4, 1, 4), + (16, 32, 64, 16, 16, False, False, True): (2, 4, 1, 4), + (16, 32, 64, 16, 16, False, True, False): (1, 4, 1, 4), + (16, 32, 64, 16, 16, False, True, True): (1, 4, 1, 4), + (16, 32, 64, 16, 16, True, False, False): (3, 4, 1, 2), + (16, 32, 64, 16, 16, True, False, True): (1, 4, 1, 1), + (16, 32, 64, 16, 32, False, False, False): (1, 4, 1, 16), + (16, 32, 64, 16, 32, False, False, True): (1, 2, 1, 2), + (16, 32, 64, 16, 32, False, True, False): (1, 4, 2, 2), + (16, 32, 64, 16, 32, False, True, True): (1, 4, 1, 8), + (16, 32, 64, 16, 32, True, False, False): (1, 4, 1, 8), + (16, 32, 64, 16, 32, True, False, True): (1, 2, 1, 4), + (16, 64, 16, 16, 32, False, False, False): (1, 1, 1, 2), + (16, 64, 16, 16, 32, False, False, True): (1, 1, 1, 4), + (16, 64, 16, 16, 32, False, True, False): (2, 1, 2, 4), + (16, 64, 16, 16, 32, False, True, True): (1, 1, 1, 4), + (16, 64, 16, 16, 32, True, False, False): (1, 1, 1, 4), + (16, 64, 16, 16, 32, True, False, True): (1, 1, 1, 4), + (16, 64, 32, 16, 32, False, False, False): (1, 2, 1, 2), + (16, 64, 32, 16, 32, False, False, True): (1, 1, 1, 4), + (16, 64, 32, 16, 32, False, True, False): (1, 1, 1, 4), + (16, 64, 32, 16, 32, False, True, True): (1, 2, 3, 2), + (16, 64, 32, 16, 32, True, False, False): (1, 1, 1, 4), + (16, 64, 32, 16, 32, True, False, True): (1, 1, 2, 4), + (16, 64, 64, 16, 32, False, False, False): (1, 4, 1, 8), + (16, 64, 64, 16, 32, False, False, True): (1, 4, 1, 4), + (16, 64, 64, 16, 32, False, True, False): (1, 4, 1, 1), + (16, 64, 64, 16, 32, False, True, True): (2, 4, 1, 4), + (16, 64, 64, 16, 32, True, False, False): (1, 4, 1, 4), + (16, 64, 64, 16, 32, True, False, True): (1, 4, 1, 4), + (32, 16, 16, 16, 16, False, False, False): (2, 1, 2, 4), + (32, 16, 16, 16, 16, False, False, True): (2, 1, 1, 2), + (32, 16, 16, 16, 16, False, True, False): (1, 1, 2, 4), + (32, 16, 16, 16, 16, False, True, True): (1, 1, 1, 2), + (32, 16, 16, 16, 16, True, False, False): (1, 1, 1, 4), + (32, 16, 16, 16, 16, True, False, True): (2, 1, 1, 2), + (32, 16, 32, 16, 16, False, False, False): (1, 1, 1, 4), + (32, 16, 32, 16, 16, False, False, True): (1, 1, 1, 4), + (32, 16, 32, 16, 16, False, True, False): (1, 2, 1, 4), + (32, 16, 32, 16, 16, False, True, True): (2, 2, 1, 4), + (32, 16, 32, 16, 16, True, False, False): (2, 1, 1, 4), + (32, 16, 32, 16, 16, True, False, True): (2, 2, 1, 2), + (32, 16, 64, 16, 16, False, False, False): (1, 4, 1, 2), + (32, 16, 64, 16, 16, False, False, True): (1, 4, 1, 4), + (32, 16, 64, 16, 16, False, True, False): (1, 2, 1, 4), + (32, 16, 64, 16, 16, False, True, True): (1, 4, 1, 2), + (32, 16, 64, 16, 16, True, False, False): (1, 4, 2, 8), + (32, 16, 64, 16, 16, True, False, True): (1, 4, 1, 1), + (32, 32, 16, 16, 16, False, False, False): (1, 1, 1, 4), + (32, 32, 16, 16, 16, False, False, True): (2, 1, 1, 4), + (32, 32, 16, 16, 16, False, True, False): (1, 1, 2, 4), + (32, 32, 16, 16, 16, False, True, True): (1, 1, 2, 2), + (32, 32, 16, 16, 16, True, False, False): (1, 1, 1, 8), + (32, 32, 16, 16, 16, True, False, True): (1, 1, 1, 4), + (32, 32, 16, 16, 32, False, False, False): (1, 1, 3, 2), + (32, 32, 16, 16, 32, False, False, True): (2, 1, 1, 4), + (32, 32, 16, 16, 32, False, True, False): (3, 1, 1, 4), + (32, 32, 16, 16, 32, False, True, True): (1, 1, 1, 4), + (32, 32, 16, 16, 32, True, False, False): (2, 1, 1, 8), + (32, 32, 16, 16, 32, True, False, True): (1, 1, 3, 2), + (32, 32, 16, 32, 32, False, False, False): (1, 1, 1, 2), + (32, 32, 16, 32, 32, False, False, True): (2, 1, 1, 8), + (32, 32, 16, 32, 32, False, True, False): (1, 1, 1, 2), + (32, 32, 16, 32, 32, False, True, True): (1, 1, 1, 8), + (32, 32, 16, 32, 32, True, False, False): (1, 1, 2, 4), + (32, 32, 16, 32, 32, True, False, True): (1, 1, 1, 2), + (32, 32, 32, 16, 16, False, False, False): (1, 1, 1, 4), + (32, 32, 32, 16, 16, False, False, True): (1, 2, 1, 4), + (32, 32, 32, 16, 16, False, True, False): (1, 2, 1, 4), + (32, 32, 32, 16, 16, False, True, True): (1, 2, 1, 2), + (32, 32, 32, 16, 16, True, False, False): (1, 2, 1, 4), + (32, 32, 32, 16, 16, True, False, True): (1, 2, 1, 4), + (32, 32, 32, 16, 32, False, False, False): (1, 2, 1, 4), + (32, 32, 32, 16, 32, False, False, True): (1, 2, 1, 2), + (32, 32, 32, 16, 32, False, True, False): (1, 2, 1, 4), + (32, 32, 32, 16, 32, False, True, True): (1, 2, 1, 2), + (32, 32, 32, 16, 32, True, False, False): (1, 2, 1, 1), + (32, 32, 32, 16, 32, True, False, True): (1, 2, 1, 2), + (32, 32, 32, 32, 32, False, False, False): (1, 1, 1, 4), + (32, 32, 32, 32, 32, False, False, True): (2, 1, 1, 4), + (32, 32, 32, 32, 32, False, True, False): (1, 1, 1, 8), + (32, 32, 32, 32, 32, False, True, True): (1, 1, 1, 8), + (32, 32, 32, 32, 32, True, False, False): (1, 1, 3, 4), + (32, 32, 32, 32, 32, True, False, True): (1, 1, 1, 8), + (32, 32, 64, 16, 16, False, False, False): (1, 4, 1, 4), + (32, 32, 64, 16, 16, False, False, True): (1, 4, 1, 2), + (32, 32, 64, 16, 16, False, True, False): (1, 1, 1, 4), + (32, 32, 64, 16, 16, False, True, True): (1, 4, 1, 4), + (32, 32, 64, 16, 16, True, False, False): (1, 4, 1, 8), + (32, 32, 64, 16, 16, True, False, True): (1, 4, 1, 2), + (32, 32, 64, 16, 32, False, False, False): (1, 1, 1, 4), + (32, 32, 64, 16, 32, False, False, True): (1, 4, 1, 4), + (32, 32, 64, 16, 32, False, True, False): (1, 1, 1, 4), + (32, 32, 64, 16, 32, False, True, True): (1, 4, 1, 4), + (32, 32, 64, 16, 32, True, False, False): (2, 2, 1, 8), + (32, 32, 64, 16, 32, True, False, True): (1, 2, 1, 2), + (32, 32, 64, 32, 32, False, False, False): (1, 2, 1, 4), + (32, 32, 64, 32, 32, False, False, True): (1, 2, 1, 1), + (32, 32, 64, 32, 32, False, True, False): (1, 2, 2, 8), + (32, 32, 64, 32, 32, False, True, True): (1, 1, 1, 4), + (32, 32, 64, 32, 32, True, False, False): (1, 2, 1, 4), + (32, 32, 64, 32, 32, True, False, True): (2, 2, 1, 4), + (32, 64, 16, 16, 32, False, False, False): (1, 1, 1, 8), + (32, 64, 16, 16, 32, False, False, True): (1, 1, 1, 4), + (32, 64, 16, 16, 32, False, True, False): (2, 1, 1, 4), + (32, 64, 16, 16, 32, False, True, True): (1, 1, 1, 4), + (32, 64, 16, 16, 32, True, False, False): (1, 1, 2, 4), + (32, 64, 16, 16, 32, True, False, True): (1, 1, 2, 2), + (32, 64, 16, 32, 32, False, False, False): (1, 1, 1, 8), + (32, 64, 16, 32, 32, False, False, True): (2, 1, 1, 4), + (32, 64, 16, 32, 32, False, True, False): (1, 1, 1, 4), + (32, 64, 16, 32, 32, False, True, True): (1, 1, 2, 2), + (32, 64, 16, 32, 32, True, False, False): (1, 1, 1, 2), + (32, 64, 16, 32, 32, True, False, True): (2, 1, 2, 4), + (32, 64, 32, 16, 32, False, False, False): (1, 1, 1, 4), + (32, 64, 32, 16, 32, False, False, True): (1, 2, 1, 2), + (32, 64, 32, 16, 32, False, True, False): (1, 2, 3, 4), + (32, 64, 32, 16, 32, False, True, True): (2, 2, 1, 4), + (32, 64, 32, 16, 32, True, False, False): (1, 1, 1, 4), + (32, 64, 32, 16, 32, True, False, True): (1, 2, 2, 1), + (32, 64, 32, 32, 32, False, False, False): (1, 1, 1, 8), + (32, 64, 32, 32, 32, False, False, True): (1, 1, 1, 4), + (32, 64, 32, 32, 32, False, True, False): (1, 1, 2, 4), + (32, 64, 32, 32, 32, False, True, True): (1, 1, 1, 4), + (32, 64, 32, 32, 32, True, False, False): (2, 1, 1, 2), + (32, 64, 32, 32, 32, True, False, True): (1, 1, 1, 4), + (32, 64, 64, 16, 32, False, False, False): (1, 4, 2, 1), + (32, 64, 64, 16, 32, False, False, True): (3, 4, 1, 4), + (32, 64, 64, 16, 32, False, True, False): (1, 1, 1, 8), + (32, 64, 64, 16, 32, False, True, True): (1, 4, 1, 4), + (32, 64, 64, 16, 32, True, False, False): (1, 4, 1, 4), + (32, 64, 64, 16, 32, True, False, True): (2, 2, 3, 4), + (32, 64, 64, 32, 32, False, False, False): (1, 2, 1, 4), + (32, 64, 64, 32, 32, False, False, True): (1, 2, 1, 4), + (32, 64, 64, 32, 32, False, True, False): (1, 2, 2, 8), + (32, 64, 64, 32, 32, False, True, True): (1, 2, 1, 4), + (32, 64, 64, 32, 32, True, False, False): (1, 2, 2, 4), + (32, 64, 64, 32, 32, True, False, True): (1, 2, 1, 4), + (64, 32, 16, 32, 32, False, False, False): (1, 1, 1, 1), + (64, 32, 16, 32, 32, False, False, True): (1, 1, 2, 4), + (64, 32, 16, 32, 32, False, True, False): (2, 1, 1, 8), + (64, 32, 16, 32, 32, False, True, True): (1, 1, 1, 4), + (64, 32, 16, 32, 32, True, False, False): (2, 1, 1, 2), + (64, 32, 16, 32, 32, True, False, True): (1, 1, 1, 4), + (64, 32, 32, 32, 32, False, False, False): (3, 1, 1, 4), + (64, 32, 32, 32, 32, False, False, True): (1, 1, 1, 4), + (64, 32, 32, 32, 32, False, True, False): (1, 1, 1, 8), + (64, 32, 32, 32, 32, False, True, True): (1, 1, 1, 2), + (64, 32, 32, 32, 32, True, False, False): (1, 1, 1, 2), + (64, 32, 32, 32, 32, True, False, True): (1, 1, 1, 4), + (64, 32, 64, 32, 32, False, False, False): (1, 2, 1, 2), + (64, 32, 64, 32, 32, False, False, True): (3, 2, 1, 4), + (64, 32, 64, 32, 32, False, True, False): (1, 1, 1, 1), + (64, 32, 64, 32, 32, False, True, True): (1, 2, 1, 4), + (64, 32, 64, 32, 32, True, False, False): (1, 1, 3, 4), + (64, 32, 64, 32, 32, True, False, True): (1, 2, 2, 4), + (64, 64, 16, 32, 32, False, False, False): (1, 1, 2, 2), + (64, 64, 16, 32, 32, False, False, True): (1, 1, 3, 2), + (64, 64, 16, 32, 32, False, True, False): (1, 1, 1, 8), + (64, 64, 16, 32, 32, False, True, True): (1, 1, 2, 4), + (64, 64, 16, 32, 32, True, False, False): (1, 1, 2, 4), + (64, 64, 16, 32, 32, True, False, True): (2, 1, 2, 4), + (64, 64, 32, 32, 32, False, False, False): (1, 1, 2, 8), + (64, 64, 32, 32, 32, False, False, True): (1, 1, 2, 4), + (64, 64, 32, 32, 32, False, True, False): (1, 1, 1, 4), + (64, 64, 32, 32, 32, False, True, True): (1, 1, 1, 4), + (64, 64, 32, 32, 32, True, False, False): (1, 1, 1, 4), + (64, 64, 32, 32, 32, True, False, True): (2, 1, 2, 4), + (64, 64, 64, 32, 32, False, False, False): (1, 2, 1, 4), + (64, 64, 64, 32, 32, False, False, True): (1, 2, 1, 4), + (64, 64, 64, 32, 32, False, True, False): (1, 2, 1, 4), + (64, 64, 64, 32, 32, False, True, True): (3, 2, 1, 4), + (64, 64, 64, 32, 32, True, False, False): (1, 2, 1, 8), + (64, 64, 64, 32, 32, True, False, True): (1, 2, 3, 4), + (192, 192, 256, 16, 16, False, True, True): (1, 8, 4, 2), + (192, 192, 256, 16, 16, True, False, True): (1, 4, 4, 4), + (192, 192, 256, 32, 32, False, True, True): (2, 8, 5, 4), + (192, 192, 256, 32, 32, True, False, True): (2, 8, 5, 1), + (192, 192, 512, 16, 16, False, True, True): (3, 8, 4, 4), + (192, 192, 512, 16, 16, True, False, True): (5, 8, 5, 4), + (192, 192, 512, 32, 32, False, True, True): (1, 16, 5, 4), + (192, 192, 512, 32, 32, True, False, True): (1, 8, 6, 2), + (192, 192, 1024, 16, 16, False, True, True): (1, 16, 4, 4), + (192, 192, 1024, 16, 16, True, False, True): (3, 16, 5, 2), + (192, 192, 1024, 32, 32, False, True, True): (3, 16, 4, 4), + (192, 192, 1024, 32, 32, True, False, True): (1, 16, 5, 4), + (192, 192, 2048, 16, 16, False, True, True): (2, 16, 3, 4), + (192, 192, 2048, 16, 16, True, False, True): (1, 16, 4, 4), + (192, 192, 2048, 32, 32, False, True, True): (1, 32, 3, 4), + (192, 192, 2048, 32, 32, True, False, True): (3, 16, 4, 4), + (192, 192, 4096, 16, 16, False, True, True): (1, 64, 1, 4), + (192, 192, 4096, 16, 16, True, False, True): (1, 16, 3, 4), + (192, 192, 4096, 32, 32, False, True, True): (1, 128, 1, 4), + (192, 192, 4096, 32, 32, True, False, True): (2, 32, 4, 2), + (192, 192, 8192, 16, 16, False, True, True): (1, 64, 1, 4), + (192, 192, 8192, 16, 16, True, False, True): (2, 64, 3, 2), + (192, 192, 8192, 32, 32, False, True, True): (1, 128, 1, 4), + (192, 192, 8192, 32, 32, True, False, True): (4, 32, 3, 4), + (192, 192, 16384, 16, 16, False, True, True): (1, 128, 1, 4), + (192, 192, 16384, 16, 16, True, False, True): (1, 64, 3, 2), + (192, 192, 16384, 32, 32, False, True, True): (1, 128, 1, 4), + (192, 192, 16384, 32, 32, True, False, True): (1, 64, 3, 4), + (192, 192, 32768, 16, 16, False, True, True): (2, 256, 1, 2), + (192, 192, 32768, 16, 16, True, False, True): (1, 128, 3, 2), + (192, 192, 32768, 32, 32, False, True, True): (2, 256, 1, 4), + (192, 192, 32768, 32, 32, True, False, True): (1, 256, 3, 2), + (192, 192, 65536, 16, 16, False, True, True): (2, 512, 1, 2), + (192, 192, 65536, 16, 16, True, False, True): (1, 256, 3, 2), + (192, 192, 65536, 32, 32, False, True, True): (2, 512, 1, 4), + (192, 192, 65536, 32, 32, True, False, True): (2, 256, 3, 4), + (192, 192, 131072, 16, 16, False, True, True): (4, 1024, 1, 2), + (192, 192, 131072, 16, 16, True, False, True): (3, 512, 3, 2), + (192, 192, 131072, 32, 32, False, True, True): (1, 1024, 1, 4), + (192, 192, 131072, 32, 32, True, False, True): (3, 512, 3, 4), + (256, 256, 256, 16, 16, False, True, True): (4, 8, 6, 2), + (256, 256, 256, 16, 16, True, False, True): (5, 16, 5, 1), + (256, 256, 256, 32, 32, False, True, True): (1, 8, 7, 4), + (256, 256, 256, 32, 32, True, False, True): (1, 8, 5, 4), + (256, 256, 256, 64, 64, False, True, True): (1, 4, 5, 4), + (256, 256, 256, 64, 64, True, False, True): (2, 4, 3, 4), + (256, 256, 256, 128, 128, False, True, True): (1, 2, 2, 8), + (256, 256, 256, 128, 128, True, False, True): (1, 2, 2, 8), + (256, 256, 512, 16, 16, False, True, True): (4, 8, 4, 4), + (256, 256, 512, 16, 16, True, False, True): (4, 8, 6, 2), + (256, 256, 512, 32, 32, False, True, True): (3, 8, 5, 4), + (256, 256, 512, 32, 32, True, False, True): (2, 8, 5, 4), + (256, 256, 512, 64, 64, False, True, True): (2, 8, 4, 4), + (256, 256, 512, 64, 64, True, False, True): (1, 8, 7, 4), + (256, 256, 512, 128, 128, False, True, True): (2, 4, 2, 8), + (256, 256, 512, 128, 128, True, False, True): (5, 4, 2, 8), + (256, 256, 1024, 16, 16, False, True, True): (1, 8, 4, 4), + (256, 256, 1024, 16, 16, True, False, True): (1, 16, 4, 2), + (256, 256, 1024, 32, 32, False, True, True): (5, 32, 5, 1), + (256, 256, 1024, 32, 32, True, False, True): (1, 16, 4, 2), + (256, 256, 1024, 64, 64, False, True, True): (1, 16, 4, 4), + (256, 256, 1024, 64, 64, True, False, True): (2, 16, 3, 4), + (256, 256, 1024, 128, 128, False, True, True): (9, 8, 2, 8), + (256, 256, 1024, 128, 128, True, False, True): (1, 8, 2, 8), + (256, 256, 2048, 16, 16, False, True, True): (6, 32, 5, 2), + (256, 256, 2048, 16, 16, True, False, True): (2, 32, 4, 2), + (256, 256, 2048, 32, 32, False, True, True): (1, 32, 3, 2), + (256, 256, 2048, 32, 32, True, False, True): (1, 32, 3, 2), + (256, 256, 2048, 64, 64, False, True, True): (2, 32, 4, 4), + (256, 256, 2048, 64, 64, True, False, True): (2, 16, 4, 4), + (256, 256, 2048, 128, 128, False, True, True): (3, 16, 2, 8), + (256, 256, 2048, 128, 128, True, False, True): (4, 16, 2, 8), + (256, 256, 4096, 16, 16, False, True, True): (1, 32, 3, 4), + (256, 256, 4096, 16, 16, True, False, True): (3, 16, 3, 2), + (256, 256, 4096, 32, 32, False, True, True): (3, 32, 3, 2), + (256, 256, 4096, 32, 32, True, False, True): (1, 32, 3, 2), + (256, 256, 4096, 64, 64, False, True, True): (2, 32, 3, 4), + (256, 256, 4096, 64, 64, True, False, True): (2, 32, 3, 4), + (256, 256, 4096, 128, 128, False, True, True): (5, 32, 2, 8), + (256, 256, 4096, 128, 128, True, False, True): (1, 32, 2, 8), + (256, 256, 8192, 16, 16, False, True, True): (8, 32, 3, 4), + (256, 256, 8192, 16, 16, True, False, True): (1, 32, 3, 2), + (256, 256, 8192, 32, 32, False, True, True): (3, 64, 3, 4), + (256, 256, 8192, 32, 32, True, False, True): (2, 128, 1, 2), + (256, 256, 8192, 64, 64, False, True, True): (7, 128, 1, 4), + (256, 256, 8192, 64, 64, True, False, True): (4, 128, 1, 4), + (256, 256, 8192, 128, 128, False, True, True): (2, 64, 1, 4), + (256, 256, 8192, 128, 128, True, False, True): (4, 64, 1, 4), + (256, 256, 16384, 16, 16, False, True, True): (4, 128, 3, 2), + (256, 256, 16384, 16, 16, True, False, True): (5, 64, 3, 2), + (256, 256, 16384, 32, 32, False, True, True): (5, 128, 3, 2), + (256, 256, 16384, 32, 32, True, False, True): (5, 128, 3, 2), + (256, 256, 16384, 64, 64, False, True, True): (1, 256, 1, 4), + (256, 256, 16384, 64, 64, True, False, True): (5, 128, 3, 4), + (256, 256, 16384, 128, 128, False, True, True): (11, 128, 2, 8), + (256, 256, 16384, 128, 128, True, False, True): (3, 128, 1, 4), + (256, 256, 32768, 16, 16, False, True, True): (1, 128, 3, 4), + (256, 256, 32768, 16, 16, True, False, True): (2, 128, 3, 2), + (256, 256, 32768, 32, 32, False, True, True): (4, 256, 3, 2), + (256, 256, 32768, 32, 32, True, False, True): (1, 256, 3, 2), + (256, 256, 32768, 64, 64, False, True, True): (2, 256, 1, 4), + (256, 256, 32768, 64, 64, True, False, True): (2, 256, 1, 4), + (256, 256, 32768, 128, 128, False, True, True): (3, 256, 1, 4), + (256, 256, 32768, 128, 128, True, False, True): (2, 256, 1, 4), + (256, 256, 50432, 16, 16, False, True, True): (4, 197, 1, 4), + (256, 256, 50432, 16, 16, True, False, True): (4, 197, 3, 2), + (256, 256, 50432, 32, 32, False, True, True): (1, 394, 1, 2), + (256, 256, 50432, 32, 32, True, False, True): (4, 197, 3, 4), + (256, 256, 50432, 64, 64, False, True, True): (6, 394, 1, 4), + (256, 256, 50432, 64, 64, True, False, True): (4, 394, 2, 4), + (256, 256, 50432, 128, 128, False, True, True): (3, 394, 1, 4), + (256, 256, 50432, 128, 128, True, False, True): (1, 394, 2, 4), + (256, 256, 65536, 16, 16, False, True, True): (1, 256, 3, 2), + (256, 256, 65536, 16, 16, True, False, True): (1, 256, 3, 2), + (256, 256, 65536, 32, 32, False, True, True): (1, 512, 3, 2), + (256, 256, 65536, 32, 32, True, False, True): (4, 512, 3, 2), + (256, 256, 65536, 64, 64, False, True, True): (2, 512, 1, 4), + (256, 256, 65536, 64, 64, True, False, True): (5, 512, 1, 4), + (256, 256, 65536, 128, 128, False, True, True): (3, 512, 1, 4), + (256, 256, 65536, 128, 128, True, False, True): (1, 512, 1, 4), + (256, 256, 65792, 16, 16, False, True, True): (2, 257, 1, 4), + (256, 256, 65792, 16, 16, True, False, True): (1, 257, 3, 2), + (256, 256, 65792, 32, 32, False, True, True): (2, 257, 1, 4), + (256, 256, 65792, 32, 32, True, False, True): (1, 257, 3, 4), + (256, 256, 65792, 64, 64, False, True, True): (2, 514, 1, 4), + (256, 256, 65792, 64, 64, True, False, True): (2, 514, 2, 4), + (256, 256, 65792, 128, 128, False, True, True): (3, 514, 1, 4), + (256, 256, 65792, 128, 128, True, False, True): (1, 514, 2, 4), + (256, 256, 131072, 16, 16, False, True, True): (1, 512, 3, 1), + (256, 256, 131072, 16, 16, True, False, True): (1, 512, 3, 2), + (256, 256, 131072, 32, 32, False, True, True): (2, 1024, 3, 2), + (256, 256, 131072, 32, 32, True, False, True): (1, 1024, 3, 2), + (256, 256, 131072, 64, 64, False, True, True): (1, 1024, 1, 4), + (256, 256, 131072, 64, 64, True, False, True): (1, 1024, 1, 4), + (256, 256, 131072, 128, 128, False, True, True): (7, 1024, 1, 4), + (256, 256, 131072, 128, 128, True, False, True): (1, 1024, 1, 4), + (384, 384, 256, 16, 16, False, True, True): (3, 16, 4, 1), + (384, 384, 256, 16, 16, True, False, True): (2, 4, 6, 2), + (384, 384, 256, 32, 32, False, True, True): (1, 8, 4, 4), + (384, 384, 256, 32, 32, True, False, True): (1, 4, 5, 2), + (384, 384, 256, 64, 64, False, True, True): (3, 4, 3, 4), + (384, 384, 256, 64, 64, True, False, True): (4, 4, 5, 4), + (384, 384, 512, 16, 16, False, True, True): (1, 16, 4, 1), + (384, 384, 512, 16, 16, True, False, True): (1, 8, 5, 2), + (384, 384, 512, 32, 32, False, True, True): (4, 16, 4, 2), + (384, 384, 512, 32, 32, True, False, True): (1, 8, 5, 2), + (384, 384, 512, 64, 64, False, True, True): (2, 8, 3, 4), + (384, 384, 512, 64, 64, True, False, True): (1, 8, 4, 4), + (384, 384, 1024, 16, 16, False, True, True): (1, 16, 4, 2), + (384, 384, 1024, 16, 16, True, False, True): (7, 8, 5, 2), + (384, 384, 1024, 32, 32, False, True, True): (2, 16, 3, 4), + (384, 384, 1024, 32, 32, True, False, True): (1, 16, 4, 2), + (384, 384, 1024, 64, 64, False, True, True): (6, 16, 3, 4), + (384, 384, 1024, 64, 64, True, False, True): (4, 16, 4, 4), + (384, 384, 2048, 16, 16, False, True, True): (1, 32, 1, 4), + (384, 384, 2048, 16, 16, True, False, True): (1, 16, 3, 2), + (384, 384, 2048, 32, 32, False, True, True): (1, 32, 1, 8), + (384, 384, 2048, 32, 32, True, False, True): (1, 8, 4, 4), + (384, 384, 2048, 64, 64, False, True, True): (2, 32, 1, 8), + (384, 384, 2048, 64, 64, True, False, True): (3, 16, 3, 4), + (384, 384, 4096, 16, 16, False, True, True): (5, 32, 1, 4), + (384, 384, 4096, 16, 16, True, False, True): (1, 32, 3, 2), + (384, 384, 4096, 32, 32, False, True, True): (1, 32, 1, 8), + (384, 384, 4096, 32, 32, True, False, True): (2, 16, 4, 4), + (384, 384, 4096, 64, 64, False, True, True): (1, 64, 1, 4), + (384, 384, 4096, 64, 64, True, False, True): (2, 32, 3, 4), + (384, 384, 8192, 16, 16, False, True, True): (2, 64, 1, 4), + (384, 384, 8192, 16, 16, True, False, True): (3, 32, 3, 2), + (384, 384, 8192, 32, 32, False, True, True): (4, 128, 1, 4), + (384, 384, 8192, 32, 32, True, False, True): (1, 32, 3, 2), + (384, 384, 8192, 64, 64, False, True, True): (1, 128, 1, 4), + (384, 384, 8192, 64, 64, True, False, True): (1, 64, 3, 4), + (384, 384, 16384, 16, 16, False, True, True): (1, 128, 1, 2), + (384, 384, 16384, 16, 16, True, False, True): (1, 64, 3, 2), + (384, 384, 16384, 32, 32, False, True, True): (1, 128, 1, 4), + (384, 384, 16384, 32, 32, True, False, True): (1, 64, 3, 4), + (384, 384, 16384, 64, 64, False, True, True): (5, 128, 3, 4), + (384, 384, 16384, 64, 64, True, False, True): (1, 128, 3, 4), + (384, 384, 32768, 16, 16, False, True, True): (2, 256, 1, 2), + (384, 384, 32768, 16, 16, True, False, True): (1, 128, 3, 4), + (384, 384, 32768, 32, 32, False, True, True): (1, 256, 1, 2), + (384, 384, 32768, 32, 32, True, False, True): (2, 128, 3, 4), + (384, 384, 32768, 64, 64, False, True, True): (3, 256, 1, 4), + (384, 384, 32768, 64, 64, True, False, True): (2, 256, 3, 4), + (384, 384, 65536, 16, 16, False, True, True): (2, 128, 1, 4), + (384, 384, 65536, 16, 16, True, False, True): (1, 256, 3, 4), + (384, 384, 65536, 32, 32, False, True, True): (1, 512, 1, 2), + (384, 384, 65536, 32, 32, True, False, True): (1, 256, 3, 4), + (384, 384, 65536, 64, 64, False, True, True): (3, 512, 1, 4), + (384, 384, 65536, 64, 64, True, False, True): (3, 256, 3, 4), + (384, 384, 131072, 16, 16, False, True, True): (2, 256, 1, 2), + (384, 384, 131072, 16, 16, True, False, True): (1, 512, 3, 4), + (384, 384, 131072, 32, 32, False, True, True): (1, 512, 1, 2), + (384, 384, 131072, 32, 32, True, False, True): (1, 512, 3, 4), + (384, 384, 131072, 64, 64, False, True, True): (3, 1024, 1, 4), + (384, 384, 131072, 64, 64, True, False, True): (3, 512, 3, 4), + (512, 512, 256, 16, 16, False, True, True): (1, 8, 5, 1), + (512, 512, 256, 16, 16, True, False, True): (2, 16, 5, 1), + (512, 512, 256, 32, 32, False, True, True): (2, 8, 5, 2), + (512, 512, 256, 32, 32, True, False, True): (4, 4, 5, 2), + (512, 512, 256, 64, 64, False, True, True): (1, 4, 5, 4), + (512, 512, 256, 64, 64, True, False, True): (3, 4, 5, 4), + (512, 512, 256, 128, 128, False, True, True): (1, 2, 2, 8), + (512, 512, 256, 128, 128, True, False, True): (1, 2, 2, 8), + (512, 512, 512, 16, 16, False, True, True): (1, 8, 4, 4), + (512, 512, 512, 16, 16, True, False, True): (4, 16, 5, 1), + (512, 512, 512, 32, 32, False, True, True): (4, 8, 5, 2), + (512, 512, 512, 32, 32, True, False, True): (7, 16, 4, 1), + (512, 512, 512, 64, 64, False, True, True): (3, 8, 5, 4), + (512, 512, 512, 64, 64, True, False, True): (1, 8, 4, 4), + (512, 512, 512, 128, 128, False, True, True): (4, 4, 2, 8), + (512, 512, 512, 128, 128, True, False, True): (4, 4, 2, 8), + (512, 512, 1024, 16, 16, False, True, True): (2, 8, 4, 4), + (512, 512, 1024, 16, 16, True, False, True): (2, 16, 4, 2), + (512, 512, 1024, 32, 32, False, True, True): (3, 16, 4, 2), + (512, 512, 1024, 32, 32, True, False, True): (3, 16, 3, 2), + (512, 512, 1024, 64, 64, False, True, True): (5, 8, 5, 4), + (512, 512, 1024, 64, 64, True, False, True): (4, 16, 3, 4), + (512, 512, 1024, 128, 128, False, True, True): (6, 8, 2, 8), + (512, 512, 1024, 128, 128, True, False, True): (4, 8, 2, 8), + (512, 512, 2048, 16, 16, False, True, True): (2, 16, 3, 4), + (512, 512, 2048, 16, 16, True, False, True): (1, 16, 4, 2), + (512, 512, 2048, 32, 32, False, True, True): (2, 32, 3, 2), + (512, 512, 2048, 32, 32, True, False, True): (2, 32, 3, 2), + (512, 512, 2048, 64, 64, False, True, True): (1, 32, 3, 4), + (512, 512, 2048, 64, 64, True, False, True): (1, 32, 3, 2), + (512, 512, 2048, 128, 128, False, True, True): (3, 16, 2, 8), + (512, 512, 2048, 128, 128, True, False, True): (1, 16, 2, 8), + (512, 512, 4096, 16, 16, False, True, True): (4, 32, 3, 2), + (512, 512, 4096, 16, 16, True, False, True): (1, 32, 3, 2), + (512, 512, 4096, 32, 32, False, True, True): (3, 32, 3, 2), + (512, 512, 4096, 32, 32, True, False, True): (3, 32, 3, 2), + (512, 512, 4096, 64, 64, False, True, True): (1, 32, 3, 4), + (512, 512, 4096, 64, 64, True, False, True): (1, 64, 1, 4), + (512, 512, 4096, 128, 128, False, True, True): (7, 32, 2, 8), + (512, 512, 4096, 128, 128, True, False, True): (1, 32, 2, 8), + (512, 512, 8192, 16, 16, False, True, True): (4, 64, 3, 2), + (512, 512, 8192, 16, 16, True, False, True): (1, 64, 3, 2), + (512, 512, 8192, 32, 32, False, True, True): (3, 64, 3, 2), + (512, 512, 8192, 32, 32, True, False, True): (1, 64, 3, 2), + (512, 512, 8192, 64, 64, False, True, True): (1, 64, 3, 4), + (512, 512, 8192, 64, 64, True, False, True): (1, 64, 3, 4), + (512, 512, 8192, 128, 128, False, True, True): (7, 64, 2, 8), + (512, 512, 8192, 128, 128, True, False, True): (1, 64, 1, 4), + (512, 512, 16384, 16, 16, False, True, True): (1, 128, 3, 2), + (512, 512, 16384, 16, 16, True, False, True): (1, 64, 3, 2), + (512, 512, 16384, 32, 32, False, True, True): (1, 128, 3, 2), + (512, 512, 16384, 32, 32, True, False, True): (1, 128, 3, 2), + (512, 512, 16384, 64, 64, False, True, True): (1, 128, 3, 4), + (512, 512, 16384, 64, 64, True, False, True): (4, 128, 3, 4), + (512, 512, 16384, 128, 128, False, True, True): (5, 128, 2, 8), + (512, 512, 16384, 128, 128, True, False, True): (2, 128, 1, 4), + (512, 512, 32768, 16, 16, False, True, True): (1, 128, 3, 4), + (512, 512, 32768, 16, 16, True, False, True): (1, 128, 3, 2), + (512, 512, 32768, 32, 32, False, True, True): (1, 256, 3, 2), + (512, 512, 32768, 32, 32, True, False, True): (1, 256, 3, 2), + (512, 512, 32768, 64, 64, False, True, True): (1, 256, 3, 4), + (512, 512, 32768, 64, 64, True, False, True): (1, 256, 3, 4), + (512, 512, 32768, 128, 128, False, True, True): (5, 256, 1, 4), + (512, 512, 32768, 128, 128, True, False, True): (1, 256, 1, 4), + (512, 512, 50432, 16, 16, False, True, True): (4, 197, 1, 4), + (512, 512, 50432, 16, 16, True, False, True): (4, 197, 3, 2), + (512, 512, 50432, 32, 32, False, True, True): (2, 197, 1, 4), + (512, 512, 50432, 32, 32, True, False, True): (4, 197, 3, 4), + (512, 512, 50432, 64, 64, False, True, True): (2, 394, 1, 4), + (512, 512, 50432, 64, 64, True, False, True): (4, 197, 2, 4), + (512, 512, 50432, 128, 128, False, True, True): (5, 394, 1, 4), + (512, 512, 50432, 128, 128, True, False, True): (6, 394, 2, 4), + (512, 512, 65536, 16, 16, False, True, True): (1, 256, 3, 2), + (512, 512, 65536, 16, 16, True, False, True): (1, 256, 3, 1), + (512, 512, 65536, 32, 32, False, True, True): (1, 512, 3, 2), + (512, 512, 65536, 32, 32, True, False, True): (1, 512, 3, 2), + (512, 512, 65536, 64, 64, False, True, True): (2, 256, 2, 4), + (512, 512, 65536, 64, 64, True, False, True): (1, 512, 3, 4), + (512, 512, 65536, 128, 128, False, True, True): (7, 512, 1, 4), + (512, 512, 65536, 128, 128, True, False, True): (5, 512, 1, 4), + (512, 512, 65792, 16, 16, False, True, True): (2, 257, 1, 4), + (512, 512, 65792, 16, 16, True, False, True): (1, 257, 3, 4), + (512, 512, 65792, 32, 32, False, True, True): (2, 257, 1, 4), + (512, 512, 65792, 32, 32, True, False, True): (1, 257, 3, 4), + (512, 512, 65792, 64, 64, False, True, True): (4, 514, 1, 4), + (512, 512, 65792, 64, 64, True, False, True): (4, 257, 2, 4), + (512, 512, 65792, 128, 128, False, True, True): (5, 514, 1, 4), + (512, 512, 65792, 128, 128, True, False, True): (4, 514, 2, 4), + (512, 512, 131072, 16, 16, False, True, True): (1, 512, 3, 1), + (512, 512, 131072, 16, 16, True, False, True): (1, 512, 3, 1), + (512, 512, 131072, 32, 32, False, True, True): (1, 1024, 3, 2), + (512, 512, 131072, 32, 32, True, False, True): (1, 1024, 3, 2), + (512, 512, 131072, 64, 64, False, True, True): (4, 512, 2, 4), + (512, 512, 131072, 64, 64, True, False, True): (2, 512, 2, 4), + (512, 512, 131072, 128, 128, False, True, True): (5, 1024, 1, 4), + (512, 512, 131072, 128, 128, True, False, True): (4, 1024, 1, 4), + (768, 768, 256, 16, 16, False, True, True): (1, 8, 4, 1), + (768, 768, 256, 16, 16, True, False, True): (3, 2, 5, 2), + (768, 768, 256, 32, 32, False, True, True): (1, 8, 4, 2), + (768, 768, 256, 32, 32, True, False, True): (2, 4, 6, 2), + (768, 768, 256, 64, 64, False, True, True): (3, 4, 3, 4), + (768, 768, 256, 64, 64, True, False, True): (2, 4, 4, 4), + (768, 768, 256, 128, 128, False, True, True): (1, 2, 3, 8), + (768, 768, 256, 128, 128, True, False, True): (2, 2, 3, 8), + (768, 768, 512, 16, 16, False, True, True): (1, 8, 4, 2), + (768, 768, 512, 16, 16, True, False, True): (2, 8, 5, 2), + (768, 768, 512, 32, 32, False, True, True): (1, 16, 1, 4), + (768, 768, 512, 32, 32, True, False, True): (3, 8, 5, 2), + (768, 768, 512, 64, 64, False, True, True): (4, 8, 3, 4), + (768, 768, 512, 64, 64, True, False, True): (2, 8, 4, 4), + (768, 768, 512, 128, 128, False, True, True): (1, 4, 3, 8), + (768, 768, 512, 128, 128, True, False, True): (3, 4, 3, 8), + (768, 768, 1024, 16, 16, False, True, True): (1, 16, 1, 4), + (768, 768, 1024, 16, 16, True, False, True): (1, 8, 5, 2), + (768, 768, 1024, 32, 32, False, True, True): (1, 16, 1, 8), + (768, 768, 1024, 32, 32, True, False, True): (1, 4, 4, 4), + (768, 768, 1024, 64, 64, False, True, True): (2, 16, 1, 8), + (768, 768, 1024, 64, 64, True, False, True): (1, 8, 3, 8), + (768, 768, 1024, 128, 128, False, True, True): (1, 8, 3, 8), + (768, 768, 1024, 128, 128, True, False, True): (3, 8, 3, 8), + (768, 768, 2048, 16, 16, False, True, True): (6, 16, 1, 2), + (768, 768, 2048, 16, 16, True, False, True): (2, 16, 4, 2), + (768, 768, 2048, 32, 32, False, True, True): (3, 32, 1, 4), + (768, 768, 2048, 32, 32, True, False, True): (6, 8, 3, 4), + (768, 768, 2048, 64, 64, False, True, True): (2, 32, 2, 2), + (768, 768, 2048, 64, 64, True, False, True): (1, 16, 4, 4), + (768, 768, 2048, 128, 128, False, True, True): (2, 16, 3, 8), + (768, 768, 2048, 128, 128, True, False, True): (4, 16, 3, 8), + (768, 768, 4096, 16, 16, False, True, True): (1, 32, 1, 4), + (768, 768, 4096, 16, 16, True, False, True): (2, 16, 3, 2), + (768, 768, 4096, 32, 32, False, True, True): (3, 32, 1, 8), + (768, 768, 4096, 32, 32, True, False, True): (1, 16, 4, 4), + (768, 768, 4096, 64, 64, False, True, True): (1, 64, 2, 4), + (768, 768, 4096, 64, 64, True, False, True): (1, 8, 3, 8), + (768, 768, 4096, 128, 128, False, True, True): (1, 32, 3, 8), + (768, 768, 4096, 128, 128, True, False, True): (2, 32, 3, 8), + (768, 768, 8192, 16, 16, False, True, True): (1, 64, 1, 2), + (768, 768, 8192, 16, 16, True, False, True): (2, 64, 3, 2), + (768, 768, 8192, 32, 32, False, True, True): (2, 64, 1, 8), + (768, 768, 8192, 32, 32, True, False, True): (2, 32, 3, 4), + (768, 768, 8192, 64, 64, False, True, True): (4, 64, 3, 4), + (768, 768, 8192, 64, 64, True, False, True): (1, 64, 3, 4), + (768, 768, 8192, 128, 128, False, True, True): (4, 64, 3, 8), + (768, 768, 8192, 128, 128, True, False, True): (2, 64, 3, 8), + (768, 768, 16384, 16, 16, False, True, True): (4, 128, 1, 2), + (768, 768, 16384, 16, 16, True, False, True): (1, 64, 3, 4), + (768, 768, 16384, 32, 32, False, True, True): (1, 128, 1, 8), + (768, 768, 16384, 32, 32, True, False, True): (1, 64, 3, 4), + (768, 768, 16384, 64, 64, False, True, True): (1, 128, 3, 4), + (768, 768, 16384, 64, 64, True, False, True): (1, 128, 3, 4), + (768, 768, 16384, 128, 128, False, True, True): (3, 128, 1, 4), + (768, 768, 16384, 128, 128, True, False, True): (1, 128, 2, 4), + (768, 768, 32768, 16, 16, False, True, True): (2, 256, 1, 2), + (768, 768, 32768, 16, 16, True, False, True): (1, 128, 4, 4), + (768, 768, 32768, 32, 32, False, True, True): (1, 128, 1, 2), + (768, 768, 32768, 32, 32, True, False, True): (1, 128, 3, 4), + (768, 768, 32768, 64, 64, False, True, True): (1, 256, 1, 4), + (768, 768, 32768, 64, 64, True, False, True): (1, 128, 3, 4), + (768, 768, 32768, 128, 128, False, True, True): (3, 256, 1, 4), + (768, 768, 32768, 128, 128, True, False, True): (3, 256, 2, 4), + (768, 768, 65536, 16, 16, False, True, True): (4, 512, 1, 2), + (768, 768, 65536, 16, 16, True, False, True): (1, 256, 4, 4), + (768, 768, 65536, 32, 32, False, True, True): (1, 256, 1, 2), + (768, 768, 65536, 32, 32, True, False, True): (1, 256, 3, 4), + (768, 768, 65536, 64, 64, False, True, True): (1, 512, 1, 4), + (768, 768, 65536, 64, 64, True, False, True): (1, 256, 3, 4), + (768, 768, 65536, 128, 128, False, True, True): (3, 512, 1, 4), + (768, 768, 65536, 128, 128, True, False, True): (2, 512, 2, 4), + (768, 768, 131072, 16, 16, False, True, True): (1, 512, 1, 1), + (768, 768, 131072, 16, 16, True, False, True): (1, 512, 4, 4), + (768, 768, 131072, 32, 32, False, True, True): (1, 512, 1, 2), + (768, 768, 131072, 32, 32, True, False, True): (1, 512, 3, 4), + (768, 768, 131072, 64, 64, False, True, True): (1, 1024, 1, 4), + (768, 768, 131072, 64, 64, True, False, True): (3, 512, 3, 4), + (768, 768, 131072, 128, 128, False, True, True): (3, 1024, 1, 4), + (768, 768, 131072, 128, 128, True, False, True): (1, 1024, 2, 4), + (768, 3072, 256, 16, 16, False, True, True): (1, 8, 5, 2), + (768, 3072, 256, 16, 16, True, False, True): (3, 4, 7, 2), + (768, 3072, 256, 32, 32, False, True, True): (1, 8, 4, 2), + (768, 3072, 256, 32, 32, True, False, True): (1, 4, 5, 4), + (768, 3072, 256, 64, 64, False, True, True): (1, 4, 3, 4), + (768, 3072, 256, 64, 64, True, False, True): (1, 4, 5, 4), + (768, 3072, 256, 128, 128, False, True, True): (2, 2, 3, 8), + (768, 3072, 256, 128, 128, True, False, True): (2, 2, 3, 8), + (768, 3072, 512, 16, 16, False, True, True): (1, 8, 5, 2), + (768, 3072, 512, 16, 16, True, False, True): (1, 8, 5, 2), + (768, 3072, 512, 32, 32, False, True, True): (3, 8, 3, 4), + (768, 3072, 512, 32, 32, True, False, True): (1, 8, 7, 4), + (768, 3072, 512, 64, 64, False, True, True): (3, 8, 3, 4), + (768, 3072, 512, 64, 64, True, False, True): (3, 8, 5, 4), + (768, 3072, 512, 128, 128, False, True, True): (1, 4, 3, 8), + (768, 3072, 512, 128, 128, True, False, True): (1, 4, 3, 8), + (768, 3072, 1024, 16, 16, False, True, True): (4, 16, 1, 4), + (768, 3072, 1024, 16, 16, True, False, True): (2, 8, 5, 2), + (768, 3072, 1024, 32, 32, False, True, True): (1, 16, 6, 2), + (768, 3072, 1024, 32, 32, True, False, True): (1, 8, 4, 4), + (768, 3072, 1024, 64, 64, False, True, True): (2, 16, 4, 4), + (768, 3072, 1024, 64, 64, True, False, True): (2, 16, 4, 4), + (768, 3072, 1024, 128, 128, False, True, True): (1, 8, 3, 8), + (768, 3072, 1024, 128, 128, True, False, True): (3, 8, 3, 8), + (768, 3072, 2048, 16, 16, False, True, True): (1, 16, 1, 2), + (768, 3072, 2048, 16, 16, True, False, True): (1, 16, 5, 2), + (768, 3072, 2048, 32, 32, False, True, True): (4, 16, 1, 8), + (768, 3072, 2048, 32, 32, True, False, True): (2, 8, 3, 4), + (768, 3072, 2048, 64, 64, False, True, True): (2, 16, 3, 4), + (768, 3072, 2048, 64, 64, True, False, True): (2, 16, 3, 4), + (768, 3072, 2048, 128, 128, False, True, True): (3, 16, 3, 8), + (768, 3072, 2048, 128, 128, True, False, True): (1, 16, 3, 8), + (768, 3072, 4096, 16, 16, False, True, True): (1, 32, 1, 4), + (768, 3072, 4096, 16, 16, True, False, True): (1, 16, 3, 1), + (768, 3072, 4096, 32, 32, False, True, True): (3, 32, 1, 8), + (768, 3072, 4096, 32, 32, True, False, True): (2, 16, 3, 8), + (768, 3072, 4096, 64, 64, False, True, True): (2, 32, 3, 4), + (768, 3072, 4096, 64, 64, True, False, True): (2, 16, 3, 4), + (768, 3072, 4096, 128, 128, False, True, True): (5, 32, 1, 4), + (768, 3072, 4096, 128, 128, True, False, True): (4, 32, 3, 8), + (768, 3072, 8192, 16, 16, False, True, True): (1, 32, 1, 4), + (768, 3072, 8192, 16, 16, True, False, True): (4, 64, 4, 2), + (768, 3072, 8192, 32, 32, False, True, True): (1, 64, 1, 8), + (768, 3072, 8192, 32, 32, True, False, True): (2, 32, 3, 8), + (768, 3072, 8192, 64, 64, False, True, True): (2, 64, 3, 4), + (768, 3072, 8192, 64, 64, True, False, True): (2, 32, 3, 4), + (768, 3072, 8192, 128, 128, False, True, True): (1, 64, 3, 8), + (768, 3072, 8192, 128, 128, True, False, True): (2, 64, 3, 8), + (768, 3072, 16384, 16, 16, False, True, True): (1, 64, 1, 4), + (768, 3072, 16384, 16, 16, True, False, True): (1, 64, 4, 1), + (768, 3072, 16384, 32, 32, False, True, True): (1, 128, 1, 8), + (768, 3072, 16384, 32, 32, True, False, True): (1, 64, 3, 4), + (768, 3072, 16384, 64, 64, False, True, True): (1, 128, 3, 4), + (768, 3072, 16384, 64, 64, True, False, True): (1, 64, 3, 4), + (768, 3072, 16384, 128, 128, False, True, True): (2, 128, 3, 8), + (768, 3072, 16384, 128, 128, True, False, True): (1, 128, 3, 8), + (768, 3072, 32768, 16, 16, False, True, True): (1, 128, 1, 4), + (768, 3072, 32768, 16, 16, True, False, True): (1, 128, 4, 1), + (768, 3072, 32768, 32, 32, False, True, True): (1, 256, 1, 8), + (768, 3072, 32768, 32, 32, True, False, True): (1, 128, 3, 4), + (768, 3072, 32768, 64, 64, False, True, True): (1, 256, 3, 4), + (768, 3072, 32768, 64, 64, True, False, True): (1, 128, 3, 4), + (768, 3072, 32768, 128, 128, False, True, True): (3, 256, 1, 4), + (768, 3072, 32768, 128, 128, True, False, True): (5, 256, 3, 8), + (768, 3072, 50432, 16, 16, False, True, True): (1, 197, 1, 4), + (768, 3072, 50432, 16, 16, True, False, True): (4, 197, 4, 1), + (768, 3072, 50432, 32, 32, False, True, True): (2, 197, 1, 4), + (768, 3072, 50432, 32, 32, True, False, True): (4, 197, 3, 4), + (768, 3072, 50432, 64, 64, False, True, True): (1, 394, 3, 4), + (768, 3072, 50432, 64, 64, True, False, True): (1, 197, 3, 4), + (768, 3072, 50432, 128, 128, False, True, True): (3, 394, 1, 4), + (768, 3072, 50432, 128, 128, True, False, True): (3, 394, 2, 4), + (768, 3072, 65536, 16, 16, False, True, True): (1, 256, 1, 4), + (768, 3072, 65536, 16, 16, True, False, True): (5, 256, 4, 1), + (768, 3072, 65536, 32, 32, False, True, True): (2, 256, 1, 4), + (768, 3072, 65536, 32, 32, True, False, True): (3, 256, 3, 4), + (768, 3072, 65536, 64, 64, False, True, True): (1, 512, 3, 4), + (768, 3072, 65536, 64, 64, True, False, True): (1, 256, 3, 4), + (768, 3072, 65536, 128, 128, False, True, True): (3, 512, 1, 4), + (768, 3072, 65536, 128, 128, True, False, True): (2, 512, 3, 8), + (768, 3072, 131072, 16, 16, False, True, True): (1, 512, 1, 4), + (768, 3072, 131072, 16, 16, True, False, True): (5, 512, 4, 1), + (768, 3072, 131072, 32, 32, False, True, True): (2, 512, 1, 4), + (768, 3072, 131072, 32, 32, True, False, True): (2, 512, 3, 4), + (768, 3072, 131072, 64, 64, False, True, True): (1, 1024, 3, 4), + (768, 3072, 131072, 64, 64, True, False, True): (2, 512, 3, 4), + (768, 3072, 131072, 128, 128, False, True, True): (3, 1024, 1, 4), + (768, 3072, 131072, 128, 128, True, False, True): (2, 1024, 3, 8), + (1024, 1024, 256, 16, 16, False, True, True): (3, 4, 5, 4), + (1024, 1024, 256, 16, 16, True, False, True): (3, 4, 5, 4), + (1024, 1024, 256, 32, 32, False, True, True): (2, 4, 6, 2), + (1024, 1024, 256, 32, 32, True, False, True): (2, 4, 6, 2), + (1024, 1024, 256, 64, 64, False, True, True): (1, 4, 4, 4), + (1024, 1024, 256, 64, 64, True, False, True): (2, 4, 6, 4), + (1024, 1024, 256, 128, 128, False, True, True): (1, 2, 2, 8), + (1024, 1024, 256, 128, 128, True, False, True): (1, 2, 2, 8), + (1024, 1024, 512, 16, 16, False, True, True): (3, 4, 5, 4), + (1024, 1024, 512, 16, 16, True, False, True): (3, 8, 4, 2), + (1024, 1024, 512, 32, 32, False, True, True): (1, 8, 4, 2), + (1024, 1024, 512, 32, 32, True, False, True): (1, 8, 4, 2), + (1024, 1024, 512, 64, 64, False, True, True): (2, 8, 3, 4), + (1024, 1024, 512, 64, 64, True, False, True): (1, 4, 4, 4), + (1024, 1024, 512, 128, 128, False, True, True): (7, 4, 2, 8), + (1024, 1024, 512, 128, 128, True, False, True): (1, 4, 2, 8), + (1024, 1024, 1024, 16, 16, False, True, True): (4, 8, 4, 2), + (1024, 1024, 1024, 16, 16, True, False, True): (3, 8, 5, 2), + (1024, 1024, 1024, 32, 32, False, True, True): (1, 8, 4, 4), + (1024, 1024, 1024, 32, 32, True, False, True): (1, 8, 4, 2), + (1024, 1024, 1024, 64, 64, False, True, True): (1, 16, 3, 4), + (1024, 1024, 1024, 64, 64, True, False, True): (3, 16, 3, 4), + (1024, 1024, 1024, 128, 128, False, True, True): (6, 8, 2, 8), + (1024, 1024, 1024, 128, 128, True, False, True): (4, 8, 2, 8), + (1024, 1024, 2048, 16, 16, False, True, True): (3, 8, 3, 4), + (1024, 1024, 2048, 16, 16, True, False, True): (3, 8, 3, 4), + (1024, 1024, 2048, 32, 32, False, True, True): (1, 16, 3, 4), + (1024, 1024, 2048, 32, 32, True, False, True): (1, 16, 3, 2), + (1024, 1024, 2048, 64, 64, False, True, True): (5, 16, 3, 4), + (1024, 1024, 2048, 64, 64, True, False, True): (5, 16, 3, 4), + (1024, 1024, 2048, 128, 128, False, True, True): (3, 16, 2, 8), + (1024, 1024, 2048, 128, 128, True, False, True): (4, 16, 2, 16), + (1024, 1024, 4096, 16, 16, False, True, True): (4, 32, 3, 2), + (1024, 1024, 4096, 16, 16, True, False, True): (8, 32, 3, 2), + (1024, 1024, 4096, 32, 32, False, True, True): (9, 32, 3, 2), + (1024, 1024, 4096, 32, 32, True, False, True): (1, 32, 3, 2), + (1024, 1024, 4096, 64, 64, False, True, True): (6, 32, 3, 4), + (1024, 1024, 4096, 64, 64, True, False, True): (1, 32, 3, 4), + (1024, 1024, 4096, 128, 128, False, True, True): (4, 32, 2, 8), + (1024, 1024, 4096, 128, 128, True, False, True): (4, 32, 1, 4), + (1024, 1024, 8192, 16, 16, False, True, True): (4, 64, 3, 2), + (1024, 1024, 8192, 16, 16, True, False, True): (4, 64, 3, 2), + (1024, 1024, 8192, 32, 32, False, True, True): (8, 64, 3, 2), + (1024, 1024, 8192, 32, 32, True, False, True): (6, 64, 3, 2), + (1024, 1024, 8192, 64, 64, False, True, True): (2, 64, 3, 4), + (1024, 1024, 8192, 64, 64, True, False, True): (2, 64, 3, 4), + (1024, 1024, 8192, 128, 128, False, True, True): (3, 64, 1, 4), + (1024, 1024, 8192, 128, 128, True, False, True): (2, 64, 1, 4), + (1024, 1024, 16384, 16, 16, False, True, True): (1, 64, 3, 4), + (1024, 1024, 16384, 16, 16, True, False, True): (1, 64, 3, 2), + (1024, 1024, 16384, 32, 32, False, True, True): (1, 128, 3, 4), + (1024, 1024, 16384, 32, 32, True, False, True): (1, 64, 3, 4), + (1024, 1024, 16384, 64, 64, False, True, True): (1, 128, 3, 4), + (1024, 1024, 16384, 64, 64, True, False, True): (1, 128, 3, 4), + (1024, 1024, 16384, 128, 128, False, True, True): (11, 128, 1, 4), + (1024, 1024, 16384, 128, 128, True, False, True): (4, 128, 1, 4), + (1024, 1024, 32768, 16, 16, False, True, True): (1, 128, 3, 4), + (1024, 1024, 32768, 16, 16, True, False, True): (1, 128, 3, 1), + (1024, 1024, 32768, 32, 32, False, True, True): (1, 256, 3, 2), + (1024, 1024, 32768, 32, 32, True, False, True): (1, 128, 3, 4), + (1024, 1024, 32768, 64, 64, False, True, True): (2, 128, 2, 4), + (1024, 1024, 32768, 64, 64, True, False, True): (1, 256, 3, 4), + (1024, 1024, 32768, 128, 128, False, True, True): (7, 256, 1, 4), + (1024, 1024, 32768, 128, 128, True, False, True): (4, 256, 1, 4), + (1024, 1024, 50432, 16, 16, False, True, True): (1, 197, 1, 4), + (1024, 1024, 50432, 16, 16, True, False, True): (4, 197, 3, 4), + (1024, 1024, 50432, 32, 32, False, True, True): (2, 197, 1, 4), + (1024, 1024, 50432, 32, 32, True, False, True): (1, 197, 3, 4), + (1024, 1024, 50432, 64, 64, False, True, True): (2, 394, 1, 4), + (1024, 1024, 50432, 64, 64, True, False, True): (1, 197, 2, 4), + (1024, 1024, 50432, 128, 128, False, True, True): (3, 394, 1, 4), + (1024, 1024, 50432, 128, 128, True, False, True): (2, 394, 2, 4), + (1024, 1024, 65536, 16, 16, False, True, True): (1, 256, 3, 4), + (1024, 1024, 65536, 16, 16, True, False, True): (1, 256, 3, 1), + (1024, 1024, 65536, 32, 32, False, True, True): (1, 512, 3, 2), + (1024, 1024, 65536, 32, 32, True, False, True): (1, 256, 3, 4), + (1024, 1024, 65536, 64, 64, False, True, True): (2, 256, 2, 4), + (1024, 1024, 65536, 64, 64, True, False, True): (1, 512, 3, 4), + (1024, 1024, 65536, 128, 128, False, True, True): (10, 512, 1, 4), + (1024, 1024, 65536, 128, 128, True, False, True): (4, 512, 1, 4), + (1024, 1024, 65792, 16, 16, False, True, True): (1, 257, 1, 4), + (1024, 1024, 65792, 16, 16, True, False, True): (10, 257, 4, 1), + (1024, 1024, 65792, 32, 32, False, True, True): (2, 257, 1, 4), + (1024, 1024, 65792, 32, 32, True, False, True): (1, 257, 3, 4), + (1024, 1024, 65792, 64, 64, False, True, True): (2, 514, 1, 4), + (1024, 1024, 65792, 64, 64, True, False, True): (2, 257, 2, 4), + (1024, 1024, 65792, 128, 128, False, True, True): (6, 514, 1, 4), + (1024, 1024, 65792, 128, 128, True, False, True): (2, 514, 2, 4), + (1024, 1024, 131072, 16, 16, False, True, True): (11, 512, 3, 2), + (1024, 1024, 131072, 16, 16, True, False, True): (11, 512, 3, 2), + (1024, 1024, 131072, 32, 32, False, True, True): (7, 1024, 3, 2), + (1024, 1024, 131072, 32, 32, True, False, True): (6, 512, 3, 4), + (1024, 1024, 131072, 64, 64, False, True, True): (1, 512, 2, 4), + (1024, 1024, 131072, 64, 64, True, False, True): (4, 1024, 3, 4), + (1024, 1024, 131072, 128, 128, False, True, True): (12, 1024, 1, 4), + (1024, 1024, 131072, 128, 128, True, False, True): (4, 1024, 1, 4), + (1280, 5120, 65792, 16, 16, False, True, True): (1, 257, 1, 4), + (1280, 5120, 65792, 16, 16, True, False, True): (5, 257, 4, 1), + (1280, 5120, 65792, 32, 32, False, True, True): (2, 257, 1, 4), + (1280, 5120, 65792, 32, 32, True, False, True): (2, 257, 3, 4), + (1280, 5120, 65792, 64, 64, False, True, True): (1, 514, 3, 4), + (1280, 5120, 65792, 64, 64, True, False, True): (2, 257, 3, 4), + (1280, 5120, 65792, 128, 128, False, True, True): (1, 514, 3, 8), + (1280, 5120, 65792, 128, 128, True, False, True): (1, 514, 3, 8), + (1536, 1536, 256, 16, 16, False, True, True): (5, 4, 4, 2), + (1536, 1536, 256, 16, 16, True, False, True): (3, 4, 5, 2), + (1536, 1536, 256, 32, 32, False, True, True): (2, 4, 4, 4), + (1536, 1536, 256, 32, 32, True, False, True): (1, 4, 6, 2), + (1536, 1536, 256, 64, 64, False, True, True): (5, 4, 4, 4), + (1536, 1536, 256, 64, 64, True, False, True): (2, 4, 4, 4), + (1536, 1536, 256, 128, 128, False, True, True): (1, 2, 3, 8), + (1536, 1536, 256, 128, 128, True, False, True): (2, 2, 3, 8), + (1536, 1536, 512, 16, 16, False, True, True): (1, 8, 1, 4), + (1536, 1536, 512, 16, 16, True, False, True): (3, 4, 4, 2), + (1536, 1536, 512, 32, 32, False, True, True): (1, 8, 1, 8), + (1536, 1536, 512, 32, 32, True, False, True): (1, 4, 4, 4), + (1536, 1536, 512, 64, 64, False, True, True): (3, 8, 3, 4), + (1536, 1536, 512, 64, 64, True, False, True): (5, 8, 3, 4), + (1536, 1536, 512, 128, 128, False, True, True): (3, 4, 3, 8), + (1536, 1536, 512, 128, 128, True, False, True): (1, 4, 3, 8), + (1536, 1536, 1024, 16, 16, False, True, True): (6, 8, 1, 2), + (1536, 1536, 1024, 16, 16, True, False, True): (2, 8, 5, 2), + (1536, 1536, 1024, 32, 32, False, True, True): (6, 8, 1, 8), + (1536, 1536, 1024, 32, 32, True, False, True): (2, 4, 3, 4), + (1536, 1536, 1024, 64, 64, False, True, True): (1, 16, 3, 4), + (1536, 1536, 1024, 64, 64, True, False, True): (3, 8, 3, 4), + (1536, 1536, 1024, 128, 128, False, True, True): (3, 8, 3, 8), + (1536, 1536, 1024, 128, 128, True, False, True): (3, 8, 3, 8), + (1536, 1536, 2048, 16, 16, False, True, True): (1, 16, 1, 4), + (1536, 1536, 2048, 16, 16, True, False, True): (1, 8, 3, 1), + (1536, 1536, 2048, 32, 32, False, True, True): (1, 16, 1, 8), + (1536, 1536, 2048, 32, 32, True, False, True): (4, 8, 3, 2), + (1536, 1536, 2048, 64, 64, False, True, True): (1, 16, 3, 4), + (1536, 1536, 2048, 64, 64, True, False, True): (3, 8, 3, 4), + (1536, 1536, 2048, 128, 128, False, True, True): (6, 16, 1, 4), + (1536, 1536, 2048, 128, 128, True, False, True): (4, 16, 3, 8), + (1536, 1536, 4096, 16, 16, False, True, True): (1, 32, 1, 2), + (1536, 1536, 4096, 16, 16, True, False, True): (4, 32, 4, 2), + (1536, 1536, 4096, 32, 32, False, True, True): (1, 32, 1, 8), + (1536, 1536, 4096, 32, 32, True, False, True): (3, 16, 3, 4), + (1536, 1536, 4096, 64, 64, False, True, True): (1, 32, 3, 4), + (1536, 1536, 4096, 64, 64, True, False, True): (1, 16, 3, 4), + (1536, 1536, 4096, 128, 128, False, True, True): (4, 32, 3, 8), + (1536, 1536, 4096, 128, 128, True, False, True): (2, 32, 3, 8), + (1536, 1536, 8192, 16, 16, False, True, True): (2, 64, 1, 2), + (1536, 1536, 8192, 16, 16, True, False, True): (4, 64, 4, 2), + (1536, 1536, 8192, 32, 32, False, True, True): (1, 64, 1, 8), + (1536, 1536, 8192, 32, 32, True, False, True): (12, 32, 3, 4), + (1536, 1536, 8192, 64, 64, False, True, True): (2, 64, 3, 4), + (1536, 1536, 8192, 64, 64, True, False, True): (2, 32, 3, 4), + (1536, 1536, 8192, 128, 128, False, True, True): (3, 64, 1, 4), + (1536, 1536, 8192, 128, 128, True, False, True): (4, 64, 3, 8), + (1536, 1536, 16384, 16, 16, False, True, True): (1, 128, 1, 2), + (1536, 1536, 16384, 16, 16, True, False, True): (1, 64, 4, 4), + (1536, 1536, 16384, 32, 32, False, True, True): (1, 64, 1, 2), + (1536, 1536, 16384, 32, 32, True, False, True): (1, 64, 3, 4), + (1536, 1536, 16384, 64, 64, False, True, True): (1, 128, 3, 4), + (1536, 1536, 16384, 64, 64, True, False, True): (1, 64, 3, 4), + (1536, 1536, 16384, 128, 128, False, True, True): (3, 128, 1, 4), + (1536, 1536, 16384, 128, 128, True, False, True): (1, 128, 2, 4), + (1536, 1536, 32768, 16, 16, False, True, True): (1, 256, 1, 2), + (1536, 1536, 32768, 16, 16, True, False, True): (1, 128, 3, 2), + (1536, 1536, 32768, 32, 32, False, True, True): (1, 128, 1, 2), + (1536, 1536, 32768, 32, 32, True, False, True): (1, 128, 3, 4), + (1536, 1536, 32768, 64, 64, False, True, True): (3, 256, 3, 4), + (1536, 1536, 32768, 64, 64, True, False, True): (1, 128, 3, 4), + (1536, 1536, 32768, 128, 128, False, True, True): (3, 256, 1, 4), + (1536, 1536, 32768, 128, 128, True, False, True): (1, 256, 2, 4), + (1536, 1536, 65536, 16, 16, False, True, True): (4, 512, 1, 2), + (1536, 1536, 65536, 16, 16, True, False, True): (1, 256, 4, 4), + (1536, 1536, 65536, 32, 32, False, True, True): (1, 256, 1, 2), + (1536, 1536, 65536, 32, 32, True, False, True): (1, 256, 3, 4), + (1536, 1536, 65536, 64, 64, False, True, True): (2, 512, 3, 4), + (1536, 1536, 65536, 64, 64, True, False, True): (1, 256, 3, 4), + (1536, 1536, 65536, 128, 128, False, True, True): (3, 512, 1, 4), + (1536, 1536, 65536, 128, 128, True, False, True): (2, 512, 2, 4), + (1536, 1536, 131072, 16, 16, False, True, True): (2, 1024, 1, 2), + (1536, 1536, 131072, 16, 16, True, False, True): (9, 512, 4, 4), + (1536, 1536, 131072, 32, 32, False, True, True): (1, 512, 1, 2), + (1536, 1536, 131072, 32, 32, True, False, True): (9, 512, 3, 4), + (1536, 1536, 131072, 64, 64, False, True, True): (1, 1024, 3, 4), + (1536, 1536, 131072, 64, 64, True, False, True): (1, 512, 3, 4), + (1536, 1536, 131072, 128, 128, False, True, True): (3, 1024, 1, 4), + (1536, 1536, 131072, 128, 128, True, False, True): (1, 1024, 2, 4), + (2048, 2048, 256, 16, 16, False, True, True): (4, 4, 6, 2), + (2048, 2048, 256, 16, 16, True, False, True): (2, 8, 4, 1), + (2048, 2048, 256, 32, 32, False, True, True): (3, 4, 4, 2), + (2048, 2048, 256, 32, 32, True, False, True): (1, 4, 5, 2), + (2048, 2048, 256, 64, 64, False, True, True): (2, 4, 4, 4), + (2048, 2048, 256, 64, 64, True, False, True): (2, 4, 4, 4), + (2048, 2048, 256, 128, 128, False, True, True): (3, 2, 2, 8), + (2048, 2048, 256, 128, 128, True, False, True): (5, 2, 2, 8), + (2048, 2048, 512, 16, 16, False, True, True): (5, 4, 4, 4), + (2048, 2048, 512, 16, 16, True, False, True): (2, 4, 4, 2), + (2048, 2048, 512, 32, 32, False, True, True): (1, 4, 3, 4), + (2048, 2048, 512, 32, 32, True, False, True): (3, 4, 4, 2), + (2048, 2048, 512, 64, 64, False, True, True): (1, 8, 3, 4), + (2048, 2048, 512, 64, 64, True, False, True): (1, 8, 3, 2), + (2048, 2048, 512, 128, 128, False, True, True): (3, 4, 2, 8), + (2048, 2048, 512, 128, 128, True, False, True): (2, 4, 2, 8), + (2048, 2048, 1024, 16, 16, False, True, True): (3, 4, 3, 4), + (2048, 2048, 1024, 16, 16, True, False, True): (2, 8, 3, 2), + (2048, 2048, 1024, 32, 32, False, True, True): (3, 8, 3, 4), + (2048, 2048, 1024, 32, 32, True, False, True): (1, 8, 3, 2), + (2048, 2048, 1024, 64, 64, False, True, True): (1, 8, 3, 4), + (2048, 2048, 1024, 64, 64, True, False, True): (1, 8, 3, 4), + (2048, 2048, 1024, 128, 128, False, True, True): (4, 8, 2, 8), + (2048, 2048, 1024, 128, 128, True, False, True): (4, 8, 1, 4), + (2048, 2048, 2048, 16, 16, False, True, True): (4, 16, 3, 2), + (2048, 2048, 2048, 16, 16, True, False, True): (2, 16, 3, 2), + (2048, 2048, 2048, 32, 32, False, True, True): (1, 16, 3, 4), + (2048, 2048, 2048, 32, 32, True, False, True): (1, 16, 3, 2), + (2048, 2048, 2048, 64, 64, False, True, True): (1, 16, 3, 4), + (2048, 2048, 2048, 64, 64, True, False, True): (1, 16, 3, 4), + (2048, 2048, 2048, 128, 128, False, True, True): (6, 16, 2, 8), + (2048, 2048, 2048, 128, 128, True, False, True): (5, 16, 1, 4), + (2048, 2048, 4096, 16, 16, False, True, True): (4, 32, 4, 2), + (2048, 2048, 4096, 16, 16, True, False, True): (4, 32, 3, 2), + (2048, 2048, 4096, 32, 32, False, True, True): (4, 16, 3, 8), + (2048, 2048, 4096, 32, 32, True, False, True): (4, 16, 3, 4), + (2048, 2048, 4096, 64, 64, False, True, True): (4, 32, 3, 4), + (2048, 2048, 4096, 64, 64, True, False, True): (4, 32, 3, 4), + (2048, 2048, 4096, 128, 128, False, True, True): (4, 32, 2, 8), + (2048, 2048, 4096, 128, 128, True, False, True): (2, 32, 1, 4), + (2048, 2048, 8192, 16, 16, False, True, True): (4, 64, 4, 2), + (2048, 2048, 8192, 16, 16, True, False, True): (4, 64, 4, 2), + (2048, 2048, 8192, 32, 32, False, True, True): (4, 32, 3, 8), + (2048, 2048, 8192, 32, 32, True, False, True): (4, 32, 4, 8), + (2048, 2048, 8192, 64, 64, False, True, True): (2, 64, 3, 4), + (2048, 2048, 8192, 64, 64, True, False, True): (4, 64, 3, 4), + (2048, 2048, 8192, 128, 128, False, True, True): (3, 64, 1, 4), + (2048, 2048, 8192, 128, 128, True, False, True): (2, 64, 1, 4), + (2048, 2048, 16384, 16, 16, False, True, True): (4, 64, 3, 4), + (2048, 2048, 16384, 16, 16, True, False, True): (1, 64, 3, 4), + (2048, 2048, 16384, 32, 32, False, True, True): (4, 64, 3, 4), + (2048, 2048, 16384, 32, 32, True, False, True): (4, 64, 3, 4), + (2048, 2048, 16384, 64, 64, False, True, True): (4, 128, 3, 4), + (2048, 2048, 16384, 64, 64, True, False, True): (4, 128, 3, 4), + (2048, 2048, 16384, 128, 128, False, True, True): (3, 128, 1, 4), + (2048, 2048, 16384, 128, 128, True, False, True): (2, 128, 1, 4), + (2048, 2048, 32768, 16, 16, False, True, True): (8, 128, 3, 2), + (2048, 2048, 32768, 16, 16, True, False, True): (8, 128, 3, 4), + (2048, 2048, 32768, 32, 32, False, True, True): (8, 128, 3, 4), + (2048, 2048, 32768, 32, 32, True, False, True): (8, 128, 3, 4), + (2048, 2048, 32768, 64, 64, False, True, True): (8, 256, 3, 4), + (2048, 2048, 32768, 64, 64, True, False, True): (8, 256, 3, 4), + (2048, 2048, 32768, 128, 128, False, True, True): (3, 256, 1, 4), + (2048, 2048, 32768, 128, 128, True, False, True): (1, 256, 1, 4), + (2048, 2048, 50432, 16, 16, False, True, True): (1, 197, 1, 4), + (2048, 2048, 50432, 16, 16, True, False, True): (4, 197, 4, 1), + (2048, 2048, 50432, 32, 32, False, True, True): (2, 197, 1, 4), + (2048, 2048, 50432, 32, 32, True, False, True): (4, 197, 3, 4), + (2048, 2048, 50432, 64, 64, False, True, True): (2, 394, 3, 4), + (2048, 2048, 50432, 64, 64, True, False, True): (4, 197, 2, 4), + (2048, 2048, 50432, 128, 128, False, True, True): (3, 394, 1, 4), + (2048, 2048, 50432, 128, 128, True, False, True): (4, 394, 2, 4), + (2048, 2048, 65536, 16, 16, False, True, True): (9, 256, 3, 2), + (2048, 2048, 65536, 16, 16, True, False, True): (9, 256, 4, 4), + (2048, 2048, 65536, 32, 32, False, True, True): (7, 256, 3, 4), + (2048, 2048, 65536, 32, 32, True, False, True): (7, 256, 3, 4), + (2048, 2048, 65536, 64, 64, False, True, True): (2, 256, 2, 4), + (2048, 2048, 65536, 64, 64, True, False, True): (9, 512, 3, 4), + (2048, 2048, 65536, 128, 128, False, True, True): (5, 512, 1, 4), + (2048, 2048, 65536, 128, 128, True, False, True): (1, 512, 1, 4), + (2048, 2048, 65792, 16, 16, False, True, True): (1, 257, 1, 4), + (2048, 2048, 65792, 16, 16, True, False, True): (7, 257, 4, 1), + (2048, 2048, 65792, 32, 32, False, True, True): (2, 257, 1, 4), + (2048, 2048, 65792, 32, 32, True, False, True): (7, 257, 3, 4), + (2048, 2048, 65792, 64, 64, False, True, True): (1, 514, 3, 4), + (2048, 2048, 65792, 64, 64, True, False, True): (1, 257, 2, 4), + (2048, 2048, 65792, 128, 128, False, True, True): (3, 514, 1, 4), + (2048, 2048, 65792, 128, 128, True, False, True): (1, 514, 2, 4), + (2048, 2048, 131072, 16, 16, False, True, True): (9, 512, 3, 2), + (2048, 2048, 131072, 16, 16, True, False, True): (9, 512, 4, 4), + (2048, 2048, 131072, 32, 32, False, True, True): (7, 512, 3, 4), + (2048, 2048, 131072, 32, 32, True, False, True): (3, 512, 3, 4), + (2048, 2048, 131072, 64, 64, False, True, True): (1, 512, 2, 4), + (2048, 2048, 131072, 64, 64, True, False, True): (2, 1024, 3, 4), + (2048, 2048, 131072, 128, 128, False, True, True): (3, 1024, 1, 4), + (2048, 2048, 131072, 128, 128, True, False, True): (1, 1024, 1, 4), + (3072, 768, 256, 16, 16, False, True, True): (6, 4, 1, 4), + (3072, 768, 256, 16, 16, True, False, True): (2, 1, 5, 2), + (3072, 768, 256, 32, 32, False, True, True): (1, 4, 1, 8), + (3072, 768, 256, 32, 32, True, False, True): (4, 2, 4, 4), + (3072, 768, 256, 64, 64, False, True, True): (1, 2, 3, 4), + (3072, 768, 256, 64, 64, True, False, True): (3, 4, 3, 4), + (3072, 768, 256, 128, 128, False, True, True): (1, 2, 3, 8), + (3072, 768, 256, 128, 128, True, False, True): (3, 2, 3, 8), + (3072, 768, 512, 16, 16, False, True, True): (1, 4, 1, 4), + (3072, 768, 512, 16, 16, True, False, True): (3, 4, 4, 1), + (3072, 768, 512, 32, 32, False, True, True): (5, 8, 1, 4), + (3072, 768, 512, 32, 32, True, False, True): (3, 4, 4, 2), + (3072, 768, 512, 64, 64, False, True, True): (1, 8, 1, 4), + (3072, 768, 512, 64, 64, True, False, True): (1, 4, 3, 4), + (3072, 768, 512, 128, 128, False, True, True): (3, 4, 3, 8), + (3072, 768, 512, 128, 128, True, False, True): (1, 4, 3, 8), + (3072, 768, 1024, 16, 16, False, True, True): (1, 8, 1, 4), + (3072, 768, 1024, 16, 16, True, False, True): (3, 4, 3, 1), + (3072, 768, 1024, 32, 32, False, True, True): (1, 16, 1, 4), + (3072, 768, 1024, 32, 32, True, False, True): (1, 4, 3, 8), + (3072, 768, 1024, 64, 64, False, True, True): (8, 16, 3, 2), + (3072, 768, 1024, 64, 64, True, False, True): (1, 4, 3, 4), + (3072, 768, 1024, 128, 128, False, True, True): (2, 8, 3, 8), + (3072, 768, 1024, 128, 128, True, False, True): (3, 8, 2, 4), + (3072, 768, 2048, 16, 16, False, True, True): (1, 8, 1, 4), + (3072, 768, 2048, 16, 16, True, False, True): (6, 8, 4, 4), + (3072, 768, 2048, 32, 32, False, True, True): (1, 16, 1, 8), + (3072, 768, 2048, 32, 32, True, False, True): (6, 8, 3, 4), + (3072, 768, 2048, 64, 64, False, True, True): (8, 16, 3, 4), + (3072, 768, 2048, 64, 64, True, False, True): (3, 16, 3, 4), + (3072, 768, 2048, 128, 128, False, True, True): (1, 16, 3, 8), + (3072, 768, 2048, 128, 128, True, False, True): (2, 16, 2, 4), + (3072, 768, 4096, 16, 16, False, True, True): (1, 16, 1, 4), + (3072, 768, 4096, 16, 16, True, False, True): (4, 32, 4, 2), + (3072, 768, 4096, 32, 32, False, True, True): (1, 32, 1, 8), + (3072, 768, 4096, 32, 32, True, False, True): (4, 16, 3, 4), + (3072, 768, 4096, 64, 64, False, True, True): (2, 32, 1, 4), + (3072, 768, 4096, 64, 64, True, False, True): (2, 16, 2, 4), + (3072, 768, 4096, 128, 128, False, True, True): (2, 32, 1, 16), + (3072, 768, 4096, 128, 128, True, False, True): (3, 32, 2, 4), + (3072, 768, 8192, 16, 16, False, True, True): (2, 32, 1, 4), + (3072, 768, 8192, 16, 16, True, False, True): (4, 64, 4, 2), + (3072, 768, 8192, 32, 32, False, True, True): (2, 32, 1, 4), + (3072, 768, 8192, 32, 32, True, False, True): (6, 32, 3, 4), + (3072, 768, 8192, 64, 64, False, True, True): (2, 64, 1, 4), + (3072, 768, 8192, 64, 64, True, False, True): (2, 32, 2, 4), + (3072, 768, 8192, 128, 128, False, True, True): (3, 64, 1, 4), + (3072, 768, 8192, 128, 128, True, False, True): (2, 64, 2, 4), + (3072, 768, 16384, 16, 16, False, True, True): (1, 64, 1, 4), + (3072, 768, 16384, 16, 16, True, False, True): (1, 64, 1, 1), + (3072, 768, 16384, 32, 32, False, True, True): (2, 64, 1, 4), + (3072, 768, 16384, 32, 32, True, False, True): (4, 64, 3, 4), + (3072, 768, 16384, 64, 64, False, True, True): (2, 128, 1, 4), + (3072, 768, 16384, 64, 64, True, False, True): (4, 64, 2, 4), + (3072, 768, 16384, 128, 128, False, True, True): (3, 128, 1, 4), + (3072, 768, 16384, 128, 128, True, False, True): (1, 128, 2, 4), + (3072, 768, 32768, 16, 16, False, True, True): (1, 128, 1, 4), + (3072, 768, 32768, 16, 16, True, False, True): (8, 256, 3, 2), + (3072, 768, 32768, 32, 32, False, True, True): (2, 128, 1, 4), + (3072, 768, 32768, 32, 32, True, False, True): (8, 128, 3, 4), + (3072, 768, 32768, 64, 64, False, True, True): (1, 256, 1, 4), + (3072, 768, 32768, 64, 64, True, False, True): (8, 128, 2, 4), + (3072, 768, 32768, 128, 128, False, True, True): (3, 256, 1, 4), + (3072, 768, 32768, 128, 128, True, False, True): (3, 256, 2, 4), + (3072, 768, 50432, 16, 16, False, True, True): (1, 197, 1, 4), + (3072, 768, 50432, 16, 16, True, False, True): (7, 197, 4, 1), + (3072, 768, 50432, 32, 32, False, True, True): (2, 197, 1, 4), + (3072, 768, 50432, 32, 32, True, False, True): (10, 197, 3, 4), + (3072, 768, 50432, 64, 64, False, True, True): (1, 394, 1, 4), + (3072, 768, 50432, 64, 64, True, False, True): (3, 197, 2, 4), + (3072, 768, 50432, 128, 128, False, True, True): (3, 394, 1, 4), + (3072, 768, 50432, 128, 128, True, False, True): (2, 394, 2, 4), + (3072, 768, 65536, 16, 16, False, True, True): (1, 256, 1, 4), + (3072, 768, 65536, 16, 16, True, False, True): (15, 256, 4, 1), + (3072, 768, 65536, 32, 32, False, True, True): (2, 256, 1, 4), + (3072, 768, 65536, 32, 32, True, False, True): (10, 256, 3, 4), + (3072, 768, 65536, 64, 64, False, True, True): (1, 512, 1, 4), + (3072, 768, 65536, 64, 64, True, False, True): (3, 256, 2, 4), + (3072, 768, 65536, 128, 128, False, True, True): (3, 512, 1, 4), + (3072, 768, 65536, 128, 128, True, False, True): (3, 512, 2, 4), + (3072, 768, 131072, 16, 16, False, True, True): (1, 512, 1, 4), + (3072, 768, 131072, 16, 16, True, False, True): (15, 512, 4, 1), + (3072, 768, 131072, 32, 32, False, True, True): (2, 512, 1, 4), + (3072, 768, 131072, 32, 32, True, False, True): (9, 512, 3, 4), + (3072, 768, 131072, 64, 64, False, True, True): (1, 1024, 1, 4), + (3072, 768, 131072, 64, 64, True, False, True): (3, 512, 2, 4), + (3072, 768, 131072, 128, 128, False, True, True): (3, 1024, 1, 4), + (3072, 768, 131072, 128, 128, True, False, True): (3, 1024, 2, 4), + (3072, 3072, 256, 16, 16, False, True, True): (5, 4, 1, 4), + (3072, 3072, 256, 16, 16, True, False, True): (1, 2, 5, 2), + (3072, 3072, 256, 32, 32, False, True, True): (1, 4, 1, 8), + (3072, 3072, 256, 32, 32, True, False, True): (3, 4, 4, 2), + (3072, 3072, 256, 64, 64, False, True, True): (2, 4, 3, 4), + (3072, 3072, 256, 64, 64, True, False, True): (3, 4, 4, 4), + (3072, 3072, 256, 128, 128, False, True, True): (1, 2, 3, 8), + (3072, 3072, 256, 128, 128, True, False, True): (1, 2, 3, 8), + (3072, 3072, 512, 16, 16, False, True, True): (5, 4, 1, 2), + (3072, 3072, 512, 16, 16, True, False, True): (1, 2, 4, 4), + (3072, 3072, 512, 32, 32, False, True, True): (3, 8, 1, 4), + (3072, 3072, 512, 32, 32, True, False, True): (4, 2, 3, 4), + (3072, 3072, 512, 64, 64, False, True, True): (1, 8, 2, 2), + (3072, 3072, 512, 64, 64, True, False, True): (2, 4, 3, 4), + (3072, 3072, 512, 128, 128, False, True, True): (1, 4, 3, 8), + (3072, 3072, 512, 128, 128, True, False, True): (4, 4, 3, 8), + (3072, 3072, 1024, 16, 16, False, True, True): (1, 8, 1, 4), + (3072, 3072, 1024, 16, 16, True, False, True): (4, 8, 5, 2), + (3072, 3072, 1024, 32, 32, False, True, True): (1, 8, 1, 8), + (3072, 3072, 1024, 32, 32, True, False, True): (1, 4, 4, 4), + (3072, 3072, 1024, 64, 64, False, True, True): (3, 8, 3, 4), + (3072, 3072, 1024, 64, 64, True, False, True): (2, 4, 3, 4), + (3072, 3072, 1024, 128, 128, False, True, True): (3, 8, 1, 4), + (3072, 3072, 1024, 128, 128, True, False, True): (1, 8, 3, 8), + (3072, 3072, 2048, 16, 16, False, True, True): (1, 16, 1, 2), + (3072, 3072, 2048, 16, 16, True, False, True): (4, 16, 4, 2), + (3072, 3072, 2048, 32, 32, False, True, True): (1, 16, 1, 8), + (3072, 3072, 2048, 32, 32, True, False, True): (3, 8, 4, 4), + (3072, 3072, 2048, 64, 64, False, True, True): (3, 16, 3, 4), + (3072, 3072, 2048, 64, 64, True, False, True): (3, 8, 3, 4), + (3072, 3072, 2048, 128, 128, False, True, True): (4, 16, 3, 8), + (3072, 3072, 2048, 128, 128, True, False, True): (3, 16, 3, 8), + (3072, 3072, 4096, 16, 16, False, True, True): (1, 32, 1, 2), + (3072, 3072, 4096, 16, 16, True, False, True): (4, 32, 4, 2), + (3072, 3072, 4096, 32, 32, False, True, True): (1, 32, 1, 8), + (3072, 3072, 4096, 32, 32, True, False, True): (3, 16, 3, 4), + (3072, 3072, 4096, 64, 64, False, True, True): (1, 32, 3, 4), + (3072, 3072, 4096, 64, 64, True, False, True): (3, 16, 3, 4), + (3072, 3072, 4096, 128, 128, False, True, True): (1, 32, 3, 8), + (3072, 3072, 4096, 128, 128, True, False, True): (3, 32, 3, 8), + (3072, 3072, 8192, 16, 16, False, True, True): (1, 64, 1, 2), + (3072, 3072, 8192, 16, 16, True, False, True): (4, 64, 4, 2), + (3072, 3072, 8192, 32, 32, False, True, True): (1, 64, 1, 8), + (3072, 3072, 8192, 32, 32, True, False, True): (8, 32, 3, 4), + (3072, 3072, 8192, 64, 64, False, True, True): (3, 64, 3, 4), + (3072, 3072, 8192, 64, 64, True, False, True): (2, 32, 3, 4), + (3072, 3072, 8192, 128, 128, False, True, True): (2, 64, 3, 8), + (3072, 3072, 8192, 128, 128, True, False, True): (1, 64, 3, 8), + (3072, 3072, 16384, 16, 16, False, True, True): (1, 128, 1, 2), + (3072, 3072, 16384, 16, 16, True, False, True): (4, 128, 4, 2), + (3072, 3072, 16384, 32, 32, False, True, True): (1, 64, 1, 2), + (3072, 3072, 16384, 32, 32, True, False, True): (4, 64, 3, 4), + (3072, 3072, 16384, 64, 64, False, True, True): (1, 128, 3, 4), + (3072, 3072, 16384, 64, 64, True, False, True): (4, 64, 3, 4), + (3072, 3072, 16384, 128, 128, False, True, True): (3, 128, 1, 4), + (3072, 3072, 16384, 128, 128, True, False, True): (1, 128, 3, 8), + (3072, 3072, 32768, 16, 16, False, True, True): (1, 256, 1, 2), + (3072, 3072, 32768, 16, 16, True, False, True): (8, 128, 4, 4), + (3072, 3072, 32768, 32, 32, False, True, True): (1, 256, 1, 8), + (3072, 3072, 32768, 32, 32, True, False, True): (5, 128, 3, 4), + (3072, 3072, 32768, 64, 64, False, True, True): (1, 256, 3, 4), + (3072, 3072, 32768, 64, 64, True, False, True): (1, 128, 3, 4), + (3072, 3072, 32768, 128, 128, False, True, True): (3, 256, 1, 4), + (3072, 3072, 32768, 128, 128, True, False, True): (3, 256, 2, 4), + (3072, 3072, 65536, 16, 16, False, True, True): (1, 512, 1, 2), + (3072, 3072, 65536, 16, 16, True, False, True): (7, 256, 4, 4), + (3072, 3072, 65536, 32, 32, False, True, True): (1, 256, 1, 2), + (3072, 3072, 65536, 32, 32, True, False, True): (5, 256, 3, 4), + (3072, 3072, 65536, 64, 64, False, True, True): (1, 512, 3, 4), + (3072, 3072, 65536, 64, 64, True, False, True): (3, 256, 3, 4), + (3072, 3072, 65536, 128, 128, False, True, True): (3, 512, 1, 4), + (3072, 3072, 65536, 128, 128, True, False, True): (3, 512, 2, 4), + (3072, 3072, 131072, 16, 16, False, True, True): (1, 1024, 1, 2), + (3072, 3072, 131072, 16, 16, True, False, True): (5, 512, 4, 4), + (3072, 3072, 131072, 32, 32, False, True, True): (1, 512, 1, 2), + (3072, 3072, 131072, 32, 32, True, False, True): (3, 512, 3, 4), + (3072, 3072, 131072, 64, 64, False, True, True): (1, 1024, 3, 4), + (3072, 3072, 131072, 64, 64, True, False, True): (3, 512, 3, 4), + (3072, 3072, 131072, 128, 128, False, True, True): (3, 1024, 1, 4), + (3072, 3072, 131072, 128, 128, True, False, True): (1, 1024, 2, 4), + (4096, 4096, 256, 16, 16, False, True, True): (2, 2, 6, 4), + (4096, 4096, 256, 16, 16, True, False, True): (2, 2, 5, 4), + (4096, 4096, 256, 32, 32, False, True, True): (7, 2, 4, 4), + (4096, 4096, 256, 32, 32, True, False, True): (1, 2, 4, 4), + (4096, 4096, 256, 64, 64, False, True, True): (3, 4, 3, 4), + (4096, 4096, 256, 64, 64, True, False, True): (3, 4, 3, 4), + (4096, 4096, 256, 128, 128, False, True, True): (1, 2, 2, 8), + (4096, 4096, 256, 128, 128, True, False, True): (1, 2, 2, 8), + (4096, 4096, 512, 16, 16, False, True, True): (4, 2, 3, 4), + (4096, 4096, 512, 16, 16, True, False, True): (2, 4, 3, 2), + (4096, 4096, 512, 32, 32, False, True, True): (3, 4, 3, 4), + (4096, 4096, 512, 32, 32, True, False, True): (3, 4, 3, 2), + (4096, 4096, 512, 64, 64, False, True, True): (3, 4, 3, 4), + (4096, 4096, 512, 64, 64, True, False, True): (3, 4, 3, 4), + (4096, 4096, 512, 128, 128, False, True, True): (2, 4, 2, 8), + (4096, 4096, 512, 128, 128, True, False, True): (2, 4, 1, 4), + (4096, 4096, 1024, 16, 16, False, True, True): (2, 8, 3, 2), + (4096, 4096, 1024, 16, 16, True, False, True): (2, 8, 3, 2), + (4096, 4096, 1024, 32, 32, False, True, True): (3, 8, 3, 4), + (4096, 4096, 1024, 32, 32, True, False, True): (1, 8, 3, 2), + (4096, 4096, 1024, 64, 64, False, True, True): (1, 8, 3, 4), + (4096, 4096, 1024, 64, 64, True, False, True): (1, 8, 3, 4), + (4096, 4096, 1024, 128, 128, False, True, True): (2, 8, 2, 8), + (4096, 4096, 1024, 128, 128, True, False, True): (2, 8, 2, 8), + (4096, 4096, 2048, 16, 16, False, True, True): (2, 8, 4, 4), + (4096, 4096, 2048, 16, 16, True, False, True): (2, 8, 4, 4), + (4096, 4096, 2048, 32, 32, False, True, True): (4, 8, 4, 8), + (4096, 4096, 2048, 32, 32, True, False, True): (4, 8, 4, 8), + (4096, 4096, 2048, 64, 64, False, True, True): (1, 16, 3, 4), + (4096, 4096, 2048, 64, 64, True, False, True): (4, 16, 3, 4), + (4096, 4096, 2048, 128, 128, False, True, True): (2, 16, 2, 8), + (4096, 4096, 2048, 128, 128, True, False, True): (4, 16, 1, 4), + (4096, 4096, 4096, 16, 16, False, True, True): (4, 32, 4, 4), + (4096, 4096, 4096, 16, 16, True, False, True): (4, 32, 4, 2), + (4096, 4096, 4096, 32, 32, False, True, True): (4, 16, 4, 8), + (4096, 4096, 4096, 32, 32, True, False, True): (4, 16, 3, 8), + (4096, 4096, 4096, 64, 64, False, True, True): (1, 32, 3, 4), + (4096, 4096, 4096, 64, 64, True, False, True): (1, 32, 3, 4), + (4096, 4096, 4096, 128, 128, False, True, True): (3, 32, 1, 4), + (4096, 4096, 4096, 128, 128, True, False, True): (2, 32, 1, 4), + (4096, 4096, 8192, 16, 16, False, True, True): (4, 64, 4, 2), + (4096, 4096, 8192, 16, 16, True, False, True): (4, 64, 4, 2), + (4096, 4096, 8192, 32, 32, False, True, True): (4, 32, 4, 8), + (4096, 4096, 8192, 32, 32, True, False, True): (4, 32, 4, 8), + (4096, 4096, 8192, 64, 64, False, True, True): (2, 64, 3, 4), + (4096, 4096, 8192, 64, 64, True, False, True): (2, 64, 3, 4), + (4096, 4096, 8192, 128, 128, False, True, True): (3, 64, 1, 4), + (4096, 4096, 8192, 128, 128, True, False, True): (1, 64, 1, 4), + (4096, 4096, 16384, 16, 16, False, True, True): (4, 64, 3, 4), + (4096, 4096, 16384, 16, 16, True, False, True): (4, 64, 4, 4), + (4096, 4096, 16384, 32, 32, False, True, True): (4, 64, 4, 8), + (4096, 4096, 16384, 32, 32, True, False, True): (4, 64, 4, 8), + (4096, 4096, 16384, 64, 64, False, True, True): (1, 64, 2, 4), + (4096, 4096, 16384, 64, 64, True, False, True): (1, 64, 3, 8), + (4096, 4096, 16384, 128, 128, False, True, True): (3, 128, 1, 4), + (4096, 4096, 16384, 128, 128, True, False, True): (1, 128, 1, 4), + (4096, 4096, 32768, 16, 16, False, True, True): (8, 128, 3, 2), + (4096, 4096, 32768, 16, 16, True, False, True): (5, 128, 4, 4), + (4096, 4096, 32768, 32, 32, False, True, True): (3, 128, 4, 4), + (4096, 4096, 32768, 32, 32, True, False, True): (3, 128, 4, 8), + (4096, 4096, 32768, 64, 64, False, True, True): (1, 128, 2, 4), + (4096, 4096, 32768, 64, 64, True, False, True): (3, 256, 3, 4), + (4096, 4096, 32768, 128, 128, False, True, True): (3, 256, 1, 4), + (4096, 4096, 32768, 128, 128, True, False, True): (1, 256, 1, 4), + (4096, 4096, 50432, 16, 16, False, True, True): (1, 197, 1, 4), + (4096, 4096, 50432, 16, 16, True, False, True): (4, 197, 4, 1), + (4096, 4096, 50432, 32, 32, False, True, True): (1, 197, 1, 4), + (4096, 4096, 50432, 32, 32, True, False, True): (2, 197, 3, 4), + (4096, 4096, 50432, 64, 64, False, True, True): (1, 394, 3, 4), + (4096, 4096, 50432, 64, 64, True, False, True): (1, 197, 2, 4), + (4096, 4096, 50432, 128, 128, False, True, True): (3, 394, 1, 4), + (4096, 4096, 50432, 128, 128, True, False, True): (1, 394, 2, 4), + (4096, 4096, 65536, 16, 16, False, True, True): (5, 256, 4, 4), + (4096, 4096, 65536, 16, 16, True, False, True): (5, 256, 4, 4), + (4096, 4096, 65536, 32, 32, False, True, True): (4, 256, 4, 8), + (4096, 4096, 65536, 32, 32, True, False, True): (4, 256, 3, 8), + (4096, 4096, 65536, 64, 64, False, True, True): (1, 256, 2, 4), + (4096, 4096, 65536, 64, 64, True, False, True): (1, 512, 3, 4), + (4096, 4096, 65536, 128, 128, False, True, True): (3, 512, 1, 4), + (4096, 4096, 65536, 128, 128, True, False, True): (1, 512, 1, 4), + (4096, 4096, 65792, 16, 16, False, True, True): (1, 257, 1, 4), + (4096, 4096, 65792, 16, 16, True, False, True): (5, 257, 4, 1), + (4096, 4096, 65792, 32, 32, False, True, True): (1, 257, 1, 4), + (4096, 4096, 65792, 32, 32, True, False, True): (1, 257, 3, 4), + (4096, 4096, 65792, 64, 64, False, True, True): (1, 514, 3, 4), + (4096, 4096, 65792, 64, 64, True, False, True): (1, 257, 2, 4), + (4096, 4096, 65792, 128, 128, False, True, True): (3, 514, 1, 4), + (4096, 4096, 65792, 128, 128, True, False, True): (1, 514, 2, 4), + (4096, 4096, 131072, 16, 16, False, True, True): (4, 512, 3, 4), + (4096, 4096, 131072, 16, 16, True, False, True): (5, 512, 4, 4), + (4096, 4096, 131072, 32, 32, False, True, True): (1, 512, 4, 8), + (4096, 4096, 131072, 32, 32, True, False, True): (4, 512, 4, 8), + (4096, 4096, 131072, 64, 64, False, True, True): (1, 512, 2, 4), + (4096, 4096, 131072, 64, 64, True, False, True): (1, 512, 2, 4), + (4096, 4096, 131072, 128, 128, False, True, True): (3, 1024, 1, 4), + (4096, 4096, 131072, 128, 128, True, False, True): (1, 1024, 1, 4), + (5120, 1280, 65792, 16, 16, False, True, True): (1, 257, 1, 4), + (5120, 1280, 65792, 16, 16, True, False, True): (7, 257, 4, 1), + (5120, 1280, 65792, 32, 32, False, True, True): (2, 257, 1, 4), + (5120, 1280, 65792, 32, 32, True, False, True): (5, 257, 3, 4), + (5120, 1280, 65792, 64, 64, False, True, True): (1, 514, 1, 4), + (5120, 1280, 65792, 64, 64, True, False, True): (5, 257, 2, 4), + (5120, 1280, 65792, 128, 128, False, True, True): (3, 514, 1, 4), + (5120, 1280, 65792, 128, 128, True, False, True): (4, 514, 2, 4), + (6144, 6144, 256, 16, 16, False, True, True): (1, 2, 1, 4), + (6144, 6144, 256, 16, 16, True, False, True): (1, 1, 4, 4), + (6144, 6144, 256, 32, 32, False, True, True): (3, 2, 1, 8), + (6144, 6144, 256, 32, 32, True, False, True): (2, 1, 3, 4), + (6144, 6144, 256, 64, 64, False, True, True): (2, 2, 3, 4), + (6144, 6144, 256, 64, 64, True, False, True): (6, 2, 4, 4), + (6144, 6144, 256, 128, 128, False, True, True): (2, 2, 3, 8), + (6144, 6144, 256, 128, 128, True, False, True): (1, 2, 3, 8), + (6144, 6144, 512, 16, 16, False, True, True): (4, 4, 1, 4), + (6144, 6144, 512, 16, 16, True, False, True): (3, 2, 3, 1), + (6144, 6144, 512, 32, 32, False, True, True): (1, 8, 1, 4), + (6144, 6144, 512, 32, 32, True, False, True): (2, 2, 3, 8), + (6144, 6144, 512, 64, 64, False, True, True): (4, 4, 3, 4), + (6144, 6144, 512, 64, 64, True, False, True): (6, 2, 3, 4), + (6144, 6144, 512, 128, 128, False, True, True): (3, 4, 1, 4), + (6144, 6144, 512, 128, 128, True, False, True): (4, 4, 3, 8), + (6144, 6144, 1024, 16, 16, False, True, True): (1, 8, 1, 2), + (6144, 6144, 1024, 16, 16, True, False, True): (4, 8, 4, 2), + (6144, 6144, 1024, 32, 32, False, True, True): (1, 8, 4, 2), + (6144, 6144, 1024, 32, 32, True, False, True): (1, 8, 4, 2), + (6144, 6144, 1024, 64, 64, False, True, True): (4, 8, 3, 4), + (6144, 6144, 1024, 64, 64, True, False, True): (1, 4, 3, 4), + (6144, 6144, 1024, 128, 128, False, True, True): (3, 8, 1, 4), + (6144, 6144, 1024, 128, 128, True, False, True): (1, 8, 3, 8), + (6144, 6144, 2048, 16, 16, False, True, True): (4, 4, 1, 4), + (6144, 6144, 2048, 16, 16, True, False, True): (2, 8, 4, 4), + (6144, 6144, 2048, 32, 32, False, True, True): (4, 8, 3, 4), + (6144, 6144, 2048, 32, 32, True, False, True): (2, 8, 3, 4), + (6144, 6144, 2048, 64, 64, False, True, True): (4, 16, 3, 4), + (6144, 6144, 2048, 64, 64, True, False, True): (2, 8, 3, 4), + (6144, 6144, 2048, 128, 128, False, True, True): (3, 16, 1, 4), + (6144, 6144, 2048, 128, 128, True, False, True): (4, 16, 3, 8), + (6144, 6144, 4096, 16, 16, False, True, True): (4, 8, 1, 4), + (6144, 6144, 4096, 16, 16, True, False, True): (4, 32, 4, 2), + (6144, 6144, 4096, 32, 32, False, True, True): (4, 16, 1, 2), + (6144, 6144, 4096, 32, 32, True, False, True): (2, 8, 3, 8), + (6144, 6144, 4096, 64, 64, False, True, True): (4, 32, 3, 4), + (6144, 6144, 4096, 64, 64, True, False, True): (4, 16, 3, 4), + (6144, 6144, 4096, 128, 128, False, True, True): (6, 32, 1, 4), + (6144, 6144, 4096, 128, 128, True, False, True): (4, 32, 3, 8), + (6144, 6144, 8192, 16, 16, False, True, True): (2, 16, 1, 2), + (6144, 6144, 8192, 16, 16, True, False, True): (4, 64, 4, 2), + (6144, 6144, 8192, 32, 32, False, True, True): (4, 32, 1, 2), + (6144, 6144, 8192, 32, 32, True, False, True): (4, 32, 3, 4), + (6144, 6144, 8192, 64, 64, False, True, True): (4, 64, 3, 4), + (6144, 6144, 8192, 64, 64, True, False, True): (4, 32, 3, 4), + (6144, 6144, 8192, 128, 128, False, True, True): (6, 64, 1, 4), + (6144, 6144, 8192, 128, 128, True, False, True): (4, 64, 3, 8), + (6144, 6144, 16384, 16, 16, False, True, True): (2, 32, 1, 2), + (6144, 6144, 16384, 16, 16, True, False, True): (4, 64, 4, 4), + (6144, 6144, 16384, 32, 32, False, True, True): (4, 64, 1, 2), + (6144, 6144, 16384, 32, 32, True, False, True): (4, 64, 3, 4), + (6144, 6144, 16384, 64, 64, False, True, True): (4, 128, 3, 4), + (6144, 6144, 16384, 64, 64, True, False, True): (1, 32, 3, 8), + (6144, 6144, 16384, 128, 128, False, True, True): (4, 128, 1, 4), + (6144, 6144, 16384, 128, 128, True, False, True): (4, 128, 3, 8), + (6144, 6144, 32768, 16, 16, False, True, True): (2, 64, 1, 2), + (6144, 6144, 32768, 16, 16, True, False, True): (5, 128, 4, 1), + (6144, 6144, 32768, 32, 32, False, True, True): (4, 128, 1, 2), + (6144, 6144, 32768, 32, 32, True, False, True): (3, 128, 3, 4), + (6144, 6144, 32768, 64, 64, False, True, True): (4, 256, 3, 4), + (6144, 6144, 32768, 64, 64, True, False, True): (2, 64, 3, 8), + (6144, 6144, 32768, 128, 128, False, True, True): (8, 256, 1, 4), + (6144, 6144, 32768, 128, 128, True, False, True): (4, 256, 3, 8), + (6144, 6144, 65536, 16, 16, False, True, True): (2, 128, 1, 2), + (6144, 6144, 65536, 16, 16, True, False, True): (5, 256, 4, 1), + (6144, 6144, 65536, 32, 32, False, True, True): (4, 256, 1, 2), + (6144, 6144, 65536, 32, 32, True, False, True): (2, 256, 3, 4), + (6144, 6144, 65536, 64, 64, False, True, True): (4, 512, 3, 4), + (6144, 6144, 65536, 64, 64, True, False, True): (1, 128, 3, 8), + (6144, 6144, 65536, 128, 128, False, True, True): (4, 512, 1, 4), + (6144, 6144, 65536, 128, 128, True, False, True): (4, 512, 3, 8), + (6144, 6144, 131072, 16, 16, False, True, True): (2, 256, 1, 2), + (6144, 6144, 131072, 16, 16, True, False, True): (3, 512, 4, 4), + (6144, 6144, 131072, 32, 32, False, True, True): (4, 512, 1, 2), + (6144, 6144, 131072, 32, 32, True, False, True): (4, 512, 3, 4), + (6144, 6144, 131072, 64, 64, False, True, True): (4, 1024, 3, 4), + (6144, 6144, 131072, 64, 64, True, False, True): (2, 256, 3, 8), + (6144, 6144, 131072, 128, 128, False, True, True): (4, 1024, 1, 4), + (6144, 6144, 131072, 128, 128, True, False, True): (4, 1024, 3, 8), + (8192, 8192, 256, 16, 16, False, True, True): (2, 2, 6, 4), + (8192, 8192, 256, 16, 16, True, False, True): (2, 4, 2, 2), + (8192, 8192, 256, 32, 32, False, True, True): (4, 2, 3, 4), + (8192, 8192, 256, 32, 32, True, False, True): (4, 2, 3, 4), + (8192, 8192, 256, 64, 64, False, True, True): (2, 2, 3, 8), + (8192, 8192, 256, 64, 64, True, False, True): (6, 2, 3, 8), + (8192, 8192, 256, 128, 128, False, True, True): (3, 2, 1, 4), + (8192, 8192, 256, 128, 128, True, False, True): (1, 2, 1, 4), + (8192, 8192, 512, 16, 16, False, True, True): (4, 4, 3, 2), + (8192, 8192, 512, 16, 16, True, False, True): (4, 4, 3, 4), + (8192, 8192, 512, 32, 32, False, True, True): (1, 4, 3, 4), + (8192, 8192, 512, 32, 32, True, False, True): (5, 4, 3, 2), + (8192, 8192, 512, 64, 64, False, True, True): (1, 4, 3, 4), + (8192, 8192, 512, 64, 64, True, False, True): (2, 2, 3, 8), + (8192, 8192, 512, 128, 128, False, True, True): (4, 4, 2, 8), + (8192, 8192, 512, 128, 128, True, False, True): (4, 4, 2, 8), + (8192, 8192, 1024, 16, 16, False, True, True): (4, 8, 4, 4), + (8192, 8192, 1024, 16, 16, True, False, True): (4, 8, 4, 4), + (8192, 8192, 1024, 32, 32, False, True, True): (2, 4, 4, 8), + (8192, 8192, 1024, 32, 32, True, False, True): (1, 4, 3, 4), + (8192, 8192, 1024, 64, 64, False, True, True): (4, 8, 3, 4), + (8192, 8192, 1024, 64, 64, True, False, True): (2, 8, 3, 4), + (8192, 8192, 1024, 128, 128, False, True, True): (4, 8, 2, 8), + (8192, 8192, 1024, 128, 128, True, False, True): (4, 8, 1, 4), + (8192, 8192, 2048, 16, 16, False, True, True): (2, 8, 4, 4), + (8192, 8192, 2048, 16, 16, True, False, True): (2, 8, 4, 4), + (8192, 8192, 2048, 32, 32, False, True, True): (2, 8, 4, 8), + (8192, 8192, 2048, 32, 32, True, False, True): (2, 8, 4, 8), + (8192, 8192, 2048, 64, 64, False, True, True): (4, 8, 2, 4), + (8192, 8192, 2048, 64, 64, True, False, True): (4, 16, 3, 4), + (8192, 8192, 2048, 128, 128, False, True, True): (6, 16, 1, 4), + (8192, 8192, 2048, 128, 128, True, False, True): (4, 16, 1, 4), + (8192, 8192, 4096, 16, 16, False, True, True): (4, 32, 4, 2), + (8192, 8192, 4096, 16, 16, True, False, True): (4, 32, 4, 2), + (8192, 8192, 4096, 32, 32, False, True, True): (2, 16, 4, 8), + (8192, 8192, 4096, 32, 32, True, False, True): (4, 16, 4, 8), + (8192, 8192, 4096, 64, 64, False, True, True): (4, 16, 2, 4), + (8192, 8192, 4096, 64, 64, True, False, True): (4, 16, 2, 4), + (8192, 8192, 4096, 128, 128, False, True, True): (6, 32, 1, 4), + (8192, 8192, 4096, 128, 128, True, False, True): (4, 32, 1, 4), + (8192, 8192, 8192, 16, 16, False, True, True): (4, 64, 4, 2), + (8192, 8192, 8192, 16, 16, True, False, True): (4, 64, 4, 2), + (8192, 8192, 8192, 32, 32, False, True, True): (2, 32, 4, 8), + (8192, 8192, 8192, 32, 32, True, False, True): (2, 32, 4, 8), + (8192, 8192, 8192, 64, 64, False, True, True): (2, 32, 2, 4), + (8192, 8192, 8192, 64, 64, True, False, True): (4, 32, 2, 4), + (8192, 8192, 8192, 128, 128, False, True, True): (6, 64, 1, 4), + (8192, 8192, 8192, 128, 128, True, False, True): (4, 64, 1, 4), + (8192, 8192, 16384, 16, 16, False, True, True): (4, 64, 3, 4), + (8192, 8192, 16384, 16, 16, True, False, True): (4, 64, 4, 4), + (8192, 8192, 16384, 32, 32, False, True, True): (4, 64, 4, 8), + (8192, 8192, 16384, 32, 32, True, False, True): (4, 64, 4, 8), + (8192, 8192, 16384, 64, 64, False, True, True): (4, 64, 2, 4), + (8192, 8192, 16384, 64, 64, True, False, True): (4, 64, 3, 8), + (8192, 8192, 16384, 128, 128, False, True, True): (6, 128, 1, 4), + (8192, 8192, 16384, 128, 128, True, False, True): (4, 128, 1, 4), + (8192, 8192, 32768, 16, 16, False, True, True): (3, 128, 4, 4), + (8192, 8192, 32768, 16, 16, True, False, True): (3, 128, 4, 4), + (8192, 8192, 32768, 32, 32, False, True, True): (2, 128, 4, 8), + (8192, 8192, 32768, 32, 32, True, False, True): (2, 128, 4, 8), + (8192, 8192, 32768, 64, 64, False, True, True): (2, 128, 2, 4), + (8192, 8192, 32768, 64, 64, True, False, True): (2, 128, 3, 8), + (8192, 8192, 32768, 128, 128, False, True, True): (6, 256, 1, 4), + (8192, 8192, 32768, 128, 128, True, False, True): (4, 256, 1, 4), + (8192, 8192, 50432, 16, 16, False, True, True): (1, 197, 1, 1), + (8192, 8192, 50432, 16, 16, True, False, True): (3, 197, 4, 1), + (8192, 8192, 50432, 32, 32, False, True, True): (2, 197, 1, 4), + (8192, 8192, 50432, 32, 32, True, False, True): (2, 197, 3, 4), + (8192, 8192, 50432, 64, 64, False, True, True): (2, 394, 3, 4), + (8192, 8192, 65536, 16, 16, False, True, True): (3, 256, 4, 4), + (8192, 8192, 65536, 16, 16, True, False, True): (4, 256, 4, 4), + (8192, 8192, 65536, 32, 32, False, True, True): (2, 256, 4, 8), + (8192, 8192, 65536, 32, 32, True, False, True): (2, 256, 3, 8), + (8192, 8192, 65536, 64, 64, False, True, True): (2, 256, 2, 4), + (8192, 8192, 65536, 64, 64, True, False, True): (4, 256, 3, 8), + (8192, 8192, 65536, 128, 128, False, True, True): (6, 512, 1, 4), + (8192, 8192, 65536, 128, 128, True, False, True): (4, 512, 1, 4), + (8192, 8192, 65792, 16, 16, False, True, True): (1, 257, 1, 1), + (8192, 8192, 65792, 16, 16, True, False, True): (3, 257, 4, 1), + (8192, 8192, 65792, 32, 32, False, True, True): (2, 257, 1, 4), + (8192, 8192, 65792, 32, 32, True, False, True): (1, 257, 3, 4), + (8192, 8192, 65792, 64, 64, False, True, True): (2, 514, 3, 4), + (8192, 8192, 65792, 64, 64, True, False, True): (1, 257, 3, 4), + (8192, 8192, 65792, 128, 128, False, True, True): (2, 514, 1, 4), + (8192, 8192, 65792, 128, 128, True, False, True): (2, 514, 3, 8), + (8192, 8192, 131072, 16, 16, False, True, True): (4, 512, 4, 4), + (8192, 8192, 131072, 16, 16, True, False, True): (3, 512, 4, 4), + (8192, 8192, 131072, 32, 32, False, True, True): (2, 512, 4, 8), + (8192, 8192, 131072, 32, 32, True, False, True): (2, 512, 4, 8), + (8192, 8192, 131072, 64, 64, False, True, True): (2, 512, 2, 4), + (8192, 8192, 131072, 64, 64, True, False, True): (2, 512, 2, 4), + (8192, 8192, 131072, 128, 128, False, True, True): (4, 1024, 1, 4), + (8192, 8192, 131072, 128, 128, True, False, True): (4, 1024, 1, 4), + (12288, 12288, 256, 16, 16, False, True, True): (4, 2, 1, 4), + (12288, 12288, 256, 16, 16, True, False, True): (1, 1, 3, 1), + (12288, 12288, 256, 32, 32, False, True, True): (4, 4, 1, 4), + (12288, 12288, 256, 32, 32, True, False, True): (2, 1, 3, 2), + (12288, 12288, 256, 64, 64, False, True, True): (4, 2, 3, 4), + (12288, 12288, 256, 64, 64, True, False, True): (3, 1, 3, 4), + (12288, 12288, 256, 128, 128, False, True, True): (6, 2, 1, 4), + (12288, 12288, 256, 128, 128, True, False, True): (4, 2, 3, 8), + (12288, 12288, 512, 16, 16, False, True, True): (4, 4, 1, 2), + (12288, 12288, 512, 16, 16, True, False, True): (4, 4, 4, 2), + (12288, 12288, 512, 32, 32, False, True, True): (4, 4, 4, 2), + (12288, 12288, 512, 32, 32, True, False, True): (2, 2, 3, 8), + (12288, 12288, 512, 64, 64, False, True, True): (4, 4, 3, 4), + (12288, 12288, 512, 64, 64, True, False, True): (8, 2, 3, 4), + (12288, 12288, 512, 128, 128, False, True, True): (4, 4, 3, 8), + (12288, 12288, 512, 128, 128, True, False, True): (4, 4, 3, 8), + (12288, 12288, 1024, 16, 16, False, True, True): (4, 8, 1, 2), + (12288, 12288, 1024, 16, 16, True, False, True): (2, 4, 4, 4), + (12288, 12288, 1024, 32, 32, False, True, True): (4, 4, 3, 4), + (12288, 12288, 1024, 32, 32, True, False, True): (1, 4, 3, 4), + (12288, 12288, 1024, 64, 64, False, True, True): (4, 8, 3, 4), + (12288, 12288, 1024, 64, 64, True, False, True): (2, 4, 3, 4), + (12288, 12288, 1024, 128, 128, False, True, True): (4, 8, 3, 8), + (12288, 12288, 1024, 128, 128, True, False, True): (4, 8, 3, 8), + (12288, 12288, 2048, 16, 16, False, True, True): (2, 4, 1, 4), + (12288, 12288, 2048, 16, 16, True, False, True): (2, 8, 4, 4), + (12288, 12288, 2048, 32, 32, False, True, True): (4, 8, 1, 2), + (12288, 12288, 2048, 32, 32, True, False, True): (2, 8, 4, 8), + (12288, 12288, 2048, 64, 64, False, True, True): (4, 16, 3, 4), + (12288, 12288, 2048, 64, 64, True, False, True): (2, 8, 3, 4), + (12288, 12288, 2048, 128, 128, False, True, True): (4, 16, 3, 8), + (12288, 12288, 2048, 128, 128, True, False, True): (4, 16, 3, 8), + (12288, 12288, 4096, 16, 16, False, True, True): (2, 8, 1, 4), + (12288, 12288, 4096, 16, 16, True, False, True): (2, 16, 4, 4), + (12288, 12288, 4096, 32, 32, False, True, True): (2, 16, 1, 2), + (12288, 12288, 4096, 32, 32, True, False, True): (2, 16, 3, 4), + (12288, 12288, 4096, 64, 64, False, True, True): (4, 32, 3, 4), + (12288, 12288, 4096, 64, 64, True, False, True): (2, 16, 3, 4), + (12288, 12288, 4096, 128, 128, False, True, True): (4, 32, 1, 4), + (12288, 12288, 4096, 128, 128, True, False, True): (4, 32, 3, 8), + (12288, 12288, 8192, 16, 16, False, True, True): (2, 32, 1, 1), + (12288, 12288, 8192, 16, 16, True, False, True): (4, 64, 4, 2), + (12288, 12288, 8192, 32, 32, False, True, True): (2, 32, 1, 2), + (12288, 12288, 8192, 32, 32, True, False, True): (2, 32, 3, 2), + (12288, 12288, 8192, 64, 64, False, True, True): (4, 64, 3, 4), + (12288, 12288, 8192, 64, 64, True, False, True): (2, 32, 3, 4), + (12288, 12288, 8192, 128, 128, False, True, True): (4, 64, 3, 8), + (12288, 12288, 8192, 128, 128, True, False, True): (2, 64, 3, 8), + (12288, 12288, 16384, 16, 16, False, True, True): (4, 128, 1, 2), + (12288, 12288, 16384, 16, 16, True, False, True): (4, 128, 4, 2), + (12288, 12288, 16384, 32, 32, False, True, True): (2, 64, 1, 2), + (12288, 12288, 16384, 32, 32, True, False, True): (2, 64, 3, 4), + (12288, 12288, 16384, 64, 64, False, True, True): (4, 128, 3, 4), + (12288, 12288, 16384, 64, 64, True, False, True): (2, 64, 3, 4), + (12288, 12288, 16384, 128, 128, False, True, True): (4, 128, 1, 4), + (12288, 12288, 16384, 128, 128, True, False, True): (4, 128, 3, 8), + (12288, 12288, 32768, 16, 16, False, True, True): (2, 128, 1, 1), + (12288, 12288, 32768, 16, 16, True, False, True): (3, 128, 4, 1), + (12288, 12288, 32768, 32, 32, False, True, True): (2, 128, 1, 2), + (12288, 12288, 32768, 32, 32, True, False, True): (2, 128, 3, 2), + (12288, 12288, 32768, 64, 64, False, True, True): (4, 256, 3, 4), + (12288, 12288, 32768, 64, 64, True, False, True): (1, 64, 3, 8), + (12288, 12288, 32768, 128, 128, False, True, True): (4, 256, 3, 8), + (12288, 12288, 32768, 128, 128, True, False, True): (4, 256, 3, 8), + (12288, 12288, 65536, 16, 16, False, True, True): (4, 512, 1, 2), + (12288, 12288, 65536, 16, 16, True, False, True): (3, 256, 4, 1), + (12288, 12288, 65536, 32, 32, False, True, True): (2, 256, 1, 2), + (12288, 12288, 65536, 32, 32, True, False, True): (2, 256, 3, 2), + (12288, 12288, 65536, 64, 64, False, True, True): (4, 512, 3, 4), + (12288, 12288, 65536, 64, 64, True, False, True): (2, 256, 3, 4), + (12288, 12288, 65536, 128, 128, False, True, True): (4, 512, 1, 4), + (12288, 12288, 65536, 128, 128, True, False, True): (4, 512, 3, 8), + (12288, 12288, 131072, 16, 16, False, True, True): (2, 512, 1, 1), + (12288, 12288, 131072, 16, 16, True, False, True): (2, 512, 4, 4), + (12288, 12288, 131072, 32, 32, False, True, True): (2, 512, 1, 2), + (12288, 12288, 131072, 32, 32, True, False, True): (2, 512, 3, 4), + (12288, 12288, 131072, 64, 64, False, True, True): (4, 1024, 3, 4), + (12288, 12288, 131072, 64, 64, True, False, True): (2, 512, 3, 4), + (12288, 12288, 131072, 128, 128, False, True, True): (4, 1024, 3, 8), + (12288, 12288, 131072, 128, 128, True, False, True): (4, 1024, 3, 8), + (16384, 16384, 256, 16, 16, False, True, True): (2, 2, 3, 2), + (16384, 16384, 256, 16, 16, True, False, True): (2, 2, 6, 4), + (16384, 16384, 256, 32, 32, False, True, True): (4, 2, 3, 4), + (16384, 16384, 256, 32, 32, True, False, True): (4, 2, 3, 2), + (16384, 16384, 256, 64, 64, False, True, True): (2, 2, 5, 4), + (16384, 16384, 256, 64, 64, True, False, True): (2, 2, 3, 8), + (16384, 16384, 256, 128, 128, False, True, True): (4, 2, 2, 8), + (16384, 16384, 256, 128, 128, True, False, True): (2, 2, 1, 4), + (16384, 16384, 512, 16, 16, False, True, True): (1, 2, 4, 4), + (16384, 16384, 512, 16, 16, True, False, True): (1, 2, 4, 4), + (16384, 16384, 512, 32, 32, False, True, True): (2, 2, 3, 8), + (16384, 16384, 512, 32, 32, True, False, True): (2, 2, 4, 8), + (16384, 16384, 512, 64, 64, False, True, True): (4, 4, 3, 4), + (16384, 16384, 512, 64, 64, True, False, True): (2, 4, 3, 4), + (16384, 16384, 512, 128, 128, False, True, True): (4, 4, 2, 8), + (16384, 16384, 512, 128, 128, True, False, True): (4, 4, 2, 8), + (16384, 16384, 1024, 16, 16, False, True, True): (4, 8, 4, 4), + (16384, 16384, 1024, 16, 16, True, False, True): (2, 4, 4, 4), + (16384, 16384, 1024, 32, 32, False, True, True): (2, 4, 4, 8), + (16384, 16384, 1024, 32, 32, True, False, True): (2, 4, 4, 8), + (16384, 16384, 1024, 64, 64, False, True, True): (4, 4, 2, 4), + (16384, 16384, 1024, 64, 64, True, False, True): (2, 4, 2, 4), + (16384, 16384, 1024, 128, 128, False, True, True): (6, 8, 1, 4), + (16384, 16384, 1024, 128, 128, True, False, True): (4, 8, 1, 4), + (16384, 16384, 2048, 16, 16, False, True, True): (2, 8, 4, 4), + (16384, 16384, 2048, 16, 16, True, False, True): (2, 8, 4, 4), + (16384, 16384, 2048, 32, 32, False, True, True): (2, 8, 4, 8), + (16384, 16384, 2048, 32, 32, True, False, True): (2, 8, 4, 8), + (16384, 16384, 2048, 64, 64, False, True, True): (2, 8, 2, 4), + (16384, 16384, 2048, 64, 64, True, False, True): (2, 8, 2, 4), + (16384, 16384, 2048, 128, 128, False, True, True): (4, 16, 2, 8), + (16384, 16384, 2048, 128, 128, True, False, True): (4, 16, 1, 4), + (16384, 16384, 4096, 16, 16, False, True, True): (2, 16, 4, 4), + (16384, 16384, 4096, 16, 16, True, False, True): (2, 16, 4, 4), + (16384, 16384, 4096, 32, 32, False, True, True): (1, 16, 4, 8), + (16384, 16384, 4096, 32, 32, True, False, True): (2, 16, 3, 4), + (16384, 16384, 4096, 64, 64, False, True, True): (1, 16, 2, 4), + (16384, 16384, 4096, 64, 64, True, False, True): (2, 16, 2, 4), + (16384, 16384, 4096, 128, 128, False, True, True): (4, 32, 2, 8), + (16384, 16384, 4096, 128, 128, True, False, True): (4, 32, 1, 4), + (16384, 16384, 8192, 16, 16, False, True, True): (2, 64, 4, 2), + (16384, 16384, 8192, 16, 16, True, False, True): (2, 64, 4, 2), + (16384, 16384, 8192, 32, 32, False, True, True): (2, 32, 4, 8), + (16384, 16384, 8192, 32, 32, True, False, True): (2, 32, 4, 8), + (16384, 16384, 8192, 64, 64, False, True, True): (2, 32, 2, 4), + (16384, 16384, 8192, 64, 64, True, False, True): (2, 32, 4, 8), + (16384, 16384, 8192, 128, 128, False, True, True): (4, 64, 2, 8), + (16384, 16384, 8192, 128, 128, True, False, True): (4, 64, 1, 4), + (16384, 16384, 16384, 16, 16, False, True, True): (1, 64, 4, 4), + (16384, 16384, 16384, 16, 16, True, False, True): (1, 64, 4, 4), + (16384, 16384, 16384, 32, 32, False, True, True): (1, 64, 4, 8), + (16384, 16384, 16384, 32, 32, True, False, True): (1, 64, 4, 8), + (16384, 16384, 16384, 64, 64, False, True, True): (1, 64, 2, 4), + (16384, 16384, 16384, 64, 64, True, False, True): (1, 64, 3, 8), + (16384, 16384, 16384, 128, 128, False, True, True): (4, 128, 1, 4), + (16384, 16384, 16384, 128, 128, True, False, True): (4, 128, 1, 4), + (16384, 16384, 32768, 16, 16, False, True, True): (1, 128, 4, 4), + (16384, 16384, 32768, 16, 16, True, False, True): (1, 128, 4, 4), + (16384, 16384, 32768, 32, 32, False, True, True): (1, 128, 3, 4), + (16384, 16384, 32768, 32, 32, True, False, True): (1, 128, 3, 8), + (16384, 16384, 32768, 64, 64, False, True, True): (2, 128, 2, 4), + (16384, 16384, 32768, 64, 64, True, False, True): (1, 128, 4, 8), + (16384, 16384, 32768, 128, 128, False, True, True): (4, 256, 2, 8), + (16384, 16384, 32768, 128, 128, True, False, True): (4, 256, 1, 4), + (16384, 16384, 65536, 16, 16, False, True, True): (1, 256, 3, 4), + (16384, 16384, 65536, 16, 16, True, False, True): (1, 256, 4, 4), + (16384, 16384, 65536, 32, 32, False, True, True): (1, 256, 4, 8), + (16384, 16384, 65536, 32, 32, True, False, True): (1, 256, 3, 4), + (16384, 16384, 65536, 64, 64, False, True, True): (2, 256, 2, 4), + (16384, 16384, 65536, 64, 64, True, False, True): (1, 256, 3, 8), + (16384, 16384, 65536, 128, 128, False, True, True): (4, 512, 2, 8), + (16384, 16384, 65536, 128, 128, True, False, True): (4, 512, 1, 4), + (16384, 16384, 65792, 16, 16, False, True, True): (1, 257, 1, 1), + (16384, 16384, 65792, 16, 16, True, False, True): (1, 257, 4, 1), + (16384, 16384, 65792, 32, 32, False, True, True): (1, 257, 1, 4), + (16384, 16384, 65792, 32, 32, True, False, True): (1, 257, 3, 4), + (16384, 16384, 65792, 64, 64, False, True, True): (2, 514, 3, 4), + (16384, 16384, 65792, 64, 64, True, False, True): (1, 257, 3, 4), + (16384, 16384, 65792, 128, 128, False, True, True): (2, 514, 3, 8), + (16384, 16384, 65792, 128, 128, True, False, True): (2, 514, 3, 8), + (16384, 16384, 131072, 16, 16, False, True, True): (1, 512, 4, 4), + (16384, 16384, 131072, 16, 16, True, False, True): (1, 512, 3, 2), + (16384, 16384, 131072, 32, 32, False, True, True): (1, 512, 4, 8), + (16384, 16384, 131072, 32, 32, True, False, True): (1, 512, 3, 2), + (16384, 16384, 131072, 64, 64, False, True, True): (1, 512, 2, 4), + (16384, 16384, 131072, 64, 64, True, False, True): (1, 512, 2, 4), + (16384, 16384, 131072, 128, 128, False, True, True): (4, 1024, 1, 4), + (16384, 16384, 131072, 128, 128, True, False, True): (4, 1024, 1, 4), + (24576, 24576, 256, 16, 16, False, True, True): (6, 2, 1, 2), + (24576, 24576, 256, 16, 16, True, False, True): (2, 2, 5, 4), + (24576, 24576, 256, 32, 32, False, True, True): (4, 4, 1, 4), + (24576, 24576, 256, 32, 32, True, False, True): (2, 2, 4, 2), + (24576, 24576, 256, 64, 64, False, True, True): (2, 2, 3, 4), + (24576, 24576, 256, 64, 64, True, False, True): (1, 1, 3, 4), + (24576, 24576, 256, 128, 128, False, True, True): (6, 2, 1, 4), + (24576, 24576, 256, 128, 128, True, False, True): (2, 2, 3, 8), + (24576, 24576, 512, 16, 16, False, True, True): (4, 4, 1, 2), + (24576, 24576, 512, 16, 16, True, False, True): (2, 2, 4, 4), + (24576, 24576, 512, 32, 32, False, True, True): (1, 2, 3, 4), + (24576, 24576, 512, 32, 32, True, False, True): (1, 2, 3, 4), + (24576, 24576, 512, 64, 64, False, True, True): (4, 4, 3, 4), + (24576, 24576, 512, 64, 64, True, False, True): (1, 2, 3, 4), + (24576, 24576, 512, 128, 128, False, True, True): (4, 4, 3, 8), + (24576, 24576, 512, 128, 128, True, False, True): (4, 4, 3, 8), + (24576, 24576, 1024, 16, 16, False, True, True): (2, 8, 1, 2), + (24576, 24576, 1024, 16, 16, True, False, True): (2, 4, 4, 4), + (24576, 24576, 1024, 32, 32, False, True, True): (2, 4, 1, 2), + (24576, 24576, 1024, 32, 32, True, False, True): (1, 4, 3, 4), + (24576, 24576, 1024, 64, 64, False, True, True): (4, 8, 3, 4), + (24576, 24576, 1024, 64, 64, True, False, True): (1, 4, 3, 4), + (24576, 24576, 1024, 128, 128, False, True, True): (4, 8, 3, 8), + (24576, 24576, 1024, 128, 128, True, False, True): (4, 8, 3, 8), + (24576, 24576, 2048, 16, 16, False, True, True): (1, 4, 1, 4), + (24576, 24576, 2048, 16, 16, True, False, True): (1, 8, 4, 4), + (24576, 24576, 2048, 32, 32, False, True, True): (2, 8, 1, 2), + (24576, 24576, 2048, 32, 32, True, False, True): (1, 8, 3, 4), + (24576, 24576, 2048, 64, 64, False, True, True): (4, 16, 3, 4), + (24576, 24576, 2048, 64, 64, True, False, True): (1, 4, 3, 8), + (24576, 24576, 2048, 128, 128, False, True, True): (4, 16, 3, 8), + (24576, 24576, 2048, 128, 128, True, False, True): (2, 16, 3, 8), + (24576, 24576, 4096, 16, 16, False, True, True): (2, 32, 1, 2), + (24576, 24576, 4096, 16, 16, True, False, True): (1, 16, 4, 4), + (24576, 24576, 4096, 32, 32, False, True, True): (1, 16, 1, 2), + (24576, 24576, 4096, 32, 32, True, False, True): (1, 16, 3, 4), + (24576, 24576, 4096, 64, 64, False, True, True): (4, 32, 3, 4), + (24576, 24576, 4096, 64, 64, True, False, True): (1, 8, 3, 8), + (24576, 24576, 4096, 128, 128, False, True, True): (4, 32, 3, 8), + (24576, 24576, 4096, 128, 128, True, False, True): (2, 32, 3, 8), + (24576, 24576, 8192, 16, 16, False, True, True): (1, 32, 1, 1), + (24576, 24576, 8192, 16, 16, True, False, True): (2, 64, 4, 2), + (24576, 24576, 8192, 32, 32, False, True, True): (1, 32, 1, 2), + (24576, 24576, 8192, 32, 32, True, False, True): (1, 32, 3, 4), + (24576, 24576, 8192, 64, 64, False, True, True): (4, 64, 3, 4), + (24576, 24576, 8192, 64, 64, True, False, True): (1, 32, 3, 4), + (24576, 24576, 8192, 128, 128, False, True, True): (4, 64, 3, 8), + (24576, 24576, 8192, 128, 128, True, False, True): (4, 64, 3, 8), + (24576, 24576, 16384, 16, 16, False, True, True): (2, 128, 1, 2), + (24576, 24576, 16384, 16, 16, True, False, True): (1, 64, 4, 4), + (24576, 24576, 16384, 32, 32, False, True, True): (1, 64, 1, 2), + (24576, 24576, 16384, 32, 32, True, False, True): (1, 64, 3, 2), + (24576, 24576, 16384, 64, 64, False, True, True): (2, 128, 3, 4), + (24576, 24576, 16384, 64, 64, True, False, True): (1, 32, 3, 8), + (24576, 24576, 16384, 128, 128, False, True, True): (4, 128, 3, 8), + (24576, 24576, 16384, 128, 128, True, False, True): (4, 128, 3, 8), + (24576, 24576, 32768, 16, 16, False, True, True): (1, 128, 1, 1), + (24576, 24576, 32768, 16, 16, True, False, True): (1, 128, 4, 4), + (24576, 24576, 32768, 32, 32, False, True, True): (1, 128, 1, 2), + (24576, 24576, 32768, 32, 32, True, False, True): (1, 128, 3, 4), + (24576, 24576, 32768, 64, 64, False, True, True): (2, 256, 3, 4), + (24576, 24576, 32768, 64, 64, True, False, True): (1, 128, 3, 4), + (24576, 24576, 32768, 128, 128, False, True, True): (4, 256, 3, 8), + (24576, 24576, 32768, 128, 128, True, False, True): (2, 256, 3, 8), + (24576, 24576, 65536, 16, 16, False, True, True): (2, 512, 1, 2), + (24576, 24576, 65536, 16, 16, True, False, True): (1, 256, 4, 4), + (32768, 32768, 256, 16, 16, False, True, True): (4, 2, 1, 2), + (32768, 32768, 256, 16, 16, True, False, True): (2, 2, 5, 4), + (32768, 32768, 256, 32, 32, False, True, True): (4, 2, 4, 2), + (32768, 32768, 256, 32, 32, True, False, True): (1, 1, 4, 8), + (32768, 32768, 256, 64, 64, False, True, True): (2, 2, 3, 4), + (32768, 32768, 256, 64, 64, True, False, True): (1, 1, 3, 8), + (32768, 32768, 256, 128, 128, False, True, True): (2, 2, 3, 8), + (32768, 32768, 256, 128, 128, True, False, True): (2, 2, 3, 8), + (32768, 32768, 512, 16, 16, False, True, True): (2, 2, 1, 4), + (32768, 32768, 512, 16, 16, True, False, True): (2, 2, 4, 2), + (32768, 32768, 512, 32, 32, False, True, True): (1, 2, 3, 4), + (32768, 32768, 512, 32, 32, True, False, True): (1, 2, 4, 8), + (32768, 32768, 512, 64, 64, False, True, True): (4, 4, 3, 4), + (32768, 32768, 512, 64, 64, True, False, True): (1, 2, 3, 4), + (32768, 32768, 512, 128, 128, False, True, True): (4, 4, 3, 8), + (32768, 32768, 512, 128, 128, True, False, True): (4, 4, 3, 8), + (32768, 32768, 1024, 16, 16, False, True, True): (2, 4, 1, 1), + (32768, 32768, 1024, 16, 16, True, False, True): (1, 4, 4, 2), + (32768, 32768, 1024, 32, 32, False, True, True): (2, 4, 1, 4), + (32768, 32768, 1024, 32, 32, True, False, True): (1, 4, 3, 4), + (32768, 32768, 1024, 64, 64, False, True, True): (4, 8, 3, 4), + (32768, 32768, 1024, 64, 64, True, False, True): (1, 4, 3, 4), + (32768, 32768, 1024, 128, 128, False, True, True): (4, 8, 3, 8), + (32768, 32768, 1024, 128, 128, True, False, True): (4, 8, 3, 8), + (32768, 32768, 2048, 16, 16, False, True, True): (1, 8, 1, 4), + (32768, 32768, 2048, 16, 16, True, False, True): (1, 8, 4, 4), + (32768, 32768, 2048, 32, 32, False, True, True): (2, 8, 1, 4), + (32768, 32768, 2048, 32, 32, True, False, True): (1, 8, 3, 4), + (32768, 32768, 2048, 64, 64, False, True, True): (4, 16, 3, 4), + (32768, 32768, 2048, 64, 64, True, False, True): (1, 8, 3, 4), + (32768, 32768, 2048, 128, 128, False, True, True): (4, 16, 3, 8), + (32768, 32768, 2048, 128, 128, True, False, True): (2, 16, 3, 8), + (32768, 32768, 4096, 16, 16, False, True, True): (1, 16, 1, 4), + (32768, 32768, 4096, 16, 16, True, False, True): (1, 16, 4, 4), + (32768, 32768, 4096, 32, 32, False, True, True): (2, 16, 1, 4), + (32768, 32768, 4096, 32, 32, True, False, True): (1, 16, 3, 4), + (32768, 32768, 4096, 64, 64, False, True, True): (2, 32, 3, 4), + (32768, 32768, 4096, 64, 64, True, False, True): (1, 16, 3, 4), + (32768, 32768, 4096, 128, 128, False, True, True): (4, 32, 3, 8), + (32768, 32768, 4096, 128, 128, True, False, True): (4, 32, 3, 8), + (32768, 32768, 8192, 16, 16, False, True, True): (1, 32, 1, 4), + (32768, 32768, 8192, 16, 16, True, False, True): (2, 64, 4, 1), + (32768, 32768, 8192, 32, 32, False, True, True): (2, 32, 1, 4), + (32768, 32768, 8192, 32, 32, True, False, True): (1, 32, 3, 4), + (32768, 32768, 8192, 64, 64, False, True, True): (2, 64, 3, 4), + (32768, 32768, 8192, 64, 64, True, False, True): (1, 32, 3, 4), + (32768, 32768, 8192, 128, 128, False, True, True): (4, 64, 3, 8), + (32768, 32768, 8192, 128, 128, True, False, True): (2, 64, 3, 8), + (32768, 32768, 16384, 16, 16, False, True, True): (1, 64, 1, 4), + (32768, 32768, 16384, 16, 16, True, False, True): (1, 64, 4, 1), + (32768, 32768, 16384, 32, 32, False, True, True): (2, 64, 1, 4), + (32768, 32768, 16384, 32, 32, True, False, True): (1, 64, 3, 4), + (32768, 32768, 16384, 64, 64, False, True, True): (2, 128, 3, 4), + (32768, 32768, 16384, 64, 64, True, False, True): (1, 64, 3, 4), + (32768, 32768, 16384, 128, 128, False, True, True): (4, 128, 3, 8), + (32768, 32768, 16384, 128, 128, True, False, True): (2, 128, 3, 8), + (32768, 32768, 32768, 16, 16, False, True, True): (1, 128, 1, 4), + (32768, 32768, 32768, 16, 16, True, False, True): (1, 128, 4, 1), + (32768, 32768, 32768, 32, 32, False, True, True): (2, 128, 1, 4), + (32768, 32768, 32768, 32, 32, True, False, True): (1, 128, 3, 4), + (32768, 32768, 32768, 64, 64, False, True, True): (2, 256, 3, 4), + (32768, 32768, 32768, 64, 64, True, False, True): (1, 128, 3, 4), + (32768, 32768, 32768, 128, 128, False, True, True): (2, 256, 3, 8), + (32768, 32768, 32768, 128, 128, True, False, True): (4, 256, 3, 8), + (32768, 32768, 65536, 16, 16, False, True, True): (1, 256, 1, 4), + (32768, 32768, 65536, 16, 16, True, False, True): (1, 256, 4, 1), + (32768, 32768, 65536, 32, 32, False, True, True): (1, 256, 3, 4), + (32768, 32768, 65536, 32, 32, True, False, True): (1, 256, 3, 4), + (32768, 32768, 65536, 64, 64, False, True, True): (1, 512, 3, 4), + (32768, 32768, 65536, 64, 64, True, False, True): (1, 256, 3, 4), + (32768, 32768, 65536, 128, 128, False, True, True): (4, 512, 1, 4), + (32768, 32768, 65536, 128, 128, True, False, True): (2, 512, 3, 8), + }, + ("bsr_dense_addmm", "NVIDIA A100-SXM4-80GB", (0, torch.float16, 0.56)): { + (192, 192, 256, 64, 64, False, True, True): (1, 4, 3, 4), + (192, 192, 256, 64, 64, True, False, True): (1, 4, 3, 4), + (192, 192, 512, 64, 64, False, True, True): (1, 8, 5, 4), + (192, 192, 512, 64, 64, True, False, True): (1, 8, 3, 4), + (192, 192, 1024, 64, 64, False, True, True): (1, 16, 3, 2), + (192, 192, 1024, 64, 64, True, False, True): (1, 16, 3, 4), + (192, 192, 2048, 64, 64, False, True, True): (1, 32, 5, 4), + (192, 192, 2048, 64, 64, True, False, True): (4, 32, 5, 4), + (192, 192, 4096, 64, 64, False, True, True): (1, 64, 1, 8), + (192, 192, 4096, 64, 64, True, False, True): (1, 32, 3, 4), + (192, 192, 8192, 64, 64, False, True, True): (4, 128, 1, 4), + (192, 192, 8192, 64, 64, True, False, True): (3, 64, 3, 4), + (192, 192, 16384, 64, 64, False, True, True): (1, 256, 1, 4), + (192, 192, 16384, 64, 64, True, False, True): (3, 64, 2, 4), + (192, 192, 32768, 64, 64, False, True, True): (1, 512, 1, 2), + (192, 192, 32768, 64, 64, True, False, True): (2, 256, 2, 4), + (192, 192, 65536, 64, 64, False, True, True): (1, 512, 1, 4), + (192, 192, 65536, 64, 64, True, False, True): (2, 512, 2, 4), + (192, 192, 131072, 64, 64, False, True, True): (1, 1024, 1, 4), + (192, 192, 131072, 64, 64, True, False, True): (1, 512, 3, 4), + (384, 384, 256, 128, 128, False, True, True): (3, 2, 3, 8), + (384, 384, 256, 128, 128, True, False, True): (5, 2, 3, 8), + (384, 384, 512, 128, 128, False, True, True): (4, 4, 3, 8), + (384, 384, 512, 128, 128, True, False, True): (1, 4, 3, 8), + (384, 384, 1024, 128, 128, False, True, True): (1, 8, 3, 8), + (384, 384, 1024, 128, 128, True, False, True): (1, 8, 2, 8), + (384, 384, 2048, 128, 128, False, True, True): (3, 16, 3, 8), + (384, 384, 2048, 128, 128, True, False, True): (1, 16, 3, 8), + (384, 384, 4096, 128, 128, False, True, True): (3, 32, 3, 8), + (384, 384, 4096, 128, 128, True, False, True): (3, 32, 3, 8), + (384, 384, 8192, 128, 128, False, True, True): (2, 64, 3, 8), + (384, 384, 8192, 128, 128, True, False, True): (2, 64, 2, 4), + (384, 384, 16384, 128, 128, False, True, True): (1, 128, 2, 8), + (384, 384, 16384, 128, 128, True, False, True): (3, 128, 2, 4), + (384, 384, 32768, 128, 128, False, True, True): (2, 256, 3, 8), + (384, 384, 32768, 128, 128, True, False, True): (1, 256, 2, 4), + (384, 384, 65536, 128, 128, False, True, True): (7, 512, 1, 4), + (384, 384, 65536, 128, 128, True, False, True): (3, 512, 2, 4), + (384, 384, 131072, 128, 128, False, True, True): (5, 1024, 1, 4), + (384, 384, 131072, 128, 128, True, False, True): (1, 1024, 2, 4), + }, + ("bsr_dense_addmm", "NVIDIA A100-SXM4-80GB", (0, torch.float32, 0.5)): { + (16, 16, 16, 16, 16, False, False, False): (2, 1, 1, 16), + (16, 16, 16, 16, 16, False, False, True): (1, 1, 2, 4), + (16, 16, 16, 16, 16, False, True, False): (1, 1, 2, 16), + (16, 16, 16, 16, 16, False, True, True): (2, 1, 2, 8), + (16, 16, 16, 16, 16, True, False, False): (1, 1, 1, 2), + (16, 16, 16, 16, 16, True, False, True): (2, 1, 1, 4), + (16, 16, 32, 16, 16, False, False, False): (1, 1, 1, 2), + (16, 16, 32, 16, 16, False, False, True): (1, 1, 2, 8), + (16, 16, 32, 16, 16, False, True, False): (1, 2, 1, 4), + (16, 16, 32, 16, 16, False, True, True): (1, 2, 2, 4), + (16, 16, 32, 16, 16, True, False, False): (1, 1, 2, 4), + (16, 16, 32, 16, 16, True, False, True): (1, 2, 2, 4), + (16, 16, 64, 16, 16, False, False, False): (1, 4, 1, 4), + (16, 16, 64, 16, 16, False, False, True): (2, 2, 1, 4), + (16, 16, 64, 16, 16, False, True, False): (1, 4, 1, 4), + (16, 16, 64, 16, 16, False, True, True): (1, 4, 1, 8), + (16, 16, 64, 16, 16, True, False, False): (1, 2, 1, 4), + (16, 16, 64, 16, 16, True, False, True): (1, 4, 2, 8), + (16, 32, 16, 16, 16, False, False, False): (1, 1, 2, 8), + (16, 32, 16, 16, 16, False, False, True): (2, 1, 1, 4), + (16, 32, 16, 16, 16, False, True, False): (1, 1, 1, 4), + (16, 32, 16, 16, 16, False, True, True): (1, 1, 1, 4), + (16, 32, 16, 16, 16, True, False, False): (1, 1, 1, 4), + (16, 32, 16, 16, 16, True, False, True): (1, 1, 2, 8), + (16, 32, 16, 16, 32, False, False, False): (1, 1, 2, 4), + (16, 32, 16, 16, 32, False, False, True): (2, 1, 2, 2), + (16, 32, 16, 16, 32, False, True, False): (1, 1, 1, 8), + (16, 32, 16, 16, 32, False, True, True): (1, 1, 1, 2), + (16, 32, 16, 16, 32, True, False, False): (3, 1, 1, 4), + (16, 32, 16, 16, 32, True, False, True): (1, 1, 1, 4), + (16, 32, 32, 16, 16, False, False, False): (1, 2, 1, 4), + (16, 32, 32, 16, 16, False, False, True): (2, 2, 1, 4), + (16, 32, 32, 16, 16, False, True, False): (1, 2, 1, 2), + (16, 32, 32, 16, 16, False, True, True): (1, 2, 1, 4), + (16, 32, 32, 16, 16, True, False, False): (1, 2, 1, 4), + (16, 32, 32, 16, 16, True, False, True): (1, 2, 1, 4), + (16, 32, 32, 16, 32, False, False, False): (1, 1, 2, 4), + (16, 32, 32, 16, 32, False, False, True): (1, 2, 1, 4), + (16, 32, 32, 16, 32, False, True, False): (1, 2, 2, 8), + (16, 32, 32, 16, 32, False, True, True): (1, 2, 1, 1), + (16, 32, 32, 16, 32, True, False, False): (1, 2, 1, 2), + (16, 32, 32, 16, 32, True, False, True): (1, 2, 1, 4), + (16, 32, 64, 16, 16, False, False, False): (1, 2, 1, 4), + (16, 32, 64, 16, 16, False, False, True): (2, 4, 1, 4), + (16, 32, 64, 16, 16, False, True, False): (1, 4, 2, 4), + (16, 32, 64, 16, 16, False, True, True): (1, 4, 1, 4), + (16, 32, 64, 16, 16, True, False, False): (1, 2, 2, 8), + (16, 32, 64, 16, 16, True, False, True): (1, 4, 1, 2), + (16, 32, 64, 16, 32, False, False, False): (1, 4, 1, 4), + (16, 32, 64, 16, 32, False, False, True): (1, 4, 3, 4), + (16, 32, 64, 16, 32, False, True, False): (1, 2, 1, 4), + (16, 32, 64, 16, 32, False, True, True): (1, 4, 1, 4), + (16, 32, 64, 16, 32, True, False, False): (1, 2, 1, 8), + (16, 32, 64, 16, 32, True, False, True): (1, 2, 1, 4), + (16, 64, 16, 16, 32, False, False, False): (1, 1, 1, 2), + (16, 64, 16, 16, 32, False, False, True): (1, 1, 1, 8), + (16, 64, 16, 16, 32, False, True, False): (1, 1, 1, 8), + (16, 64, 16, 16, 32, False, True, True): (1, 1, 1, 4), + (16, 64, 16, 16, 32, True, False, False): (1, 1, 1, 8), + (16, 64, 16, 16, 32, True, False, True): (1, 1, 1, 4), + (16, 64, 32, 16, 32, False, False, False): (1, 2, 1, 4), + (16, 64, 32, 16, 32, False, False, True): (1, 1, 1, 4), + (16, 64, 32, 16, 32, False, True, False): (1, 2, 1, 1), + (16, 64, 32, 16, 32, False, True, True): (1, 2, 1, 8), + (16, 64, 32, 16, 32, True, False, False): (2, 2, 1, 4), + (16, 64, 32, 16, 32, True, False, True): (2, 2, 1, 4), + (16, 64, 64, 16, 32, False, False, False): (1, 2, 1, 4), + (16, 64, 64, 16, 32, False, False, True): (1, 4, 1, 4), + (16, 64, 64, 16, 32, False, True, False): (1, 4, 1, 4), + (16, 64, 64, 16, 32, False, True, True): (1, 4, 1, 4), + (16, 64, 64, 16, 32, True, False, False): (1, 4, 1, 2), + (16, 64, 64, 16, 32, True, False, True): (3, 4, 1, 4), + (32, 16, 16, 16, 16, False, False, False): (1, 1, 2, 4), + (32, 16, 16, 16, 16, False, False, True): (1, 1, 1, 2), + (32, 16, 16, 16, 16, False, True, False): (1, 1, 2, 4), + (32, 16, 16, 16, 16, False, True, True): (1, 1, 2, 4), + (32, 16, 16, 16, 16, True, False, False): (1, 1, 3, 8), + (32, 16, 16, 16, 16, True, False, True): (1, 1, 2, 4), + (32, 16, 32, 16, 16, False, False, False): (1, 2, 1, 4), + (32, 16, 32, 16, 16, False, False, True): (1, 2, 3, 4), + (32, 16, 32, 16, 16, False, True, False): (1, 1, 1, 8), + (32, 16, 32, 16, 16, False, True, True): (1, 2, 1, 4), + (32, 16, 32, 16, 16, True, False, False): (1, 1, 1, 2), + (32, 16, 32, 16, 16, True, False, True): (1, 1, 1, 4), + (32, 16, 64, 16, 16, False, False, False): (1, 4, 1, 4), + (32, 16, 64, 16, 16, False, False, True): (3, 4, 1, 4), + (32, 16, 64, 16, 16, False, True, False): (1, 4, 1, 1), + (32, 16, 64, 16, 16, False, True, True): (1, 4, 1, 4), + (32, 16, 64, 16, 16, True, False, False): (1, 4, 1, 4), + (32, 16, 64, 16, 16, True, False, True): (1, 4, 1, 4), + (32, 32, 16, 16, 16, False, False, False): (1, 1, 1, 2), + (32, 32, 16, 16, 16, False, False, True): (2, 1, 1, 4), + (32, 32, 16, 16, 16, False, True, False): (1, 1, 1, 2), + (32, 32, 16, 16, 16, False, True, True): (2, 1, 1, 4), + (32, 32, 16, 16, 16, True, False, False): (3, 1, 2, 4), + (32, 32, 16, 16, 16, True, False, True): (1, 1, 2, 4), + (32, 32, 16, 16, 32, False, False, False): (2, 1, 1, 2), + (32, 32, 16, 16, 32, False, False, True): (1, 1, 1, 4), + (32, 32, 16, 16, 32, False, True, False): (1, 1, 1, 4), + (32, 32, 16, 16, 32, False, True, True): (1, 1, 1, 8), + (32, 32, 16, 16, 32, True, False, False): (1, 1, 1, 8), + (32, 32, 16, 16, 32, True, False, True): (1, 1, 1, 4), + (32, 32, 16, 32, 32, False, False, False): (2, 1, 1, 4), + (32, 32, 16, 32, 32, False, False, True): (1, 1, 2, 4), + (32, 32, 16, 32, 32, False, True, False): (2, 1, 1, 1), + (32, 32, 16, 32, 32, False, True, True): (2, 1, 2, 4), + (32, 32, 16, 32, 32, True, False, False): (1, 1, 1, 8), + (32, 32, 16, 32, 32, True, False, True): (1, 1, 1, 4), + (32, 32, 32, 16, 16, False, False, False): (1, 1, 1, 4), + (32, 32, 32, 16, 16, False, False, True): (1, 2, 1, 2), + (32, 32, 32, 16, 16, False, True, False): (2, 2, 1, 4), + (32, 32, 32, 16, 16, False, True, True): (1, 2, 2, 4), + (32, 32, 32, 16, 16, True, False, False): (1, 2, 1, 4), + (32, 32, 32, 16, 16, True, False, True): (2, 2, 1, 4), + (32, 32, 32, 16, 32, False, False, False): (1, 2, 1, 4), + (32, 32, 32, 16, 32, False, False, True): (1, 2, 1, 4), + (32, 32, 32, 16, 32, False, True, False): (1, 2, 1, 4), + (32, 32, 32, 16, 32, False, True, True): (1, 2, 1, 4), + (32, 32, 32, 16, 32, True, False, False): (2, 1, 1, 2), + (32, 32, 32, 16, 32, True, False, True): (2, 2, 2, 4), + (32, 32, 32, 32, 32, False, False, False): (1, 1, 1, 4), + (32, 32, 32, 32, 32, False, False, True): (1, 1, 1, 2), + (32, 32, 32, 32, 32, False, True, False): (1, 1, 1, 4), + (32, 32, 32, 32, 32, False, True, True): (1, 1, 2, 2), + (32, 32, 32, 32, 32, True, False, False): (1, 1, 1, 2), + (32, 32, 32, 32, 32, True, False, True): (1, 1, 2, 1), + (32, 32, 64, 16, 16, False, False, False): (2, 4, 1, 4), + (32, 32, 64, 16, 16, False, False, True): (1, 4, 2, 4), + (32, 32, 64, 16, 16, False, True, False): (1, 4, 1, 4), + (32, 32, 64, 16, 16, False, True, True): (1, 4, 1, 4), + (32, 32, 64, 16, 16, True, False, False): (1, 2, 1, 4), + (32, 32, 64, 16, 16, True, False, True): (2, 4, 1, 4), + (32, 32, 64, 16, 32, False, False, False): (1, 4, 1, 8), + (32, 32, 64, 16, 32, False, False, True): (1, 4, 1, 4), + (32, 32, 64, 16, 32, False, True, False): (1, 4, 1, 4), + (32, 32, 64, 16, 32, False, True, True): (2, 4, 1, 4), + (32, 32, 64, 16, 32, True, False, False): (1, 2, 2, 4), + (32, 32, 64, 16, 32, True, False, True): (2, 4, 1, 4), + (32, 32, 64, 32, 32, False, False, False): (2, 2, 1, 4), + (32, 32, 64, 32, 32, False, False, True): (1, 1, 1, 4), + (32, 32, 64, 32, 32, False, True, False): (1, 1, 1, 8), + (32, 32, 64, 32, 32, False, True, True): (2, 1, 1, 4), + (32, 32, 64, 32, 32, True, False, False): (1, 1, 1, 4), + (32, 32, 64, 32, 32, True, False, True): (1, 2, 1, 1), + (32, 64, 16, 16, 32, False, False, False): (1, 1, 2, 2), + (32, 64, 16, 16, 32, False, False, True): (2, 1, 1, 4), + (32, 64, 16, 16, 32, False, True, False): (1, 1, 1, 8), + (32, 64, 16, 16, 32, False, True, True): (1, 1, 3, 4), + (32, 64, 16, 16, 32, True, False, False): (1, 1, 1, 2), + (32, 64, 16, 16, 32, True, False, True): (1, 1, 2, 4), + (32, 64, 16, 32, 32, False, False, False): (1, 1, 1, 2), + (32, 64, 16, 32, 32, False, False, True): (1, 1, 3, 4), + (32, 64, 16, 32, 32, False, True, False): (1, 1, 2, 4), + (32, 64, 16, 32, 32, False, True, True): (1, 1, 1, 8), + (32, 64, 16, 32, 32, True, False, False): (1, 1, 2, 4), + (32, 64, 16, 32, 32, True, False, True): (1, 1, 1, 8), + (32, 64, 32, 16, 32, False, False, False): (1, 2, 1, 4), + (32, 64, 32, 16, 32, False, False, True): (1, 2, 3, 4), + (32, 64, 32, 16, 32, False, True, False): (1, 2, 1, 8), + (32, 64, 32, 16, 32, False, True, True): (3, 2, 1, 4), + (32, 64, 32, 16, 32, True, False, False): (1, 1, 1, 8), + (32, 64, 32, 16, 32, True, False, True): (1, 2, 1, 4), + (32, 64, 32, 32, 32, False, False, False): (1, 1, 1, 1), + (32, 64, 32, 32, 32, False, False, True): (1, 1, 1, 4), + (32, 64, 32, 32, 32, False, True, False): (1, 1, 1, 4), + (32, 64, 32, 32, 32, False, True, True): (1, 1, 1, 4), + (32, 64, 32, 32, 32, True, False, False): (1, 1, 1, 4), + (32, 64, 32, 32, 32, True, False, True): (1, 1, 2, 8), + (32, 64, 64, 16, 32, False, False, False): (2, 4, 1, 4), + (32, 64, 64, 16, 32, False, False, True): (1, 4, 1, 4), + (32, 64, 64, 16, 32, False, True, False): (1, 4, 1, 4), + (32, 64, 64, 16, 32, False, True, True): (2, 4, 1, 4), + (32, 64, 64, 16, 32, True, False, False): (1, 4, 1, 4), + (32, 64, 64, 16, 32, True, False, True): (1, 4, 1, 4), + (32, 64, 64, 32, 32, False, False, False): (2, 2, 1, 4), + (32, 64, 64, 32, 32, False, False, True): (1, 2, 1, 8), + (32, 64, 64, 32, 32, False, True, False): (1, 2, 1, 4), + (32, 64, 64, 32, 32, False, True, True): (1, 2, 1, 4), + (32, 64, 64, 32, 32, True, False, False): (2, 2, 1, 4), + (32, 64, 64, 32, 32, True, False, True): (1, 2, 3, 8), + (64, 32, 16, 32, 32, False, False, False): (1, 1, 1, 4), + (64, 32, 16, 32, 32, False, False, True): (3, 1, 2, 4), + (64, 32, 16, 32, 32, False, True, False): (2, 1, 1, 2), + (64, 32, 16, 32, 32, False, True, True): (1, 1, 1, 8), + (64, 32, 16, 32, 32, True, False, False): (1, 1, 1, 2), + (64, 32, 16, 32, 32, True, False, True): (1, 1, 1, 4), + (64, 32, 32, 32, 32, False, False, False): (1, 1, 1, 4), + (64, 32, 32, 32, 32, False, False, True): (1, 1, 2, 8), + (64, 32, 32, 32, 32, False, True, False): (1, 1, 1, 8), + (64, 32, 32, 32, 32, False, True, True): (1, 1, 1, 4), + (64, 32, 32, 32, 32, True, False, False): (1, 1, 2, 4), + (64, 32, 32, 32, 32, True, False, True): (1, 1, 3, 8), + (64, 32, 64, 32, 32, False, False, False): (1, 2, 1, 4), + (64, 32, 64, 32, 32, False, False, True): (2, 2, 1, 4), + (64, 32, 64, 32, 32, False, True, False): (1, 1, 1, 4), + (64, 32, 64, 32, 32, False, True, True): (1, 2, 1, 8), + (64, 32, 64, 32, 32, True, False, False): (2, 2, 1, 4), + (64, 32, 64, 32, 32, True, False, True): (1, 2, 1, 8), + (64, 64, 16, 32, 32, False, False, False): (1, 1, 2, 8), + (64, 64, 16, 32, 32, False, False, True): (2, 1, 2, 4), + (64, 64, 16, 32, 32, False, True, False): (1, 1, 1, 2), + (64, 64, 16, 32, 32, False, True, True): (1, 1, 2, 4), + (64, 64, 16, 32, 32, True, False, False): (1, 1, 1, 2), + (64, 64, 16, 32, 32, True, False, True): (1, 1, 2, 4), + (64, 64, 32, 32, 32, False, False, False): (1, 1, 1, 4), + (64, 64, 32, 32, 32, False, False, True): (2, 1, 1, 4), + (64, 64, 32, 32, 32, False, True, False): (1, 1, 1, 8), + (64, 64, 32, 32, 32, False, True, True): (2, 1, 1, 4), + (64, 64, 32, 32, 32, True, False, False): (1, 1, 1, 4), + (64, 64, 32, 32, 32, True, False, True): (1, 1, 1, 8), + (64, 64, 64, 32, 32, False, False, False): (2, 2, 1, 4), + (64, 64, 64, 32, 32, False, False, True): (1, 2, 1, 4), + (64, 64, 64, 32, 32, False, True, False): (1, 2, 1, 4), + (64, 64, 64, 32, 32, False, True, True): (2, 2, 1, 4), + (64, 64, 64, 32, 32, True, False, False): (1, 1, 1, 8), + (64, 64, 64, 32, 32, True, False, True): (1, 2, 2, 4), + (192, 192, 256, 16, 16, False, True, True): (1, 16, 3, 2), + (192, 192, 256, 16, 16, True, False, True): (1, 8, 5, 4), + (192, 192, 256, 32, 32, False, True, True): (2, 8, 4, 4), + (192, 192, 256, 32, 32, True, False, True): (1, 8, 5, 4), + (192, 192, 512, 16, 16, False, True, True): (2, 16, 3, 4), + (192, 192, 512, 16, 16, True, False, True): (1, 16, 5, 4), + (192, 192, 512, 32, 32, False, True, True): (1, 16, 3, 4), + (192, 192, 512, 32, 32, True, False, True): (2, 16, 3, 4), + (192, 192, 1024, 16, 16, False, True, True): (3, 16, 3, 4), + (192, 192, 1024, 16, 16, True, False, True): (2, 8, 3, 4), + (192, 192, 1024, 32, 32, False, True, True): (3, 32, 1, 4), + (192, 192, 1024, 32, 32, True, False, True): (3, 16, 3, 4), + (192, 192, 2048, 16, 16, False, True, True): (1, 32, 3, 4), + (192, 192, 2048, 16, 16, True, False, True): (2, 16, 3, 4), + (192, 192, 2048, 32, 32, False, True, True): (1, 64, 1, 4), + (192, 192, 2048, 32, 32, True, False, True): (1, 64, 2, 4), + (192, 192, 4096, 16, 16, False, True, True): (1, 64, 2, 4), + (192, 192, 4096, 16, 16, True, False, True): (1, 32, 3, 4), + (192, 192, 4096, 32, 32, False, True, True): (3, 128, 2, 4), + (192, 192, 4096, 32, 32, True, False, True): (1, 128, 2, 4), + (192, 192, 8192, 16, 16, False, True, True): (2, 64, 3, 4), + (192, 192, 8192, 16, 16, True, False, True): (1, 64, 3, 4), + (192, 192, 8192, 32, 32, False, True, True): (3, 128, 3, 4), + (192, 192, 8192, 32, 32, True, False, True): (1, 128, 2, 4), + (192, 192, 16384, 16, 16, False, True, True): (1, 256, 3, 2), + (192, 192, 16384, 16, 16, True, False, True): (1, 256, 3, 2), + (192, 192, 16384, 32, 32, False, True, True): (2, 256, 3, 4), + (192, 192, 16384, 32, 32, True, False, True): (2, 256, 3, 4), + (192, 192, 32768, 16, 16, False, True, True): (2, 512, 3, 2), + (192, 192, 32768, 16, 16, True, False, True): (1, 128, 3, 4), + (192, 192, 32768, 32, 32, False, True, True): (2, 512, 3, 4), + (192, 192, 32768, 32, 32, True, False, True): (2, 512, 3, 4), + (192, 192, 65536, 16, 16, False, True, True): (2, 1024, 3, 2), + (192, 192, 65536, 16, 16, True, False, True): (1, 256, 3, 4), + (192, 192, 65536, 32, 32, False, True, True): (2, 1024, 3, 4), + (192, 192, 65536, 32, 32, True, False, True): (2, 1024, 3, 4), + (192, 192, 131072, 16, 16, False, True, True): (2, 512, 3, 4), + (192, 192, 131072, 16, 16, True, False, True): (1, 512, 3, 4), + (192, 192, 131072, 32, 32, False, True, True): (2, 1024, 3, 4), + (192, 192, 131072, 32, 32, True, False, True): (2, 1024, 3, 4), + (256, 256, 256, 16, 16, False, True, True): (1, 16, 3, 4), + (256, 256, 256, 16, 16, True, False, True): (2, 16, 1, 4), + (256, 256, 256, 32, 32, False, True, True): (1, 8, 4, 8), + (256, 256, 256, 32, 32, True, False, True): (4, 8, 4, 4), + (256, 256, 256, 64, 64, False, True, True): (1, 4, 4, 8), + (256, 256, 256, 64, 64, True, False, True): (1, 4, 3, 8), + (256, 256, 256, 128, 128, False, True, True): (7, 2, 1, 32), + (256, 256, 256, 128, 128, True, False, True): (3, 2, 1, 32), + (256, 256, 512, 16, 16, False, True, True): (1, 16, 5, 4), + (256, 256, 512, 16, 16, True, False, True): (1, 16, 3, 2), + (256, 256, 512, 32, 32, False, True, True): (4, 16, 4, 4), + (256, 256, 512, 32, 32, True, False, True): (4, 16, 3, 4), + (256, 256, 512, 64, 64, False, True, True): (1, 8, 3, 8), + (256, 256, 512, 64, 64, True, False, True): (1, 8, 3, 8), + (256, 256, 512, 128, 128, False, True, True): (1, 4, 1, 32), + (256, 256, 512, 128, 128, True, False, True): (3, 4, 1, 32), + (256, 256, 1024, 16, 16, False, True, True): (3, 32, 5, 2), + (256, 256, 1024, 16, 16, True, False, True): (2, 32, 5, 2), + (256, 256, 1024, 32, 32, False, True, True): (1, 32, 4, 4), + (256, 256, 1024, 32, 32, True, False, True): (1, 32, 5, 4), + (256, 256, 1024, 64, 64, False, True, True): (4, 16, 3, 8), + (256, 256, 1024, 64, 64, True, False, True): (1, 16, 3, 8), + (256, 256, 1024, 128, 128, False, True, True): (1, 8, 1, 32), + (256, 256, 1024, 128, 128, True, False, True): (3, 8, 1, 32), + (256, 256, 2048, 16, 16, False, True, True): (3, 32, 3, 4), + (256, 256, 2048, 16, 16, True, False, True): (1, 64, 3, 2), + (256, 256, 2048, 32, 32, False, True, True): (1, 64, 3, 4), + (256, 256, 2048, 32, 32, True, False, True): (1, 64, 3, 4), + (256, 256, 2048, 64, 64, False, True, True): (2, 32, 1, 8), + (256, 256, 2048, 64, 64, True, False, True): (2, 32, 1, 8), + (256, 256, 2048, 128, 128, False, True, True): (4, 16, 1, 32), + (256, 256, 2048, 128, 128, True, False, True): (4, 16, 1, 32), + (256, 256, 4096, 16, 16, False, True, True): (1, 32, 2, 4), + (256, 256, 4096, 16, 16, True, False, True): (1, 32, 3, 4), + (256, 256, 4096, 32, 32, False, True, True): (1, 128, 2, 4), + (256, 256, 4096, 32, 32, True, False, True): (1, 128, 2, 4), + (256, 256, 4096, 64, 64, False, True, True): (2, 64, 4, 8), + (256, 256, 4096, 64, 64, True, False, True): (3, 64, 2, 8), + (256, 256, 4096, 128, 128, False, True, True): (3, 32, 1, 32), + (256, 256, 4096, 128, 128, True, False, True): (2, 32, 1, 32), + (256, 256, 8192, 16, 16, False, True, True): (1, 64, 3, 4), + (256, 256, 8192, 16, 16, True, False, True): (2, 128, 3, 2), + (256, 256, 8192, 32, 32, False, True, True): (3, 128, 3, 4), + (256, 256, 8192, 32, 32, True, False, True): (1, 128, 3, 4), + (256, 256, 8192, 64, 64, False, True, True): (3, 128, 1, 4), + (256, 256, 8192, 64, 64, True, False, True): (4, 128, 2, 8), + (256, 256, 8192, 128, 128, False, True, True): (6, 64, 1, 32), + (256, 256, 8192, 128, 128, True, False, True): (2, 64, 1, 32), + (256, 256, 16384, 16, 16, False, True, True): (4, 128, 3, 4), + (256, 256, 16384, 16, 16, True, False, True): (3, 128, 3, 4), + (256, 256, 16384, 32, 32, False, True, True): (4, 256, 3, 4), + (256, 256, 16384, 32, 32, True, False, True): (2, 256, 3, 4), + (256, 256, 16384, 64, 64, False, True, True): (3, 256, 1, 4), + (256, 256, 16384, 64, 64, True, False, True): (2, 256, 2, 4), + (256, 256, 16384, 128, 128, False, True, True): (1, 128, 1, 32), + (256, 256, 16384, 128, 128, True, False, True): (3, 128, 1, 32), + (256, 256, 32768, 16, 16, False, True, True): (1, 256, 3, 4), + (256, 256, 32768, 16, 16, True, False, True): (2, 128, 3, 4), + (256, 256, 32768, 32, 32, False, True, True): (2, 512, 3, 4), + (256, 256, 32768, 32, 32, True, False, True): (4, 512, 3, 4), + (256, 256, 32768, 64, 64, False, True, True): (1, 512, 1, 8), + (256, 256, 32768, 64, 64, True, False, True): (1, 512, 2, 4), + (256, 256, 32768, 128, 128, False, True, True): (1, 256, 1, 32), + (256, 256, 32768, 128, 128, True, False, True): (1, 256, 1, 32), + (256, 256, 65536, 16, 16, False, True, True): (2, 512, 3, 4), + (256, 256, 65536, 16, 16, True, False, True): (1, 256, 3, 4), + (256, 256, 65536, 32, 32, False, True, True): (1, 1024, 3, 4), + (256, 256, 65536, 32, 32, True, False, True): (2, 1024, 3, 4), + (256, 256, 65536, 64, 64, False, True, True): (1, 1024, 2, 4), + (256, 256, 65536, 64, 64, True, False, True): (1, 1024, 2, 4), + (256, 256, 65536, 128, 128, False, True, True): (1, 512, 1, 32), + (256, 256, 65536, 128, 128, True, False, True): (2, 512, 1, 32), + (256, 256, 131072, 16, 16, False, True, True): (1, 1024, 3, 4), + (256, 256, 131072, 16, 16, True, False, True): (1, 512, 3, 4), + (256, 256, 131072, 32, 32, False, True, True): (1, 2048, 3, 4), + (256, 256, 131072, 32, 32, True, False, True): (1, 2048, 3, 4), + (256, 256, 131072, 64, 64, False, True, True): (1, 2048, 1, 8), + (256, 256, 131072, 64, 64, True, False, True): (1, 2048, 2, 4), + (256, 256, 131072, 128, 128, False, True, True): (1, 1024, 1, 32), + (256, 256, 131072, 128, 128, True, False, True): (4, 1024, 1, 32), + (384, 384, 256, 16, 16, False, True, True): (1, 8, 3, 4), + (384, 384, 256, 16, 16, True, False, True): (1, 8, 3, 4), + (384, 384, 256, 32, 32, False, True, True): (2, 8, 3, 8), + (384, 384, 256, 32, 32, True, False, True): (1, 8, 3, 4), + (384, 384, 256, 64, 64, False, True, True): (1, 4, 4, 8), + (384, 384, 256, 64, 64, True, False, True): (2, 4, 3, 8), + (384, 384, 512, 16, 16, False, True, True): (3, 16, 3, 2), + (384, 384, 512, 16, 16, True, False, True): (3, 16, 3, 2), + (384, 384, 512, 32, 32, False, True, True): (2, 8, 3, 4), + (384, 384, 512, 32, 32, True, False, True): (1, 8, 3, 4), + (384, 384, 512, 64, 64, False, True, True): (2, 8, 3, 8), + (384, 384, 512, 64, 64, True, False, True): (2, 8, 4, 8), + (384, 384, 1024, 16, 16, False, True, True): (3, 16, 3, 2), + (384, 384, 1024, 16, 16, True, False, True): (4, 32, 3, 2), + (384, 384, 1024, 32, 32, False, True, True): (1, 32, 3, 4), + (384, 384, 1024, 32, 32, True, False, True): (2, 16, 3, 4), + (384, 384, 1024, 64, 64, False, True, True): (2, 16, 3, 8), + (384, 384, 1024, 64, 64, True, False, True): (4, 16, 4, 8), + (384, 384, 2048, 16, 16, False, True, True): (3, 16, 3, 4), + (384, 384, 2048, 16, 16, True, False, True): (1, 32, 3, 4), + (384, 384, 2048, 32, 32, False, True, True): (3, 64, 2, 4), + (384, 384, 2048, 32, 32, True, False, True): (1, 64, 3, 4), + (384, 384, 2048, 64, 64, False, True, True): (4, 32, 4, 8), + (384, 384, 2048, 64, 64, True, False, True): (5, 32, 4, 8), + (384, 384, 4096, 16, 16, False, True, True): (1, 32, 3, 4), + (384, 384, 4096, 16, 16, True, False, True): (3, 32, 3, 4), + (384, 384, 4096, 32, 32, False, True, True): (2, 64, 3, 4), + (384, 384, 4096, 32, 32, True, False, True): (2, 64, 3, 4), + (384, 384, 4096, 64, 64, False, True, True): (2, 64, 3, 8), + (384, 384, 4096, 64, 64, True, False, True): (2, 64, 3, 8), + (384, 384, 8192, 16, 16, False, True, True): (1, 128, 3, 2), + (384, 384, 8192, 16, 16, True, False, True): (1, 128, 3, 2), + (384, 384, 8192, 32, 32, False, True, True): (1, 128, 3, 4), + (384, 384, 8192, 32, 32, True, False, True): (1, 128, 3, 4), + (384, 384, 8192, 64, 64, False, True, True): (3, 128, 3, 4), + (384, 384, 8192, 64, 64, True, False, True): (2, 128, 3, 4), + (384, 384, 16384, 16, 16, False, True, True): (1, 256, 3, 2), + (384, 384, 16384, 16, 16, True, False, True): (1, 64, 3, 4), + (384, 384, 16384, 32, 32, False, True, True): (2, 256, 3, 4), + (384, 384, 16384, 32, 32, True, False, True): (4, 256, 3, 4), + (384, 384, 16384, 64, 64, False, True, True): (2, 256, 3, 4), + (384, 384, 16384, 64, 64, True, False, True): (1, 256, 3, 4), + (384, 384, 32768, 16, 16, False, True, True): (1, 128, 3, 4), + (384, 384, 32768, 16, 16, True, False, True): (1, 128, 3, 4), + (384, 384, 32768, 32, 32, False, True, True): (1, 512, 3, 4), + (384, 384, 32768, 32, 32, True, False, True): (1, 512, 2, 4), + (384, 384, 32768, 64, 64, False, True, True): (1, 512, 3, 4), + (384, 384, 32768, 64, 64, True, False, True): (1, 512, 3, 4), + (384, 384, 65536, 16, 16, False, True, True): (1, 256, 3, 4), + (384, 384, 65536, 16, 16, True, False, True): (1, 256, 3, 4), + (384, 384, 65536, 32, 32, False, True, True): (1, 1024, 3, 4), + (384, 384, 65536, 32, 32, True, False, True): (1, 512, 3, 4), + (384, 384, 65536, 64, 64, False, True, True): (1, 1024, 3, 4), + (384, 384, 65536, 64, 64, True, False, True): (1, 1024, 3, 4), + (384, 384, 131072, 16, 16, False, True, True): (1, 512, 3, 4), + (384, 384, 131072, 16, 16, True, False, True): (1, 512, 3, 4), + (384, 384, 131072, 32, 32, False, True, True): (1, 1024, 3, 4), + (384, 384, 131072, 32, 32, True, False, True): (1, 1024, 3, 4), + (384, 384, 131072, 64, 64, False, True, True): (1, 2048, 3, 4), + (384, 384, 131072, 64, 64, True, False, True): (1, 2048, 3, 4), + (512, 512, 256, 16, 16, False, True, True): (1, 8, 4, 4), + (512, 512, 256, 16, 16, True, False, True): (1, 8, 3, 2), + (512, 512, 256, 32, 32, False, True, True): (4, 8, 3, 4), + (512, 512, 256, 32, 32, True, False, True): (4, 8, 3, 4), + (512, 512, 256, 64, 64, False, True, True): (3, 4, 3, 8), + (512, 512, 256, 64, 64, True, False, True): (5, 4, 3, 8), + (512, 512, 256, 128, 128, False, True, True): (1, 2, 1, 32), + (512, 512, 256, 128, 128, True, False, True): (3, 2, 1, 32), + (512, 512, 512, 16, 16, False, True, True): (2, 16, 3, 2), + (512, 512, 512, 16, 16, True, False, True): (1, 8, 4, 4), + (512, 512, 512, 32, 32, False, True, True): (3, 16, 3, 4), + (512, 512, 512, 32, 32, True, False, True): (5, 16, 2, 4), + (512, 512, 512, 64, 64, False, True, True): (1, 8, 3, 8), + (512, 512, 512, 64, 64, True, False, True): (3, 8, 3, 8), + (512, 512, 512, 128, 128, False, True, True): (1, 4, 1, 32), + (512, 512, 512, 128, 128, True, False, True): (3, 4, 1, 16), + (512, 512, 1024, 16, 16, False, True, True): (1, 16, 3, 4), + (512, 512, 1024, 16, 16, True, False, True): (3, 16, 3, 4), + (512, 512, 1024, 32, 32, False, True, True): (3, 32, 3, 4), + (512, 512, 1024, 32, 32, True, False, True): (3, 32, 2, 4), + (512, 512, 1024, 64, 64, False, True, True): (1, 16, 3, 8), + (512, 512, 1024, 64, 64, True, False, True): (4, 16, 3, 8), + (512, 512, 1024, 128, 128, False, True, True): (4, 8, 1, 32), + (512, 512, 1024, 128, 128, True, False, True): (4, 8, 1, 32), + (512, 512, 2048, 16, 16, False, True, True): (5, 16, 3, 4), + (512, 512, 2048, 16, 16, True, False, True): (5, 16, 3, 4), + (512, 512, 2048, 32, 32, False, True, True): (1, 32, 3, 4), + (512, 512, 2048, 32, 32, True, False, True): (1, 32, 4, 4), + (512, 512, 2048, 64, 64, False, True, True): (4, 32, 3, 8), + (512, 512, 2048, 64, 64, True, False, True): (4, 32, 3, 8), + (512, 512, 2048, 128, 128, False, True, True): (3, 16, 1, 32), + (512, 512, 2048, 128, 128, True, False, True): (3, 16, 1, 32), + (512, 512, 4096, 16, 16, False, True, True): (4, 32, 3, 4), + (512, 512, 4096, 16, 16, True, False, True): (4, 64, 3, 2), + (512, 512, 4096, 32, 32, False, True, True): (3, 64, 3, 4), + (512, 512, 4096, 32, 32, True, False, True): (3, 64, 3, 4), + (512, 512, 4096, 64, 64, False, True, True): (4, 64, 2, 4), + (512, 512, 4096, 64, 64, True, False, True): (1, 64, 2, 4), + (512, 512, 4096, 128, 128, False, True, True): (1, 32, 1, 32), + (512, 512, 4096, 128, 128, True, False, True): (1, 32, 1, 32), + (512, 512, 8192, 16, 16, False, True, True): (1, 64, 3, 4), + (512, 512, 8192, 16, 16, True, False, True): (4, 64, 3, 4), + (512, 512, 8192, 32, 32, False, True, True): (2, 128, 3, 4), + (512, 512, 8192, 32, 32, True, False, True): (3, 128, 3, 4), + (512, 512, 8192, 64, 64, False, True, True): (1, 128, 2, 4), + (512, 512, 8192, 64, 64, True, False, True): (1, 128, 2, 4), + (512, 512, 8192, 128, 128, False, True, True): (6, 64, 1, 32), + (512, 512, 8192, 128, 128, True, False, True): (4, 64, 1, 32), + (512, 512, 16384, 16, 16, False, True, True): (1, 128, 3, 4), + (512, 512, 16384, 16, 16, True, False, True): (1, 64, 3, 4), + (512, 512, 16384, 32, 32, False, True, True): (1, 256, 3, 4), + (512, 512, 16384, 32, 32, True, False, True): (4, 256, 3, 4), + (512, 512, 16384, 64, 64, False, True, True): (1, 256, 2, 4), + (512, 512, 16384, 64, 64, True, False, True): (1, 256, 2, 4), + (512, 512, 16384, 128, 128, False, True, True): (1, 128, 1, 32), + (512, 512, 16384, 128, 128, True, False, True): (2, 128, 1, 32), + (512, 512, 32768, 16, 16, False, True, True): (1, 256, 3, 4), + (512, 512, 32768, 16, 16, True, False, True): (1, 128, 3, 4), + (512, 512, 32768, 32, 32, False, True, True): (1, 512, 3, 4), + (512, 512, 32768, 32, 32, True, False, True): (1, 512, 3, 4), + (512, 512, 32768, 64, 64, False, True, True): (1, 512, 2, 4), + (512, 512, 32768, 64, 64, True, False, True): (2, 512, 2, 4), + (512, 512, 32768, 128, 128, False, True, True): (1, 256, 1, 32), + (512, 512, 32768, 128, 128, True, False, True): (2, 256, 1, 32), + (512, 512, 65536, 16, 16, False, True, True): (1, 512, 3, 4), + (512, 512, 65536, 16, 16, True, False, True): (1, 256, 3, 4), + (512, 512, 65536, 32, 32, False, True, True): (1, 1024, 3, 4), + (512, 512, 65536, 32, 32, True, False, True): (1, 1024, 3, 4), + (512, 512, 65536, 64, 64, False, True, True): (1, 1024, 2, 4), + (512, 512, 65536, 64, 64, True, False, True): (1, 1024, 2, 4), + (512, 512, 65536, 128, 128, False, True, True): (1, 512, 1, 32), + (512, 512, 65536, 128, 128, True, False, True): (4, 512, 1, 32), + (512, 512, 131072, 16, 16, False, True, True): (1, 512, 3, 4), + (512, 512, 131072, 16, 16, True, False, True): (1, 512, 3, 4), + (512, 512, 131072, 32, 32, False, True, True): (1, 2048, 3, 4), + (512, 512, 131072, 32, 32, True, False, True): (1, 2048, 3, 4), + (512, 512, 131072, 64, 64, False, True, True): (1, 2048, 2, 4), + (512, 512, 131072, 64, 64, True, False, True): (1, 2048, 2, 4), + (512, 512, 131072, 128, 128, False, True, True): (1, 1024, 1, 32), + (512, 512, 131072, 128, 128, True, False, True): (2, 1024, 1, 32), + (768, 768, 256, 16, 16, False, True, True): (1, 4, 5, 4), + (768, 768, 256, 16, 16, True, False, True): (3, 8, 3, 2), + (768, 768, 256, 32, 32, False, True, True): (2, 4, 3, 4), + (768, 768, 256, 32, 32, True, False, True): (3, 8, 4, 4), + (768, 768, 256, 64, 64, False, True, True): (1, 4, 4, 8), + (768, 768, 256, 64, 64, True, False, True): (3, 4, 3, 8), + (768, 768, 256, 128, 128, False, True, True): (3, 2, 1, 32), + (768, 768, 256, 128, 128, True, False, True): (2, 2, 2, 32), + (768, 768, 512, 16, 16, False, True, True): (2, 4, 5, 4), + (768, 768, 512, 16, 16, True, False, True): (2, 4, 4, 4), + (768, 768, 512, 32, 32, False, True, True): (1, 8, 3, 4), + (768, 768, 512, 32, 32, True, False, True): (3, 8, 4, 4), + (768, 768, 512, 64, 64, False, True, True): (2, 8, 3, 8), + (768, 768, 512, 64, 64, True, False, True): (5, 8, 3, 8), + (768, 768, 512, 128, 128, False, True, True): (2, 4, 1, 32), + (768, 768, 512, 128, 128, True, False, True): (2, 4, 2, 32), + (768, 768, 1024, 16, 16, False, True, True): (2, 16, 4, 2), + (768, 768, 1024, 16, 16, True, False, True): (4, 32, 3, 1), + (768, 768, 1024, 32, 32, False, True, True): (1, 32, 2, 4), + (768, 768, 1024, 32, 32, True, False, True): (1, 16, 5, 4), + (768, 768, 1024, 64, 64, False, True, True): (2, 16, 3, 8), + (768, 768, 1024, 64, 64, True, False, True): (2, 16, 3, 8), + (768, 768, 1024, 128, 128, False, True, True): (1, 8, 2, 32), + (768, 768, 1024, 128, 128, True, False, True): (1, 8, 1, 32), + (768, 768, 2048, 16, 16, False, True, True): (1, 16, 3, 4), + (768, 768, 2048, 16, 16, True, False, True): (1, 16, 3, 4), + (768, 768, 2048, 32, 32, False, True, True): (1, 32, 3, 4), + (768, 768, 2048, 32, 32, True, False, True): (5, 32, 3, 4), + (768, 768, 2048, 64, 64, False, True, True): (1, 32, 3, 8), + (768, 768, 2048, 64, 64, True, False, True): (1, 32, 3, 4), + (768, 768, 2048, 128, 128, False, True, True): (3, 16, 1, 32), + (768, 768, 2048, 128, 128, True, False, True): (4, 16, 1, 32), + (768, 768, 4096, 16, 16, False, True, True): (1, 64, 3, 2), + (768, 768, 4096, 16, 16, True, False, True): (3, 64, 3, 2), + (768, 768, 4096, 32, 32, False, True, True): (1, 64, 3, 4), + (768, 768, 4096, 32, 32, True, False, True): (1, 64, 3, 4), + (768, 768, 4096, 64, 64, False, True, True): (4, 64, 3, 4), + (768, 768, 4096, 64, 64, True, False, True): (4, 64, 3, 4), + (768, 768, 4096, 128, 128, False, True, True): (1, 32, 1, 32), + (768, 768, 4096, 128, 128, True, False, True): (1, 32, 2, 32), + (768, 768, 8192, 16, 16, False, True, True): (1, 128, 3, 2), + (768, 768, 8192, 16, 16, True, False, True): (2, 32, 3, 4), + (768, 768, 8192, 32, 32, False, True, True): (2, 128, 3, 4), + (768, 768, 8192, 32, 32, True, False, True): (1, 128, 2, 4), + (768, 768, 8192, 64, 64, False, True, True): (1, 128, 3, 4), + (768, 768, 8192, 64, 64, True, False, True): (2, 128, 3, 4), + (768, 768, 8192, 128, 128, False, True, True): (1, 64, 1, 32), + (768, 768, 8192, 128, 128, True, False, True): (2, 64, 1, 32), + (768, 768, 16384, 16, 16, False, True, True): (3, 64, 3, 4), + (768, 768, 16384, 16, 16, True, False, True): (1, 64, 3, 4), + (768, 768, 16384, 32, 32, False, True, True): (2, 256, 3, 4), + (768, 768, 16384, 32, 32, True, False, True): (4, 256, 2, 4), + (768, 768, 16384, 64, 64, False, True, True): (1, 256, 3, 4), + (768, 768, 16384, 64, 64, True, False, True): (1, 256, 3, 4), + (768, 768, 16384, 128, 128, False, True, True): (1, 128, 1, 32), + (768, 768, 16384, 128, 128, True, False, True): (2, 128, 1, 32), + (768, 768, 32768, 16, 16, False, True, True): (1, 128, 3, 4), + (768, 768, 32768, 16, 16, True, False, True): (2, 128, 3, 4), + (768, 768, 32768, 32, 32, False, True, True): (2, 256, 3, 4), + (768, 768, 32768, 32, 32, True, False, True): (1, 256, 3, 4), + (768, 768, 32768, 64, 64, False, True, True): (1, 512, 3, 4), + (768, 768, 32768, 64, 64, True, False, True): (1, 512, 3, 4), + (768, 768, 32768, 128, 128, False, True, True): (1, 256, 1, 32), + (768, 768, 32768, 128, 128, True, False, True): (1, 256, 1, 32), + (768, 768, 50432, 16, 16, False, True, True): (1, 197, 3, 4), + (768, 768, 50432, 32, 32, False, True, True): (1, 394, 3, 4), + (768, 768, 50432, 64, 64, False, True, True): (1, 788, 3, 4), + (768, 768, 50432, 128, 128, False, True, True): (3, 394, 1, 32), + (768, 768, 65536, 16, 16, False, True, True): (1, 256, 3, 4), + (768, 768, 65536, 16, 16, True, False, True): (1, 256, 3, 4), + (768, 768, 65536, 32, 32, False, True, True): (1, 512, 3, 4), + (768, 768, 65536, 32, 32, True, False, True): (1, 512, 3, 4), + (768, 768, 65536, 64, 64, False, True, True): (1, 1024, 3, 4), + (768, 768, 65536, 64, 64, True, False, True): (1, 1024, 3, 4), + (768, 768, 65536, 128, 128, False, True, True): (1, 512, 1, 32), + (768, 768, 65536, 128, 128, True, False, True): (1, 512, 1, 32), + (768, 768, 131072, 16, 16, False, True, True): (1, 512, 3, 4), + (768, 768, 131072, 16, 16, True, False, True): (1, 512, 3, 4), + (768, 768, 131072, 32, 32, False, True, True): (1, 1024, 3, 4), + (768, 768, 131072, 32, 32, True, False, True): (1, 1024, 3, 4), + (768, 768, 131072, 64, 64, False, True, True): (1, 2048, 3, 4), + (768, 768, 131072, 64, 64, True, False, True): (1, 2048, 3, 4), + (768, 768, 131072, 128, 128, False, True, True): (1, 1024, 1, 32), + (768, 768, 131072, 128, 128, True, False, True): (1, 1024, 1, 32), + (768, 3072, 256, 16, 16, False, True, True): (1, 2, 4, 4), + (768, 3072, 256, 16, 16, True, False, True): (1, 4, 3, 4), + (768, 3072, 256, 32, 32, False, True, True): (1, 4, 3, 4), + (768, 3072, 256, 32, 32, True, False, True): (3, 4, 3, 4), + (768, 3072, 256, 64, 64, False, True, True): (1, 4, 3, 8), + (768, 3072, 256, 64, 64, True, False, True): (1, 4, 3, 8), + (768, 3072, 256, 128, 128, False, True, True): (2, 2, 2, 32), + (768, 3072, 256, 128, 128, True, False, True): (2, 2, 1, 32), + (768, 3072, 512, 16, 16, False, True, True): (2, 4, 3, 4), + (768, 3072, 512, 16, 16, True, False, True): (1, 8, 3, 2), + (768, 3072, 512, 32, 32, False, True, True): (3, 8, 4, 4), + (768, 3072, 512, 32, 32, True, False, True): (3, 8, 3, 4), + (768, 3072, 512, 64, 64, False, True, True): (1, 8, 4, 8), + (768, 3072, 512, 64, 64, True, False, True): (1, 8, 3, 8), + (768, 3072, 512, 128, 128, False, True, True): (1, 4, 2, 32), + (768, 3072, 512, 128, 128, True, False, True): (1, 4, 1, 32), + (768, 3072, 1024, 16, 16, False, True, True): (4, 16, 3, 2), + (768, 3072, 1024, 16, 16, True, False, True): (4, 16, 3, 2), + (768, 3072, 1024, 32, 32, False, True, True): (4, 16, 5, 4), + (768, 3072, 1024, 32, 32, True, False, True): (4, 16, 5, 4), + (768, 3072, 1024, 64, 64, False, True, True): (2, 16, 3, 8), + (768, 3072, 1024, 64, 64, True, False, True): (2, 16, 3, 8), + (768, 3072, 1024, 128, 128, False, True, True): (1, 8, 1, 32), + (768, 3072, 1024, 128, 128, True, False, True): (1, 8, 1, 32), + (768, 3072, 2048, 16, 16, False, True, True): (2, 16, 3, 4), + (768, 3072, 2048, 16, 16, True, False, True): (2, 16, 3, 4), + (768, 3072, 2048, 32, 32, False, True, True): (4, 32, 5, 4), + (768, 3072, 2048, 32, 32, True, False, True): (2, 32, 3, 4), + (768, 3072, 2048, 64, 64, False, True, True): (2, 32, 3, 8), + (768, 3072, 2048, 64, 64, True, False, True): (2, 32, 3, 8), + (768, 3072, 2048, 128, 128, False, True, True): (1, 16, 1, 32), + (768, 3072, 2048, 128, 128, True, False, True): (2, 16, 1, 32), + (768, 3072, 4096, 16, 16, False, True, True): (1, 32, 5, 4), + (768, 3072, 4096, 16, 16, True, False, True): (3, 64, 3, 2), + (768, 3072, 4096, 32, 32, False, True, True): (5, 64, 3, 4), + (768, 3072, 4096, 32, 32, True, False, True): (5, 64, 3, 4), + (768, 3072, 4096, 64, 64, False, True, True): (1, 64, 3, 8), + (768, 3072, 4096, 64, 64, True, False, True): (5, 64, 3, 4), + (768, 3072, 4096, 128, 128, False, True, True): (1, 32, 1, 32), + (768, 3072, 4096, 128, 128, True, False, True): (1, 32, 1, 32), + (768, 3072, 8192, 16, 16, False, True, True): (1, 128, 3, 2), + (768, 3072, 8192, 16, 16, True, False, True): (1, 128, 3, 2), + (768, 3072, 8192, 32, 32, False, True, True): (1, 128, 3, 4), + (768, 3072, 8192, 32, 32, True, False, True): (1, 64, 3, 4), + (768, 3072, 8192, 64, 64, False, True, True): (3, 128, 3, 4), + (768, 3072, 8192, 64, 64, True, False, True): (3, 128, 3, 4), + (768, 3072, 8192, 128, 128, False, True, True): (4, 64, 2, 32), + (768, 3072, 8192, 128, 128, True, False, True): (2, 64, 1, 32), + (768, 3072, 16384, 16, 16, False, True, True): (1, 256, 2, 2), + (768, 3072, 16384, 16, 16, True, False, True): (1, 64, 3, 4), + (768, 3072, 16384, 32, 32, False, True, True): (8, 128, 3, 4), + (768, 3072, 16384, 32, 32, True, False, True): (1, 128, 3, 4), + (768, 3072, 16384, 64, 64, False, True, True): (1, 256, 3, 4), + (768, 3072, 16384, 64, 64, True, False, True): (3, 256, 3, 4), + (768, 3072, 16384, 128, 128, False, True, True): (3, 128, 1, 32), + (768, 3072, 16384, 128, 128, True, False, True): (2, 128, 2, 32), + (768, 3072, 32768, 16, 16, False, True, True): (1, 512, 3, 1), + (768, 3072, 32768, 16, 16, True, False, True): (1, 128, 3, 4), + (768, 3072, 32768, 32, 32, False, True, True): (1, 256, 3, 4), + (768, 3072, 32768, 32, 32, True, False, True): (1, 256, 3, 4), + (768, 3072, 32768, 64, 64, False, True, True): (2, 512, 3, 4), + (768, 3072, 32768, 64, 64, True, False, True): (1, 512, 3, 4), + (768, 3072, 32768, 128, 128, False, True, True): (1, 256, 1, 32), + (768, 3072, 32768, 128, 128, True, False, True): (2, 256, 2, 32), + (768, 3072, 50432, 16, 16, False, True, True): (1, 197, 3, 4), + (768, 3072, 50432, 16, 16, True, False, True): (1, 197, 3, 4), + (768, 3072, 50432, 32, 32, False, True, True): (1, 788, 2, 4), + (768, 3072, 50432, 32, 32, True, False, True): (1, 394, 3, 4), + (768, 3072, 50432, 64, 64, False, True, True): (1, 788, 3, 4), + (768, 3072, 50432, 64, 64, True, False, True): (2, 788, 3, 4), + (768, 3072, 50432, 128, 128, False, True, True): (1, 394, 1, 32), + (768, 3072, 50432, 128, 128, True, False, True): (2, 394, 2, 32), + (768, 3072, 65536, 16, 16, False, True, True): (1, 1024, 3, 1), + (768, 3072, 65536, 16, 16, True, False, True): (1, 256, 3, 4), + (768, 3072, 65536, 32, 32, False, True, True): (1, 512, 3, 4), + (768, 3072, 65536, 32, 32, True, False, True): (1, 512, 3, 4), + (768, 3072, 65536, 64, 64, False, True, True): (2, 1024, 3, 4), + (768, 3072, 65536, 64, 64, True, False, True): (5, 1024, 3, 4), + (768, 3072, 65536, 128, 128, False, True, True): (1, 512, 1, 32), + (768, 3072, 65536, 128, 128, True, False, True): (2, 512, 2, 32), + (768, 3072, 131072, 16, 16, False, True, True): (1, 2048, 3, 1), + (768, 3072, 131072, 16, 16, True, False, True): (1, 512, 3, 4), + (768, 3072, 131072, 32, 32, False, True, True): (1, 1024, 3, 4), + (768, 3072, 131072, 32, 32, True, False, True): (1, 1024, 3, 4), + (768, 3072, 131072, 64, 64, False, True, True): (1, 2048, 3, 4), + (768, 3072, 131072, 64, 64, True, False, True): (2, 2048, 3, 4), + (768, 3072, 131072, 128, 128, False, True, True): (1, 1024, 1, 32), + (768, 3072, 131072, 128, 128, True, False, True): (1, 1024, 2, 32), + (1024, 1024, 256, 16, 16, False, True, True): (4, 8, 3, 2), + (1024, 1024, 256, 16, 16, True, False, True): (2, 8, 3, 2), + (1024, 1024, 256, 32, 32, False, True, True): (1, 8, 3, 4), + (1024, 1024, 256, 32, 32, True, False, True): (1, 8, 3, 4), + (1024, 1024, 256, 64, 64, False, True, True): (1, 4, 3, 8), + (1024, 1024, 256, 64, 64, True, False, True): (2, 4, 3, 8), + (1024, 1024, 256, 128, 128, False, True, True): (3, 2, 1, 32), + (1024, 1024, 256, 128, 128, True, False, True): (5, 2, 1, 32), + (1024, 1024, 512, 16, 16, False, True, True): (3, 8, 3, 4), + (1024, 1024, 512, 16, 16, True, False, True): (3, 8, 3, 4), + (1024, 1024, 512, 32, 32, False, True, True): (1, 16, 3, 4), + (1024, 1024, 512, 32, 32, True, False, True): (3, 16, 3, 4), + (1024, 1024, 512, 64, 64, False, True, True): (6, 8, 3, 8), + (1024, 1024, 512, 64, 64, True, False, True): (8, 8, 3, 8), + (1024, 1024, 512, 128, 128, False, True, True): (1, 4, 1, 32), + (1024, 1024, 512, 128, 128, True, False, True): (1, 4, 1, 32), + (1024, 1024, 1024, 16, 16, False, True, True): (4, 8, 3, 4), + (1024, 1024, 1024, 16, 16, True, False, True): (1, 8, 3, 4), + (1024, 1024, 1024, 32, 32, False, True, True): (4, 16, 4, 4), + (1024, 1024, 1024, 32, 32, True, False, True): (5, 16, 3, 4), + (1024, 1024, 1024, 64, 64, False, True, True): (6, 16, 3, 8), + (1024, 1024, 1024, 64, 64, True, False, True): (3, 16, 2, 4), + (1024, 1024, 1024, 128, 128, False, True, True): (1, 8, 1, 32), + (1024, 1024, 1024, 128, 128, True, False, True): (2, 8, 1, 32), + (1024, 1024, 2048, 16, 16, False, True, True): (4, 16, 3, 4), + (1024, 1024, 2048, 16, 16, True, False, True): (1, 16, 3, 4), + (1024, 1024, 2048, 32, 32, False, True, True): (1, 32, 3, 4), + (1024, 1024, 2048, 32, 32, True, False, True): (2, 32, 3, 4), + (1024, 1024, 2048, 64, 64, False, True, True): (4, 32, 2, 4), + (1024, 1024, 2048, 64, 64, True, False, True): (8, 32, 2, 4), + (1024, 1024, 2048, 128, 128, False, True, True): (1, 16, 1, 32), + (1024, 1024, 2048, 128, 128, True, False, True): (1, 16, 1, 32), + (1024, 1024, 4096, 16, 16, False, True, True): (4, 32, 3, 4), + (1024, 1024, 4096, 16, 16, True, False, True): (1, 64, 3, 2), + (1024, 1024, 4096, 32, 32, False, True, True): (1, 64, 3, 4), + (1024, 1024, 4096, 32, 32, True, False, True): (1, 64, 3, 4), + (1024, 1024, 4096, 64, 64, False, True, True): (2, 64, 2, 4), + (1024, 1024, 4096, 64, 64, True, False, True): (2, 64, 2, 4), + (1024, 1024, 4096, 128, 128, False, True, True): (1, 32, 1, 32), + (1024, 1024, 4096, 128, 128, True, False, True): (4, 32, 1, 32), + (1024, 1024, 8192, 16, 16, False, True, True): (1, 128, 3, 1), + (1024, 1024, 8192, 16, 16, True, False, True): (1, 128, 3, 1), + (1024, 1024, 8192, 32, 32, False, True, True): (1, 128, 3, 4), + (1024, 1024, 8192, 32, 32, True, False, True): (1, 128, 3, 4), + (1024, 1024, 8192, 64, 64, False, True, True): (2, 128, 2, 4), + (1024, 1024, 8192, 64, 64, True, False, True): (2, 128, 2, 4), + (1024, 1024, 8192, 128, 128, False, True, True): (1, 64, 1, 32), + (1024, 1024, 8192, 128, 128, True, False, True): (4, 64, 1, 32), + (1024, 1024, 16384, 16, 16, False, True, True): (1, 128, 2, 4), + (1024, 1024, 16384, 16, 16, True, False, True): (4, 256, 3, 1), + (1024, 1024, 16384, 32, 32, False, True, True): (1, 256, 3, 4), + (1024, 1024, 16384, 32, 32, True, False, True): (1, 256, 3, 4), + (1024, 1024, 16384, 64, 64, False, True, True): (1, 256, 2, 4), + (1024, 1024, 16384, 64, 64, True, False, True): (1, 256, 2, 4), + (1024, 1024, 16384, 128, 128, False, True, True): (1, 128, 1, 32), + (1024, 1024, 16384, 128, 128, True, False, True): (4, 128, 1, 32), + (1024, 1024, 32768, 16, 16, False, True, True): (1, 256, 2, 4), + (1024, 1024, 32768, 16, 16, True, False, True): (4, 512, 3, 1), + (1024, 1024, 32768, 32, 32, False, True, True): (1, 512, 3, 4), + (1024, 1024, 32768, 32, 32, True, False, True): (1, 512, 3, 4), + (1024, 1024, 32768, 64, 64, False, True, True): (1, 512, 2, 4), + (1024, 1024, 32768, 64, 64, True, False, True): (1, 512, 2, 4), + (1024, 1024, 32768, 128, 128, False, True, True): (1, 256, 1, 32), + (1024, 1024, 32768, 128, 128, True, False, True): (1, 256, 1, 32), + (1024, 1024, 65536, 16, 16, False, True, True): (1, 512, 2, 4), + (1024, 1024, 65536, 16, 16, True, False, True): (1, 1024, 3, 1), + (1024, 1024, 65536, 32, 32, False, True, True): (1, 1024, 3, 4), + (1024, 1024, 65536, 32, 32, True, False, True): (1, 512, 3, 4), + (1024, 1024, 65536, 64, 64, False, True, True): (1, 1024, 2, 4), + (1024, 1024, 65536, 64, 64, True, False, True): (1, 1024, 2, 4), + (1024, 1024, 65536, 128, 128, False, True, True): (1, 512, 1, 32), + (1024, 1024, 65536, 128, 128, True, False, True): (1, 512, 1, 32), + (1024, 1024, 131072, 16, 16, False, True, True): (4, 2048, 3, 1), + (1024, 1024, 131072, 16, 16, True, False, True): (4, 2048, 3, 1), + (1024, 1024, 131072, 32, 32, False, True, True): (1, 2048, 3, 4), + (1024, 1024, 131072, 32, 32, True, False, True): (1, 1024, 3, 4), + (1024, 1024, 131072, 64, 64, False, True, True): (1, 2048, 2, 4), + (1024, 1024, 131072, 64, 64, True, False, True): (1, 2048, 2, 4), + (1024, 1024, 131072, 128, 128, False, True, True): (1, 1024, 1, 32), + (1024, 1024, 131072, 128, 128, True, False, True): (1, 1024, 1, 32), + (1280, 5120, 65792, 16, 16, False, True, True): (1, 1028, 3, 1), + (1280, 5120, 65792, 16, 16, True, False, True): (1, 257, 3, 4), + (1280, 5120, 65792, 32, 32, False, True, True): (1, 514, 3, 4), + (1280, 5120, 65792, 32, 32, True, False, True): (1, 514, 3, 4), + (1280, 5120, 65792, 64, 64, False, True, True): (2, 1028, 3, 4), + (1280, 5120, 65792, 64, 64, True, False, True): (1, 1028, 3, 4), + (1280, 5120, 65792, 128, 128, False, True, True): (2, 514, 2, 32), + (1280, 5120, 65792, 128, 128, True, False, True): (1, 514, 2, 32), + (1536, 1536, 256, 16, 16, False, True, True): (5, 4, 3, 2), + (1536, 1536, 256, 16, 16, True, False, True): (2, 2, 3, 4), + (1536, 1536, 256, 32, 32, False, True, True): (1, 8, 2, 4), + (1536, 1536, 256, 32, 32, True, False, True): (2, 4, 3, 4), + (1536, 1536, 256, 64, 64, False, True, True): (1, 4, 3, 8), + (1536, 1536, 256, 64, 64, True, False, True): (2, 4, 3, 8), + (1536, 1536, 256, 128, 128, False, True, True): (1, 2, 1, 32), + (1536, 1536, 256, 128, 128, True, False, True): (2, 2, 2, 32), + (1536, 1536, 512, 16, 16, False, True, True): (1, 8, 3, 2), + (1536, 1536, 512, 16, 16, True, False, True): (1, 8, 3, 2), + (1536, 1536, 512, 32, 32, False, True, True): (1, 16, 3, 4), + (1536, 1536, 512, 32, 32, True, False, True): (1, 16, 3, 4), + (1536, 1536, 512, 64, 64, False, True, True): (3, 8, 3, 8), + (1536, 1536, 512, 64, 64, True, False, True): (3, 8, 3, 8), + (1536, 1536, 512, 128, 128, False, True, True): (1, 4, 1, 32), + (1536, 1536, 512, 128, 128, True, False, True): (2, 4, 2, 32), + (1536, 1536, 1024, 16, 16, False, True, True): (2, 8, 3, 4), + (1536, 1536, 1024, 16, 16, True, False, True): (2, 8, 3, 4), + (1536, 1536, 1024, 32, 32, False, True, True): (1, 16, 3, 4), + (1536, 1536, 1024, 32, 32, True, False, True): (1, 16, 3, 4), + (1536, 1536, 1024, 64, 64, False, True, True): (2, 16, 3, 8), + (1536, 1536, 1024, 64, 64, True, False, True): (2, 16, 3, 8), + (1536, 1536, 1024, 128, 128, False, True, True): (3, 8, 1, 32), + (1536, 1536, 1024, 128, 128, True, False, True): (1, 8, 2, 32), + (1536, 1536, 2048, 16, 16, False, True, True): (1, 32, 3, 2), + (1536, 1536, 2048, 16, 16, True, False, True): (1, 32, 3, 2), + (1536, 1536, 2048, 32, 32, False, True, True): (3, 32, 2, 4), + (1536, 1536, 2048, 32, 32, True, False, True): (4, 32, 3, 4), + (1536, 1536, 2048, 64, 64, False, True, True): (1, 32, 3, 4), + (1536, 1536, 2048, 64, 64, True, False, True): (1, 32, 3, 4), + (1536, 1536, 2048, 128, 128, False, True, True): (1, 16, 1, 32), + (1536, 1536, 2048, 128, 128, True, False, True): (2, 16, 1, 32), + (1536, 1536, 4096, 16, 16, False, True, True): (1, 64, 3, 2), + (1536, 1536, 4096, 16, 16, True, False, True): (1, 16, 3, 4), + (1536, 1536, 4096, 32, 32, False, True, True): (1, 64, 2, 4), + (1536, 1536, 4096, 32, 32, True, False, True): (1, 64, 2, 4), + (1536, 1536, 4096, 64, 64, False, True, True): (1, 64, 3, 4), + (1536, 1536, 4096, 64, 64, True, False, True): (1, 64, 3, 4), + (1536, 1536, 4096, 128, 128, False, True, True): (1, 32, 1, 32), + (1536, 1536, 4096, 128, 128, True, False, True): (4, 32, 2, 32), + (1536, 1536, 8192, 16, 16, False, True, True): (1, 32, 3, 4), + (1536, 1536, 8192, 16, 16, True, False, True): (5, 32, 3, 4), + (1536, 1536, 8192, 32, 32, False, True, True): (1, 128, 2, 4), + (1536, 1536, 8192, 32, 32, True, False, True): (1, 128, 2, 4), + (1536, 1536, 8192, 64, 64, False, True, True): (1, 128, 3, 4), + (1536, 1536, 8192, 64, 64, True, False, True): (1, 128, 3, 4), + (1536, 1536, 8192, 128, 128, False, True, True): (1, 64, 1, 32), + (1536, 1536, 8192, 128, 128, True, False, True): (4, 64, 2, 32), + (1536, 1536, 16384, 16, 16, False, True, True): (1, 64, 3, 4), + (1536, 1536, 16384, 16, 16, True, False, True): (1, 64, 3, 4), + (1536, 1536, 16384, 32, 32, False, True, True): (1, 256, 2, 4), + (1536, 1536, 16384, 32, 32, True, False, True): (1, 128, 3, 4), + (1536, 1536, 16384, 64, 64, False, True, True): (1, 256, 3, 4), + (1536, 1536, 16384, 64, 64, True, False, True): (3, 256, 3, 4), + (1536, 1536, 16384, 128, 128, False, True, True): (1, 128, 1, 32), + (1536, 1536, 16384, 128, 128, True, False, True): (4, 128, 2, 32), + (1536, 1536, 32768, 16, 16, False, True, True): (1, 128, 3, 4), + (1536, 1536, 32768, 16, 16, True, False, True): (1, 128, 3, 4), + (1536, 1536, 32768, 32, 32, False, True, True): (1, 256, 3, 4), + (1536, 1536, 32768, 32, 32, True, False, True): (1, 256, 3, 4), + (1536, 1536, 32768, 64, 64, False, True, True): (1, 512, 3, 4), + (1536, 1536, 32768, 64, 64, True, False, True): (1, 512, 3, 4), + (1536, 1536, 32768, 128, 128, False, True, True): (1, 256, 1, 32), + (1536, 1536, 32768, 128, 128, True, False, True): (4, 256, 2, 32), + (1536, 1536, 65536, 16, 16, False, True, True): (5, 256, 3, 4), + (1536, 1536, 65536, 16, 16, True, False, True): (1, 256, 3, 4), + (1536, 1536, 65536, 32, 32, False, True, True): (1, 512, 3, 4), + (1536, 1536, 65536, 32, 32, True, False, True): (1, 512, 3, 4), + (1536, 1536, 65536, 64, 64, False, True, True): (1, 1024, 3, 4), + (1536, 1536, 65536, 64, 64, True, False, True): (1, 1024, 3, 4), + (1536, 1536, 65536, 128, 128, False, True, True): (1, 512, 1, 32), + (1536, 1536, 65536, 128, 128, True, False, True): (4, 512, 2, 32), + (1536, 1536, 131072, 16, 16, False, True, True): (3, 512, 3, 4), + (1536, 1536, 131072, 16, 16, True, False, True): (1, 512, 3, 4), + (1536, 1536, 131072, 32, 32, False, True, True): (1, 1024, 3, 4), + (1536, 1536, 131072, 32, 32, True, False, True): (1, 1024, 3, 4), + (1536, 1536, 131072, 64, 64, False, True, True): (1, 2048, 3, 4), + (1536, 1536, 131072, 64, 64, True, False, True): (1, 2048, 3, 4), + (1536, 1536, 131072, 128, 128, False, True, True): (1, 1024, 1, 32), + (1536, 1536, 131072, 128, 128, True, False, True): (4, 1024, 2, 32), + (2048, 2048, 256, 16, 16, False, True, True): (1, 4, 3, 4), + (2048, 2048, 256, 16, 16, True, False, True): (1, 4, 3, 4), + (2048, 2048, 256, 32, 32, False, True, True): (3, 8, 3, 4), + (2048, 2048, 256, 32, 32, True, False, True): (3, 8, 3, 4), + (2048, 2048, 256, 64, 64, False, True, True): (4, 4, 4, 8), + (2048, 2048, 256, 64, 64, True, False, True): (8, 4, 4, 8), + (2048, 2048, 256, 128, 128, False, True, True): (3, 2, 1, 32), + (2048, 2048, 256, 128, 128, True, False, True): (3, 2, 1, 32), + (2048, 2048, 512, 16, 16, False, True, True): (4, 8, 3, 2), + (2048, 2048, 512, 16, 16, True, False, True): (4, 8, 3, 2), + (2048, 2048, 512, 32, 32, False, True, True): (3, 8, 3, 4), + (2048, 2048, 512, 32, 32, True, False, True): (1, 16, 2, 4), + (2048, 2048, 512, 64, 64, False, True, True): (4, 8, 2, 4), + (2048, 2048, 512, 64, 64, True, False, True): (4, 8, 2, 4), + (2048, 2048, 512, 128, 128, False, True, True): (1, 4, 1, 32), + (2048, 2048, 512, 128, 128, True, False, True): (4, 4, 1, 32), + (2048, 2048, 1024, 16, 16, False, True, True): (4, 8, 3, 4), + (2048, 2048, 1024, 16, 16, True, False, True): (4, 8, 3, 4), + (2048, 2048, 1024, 32, 32, False, True, True): (4, 16, 3, 4), + (2048, 2048, 1024, 32, 32, True, False, True): (1, 16, 3, 4), + (2048, 2048, 1024, 64, 64, False, True, True): (2, 16, 2, 4), + (2048, 2048, 1024, 64, 64, True, False, True): (2, 16, 2, 4), + (2048, 2048, 1024, 128, 128, False, True, True): (8, 8, 1, 32), + (2048, 2048, 1024, 128, 128, True, False, True): (4, 8, 1, 32), + (2048, 2048, 2048, 16, 16, False, True, True): (4, 32, 3, 1), + (2048, 2048, 2048, 16, 16, True, False, True): (3, 32, 3, 2), + (2048, 2048, 2048, 32, 32, False, True, True): (1, 32, 3, 4), + (2048, 2048, 2048, 32, 32, True, False, True): (1, 32, 3, 4), + (2048, 2048, 2048, 64, 64, False, True, True): (2, 32, 2, 4), + (2048, 2048, 2048, 64, 64, True, False, True): (2, 32, 2, 4), + (2048, 2048, 2048, 128, 128, False, True, True): (6, 16, 1, 32), + (2048, 2048, 2048, 128, 128, True, False, True): (4, 16, 1, 32), + (2048, 2048, 4096, 16, 16, False, True, True): (4, 64, 3, 1), + (2048, 2048, 4096, 16, 16, True, False, True): (1, 64, 3, 1), + (2048, 2048, 4096, 32, 32, False, True, True): (1, 64, 3, 4), + (2048, 2048, 4096, 32, 32, True, False, True): (4, 64, 3, 4), + (2048, 2048, 4096, 64, 64, False, True, True): (2, 64, 2, 4), + (2048, 2048, 4096, 64, 64, True, False, True): (2, 64, 2, 4), + (2048, 2048, 4096, 128, 128, False, True, True): (4, 32, 1, 32), + (2048, 2048, 4096, 128, 128, True, False, True): (4, 32, 1, 32), + (2048, 2048, 8192, 16, 16, False, True, True): (4, 128, 3, 1), + (2048, 2048, 8192, 16, 16, True, False, True): (1, 128, 3, 1), + (2048, 2048, 8192, 32, 32, False, True, True): (4, 128, 3, 4), + (2048, 2048, 8192, 32, 32, True, False, True): (4, 64, 3, 4), + (2048, 2048, 8192, 64, 64, False, True, True): (1, 128, 2, 4), + (2048, 2048, 8192, 64, 64, True, False, True): (2, 128, 2, 4), + (2048, 2048, 8192, 128, 128, False, True, True): (1, 64, 1, 32), + (2048, 2048, 8192, 128, 128, True, False, True): (4, 64, 1, 32), + (2048, 2048, 16384, 16, 16, False, True, True): (4, 256, 3, 1), + (2048, 2048, 16384, 16, 16, True, False, True): (1, 256, 3, 1), + (2048, 2048, 16384, 32, 32, False, True, True): (1, 256, 3, 4), + (2048, 2048, 16384, 32, 32, True, False, True): (1, 128, 3, 4), + (2048, 2048, 16384, 64, 64, False, True, True): (1, 256, 2, 4), + (2048, 2048, 16384, 64, 64, True, False, True): (1, 256, 2, 4), + (2048, 2048, 16384, 128, 128, False, True, True): (1, 128, 1, 32), + (2048, 2048, 16384, 128, 128, True, False, True): (4, 128, 1, 32), + (2048, 2048, 32768, 16, 16, False, True, True): (8, 512, 3, 1), + (2048, 2048, 32768, 16, 16, True, False, True): (1, 512, 3, 1), + (2048, 2048, 32768, 32, 32, False, True, True): (1, 512, 3, 4), + (2048, 2048, 32768, 32, 32, True, False, True): (1, 256, 3, 4), + (2048, 2048, 32768, 64, 64, False, True, True): (1, 512, 2, 4), + (2048, 2048, 32768, 64, 64, True, False, True): (1, 512, 2, 4), + (2048, 2048, 32768, 128, 128, False, True, True): (1, 256, 1, 32), + (2048, 2048, 32768, 128, 128, True, False, True): (4, 256, 1, 32), + (2048, 2048, 65536, 16, 16, False, True, True): (4, 1024, 3, 1), + (2048, 2048, 65536, 16, 16, True, False, True): (1, 1024, 3, 1), + (2048, 2048, 65536, 32, 32, False, True, True): (1, 1024, 3, 4), + (2048, 2048, 65536, 32, 32, True, False, True): (1, 512, 3, 4), + (2048, 2048, 65536, 64, 64, False, True, True): (1, 1024, 2, 4), + (2048, 2048, 65536, 64, 64, True, False, True): (1, 1024, 2, 4), + (2048, 2048, 65536, 128, 128, False, True, True): (1, 512, 1, 32), + (2048, 2048, 65536, 128, 128, True, False, True): (4, 512, 1, 32), + (2048, 2048, 131072, 16, 16, False, True, True): (4, 2048, 3, 1), + (2048, 2048, 131072, 16, 16, True, False, True): (1, 2048, 3, 1), + (2048, 2048, 131072, 32, 32, False, True, True): (1, 2048, 3, 4), + (2048, 2048, 131072, 32, 32, True, False, True): (1, 1024, 3, 4), + (2048, 2048, 131072, 64, 64, False, True, True): (1, 2048, 2, 4), + (2048, 2048, 131072, 64, 64, True, False, True): (1, 2048, 2, 4), + (2048, 2048, 131072, 128, 128, False, True, True): (1, 1024, 1, 32), + (2048, 2048, 131072, 128, 128, True, False, True): (4, 1024, 1, 32), + (3072, 768, 256, 16, 16, False, True, True): (4, 4, 3, 2), + (3072, 768, 256, 16, 16, True, False, True): (1, 2, 6, 4), + (3072, 768, 256, 32, 32, False, True, True): (1, 4, 6, 4), + (3072, 768, 256, 32, 32, True, False, True): (5, 4, 3, 4), + (3072, 768, 256, 64, 64, False, True, True): (4, 4, 3, 8), + (3072, 768, 256, 64, 64, True, False, True): (4, 4, 3, 8), + (3072, 768, 256, 128, 128, False, True, True): (1, 2, 1, 32), + (3072, 768, 256, 128, 128, True, False, True): (5, 2, 1, 32), + (3072, 768, 512, 16, 16, False, True, True): (4, 4, 3, 4), + (3072, 768, 512, 16, 16, True, False, True): (1, 4, 3, 4), + (3072, 768, 512, 32, 32, False, True, True): (3, 8, 3, 4), + (3072, 768, 512, 32, 32, True, False, True): (3, 8, 3, 4), + (3072, 768, 512, 64, 64, False, True, True): (2, 8, 3, 8), + (3072, 768, 512, 64, 64, True, False, True): (2, 8, 3, 8), + (3072, 768, 512, 128, 128, False, True, True): (1, 4, 2, 32), + (3072, 768, 512, 128, 128, True, False, True): (1, 4, 1, 32), + (3072, 768, 1024, 16, 16, False, True, True): (1, 16, 3, 2), + (3072, 768, 1024, 16, 16, True, False, True): (3, 16, 3, 2), + (3072, 768, 1024, 32, 32, False, True, True): (1, 16, 3, 4), + (3072, 768, 1024, 32, 32, True, False, True): (3, 16, 3, 4), + (3072, 768, 1024, 64, 64, False, True, True): (4, 16, 3, 8), + (3072, 768, 1024, 64, 64, True, False, True): (4, 16, 3, 4), + (3072, 768, 1024, 128, 128, False, True, True): (5, 8, 1, 32), + (3072, 768, 1024, 128, 128, True, False, True): (5, 8, 1, 32), + (3072, 768, 2048, 16, 16, False, True, True): (4, 32, 3, 2), + (3072, 768, 2048, 16, 16, True, False, True): (1, 32, 3, 2), + (3072, 768, 2048, 32, 32, False, True, True): (1, 32, 3, 4), + (3072, 768, 2048, 32, 32, True, False, True): (1, 32, 2, 4), + (3072, 768, 2048, 64, 64, False, True, True): (2, 32, 3, 4), + (3072, 768, 2048, 64, 64, True, False, True): (4, 32, 3, 4), + (3072, 768, 2048, 128, 128, False, True, True): (1, 16, 1, 32), + (3072, 768, 2048, 128, 128, True, False, True): (1, 16, 1, 32), + (3072, 768, 4096, 16, 16, False, True, True): (3, 64, 3, 2), + (3072, 768, 4096, 16, 16, True, False, True): (1, 64, 3, 2), + (3072, 768, 4096, 32, 32, False, True, True): (1, 64, 3, 4), + (3072, 768, 4096, 32, 32, True, False, True): (1, 32, 3, 4), + (3072, 768, 4096, 64, 64, False, True, True): (2, 64, 3, 4), + (3072, 768, 4096, 64, 64, True, False, True): (2, 64, 3, 4), + (3072, 768, 4096, 128, 128, False, True, True): (1, 32, 1, 32), + (3072, 768, 4096, 128, 128, True, False, True): (1, 32, 1, 32), + (3072, 768, 8192, 16, 16, False, True, True): (4, 128, 3, 1), + (3072, 768, 8192, 16, 16, True, False, True): (1, 32, 3, 4), + (3072, 768, 8192, 32, 32, False, True, True): (1, 64, 3, 4), + (3072, 768, 8192, 32, 32, True, False, True): (1, 64, 3, 4), + (3072, 768, 8192, 64, 64, False, True, True): (2, 128, 3, 4), + (3072, 768, 8192, 64, 64, True, False, True): (2, 128, 3, 4), + (3072, 768, 8192, 128, 128, False, True, True): (1, 64, 1, 32), + (3072, 768, 8192, 128, 128, True, False, True): (1, 64, 1, 32), + (3072, 768, 16384, 16, 16, False, True, True): (4, 256, 3, 1), + (3072, 768, 16384, 16, 16, True, False, True): (1, 64, 3, 4), + (3072, 768, 16384, 32, 32, False, True, True): (1, 128, 3, 4), + (3072, 768, 16384, 32, 32, True, False, True): (1, 128, 3, 4), + (3072, 768, 16384, 64, 64, False, True, True): (2, 256, 3, 4), + (3072, 768, 16384, 64, 64, True, False, True): (2, 256, 3, 4), + (3072, 768, 16384, 128, 128, False, True, True): (1, 128, 1, 32), + (3072, 768, 16384, 128, 128, True, False, True): (1, 128, 1, 32), + (3072, 768, 32768, 16, 16, False, True, True): (4, 512, 3, 1), + (3072, 768, 32768, 16, 16, True, False, True): (1, 128, 3, 4), + (3072, 768, 32768, 32, 32, False, True, True): (1, 256, 3, 4), + (3072, 768, 32768, 32, 32, True, False, True): (1, 256, 3, 4), + (3072, 768, 32768, 64, 64, False, True, True): (2, 512, 3, 4), + (3072, 768, 32768, 64, 64, True, False, True): (2, 512, 3, 4), + (3072, 768, 32768, 128, 128, False, True, True): (1, 256, 1, 32), + (3072, 768, 32768, 128, 128, True, False, True): (1, 256, 1, 32), + (3072, 768, 50432, 16, 16, False, True, True): (4, 788, 3, 1), + (3072, 768, 50432, 16, 16, True, False, True): (1, 197, 3, 4), + (3072, 768, 50432, 32, 32, False, True, True): (1, 394, 3, 4), + (3072, 768, 50432, 32, 32, True, False, True): (1, 394, 3, 4), + (3072, 768, 50432, 64, 64, False, True, True): (1, 788, 3, 4), + (3072, 768, 50432, 64, 64, True, False, True): (2, 788, 3, 4), + (3072, 768, 50432, 128, 128, False, True, True): (1, 394, 1, 32), + (3072, 768, 50432, 128, 128, True, False, True): (1, 394, 1, 32), + (3072, 768, 65536, 16, 16, False, True, True): (4, 1024, 3, 1), + (3072, 768, 65536, 16, 16, True, False, True): (1, 256, 3, 4), + (3072, 768, 65536, 32, 32, False, True, True): (1, 512, 3, 4), + (3072, 768, 65536, 32, 32, True, False, True): (1, 512, 3, 4), + (3072, 768, 65536, 64, 64, False, True, True): (2, 1024, 3, 4), + (3072, 768, 65536, 64, 64, True, False, True): (2, 1024, 3, 4), + (3072, 768, 65536, 128, 128, False, True, True): (1, 512, 1, 32), + (3072, 768, 65536, 128, 128, True, False, True): (1, 512, 1, 32), + (3072, 768, 131072, 16, 16, False, True, True): (4, 2048, 3, 1), + (3072, 768, 131072, 16, 16, True, False, True): (1, 512, 3, 4), + (3072, 768, 131072, 32, 32, False, True, True): (1, 1024, 3, 4), + (3072, 768, 131072, 32, 32, True, False, True): (1, 1024, 3, 4), + (3072, 768, 131072, 64, 64, False, True, True): (2, 2048, 3, 4), + (3072, 768, 131072, 64, 64, True, False, True): (2, 2048, 3, 4), + (3072, 768, 131072, 128, 128, False, True, True): (1, 1024, 1, 32), + (3072, 768, 131072, 128, 128, True, False, True): (1, 1024, 1, 32), + (3072, 3072, 256, 16, 16, False, True, True): (1, 4, 5, 2), + (3072, 3072, 256, 16, 16, True, False, True): (1, 4, 3, 2), + (3072, 3072, 256, 32, 32, False, True, True): (1, 4, 4, 4), + (3072, 3072, 256, 32, 32, True, False, True): (1, 4, 3, 4), + (3072, 3072, 256, 64, 64, False, True, True): (2, 4, 3, 8), + (3072, 3072, 256, 64, 64, True, False, True): (2, 4, 3, 8), + (3072, 3072, 256, 128, 128, False, True, True): (6, 2, 1, 32), + (3072, 3072, 256, 128, 128, True, False, True): (8, 2, 2, 32), + (3072, 3072, 512, 16, 16, False, True, True): (2, 4, 3, 4), + (3072, 3072, 512, 16, 16, True, False, True): (2, 4, 3, 4), + (3072, 3072, 512, 32, 32, False, True, True): (2, 8, 3, 4), + (3072, 3072, 512, 32, 32, True, False, True): (2, 8, 3, 4), + (3072, 3072, 512, 64, 64, False, True, True): (2, 8, 3, 8), + (3072, 3072, 512, 64, 64, True, False, True): (2, 8, 3, 8), + (3072, 3072, 512, 128, 128, False, True, True): (5, 4, 1, 32), + (3072, 3072, 512, 128, 128, True, False, True): (5, 4, 2, 32), + (3072, 3072, 1024, 16, 16, False, True, True): (1, 16, 3, 2), + (3072, 3072, 1024, 16, 16, True, False, True): (1, 16, 3, 2), + (3072, 3072, 1024, 32, 32, False, True, True): (2, 16, 3, 4), + (3072, 3072, 1024, 32, 32, True, False, True): (1, 16, 3, 4), + (3072, 3072, 1024, 64, 64, False, True, True): (1, 16, 3, 4), + (3072, 3072, 1024, 64, 64, True, False, True): (1, 16, 3, 4), + (3072, 3072, 1024, 128, 128, False, True, True): (1, 8, 1, 32), + (3072, 3072, 1024, 128, 128, True, False, True): (3, 8, 2, 32), + (3072, 3072, 2048, 16, 16, False, True, True): (1, 32, 3, 2), + (3072, 3072, 2048, 16, 16, True, False, True): (1, 16, 2, 4), + (3072, 3072, 2048, 32, 32, False, True, True): (1, 32, 2, 4), + (3072, 3072, 2048, 32, 32, True, False, True): (1, 32, 3, 4), + (3072, 3072, 2048, 64, 64, False, True, True): (1, 32, 3, 4), + (3072, 3072, 2048, 64, 64, True, False, True): (1, 32, 3, 4), + (3072, 3072, 2048, 128, 128, False, True, True): (1, 16, 1, 32), + (3072, 3072, 2048, 128, 128, True, False, True): (4, 16, 2, 32), + (3072, 3072, 4096, 16, 16, False, True, True): (2, 16, 3, 4), + (3072, 3072, 4096, 16, 16, True, False, True): (2, 16, 3, 4), + (3072, 3072, 4096, 32, 32, False, True, True): (1, 64, 2, 4), + (3072, 3072, 4096, 32, 32, True, False, True): (1, 32, 3, 4), + (3072, 3072, 4096, 64, 64, False, True, True): (1, 64, 3, 4), + (3072, 3072, 4096, 64, 64, True, False, True): (1, 64, 3, 4), + (3072, 3072, 4096, 128, 128, False, True, True): (1, 32, 1, 32), + (3072, 3072, 4096, 128, 128, True, False, True): (2, 32, 2, 32), + (3072, 3072, 8192, 16, 16, False, True, True): (2, 32, 3, 4), + (3072, 3072, 8192, 16, 16, True, False, True): (2, 32, 3, 4), + (3072, 3072, 8192, 32, 32, False, True, True): (1, 64, 3, 4), + (3072, 3072, 8192, 32, 32, True, False, True): (1, 64, 3, 4), + (3072, 3072, 8192, 64, 64, False, True, True): (1, 128, 3, 4), + (3072, 3072, 8192, 64, 64, True, False, True): (1, 128, 3, 4), + (3072, 3072, 8192, 128, 128, False, True, True): (1, 64, 1, 32), + (3072, 3072, 8192, 128, 128, True, False, True): (4, 64, 2, 32), + (3072, 3072, 16384, 16, 16, False, True, True): (2, 64, 3, 4), + (3072, 3072, 16384, 16, 16, True, False, True): (1, 64, 3, 4), + (3072, 3072, 16384, 32, 32, False, True, True): (1, 128, 3, 4), + (3072, 3072, 16384, 32, 32, True, False, True): (1, 128, 3, 4), + (3072, 3072, 16384, 64, 64, False, True, True): (1, 256, 3, 4), + (3072, 3072, 16384, 64, 64, True, False, True): (1, 256, 3, 4), + (3072, 3072, 16384, 128, 128, False, True, True): (1, 128, 1, 32), + (3072, 3072, 16384, 128, 128, True, False, True): (4, 128, 2, 32), + (3072, 3072, 32768, 16, 16, False, True, True): (3, 128, 3, 4), + (3072, 3072, 32768, 16, 16, True, False, True): (1, 128, 3, 4), + (3072, 3072, 32768, 32, 32, False, True, True): (1, 256, 3, 4), + (3072, 3072, 32768, 32, 32, True, False, True): (1, 256, 3, 4), + (3072, 3072, 32768, 64, 64, False, True, True): (1, 512, 3, 4), + (3072, 3072, 32768, 64, 64, True, False, True): (1, 512, 3, 4), + (3072, 3072, 32768, 128, 128, False, True, True): (1, 256, 1, 32), + (3072, 3072, 32768, 128, 128, True, False, True): (4, 256, 2, 32), + (3072, 3072, 65536, 16, 16, False, True, True): (5, 256, 3, 4), + (3072, 3072, 65536, 16, 16, True, False, True): (2, 256, 3, 4), + (3072, 3072, 65536, 32, 32, False, True, True): (1, 512, 3, 4), + (3072, 3072, 65536, 32, 32, True, False, True): (1, 512, 3, 4), + (3072, 3072, 65536, 64, 64, False, True, True): (1, 1024, 3, 4), + (3072, 3072, 65536, 64, 64, True, False, True): (1, 1024, 3, 4), + (3072, 3072, 65536, 128, 128, False, True, True): (1, 512, 1, 32), + (3072, 3072, 65536, 128, 128, True, False, True): (4, 512, 2, 32), + (3072, 3072, 131072, 16, 16, False, True, True): (5, 512, 3, 4), + (3072, 3072, 131072, 16, 16, True, False, True): (1, 512, 3, 4), + (3072, 3072, 131072, 32, 32, False, True, True): (1, 1024, 3, 4), + (3072, 3072, 131072, 32, 32, True, False, True): (1, 1024, 3, 4), + (3072, 3072, 131072, 64, 64, False, True, True): (1, 2048, 3, 4), + (3072, 3072, 131072, 64, 64, True, False, True): (1, 2048, 3, 4), + (3072, 3072, 131072, 128, 128, False, True, True): (1, 1024, 1, 32), + (3072, 3072, 131072, 128, 128, True, False, True): (4, 1024, 2, 32), + (4096, 4096, 256, 16, 16, False, True, True): (1, 4, 3, 2), + (4096, 4096, 256, 16, 16, True, False, True): (1, 2, 3, 4), + (4096, 4096, 256, 32, 32, False, True, True): (4, 4, 4, 4), + (4096, 4096, 256, 32, 32, True, False, True): (4, 4, 4, 4), + (4096, 4096, 256, 64, 64, False, True, True): (1, 4, 3, 8), + (4096, 4096, 256, 64, 64, True, False, True): (4, 4, 2, 4), + (4096, 4096, 256, 128, 128, False, True, True): (1, 2, 1, 32), + (4096, 4096, 256, 128, 128, True, False, True): (3, 2, 1, 32), + (4096, 4096, 512, 16, 16, False, True, True): (1, 4, 3, 4), + (4096, 4096, 512, 16, 16, True, False, True): (5, 8, 3, 2), + (4096, 4096, 512, 32, 32, False, True, True): (4, 8, 3, 4), + (4096, 4096, 512, 32, 32, True, False, True): (4, 8, 3, 4), + (4096, 4096, 512, 64, 64, False, True, True): (1, 8, 2, 4), + (4096, 4096, 512, 64, 64, True, False, True): (1, 8, 2, 4), + (4096, 4096, 512, 128, 128, False, True, True): (4, 4, 1, 32), + (4096, 4096, 512, 128, 128, True, False, True): (4, 4, 1, 32), + (4096, 4096, 1024, 16, 16, False, True, True): (1, 8, 3, 4), + (4096, 4096, 1024, 16, 16, True, False, True): (1, 8, 3, 4), + (4096, 4096, 1024, 32, 32, False, True, True): (1, 16, 3, 4), + (4096, 4096, 1024, 32, 32, True, False, True): (1, 16, 3, 4), + (4096, 4096, 1024, 64, 64, False, True, True): (4, 16, 2, 4), + (4096, 4096, 1024, 64, 64, True, False, True): (4, 16, 2, 4), + (4096, 4096, 1024, 128, 128, False, True, True): (4, 8, 1, 32), + (4096, 4096, 1024, 128, 128, True, False, True): (4, 8, 1, 32), + (4096, 4096, 2048, 16, 16, False, True, True): (1, 32, 3, 1), + (4096, 4096, 2048, 16, 16, True, False, True): (6, 8, 3, 4), + (4096, 4096, 2048, 32, 32, False, True, True): (1, 32, 3, 4), + (4096, 4096, 2048, 32, 32, True, False, True): (1, 32, 3, 4), + (4096, 4096, 2048, 64, 64, False, True, True): (4, 32, 2, 4), + (4096, 4096, 2048, 64, 64, True, False, True): (4, 32, 2, 4), + (4096, 4096, 2048, 128, 128, False, True, True): (4, 16, 1, 32), + (4096, 4096, 2048, 128, 128, True, False, True): (4, 16, 1, 32), + (4096, 4096, 4096, 16, 16, False, True, True): (1, 16, 3, 4), + (4096, 4096, 4096, 16, 16, True, False, True): (1, 64, 3, 1), + (4096, 4096, 4096, 32, 32, False, True, True): (1, 64, 3, 4), + (4096, 4096, 4096, 32, 32, True, False, True): (1, 32, 3, 4), + (4096, 4096, 4096, 64, 64, False, True, True): (4, 64, 2, 4), + (4096, 4096, 4096, 64, 64, True, False, True): (4, 64, 2, 4), + (4096, 4096, 4096, 128, 128, False, True, True): (4, 32, 1, 32), + (4096, 4096, 4096, 128, 128, True, False, True): (4, 32, 1, 32), + (4096, 4096, 8192, 16, 16, False, True, True): (4, 128, 3, 1), + (4096, 4096, 8192, 16, 16, True, False, True): (1, 128, 3, 1), + (4096, 4096, 8192, 32, 32, False, True, True): (1, 128, 3, 4), + (4096, 4096, 8192, 32, 32, True, False, True): (1, 64, 3, 4), + (4096, 4096, 8192, 64, 64, False, True, True): (4, 128, 2, 4), + (4096, 4096, 8192, 64, 64, True, False, True): (4, 128, 2, 4), + (4096, 4096, 8192, 128, 128, False, True, True): (4, 64, 1, 32), + (4096, 4096, 8192, 128, 128, True, False, True): (4, 64, 1, 32), + (4096, 4096, 16384, 16, 16, False, True, True): (1, 64, 3, 4), + (4096, 4096, 16384, 16, 16, True, False, True): (1, 256, 3, 1), + (4096, 4096, 16384, 32, 32, False, True, True): (1, 256, 3, 4), + (4096, 4096, 16384, 32, 32, True, False, True): (1, 128, 3, 4), + (4096, 4096, 16384, 64, 64, False, True, True): (4, 256, 2, 4), + (4096, 4096, 16384, 64, 64, True, False, True): (4, 256, 2, 4), + (4096, 4096, 16384, 128, 128, False, True, True): (4, 128, 1, 32), + (4096, 4096, 16384, 128, 128, True, False, True): (4, 128, 1, 32), + (4096, 4096, 32768, 16, 16, False, True, True): (1, 128, 3, 4), + (4096, 4096, 32768, 16, 16, True, False, True): (1, 512, 3, 1), + (4096, 4096, 32768, 32, 32, False, True, True): (1, 512, 3, 4), + (4096, 4096, 32768, 32, 32, True, False, True): (1, 256, 3, 4), + (4096, 4096, 32768, 64, 64, False, True, True): (4, 512, 2, 4), + (4096, 4096, 32768, 64, 64, True, False, True): (4, 512, 2, 4), + (4096, 4096, 32768, 128, 128, False, True, True): (4, 256, 1, 32), + (4096, 4096, 32768, 128, 128, True, False, True): (4, 256, 1, 32), + (4096, 4096, 65536, 16, 16, False, True, True): (1, 256, 3, 4), + (4096, 4096, 65536, 16, 16, True, False, True): (1, 1024, 3, 1), + (4096, 4096, 65536, 32, 32, False, True, True): (1, 1024, 3, 4), + (4096, 4096, 65536, 32, 32, True, False, True): (1, 512, 3, 4), + (4096, 4096, 65536, 64, 64, False, True, True): (4, 1024, 2, 4), + (4096, 4096, 65536, 64, 64, True, False, True): (2, 1024, 2, 4), + (4096, 4096, 65536, 128, 128, False, True, True): (4, 512, 1, 32), + (4096, 4096, 65536, 128, 128, True, False, True): (4, 512, 1, 32), + (4096, 4096, 131072, 16, 16, False, True, True): (2, 2048, 3, 1), + (4096, 4096, 131072, 16, 16, True, False, True): (1, 2048, 3, 1), + (4096, 4096, 131072, 32, 32, False, True, True): (2, 2048, 3, 4), + (4096, 4096, 131072, 32, 32, True, False, True): (1, 1024, 3, 4), + (4096, 4096, 131072, 64, 64, False, True, True): (2, 2048, 2, 4), + (4096, 4096, 131072, 64, 64, True, False, True): (2, 2048, 2, 4), + (4096, 4096, 131072, 128, 128, False, True, True): (4, 1024, 1, 32), + (4096, 4096, 131072, 128, 128, True, False, True): (4, 1024, 1, 32), + (5120, 1280, 65792, 16, 16, False, True, True): (2, 1028, 3, 1), + (5120, 1280, 65792, 16, 16, True, False, True): (1, 257, 3, 4), + (5120, 1280, 65792, 32, 32, False, True, True): (1, 514, 3, 4), + (5120, 1280, 65792, 32, 32, True, False, True): (1, 514, 3, 4), + (5120, 1280, 65792, 64, 64, False, True, True): (1, 1028, 3, 4), + (5120, 1280, 65792, 64, 64, True, False, True): (5, 1028, 3, 4), + (5120, 1280, 65792, 128, 128, False, True, True): (1, 514, 1, 32), + (5120, 1280, 65792, 128, 128, True, False, True): (4, 514, 2, 32), + (6144, 6144, 256, 16, 16, False, True, True): (2, 2, 3, 4), + (6144, 6144, 256, 16, 16, True, False, True): (2, 2, 3, 4), + (6144, 6144, 256, 32, 32, False, True, True): (2, 4, 3, 4), + (6144, 6144, 256, 32, 32, True, False, True): (2, 4, 3, 4), + (6144, 6144, 256, 64, 64, False, True, True): (1, 4, 3, 4), + (6144, 6144, 256, 64, 64, True, False, True): (1, 4, 3, 4), + (6144, 6144, 256, 128, 128, False, True, True): (1, 2, 1, 32), + (6144, 6144, 256, 128, 128, True, False, True): (5, 2, 2, 32), + (6144, 6144, 512, 16, 16, False, True, True): (4, 8, 3, 2), + (6144, 6144, 512, 16, 16, True, False, True): (4, 8, 3, 2), + (6144, 6144, 512, 32, 32, False, True, True): (2, 8, 3, 4), + (6144, 6144, 512, 32, 32, True, False, True): (2, 8, 3, 4), + (6144, 6144, 512, 64, 64, False, True, True): (1, 8, 3, 4), + (6144, 6144, 512, 64, 64, True, False, True): (1, 8, 3, 4), + (6144, 6144, 512, 128, 128, False, True, True): (1, 4, 1, 32), + (6144, 6144, 512, 128, 128, True, False, True): (4, 4, 2, 32), + (6144, 6144, 1024, 16, 16, False, True, True): (4, 16, 3, 2), + (6144, 6144, 1024, 16, 16, True, False, True): (4, 4, 3, 4), + (6144, 6144, 1024, 32, 32, False, True, True): (1, 16, 3, 4), + (6144, 6144, 1024, 32, 32, True, False, True): (1, 16, 3, 4), + (6144, 6144, 1024, 64, 64, False, True, True): (1, 16, 3, 4), + (6144, 6144, 1024, 64, 64, True, False, True): (1, 16, 3, 4), + (6144, 6144, 1024, 128, 128, False, True, True): (1, 8, 1, 32), + (6144, 6144, 1024, 128, 128, True, False, True): (4, 8, 2, 32), + (6144, 6144, 2048, 16, 16, False, True, True): (1, 8, 3, 4), + (6144, 6144, 2048, 16, 16, True, False, True): (4, 8, 3, 4), + (6144, 6144, 2048, 32, 32, False, True, True): (1, 16, 3, 4), + (6144, 6144, 2048, 32, 32, True, False, True): (1, 16, 3, 4), + (6144, 6144, 2048, 64, 64, False, True, True): (1, 32, 3, 4), + (6144, 6144, 2048, 64, 64, True, False, True): (3, 32, 3, 4), + (6144, 6144, 2048, 128, 128, False, True, True): (1, 16, 1, 32), + (6144, 6144, 2048, 128, 128, True, False, True): (1, 16, 2, 32), + (6144, 6144, 4096, 16, 16, False, True, True): (3, 16, 3, 4), + (6144, 6144, 4096, 16, 16, True, False, True): (4, 16, 3, 4), + (6144, 6144, 4096, 32, 32, False, True, True): (1, 32, 3, 4), + (6144, 6144, 4096, 32, 32, True, False, True): (1, 32, 3, 4), + (6144, 6144, 4096, 64, 64, False, True, True): (1, 64, 3, 4), + (6144, 6144, 4096, 64, 64, True, False, True): (1, 64, 3, 4), + (6144, 6144, 4096, 128, 128, False, True, True): (1, 32, 1, 32), + (6144, 6144, 4096, 128, 128, True, False, True): (4, 32, 2, 32), + (6144, 6144, 8192, 16, 16, False, True, True): (1, 32, 3, 4), + (6144, 6144, 8192, 16, 16, True, False, True): (4, 32, 3, 4), + (6144, 6144, 8192, 32, 32, False, True, True): (1, 64, 3, 4), + (6144, 6144, 8192, 32, 32, True, False, True): (1, 64, 3, 4), + (6144, 6144, 8192, 64, 64, False, True, True): (1, 128, 3, 4), + (6144, 6144, 8192, 64, 64, True, False, True): (1, 128, 3, 4), + (6144, 6144, 8192, 128, 128, False, True, True): (1, 64, 1, 32), + (6144, 6144, 8192, 128, 128, True, False, True): (4, 64, 2, 32), + (6144, 6144, 16384, 16, 16, False, True, True): (1, 64, 3, 4), + (6144, 6144, 16384, 16, 16, True, False, True): (4, 64, 3, 4), + (6144, 6144, 16384, 32, 32, False, True, True): (1, 128, 3, 4), + (6144, 6144, 16384, 32, 32, True, False, True): (1, 128, 3, 4), + (6144, 6144, 16384, 64, 64, False, True, True): (1, 256, 3, 4), + (6144, 6144, 16384, 64, 64, True, False, True): (1, 256, 3, 4), + (6144, 6144, 16384, 128, 128, False, True, True): (1, 128, 1, 32), + (6144, 6144, 16384, 128, 128, True, False, True): (4, 128, 2, 32), + (6144, 6144, 32768, 16, 16, False, True, True): (1, 128, 3, 4), + (6144, 6144, 32768, 16, 16, True, False, True): (4, 128, 3, 4), + (6144, 6144, 32768, 32, 32, False, True, True): (1, 256, 3, 4), + (6144, 6144, 32768, 32, 32, True, False, True): (1, 256, 3, 4), + (6144, 6144, 32768, 64, 64, False, True, True): (1, 512, 3, 4), + (6144, 6144, 32768, 64, 64, True, False, True): (1, 512, 3, 4), + (6144, 6144, 32768, 128, 128, False, True, True): (1, 256, 1, 32), + (6144, 6144, 32768, 128, 128, True, False, True): (4, 256, 2, 32), + (6144, 6144, 65536, 16, 16, False, True, True): (1, 256, 3, 4), + (6144, 6144, 65536, 16, 16, True, False, True): (2, 256, 3, 4), + (6144, 6144, 65536, 32, 32, False, True, True): (1, 512, 3, 4), + (6144, 6144, 65536, 32, 32, True, False, True): (1, 512, 3, 4), + (6144, 6144, 65536, 64, 64, False, True, True): (1, 1024, 3, 4), + (6144, 6144, 65536, 64, 64, True, False, True): (1, 1024, 3, 4), + (6144, 6144, 65536, 128, 128, False, True, True): (1, 512, 1, 32), + (6144, 6144, 65536, 128, 128, True, False, True): (4, 512, 2, 32), + (6144, 6144, 131072, 16, 16, False, True, True): (1, 512, 3, 4), + (6144, 6144, 131072, 16, 16, True, False, True): (2, 512, 3, 4), + (6144, 6144, 131072, 32, 32, False, True, True): (1, 1024, 3, 4), + (6144, 6144, 131072, 32, 32, True, False, True): (1, 1024, 3, 4), + (6144, 6144, 131072, 64, 64, False, True, True): (1, 2048, 3, 4), + (6144, 6144, 131072, 64, 64, True, False, True): (1, 2048, 3, 4), + (6144, 6144, 131072, 128, 128, False, True, True): (1, 1024, 1, 32), + (6144, 6144, 131072, 128, 128, True, False, True): (4, 1024, 2, 32), + (8192, 8192, 256, 16, 16, False, True, True): (2, 2, 4, 4), + (8192, 8192, 256, 16, 16, True, False, True): (1, 1, 3, 4), + (8192, 8192, 256, 32, 32, False, True, True): (2, 4, 3, 4), + (8192, 8192, 256, 32, 32, True, False, True): (2, 4, 3, 4), + (8192, 8192, 256, 64, 64, False, True, True): (4, 4, 2, 4), + (8192, 8192, 256, 64, 64, True, False, True): (4, 4, 2, 4), + (8192, 8192, 256, 128, 128, False, True, True): (1, 2, 1, 32), + (8192, 8192, 256, 128, 128, True, False, True): (4, 2, 1, 32), + (8192, 8192, 512, 16, 16, False, True, True): (1, 4, 3, 4), + (8192, 8192, 512, 16, 16, True, False, True): (3, 4, 3, 4), + (8192, 8192, 512, 32, 32, False, True, True): (1, 8, 3, 4), + (8192, 8192, 512, 32, 32, True, False, True): (6, 8, 3, 4), + (8192, 8192, 512, 64, 64, False, True, True): (4, 8, 2, 4), + (8192, 8192, 512, 64, 64, True, False, True): (4, 8, 2, 4), + (8192, 8192, 512, 128, 128, False, True, True): (4, 4, 1, 32), + (8192, 8192, 512, 128, 128, True, False, True): (4, 4, 1, 32), + (8192, 8192, 1024, 16, 16, False, True, True): (1, 4, 3, 4), + (8192, 8192, 1024, 16, 16, True, False, True): (1, 32, 3, 1), + (8192, 8192, 1024, 32, 32, False, True, True): (1, 16, 3, 4), + (8192, 8192, 1024, 32, 32, True, False, True): (1, 16, 3, 4), + (8192, 8192, 1024, 64, 64, False, True, True): (4, 16, 2, 4), + (8192, 8192, 1024, 64, 64, True, False, True): (4, 16, 2, 4), + (8192, 8192, 1024, 128, 128, False, True, True): (4, 8, 1, 32), + (8192, 8192, 1024, 128, 128, True, False, True): (4, 8, 1, 32), + (8192, 8192, 2048, 16, 16, False, True, True): (4, 8, 3, 4), + (8192, 8192, 2048, 16, 16, True, False, True): (1, 32, 3, 1), + (8192, 8192, 2048, 32, 32, False, True, True): (1, 32, 3, 4), + (8192, 8192, 2048, 32, 32, True, False, True): (1, 16, 4, 4), + (8192, 8192, 2048, 64, 64, False, True, True): (4, 32, 2, 4), + (8192, 8192, 2048, 64, 64, True, False, True): (4, 32, 2, 4), + (8192, 8192, 2048, 128, 128, False, True, True): (4, 16, 1, 32), + (8192, 8192, 2048, 128, 128, True, False, True): (4, 16, 1, 32), + (8192, 8192, 4096, 16, 16, False, True, True): (3, 16, 3, 4), + (8192, 8192, 4096, 16, 16, True, False, True): (2, 64, 3, 1), + (8192, 8192, 4096, 32, 32, False, True, True): (1, 64, 3, 4), + (8192, 8192, 4096, 32, 32, True, False, True): (1, 32, 3, 4), + (8192, 8192, 4096, 64, 64, False, True, True): (4, 64, 2, 4), + (8192, 8192, 4096, 64, 64, True, False, True): (2, 64, 2, 4), + (8192, 8192, 4096, 128, 128, False, True, True): (4, 32, 1, 32), + (8192, 8192, 4096, 128, 128, True, False, True): (4, 32, 1, 32), + (8192, 8192, 8192, 16, 16, False, True, True): (2, 128, 3, 1), + (8192, 8192, 8192, 16, 16, True, False, True): (2, 128, 3, 1), + (8192, 8192, 8192, 32, 32, False, True, True): (1, 128, 3, 4), + (8192, 8192, 8192, 32, 32, True, False, True): (1, 64, 3, 4), + (8192, 8192, 8192, 64, 64, False, True, True): (4, 128, 2, 4), + (8192, 8192, 8192, 64, 64, True, False, True): (2, 128, 2, 4), + (8192, 8192, 8192, 128, 128, False, True, True): (4, 64, 1, 32), + (8192, 8192, 8192, 128, 128, True, False, True): (4, 64, 1, 32), + (8192, 8192, 16384, 16, 16, False, True, True): (1, 64, 3, 4), + (8192, 8192, 16384, 16, 16, True, False, True): (1, 256, 3, 1), + (8192, 8192, 16384, 32, 32, False, True, True): (1, 256, 3, 4), + (8192, 8192, 16384, 32, 32, True, False, True): (1, 128, 3, 4), + (8192, 8192, 16384, 64, 64, False, True, True): (2, 256, 2, 4), + (8192, 8192, 16384, 64, 64, True, False, True): (2, 256, 2, 4), + (8192, 8192, 16384, 128, 128, False, True, True): (4, 128, 1, 32), + (8192, 8192, 16384, 128, 128, True, False, True): (4, 128, 1, 32), + (8192, 8192, 32768, 16, 16, False, True, True): (1, 512, 3, 1), + (8192, 8192, 32768, 16, 16, True, False, True): (1, 512, 3, 1), + (8192, 8192, 32768, 32, 32, False, True, True): (1, 512, 3, 4), + (8192, 8192, 32768, 32, 32, True, False, True): (1, 256, 3, 4), + (8192, 8192, 32768, 64, 64, False, True, True): (2, 512, 2, 4), + (8192, 8192, 32768, 64, 64, True, False, True): (2, 512, 2, 4), + (8192, 8192, 32768, 128, 128, False, True, True): (4, 256, 1, 32), + (8192, 8192, 32768, 128, 128, True, False, True): (4, 256, 1, 32), + (8192, 8192, 65536, 16, 16, False, True, True): (1, 256, 3, 4), + (8192, 8192, 65536, 16, 16, True, False, True): (1, 1024, 3, 1), + (8192, 8192, 65536, 32, 32, False, True, True): (1, 1024, 3, 4), + (8192, 8192, 65536, 32, 32, True, False, True): (1, 512, 3, 4), + (8192, 8192, 65536, 64, 64, False, True, True): (4, 1024, 2, 4), + (8192, 8192, 65536, 64, 64, True, False, True): (2, 1024, 2, 4), + (8192, 8192, 65536, 128, 128, False, True, True): (4, 512, 1, 32), + (8192, 8192, 65536, 128, 128, True, False, True): (4, 512, 1, 32), + (8192, 8192, 131072, 16, 16, False, True, True): (1, 2048, 3, 1), + (8192, 8192, 131072, 16, 16, True, False, True): (2, 2048, 3, 1), + (8192, 8192, 131072, 32, 32, False, True, True): (4, 2048, 3, 4), + (8192, 8192, 131072, 32, 32, True, False, True): (1, 1024, 3, 4), + (8192, 8192, 131072, 64, 64, False, True, True): (2, 2048, 2, 4), + (8192, 8192, 131072, 64, 64, True, False, True): (2, 2048, 2, 4), + (8192, 8192, 131072, 128, 128, False, True, True): (4, 1024, 1, 32), + (8192, 8192, 131072, 128, 128, True, False, True): (4, 1024, 1, 32), + (16384, 16384, 256, 16, 16, False, True, True): (1, 2, 3, 4), + (16384, 16384, 256, 16, 16, True, False, True): (1, 2, 3, 4), + (16384, 16384, 256, 32, 32, False, True, True): (1, 4, 3, 4), + (16384, 16384, 256, 32, 32, True, False, True): (1, 4, 3, 4), + (16384, 16384, 256, 64, 64, False, True, True): (2, 4, 2, 4), + (16384, 16384, 256, 64, 64, True, False, True): (2, 4, 2, 4), + (16384, 16384, 256, 128, 128, False, True, True): (2, 2, 1, 32), + (16384, 16384, 256, 128, 128, True, False, True): (2, 2, 1, 32), + (16384, 16384, 512, 16, 16, False, True, True): (1, 2, 3, 4), + (16384, 16384, 512, 16, 16, True, False, True): (5, 2, 3, 4), + (16384, 16384, 512, 32, 32, False, True, True): (1, 8, 3, 4), + (16384, 16384, 512, 32, 32, True, False, True): (1, 4, 3, 4), + (16384, 16384, 512, 64, 64, False, True, True): (4, 8, 2, 4), + (16384, 16384, 512, 64, 64, True, False, True): (4, 8, 2, 4), + (16384, 16384, 512, 128, 128, False, True, True): (4, 4, 1, 32), + (16384, 16384, 512, 128, 128, True, False, True): (4, 4, 1, 32), + (16384, 16384, 1024, 16, 16, False, True, True): (1, 4, 3, 4), + (16384, 16384, 1024, 16, 16, True, False, True): (2, 16, 3, 1), + (16384, 16384, 1024, 32, 32, False, True, True): (1, 16, 3, 4), + (16384, 16384, 1024, 32, 32, True, False, True): (1, 8, 3, 4), + (16384, 16384, 1024, 64, 64, False, True, True): (4, 16, 2, 4), + (16384, 16384, 1024, 64, 64, True, False, True): (4, 16, 2, 4), + (16384, 16384, 1024, 128, 128, False, True, True): (4, 8, 1, 32), + (16384, 16384, 1024, 128, 128, True, False, True): (4, 8, 1, 32), + (16384, 16384, 2048, 16, 16, False, True, True): (1, 8, 3, 4), + (16384, 16384, 2048, 16, 16, True, False, True): (2, 32, 3, 1), + (16384, 16384, 2048, 32, 32, False, True, True): (1, 32, 3, 4), + (16384, 16384, 2048, 32, 32, True, False, True): (1, 16, 3, 4), + (16384, 16384, 2048, 64, 64, False, True, True): (4, 32, 2, 4), + (16384, 16384, 2048, 64, 64, True, False, True): (2, 32, 2, 4), + (16384, 16384, 2048, 128, 128, False, True, True): (4, 16, 1, 32), + (16384, 16384, 2048, 128, 128, True, False, True): (4, 16, 1, 32), + (16384, 16384, 4096, 16, 16, False, True, True): (1, 16, 3, 4), + (16384, 16384, 4096, 16, 16, True, False, True): (2, 64, 3, 1), + (16384, 16384, 4096, 32, 32, False, True, True): (1, 64, 3, 4), + (16384, 16384, 4096, 32, 32, True, False, True): (1, 32, 3, 4), + (16384, 16384, 4096, 64, 64, False, True, True): (4, 64, 2, 4), + (16384, 16384, 4096, 64, 64, True, False, True): (2, 64, 2, 4), + (16384, 16384, 4096, 128, 128, False, True, True): (4, 32, 1, 32), + (16384, 16384, 4096, 128, 128, True, False, True): (4, 32, 1, 32), + (16384, 16384, 8192, 16, 16, False, True, True): (1, 128, 3, 1), + (16384, 16384, 8192, 16, 16, True, False, True): (2, 128, 3, 1), + (16384, 16384, 8192, 32, 32, False, True, True): (1, 128, 3, 4), + (16384, 16384, 8192, 32, 32, True, False, True): (1, 64, 3, 4), + (16384, 16384, 8192, 64, 64, False, True, True): (2, 128, 2, 4), + (16384, 16384, 8192, 64, 64, True, False, True): (2, 128, 2, 4), + (16384, 16384, 8192, 128, 128, False, True, True): (4, 64, 1, 32), + (16384, 16384, 8192, 128, 128, True, False, True): (4, 64, 1, 32), + (16384, 16384, 16384, 16, 16, False, True, True): (1, 64, 3, 4), + (16384, 16384, 16384, 16, 16, True, False, True): (2, 256, 3, 1), + (16384, 16384, 16384, 32, 32, False, True, True): (1, 256, 3, 4), + (16384, 16384, 16384, 32, 32, True, False, True): (1, 128, 3, 4), + (16384, 16384, 16384, 64, 64, False, True, True): (2, 256, 2, 4), + (16384, 16384, 16384, 64, 64, True, False, True): (2, 256, 2, 4), + (16384, 16384, 16384, 128, 128, False, True, True): (4, 128, 1, 32), + (16384, 16384, 16384, 128, 128, True, False, True): (4, 128, 1, 32), + (16384, 16384, 32768, 16, 16, False, True, True): (1, 512, 3, 1), + (16384, 16384, 32768, 16, 16, True, False, True): (1, 128, 3, 4), + (16384, 16384, 32768, 32, 32, False, True, True): (2, 512, 3, 4), + (16384, 16384, 32768, 32, 32, True, False, True): (1, 256, 4, 4), + (16384, 16384, 32768, 64, 64, False, True, True): (2, 512, 2, 4), + (16384, 16384, 32768, 64, 64, True, False, True): (2, 512, 2, 4), + (16384, 16384, 32768, 128, 128, False, True, True): (4, 256, 1, 32), + (16384, 16384, 32768, 128, 128, True, False, True): (4, 256, 1, 32), + (16384, 16384, 65536, 16, 16, False, True, True): (1, 256, 3, 4), + (16384, 16384, 65536, 16, 16, True, False, True): (1, 1024, 3, 1), + (16384, 16384, 65536, 32, 32, False, True, True): (1, 1024, 3, 4), + (16384, 16384, 65536, 32, 32, True, False, True): (1, 512, 4, 4), + (16384, 16384, 65536, 64, 64, False, True, True): (2, 1024, 2, 4), + (16384, 16384, 65536, 64, 64, True, False, True): (2, 1024, 2, 4), + (16384, 16384, 65536, 128, 128, False, True, True): (4, 512, 1, 32), + (16384, 16384, 65536, 128, 128, True, False, True): (4, 512, 1, 32), + (16384, 16384, 131072, 16, 16, False, True, True): (1, 1024, 4, 4), + (16384, 16384, 131072, 16, 16, True, False, True): (2, 2048, 3, 1), + (16384, 16384, 131072, 32, 32, False, True, True): (1, 1024, 2, 4), + (16384, 16384, 131072, 32, 32, True, False, True): (1, 1024, 2, 4), + (16384, 16384, 131072, 64, 64, False, True, True): (4, 2048, 2, 4), + (16384, 16384, 131072, 64, 64, True, False, True): (2, 2048, 2, 4), + (16384, 16384, 131072, 128, 128, False, True, True): (4, 1024, 1, 32), + (16384, 16384, 131072, 128, 128, True, False, True): (4, 1024, 1, 32), + }, + ("bsr_dense_addmm", "NVIDIA A100-SXM4-80GB", (0, torch.float32, 0.56)): { + (192, 192, 256, 64, 64, False, True, True): (1, 4, 3, 8), + (192, 192, 256, 64, 64, True, False, True): (1, 4, 3, 8), + (192, 192, 512, 64, 64, False, True, True): (2, 8, 3, 8), + (192, 192, 512, 64, 64, True, False, True): (5, 8, 3, 8), + (192, 192, 1024, 64, 64, False, True, True): (2, 16, 4, 8), + (192, 192, 1024, 64, 64, True, False, True): (1, 16, 3, 8), + (192, 192, 2048, 64, 64, False, True, True): (3, 32, 3, 8), + (192, 192, 2048, 64, 64, True, False, True): (5, 32, 5, 8), + (192, 192, 4096, 64, 64, False, True, True): (3, 64, 2, 8), + (192, 192, 4096, 64, 64, True, False, True): (1, 64, 3, 8), + (192, 192, 8192, 64, 64, False, True, True): (3, 128, 3, 8), + (192, 192, 8192, 64, 64, True, False, True): (6, 128, 3, 4), + (192, 192, 16384, 64, 64, False, True, True): (1, 256, 1, 8), + (192, 192, 16384, 64, 64, True, False, True): (1, 256, 3, 4), + (192, 192, 32768, 64, 64, False, True, True): (1, 512, 1, 8), + (192, 192, 32768, 64, 64, True, False, True): (1, 512, 3, 4), + (192, 192, 65536, 64, 64, False, True, True): (1, 1024, 1, 8), + (192, 192, 65536, 64, 64, True, False, True): (1, 1024, 3, 4), + (192, 192, 131072, 64, 64, False, True, True): (1, 2048, 1, 8), + (192, 192, 131072, 64, 64, True, False, True): (3, 2048, 1, 4), + (384, 384, 256, 128, 128, False, True, True): (1, 2, 1, 32), + (384, 384, 256, 128, 128, True, False, True): (1, 2, 1, 32), + (384, 384, 512, 128, 128, False, True, True): (1, 4, 1, 32), + (384, 384, 512, 128, 128, True, False, True): (2, 4, 1, 32), + (384, 384, 1024, 128, 128, False, True, True): (1, 8, 1, 32), + (384, 384, 1024, 128, 128, True, False, True): (4, 8, 1, 32), + (384, 384, 2048, 128, 128, False, True, True): (1, 16, 1, 32), + (384, 384, 2048, 128, 128, True, False, True): (1, 16, 1, 32), + (384, 384, 4096, 128, 128, False, True, True): (1, 32, 1, 32), + (384, 384, 4096, 128, 128, True, False, True): (2, 32, 2, 32), + (384, 384, 8192, 128, 128, False, True, True): (1, 64, 1, 32), + (384, 384, 8192, 128, 128, True, False, True): (1, 64, 2, 32), + (384, 384, 16384, 128, 128, False, True, True): (1, 128, 1, 32), + (384, 384, 16384, 128, 128, True, False, True): (4, 128, 1, 32), + (384, 384, 32768, 128, 128, False, True, True): (3, 256, 1, 32), + (384, 384, 32768, 128, 128, True, False, True): (3, 256, 1, 32), + (384, 384, 65536, 128, 128, False, True, True): (3, 512, 1, 32), + (384, 384, 65536, 128, 128, True, False, True): (3, 512, 1, 32), + (384, 384, 131072, 128, 128, False, True, True): (1, 1024, 1, 32), + (384, 384, 131072, 128, 128, True, False, True): (3, 1024, 1, 32), + }, + ("bsr_dense_addmm", "NVIDIA A100-SXM4-80GB", (0, torch.int8, 0.5)): { + (1280, 5120, 65792, 32, 32, False, True, True): (1, 1028, 1, 8), + (1280, 5120, 65792, 32, 32, True, False, True): (1, 514, 3, 2), + (1280, 5120, 65792, 64, 64, False, True, True): (2, 514, 1, 4), + (1280, 5120, 65792, 64, 64, True, False, True): (1, 514, 3, 2), + (1280, 5120, 65792, 128, 128, False, True, True): (2, 514, 1, 8), + (1280, 5120, 65792, 128, 128, True, False, True): (1, 514, 2, 4), + (1280, 5120, 65792, 256, 256, False, True, True): (1, 257, 1, 32), + (1280, 5120, 65792, 256, 256, True, False, True): (1, 257, 1, 32), + (5120, 1280, 65792, 32, 32, False, True, True): (3, 1028, 1, 8), + (5120, 1280, 65792, 32, 32, True, False, True): (1, 514, 1, 2), + (5120, 1280, 65792, 64, 64, False, True, True): (1, 514, 1, 4), + (5120, 1280, 65792, 64, 64, True, False, True): (2, 514, 2, 2), + (5120, 1280, 65792, 128, 128, False, True, True): (2, 514, 1, 8), + (5120, 1280, 65792, 128, 128, True, False, True): (2, 514, 2, 4), + (5120, 1280, 65792, 256, 256, False, True, True): (1, 257, 1, 32), + (5120, 1280, 65792, 256, 256, True, False, True): (1, 257, 1, 32), + }, + ("scatter_mm", "NVIDIA A100-SXM4-80GB", (0, torch.bfloat16, 0.5)): { + (256, 256, 256, 16, 16): (1, 1, 16, 16, 1, 2), + (256, 256, 256, 32, 32): (1, 1, 16, 16, 1, 4), + (256, 256, 256, 64, 64): (1, 1, 16, 16, 1, 1), + (256, 256, 256, 128, 128): (2, 4, 16, 64, 1, 4), + (256, 256, 512, 16, 16): (1, 1, 16, 16, 1, 4), + (256, 256, 512, 32, 32): (1, 1, 16, 32, 1, 4), + (256, 256, 512, 64, 64): (1, 1, 16, 32, 1, 1), + (256, 256, 512, 128, 128): (1, 1, 32, 32, 1, 4), + (256, 256, 1024, 16, 16): (1, 1, 16, 16, 1, 4), + (256, 256, 1024, 32, 32): (1, 2, 16, 32, 1, 1), + (256, 256, 1024, 64, 64): (1, 1, 32, 32, 1, 2), + (256, 256, 1024, 128, 128): (1, 1, 32, 64, 1, 4), + (256, 256, 2048, 16, 16): (1, 1, 16, 64, 1, 8), + (256, 256, 2048, 32, 32): (2, 1, 32, 64, 1, 2), + (256, 256, 2048, 64, 64): (1, 1, 32, 32, 1, 1), + (256, 256, 2048, 128, 128): (1, 1, 64, 64, 1, 4), + (256, 256, 4096, 16, 16): (1, 1, 16, 64, 1, 1), + (256, 256, 4096, 32, 32): (2, 2, 32, 64, 1, 2), + (256, 256, 4096, 64, 64): (1, 1, 32, 128, 1, 4), + (256, 256, 4096, 128, 128): (1, 1, 64, 64, 1, 4), + (256, 256, 8192, 16, 16): (1, 2, 16, 64, 1, 2), + (256, 256, 8192, 32, 32): (1, 1, 32, 64, 1, 2), + (256, 256, 8192, 64, 64): (1, 1, 32, 64, 1, 2), + (256, 256, 8192, 128, 128): (1, 1, 64, 64, 1, 4), + (256, 256, 16384, 16, 16): (1, 1, 16, 64, 1, 2), + (256, 256, 16384, 32, 32): (1, 1, 32, 64, 1, 2), + (256, 256, 16384, 64, 64): (1, 1, 64, 64, 1, 2), + (256, 256, 16384, 128, 128): (2, 16, 64, 64, 1, 4), + (256, 256, 32768, 16, 16): (1, 1, 16, 128, 1, 2), + (256, 256, 32768, 32, 32): (1, 1, 32, 64, 1, 2), + (256, 256, 32768, 64, 64): (1, 1, 64, 64, 1, 2), + (256, 256, 32768, 128, 128): (2, 32, 64, 64, 1, 4), + (256, 256, 65536, 16, 16): (1, 1, 16, 64, 1, 1), + (256, 256, 65536, 32, 32): (1, 1, 32, 64, 1, 2), + (256, 256, 65536, 64, 64): (1, 1, 64, 32, 1, 1), + (256, 256, 65536, 128, 128): (2, 32, 64, 64, 1, 4), + (256, 256, 131072, 16, 16): (1, 1, 16, 64, 1, 1), + (256, 256, 131072, 32, 32): (1, 1, 32, 64, 1, 2), + (256, 256, 131072, 64, 64): (4, 1, 64, 32, 1, 1), + (256, 256, 131072, 128, 128): (2, 64, 64, 64, 1, 4), + (512, 512, 256, 16, 16): (1, 1, 16, 16, 1, 2), + (512, 512, 256, 32, 32): (1, 1, 16, 32, 1, 1), + (512, 512, 256, 64, 64): (1, 2, 16, 32, 1, 1), + (512, 512, 256, 128, 128): (2, 16, 64, 16, 2, 4), + (512, 512, 512, 16, 16): (1, 1, 16, 16, 1, 4), + (512, 512, 512, 32, 32): (1, 1, 16, 32, 1, 1), + (512, 512, 512, 64, 64): (1, 1, 32, 32, 1, 2), + (512, 512, 512, 128, 128): (2, 8, 32, 64, 1, 4), + (512, 512, 1024, 16, 16): (1, 1, 16, 64, 1, 8), + (512, 512, 1024, 32, 32): (1, 1, 32, 32, 3, 1), + (512, 512, 1024, 64, 64): (1, 4, 32, 64, 1, 2), + (512, 512, 1024, 128, 128): (1, 4, 64, 64, 1, 4), + (512, 512, 2048, 16, 16): (1, 1, 16, 64, 1, 2), + (512, 512, 2048, 32, 32): (1, 1, 32, 64, 1, 2), + (512, 512, 2048, 64, 64): (1, 1, 64, 64, 3, 4), + (512, 512, 2048, 128, 128): (1, 1, 64, 64, 1, 4), + (512, 512, 4096, 16, 16): (1, 1, 16, 64, 1, 2), + (512, 512, 4096, 32, 32): (2, 64, 32, 64, 1, 2), + (512, 512, 4096, 64, 64): (1, 1, 64, 64, 3, 4), + (512, 512, 4096, 128, 128): (1, 1, 64, 64, 1, 4), + (512, 512, 8192, 16, 16): (1, 2, 16, 128, 1, 2), + (512, 512, 8192, 32, 32): (1, 1, 32, 64, 1, 2), + (512, 512, 8192, 64, 64): (1, 1, 64, 64, 1, 2), + (512, 512, 8192, 128, 128): (1, 1, 64, 64, 1, 4), + (512, 512, 16384, 16, 16): (1, 2, 16, 128, 1, 2), + (512, 512, 16384, 32, 32): (1, 1, 32, 64, 1, 2), + (512, 512, 16384, 64, 64): (1, 1, 64, 64, 3, 2), + (512, 512, 16384, 128, 128): (2, 1, 64, 64, 1, 4), + (512, 512, 32768, 16, 16): (1, 2, 16, 128, 1, 2), + (512, 512, 32768, 32, 32): (1, 1, 32, 64, 1, 2), + (512, 512, 32768, 64, 64): (1, 1, 64, 64, 3, 4), + (512, 512, 32768, 128, 128): (2, 1, 64, 64, 1, 4), + (512, 512, 65536, 16, 16): (1, 2, 16, 128, 1, 2), + (512, 512, 65536, 32, 32): (1, 1, 32, 64, 1, 2), + (512, 512, 65536, 64, 64): (1, 1, 64, 64, 3, 4), + (512, 512, 65536, 128, 128): (2, 1, 64, 64, 1, 4), + (512, 512, 131072, 16, 16): (1, 1, 16, 64, 1, 1), + (512, 512, 131072, 32, 32): (1, 1, 32, 64, 1, 2), + (512, 512, 131072, 64, 64): (1, 1, 64, 64, 3, 4), + (512, 512, 131072, 128, 128): (2, 4, 64, 64, 1, 4), + (1024, 1024, 256, 16, 16): (1, 1, 16, 16, 1, 4), + (1024, 1024, 256, 32, 32): (2, 16, 32, 16, 3, 4), + (1024, 1024, 256, 64, 64): (1, 4, 32, 32, 1, 2), + (1024, 1024, 256, 128, 128): (1, 4, 128, 16, 3, 16), + (1024, 1024, 512, 16, 16): (1, 1, 16, 64, 1, 2), + (1024, 1024, 512, 32, 32): (2, 2, 32, 64, 1, 2), + (1024, 1024, 512, 64, 64): (2, 8, 64, 64, 3, 4), + (1024, 1024, 512, 128, 128): (1, 4, 64, 64, 1, 8), + (1024, 1024, 1024, 16, 16): (1, 1, 16, 64, 1, 2), + (1024, 1024, 1024, 32, 32): (1, 1, 32, 64, 1, 2), + (1024, 1024, 1024, 64, 64): (1, 8, 64, 64, 3, 4), + (1024, 1024, 1024, 128, 128): (1, 8, 64, 64, 1, 4), + (1024, 1024, 2048, 16, 16): (1, 2, 16, 64, 1, 2), + (1024, 1024, 2048, 32, 32): (1, 1, 32, 64, 1, 2), + (1024, 1024, 2048, 64, 64): (2, 16, 64, 64, 2, 2), + (1024, 1024, 2048, 128, 128): (2, 32, 64, 64, 1, 4), + (1024, 1024, 4096, 16, 16): (2, 16, 16, 128, 1, 2), + (1024, 1024, 4096, 32, 32): (1, 16, 32, 64, 3, 2), + (1024, 1024, 4096, 64, 64): (1, 1, 64, 64, 3, 4), + (1024, 1024, 4096, 128, 128): (2, 64, 128, 64, 1, 4), + (1024, 1024, 8192, 16, 16): (2, 16, 16, 128, 1, 2), + (1024, 1024, 8192, 32, 32): (1, 16, 32, 64, 3, 2), + (1024, 1024, 8192, 64, 64): (1, 1, 64, 64, 3, 4), + (1024, 1024, 8192, 128, 128): (2, 1, 64, 64, 1, 4), + (1024, 1024, 16384, 16, 16): (1, 2, 16, 128, 1, 2), + (1024, 1024, 16384, 32, 32): (1, 16, 32, 64, 3, 2), + (1024, 1024, 16384, 64, 64): (1, 1, 64, 64, 3, 4), + (1024, 1024, 16384, 128, 128): (2, 16, 128, 64, 1, 4), + (1024, 1024, 32768, 16, 16): (1, 1, 16, 128, 1, 2), + (1024, 1024, 32768, 32, 32): (1, 1, 32, 128, 1, 2), + (1024, 1024, 32768, 64, 64): (1, 32, 64, 32, 2, 1), + (1024, 1024, 32768, 128, 128): (2, 8, 128, 64, 1, 4), + (1024, 1024, 65536, 16, 16): (3, 2, 16, 128, 1, 2), + (1024, 1024, 65536, 32, 32): (1, 1, 32, 128, 1, 2), + (1024, 1024, 65536, 64, 64): (2, 4, 64, 32, 2, 1), + (1024, 1024, 65536, 128, 128): (2, 8, 128, 64, 1, 4), + (1024, 1024, 131072, 16, 16): (2, 1, 16, 128, 1, 2), + (1024, 1024, 131072, 32, 32): (1, 1, 32, 128, 1, 2), + (1024, 1024, 131072, 64, 64): (1, 4, 64, 32, 2, 1), + (1024, 1024, 131072, 128, 128): (4, 1, 128, 64, 1, 4), + (2048, 2048, 256, 16, 16): (1, 1, 16, 64, 1, 8), + (2048, 2048, 256, 32, 32): (1, 1, 32, 32, 3, 1), + (2048, 2048, 256, 64, 64): (1, 1, 32, 32, 2, 1), + (2048, 2048, 256, 128, 128): (1, 4, 64, 64, 1, 8), + (2048, 2048, 512, 16, 16): (1, 2, 16, 64, 1, 2), + (2048, 2048, 512, 32, 32): (1, 2, 32, 64, 1, 4), + (2048, 2048, 512, 64, 64): (1, 4, 64, 64, 1, 8), + (2048, 2048, 512, 128, 128): (1, 4, 64, 64, 1, 4), + (2048, 2048, 1024, 16, 16): (1, 2, 16, 128, 1, 2), + (2048, 2048, 1024, 32, 32): (1, 1, 32, 64, 1, 2), + (2048, 2048, 1024, 64, 64): (1, 8, 64, 64, 1, 4), + (2048, 2048, 1024, 128, 128): (1, 8, 128, 64, 1, 4), + (2048, 2048, 2048, 16, 16): (3, 4, 16, 128, 1, 2), + (2048, 2048, 2048, 32, 32): (1, 16, 32, 64, 5, 2), + (2048, 2048, 2048, 64, 64): (1, 1, 64, 64, 3, 4), + (2048, 2048, 2048, 128, 128): (1, 8, 128, 64, 1, 4), + (2048, 2048, 4096, 16, 16): (1, 2, 16, 128, 1, 2), + (2048, 2048, 4096, 32, 32): (1, 8, 32, 64, 3, 2), + (2048, 2048, 4096, 64, 64): (1, 1, 64, 64, 3, 4), + (2048, 2048, 4096, 128, 128): (1, 8, 128, 64, 1, 4), + (2048, 2048, 8192, 16, 16): (2, 4, 16, 128, 1, 2), + (2048, 2048, 8192, 32, 32): (1, 4, 32, 128, 3, 2), + (2048, 2048, 8192, 64, 64): (1, 8, 64, 64, 3, 2), + (2048, 2048, 8192, 128, 128): (1, 8, 128, 64, 1, 4), + (2048, 2048, 16384, 16, 16): (1, 2, 16, 128, 1, 2), + (2048, 2048, 16384, 32, 32): (1, 4, 32, 128, 3, 2), + (2048, 2048, 16384, 64, 64): (1, 8, 64, 64, 3, 2), + (2048, 2048, 16384, 128, 128): (1, 4, 128, 64, 1, 4), + (2048, 2048, 32768, 16, 16): (3, 2, 16, 128, 1, 2), + (2048, 2048, 32768, 32, 32): (1, 1, 32, 128, 3, 2), + (2048, 2048, 32768, 64, 64): (1, 1, 64, 64, 3, 2), + (2048, 2048, 32768, 128, 128): (1, 4, 128, 64, 1, 4), + (2048, 2048, 65536, 16, 16): (1, 2, 16, 128, 1, 2), + (2048, 2048, 65536, 32, 32): (1, 4, 32, 128, 1, 2), + (2048, 2048, 65536, 64, 64): (1, 1, 64, 64, 3, 2), + (2048, 2048, 65536, 128, 128): (1, 2, 128, 64, 1, 4), + (2048, 2048, 131072, 16, 16): (4, 2, 16, 128, 1, 2), + (2048, 2048, 131072, 32, 32): (1, 1, 32, 128, 3, 2), + (2048, 2048, 131072, 64, 64): (1, 1, 64, 64, 3, 2), + (2048, 2048, 131072, 128, 128): (1, 2, 128, 64, 1, 4), + (4096, 4096, 256, 16, 16): (1, 1, 16, 64, 1, 2), + (4096, 4096, 256, 32, 32): (1, 1, 32, 64, 3, 4), + (4096, 4096, 256, 64, 64): (1, 1, 64, 64, 3, 4), + (4096, 4096, 256, 128, 128): (3, 4, 128, 32, 1, 4), + (4096, 4096, 512, 16, 16): (1, 2, 16, 128, 1, 2), + (4096, 4096, 512, 32, 32): (1, 2, 32, 64, 3, 2), + (4096, 4096, 512, 64, 64): (1, 4, 64, 64, 1, 4), + (4096, 4096, 512, 128, 128): (1, 4, 128, 64, 1, 4), + (4096, 4096, 1024, 16, 16): (1, 2, 16, 128, 1, 2), + (4096, 4096, 1024, 32, 32): (1, 8, 32, 64, 3, 2), + (4096, 4096, 1024, 64, 64): (1, 4, 64, 64, 1, 4), + (4096, 4096, 1024, 128, 128): (2, 4, 128, 64, 1, 4), + (4096, 4096, 2048, 16, 16): (1, 1, 16, 128, 1, 2), + (4096, 4096, 2048, 32, 32): (1, 4, 32, 128, 1, 4), + (4096, 4096, 2048, 64, 64): (1, 1, 64, 64, 3, 4), + (4096, 4096, 2048, 128, 128): (1, 16, 128, 64, 1, 4), + (4096, 4096, 4096, 16, 16): (1, 1, 16, 64, 3, 1), + (4096, 4096, 4096, 32, 32): (1, 4, 32, 64, 3, 2), + (4096, 4096, 4096, 64, 64): (1, 1, 64, 64, 3, 4), + (4096, 4096, 4096, 128, 128): (5, 1, 128, 64, 1, 4), + (4096, 4096, 8192, 16, 16): (1, 1, 16, 128, 1, 2), + (4096, 4096, 8192, 32, 32): (1, 1, 32, 128, 3, 2), + (4096, 4096, 8192, 64, 64): (1, 1, 64, 64, 3, 4), + (4096, 4096, 8192, 128, 128): (2, 1, 128, 64, 1, 4), + (4096, 4096, 16384, 16, 16): (1, 1, 16, 128, 1, 2), + (4096, 4096, 16384, 32, 32): (1, 1, 32, 128, 3, 2), + (4096, 4096, 16384, 64, 64): (1, 1, 64, 64, 4, 4), + (4096, 4096, 16384, 128, 128): (2, 1, 128, 64, 1, 4), + (4096, 4096, 32768, 16, 16): (3, 1, 16, 128, 1, 2), + (4096, 4096, 32768, 32, 32): (1, 1, 32, 128, 3, 2), + (4096, 4096, 32768, 64, 64): (1, 1, 64, 64, 3, 4), + (4096, 4096, 32768, 128, 128): (2, 1, 128, 64, 1, 4), + (4096, 4096, 65536, 16, 16): (2, 2, 16, 128, 1, 2), + (4096, 4096, 65536, 32, 32): (1, 1, 32, 128, 4, 2), + (4096, 4096, 65536, 64, 64): (1, 1, 64, 64, 4, 4), + (4096, 4096, 65536, 128, 128): (2, 1, 128, 64, 1, 4), + (4096, 4096, 131072, 16, 16): (2, 1, 16, 128, 1, 2), + (4096, 4096, 131072, 32, 32): (1, 1, 32, 128, 3, 2), + (4096, 4096, 131072, 64, 64): (1, 1, 64, 64, 3, 4), + (4096, 4096, 131072, 128, 128): (2, 1, 128, 64, 1, 4), + (8192, 8192, 256, 16, 16): (1, 2, 16, 64, 1, 2), + (8192, 8192, 256, 32, 32): (1, 1, 32, 64, 1, 2), + (8192, 8192, 256, 64, 64): (1, 2, 64, 64, 1, 4), + (8192, 8192, 256, 128, 128): (3, 16, 128, 16, 1, 2), + (8192, 8192, 512, 16, 16): (1, 2, 16, 128, 1, 2), + (8192, 8192, 512, 32, 32): (1, 4, 32, 64, 3, 2), + (8192, 8192, 512, 64, 64): (2, 8, 64, 64, 4, 4), + (8192, 8192, 512, 128, 128): (1, 8, 128, 64, 1, 4), + (8192, 8192, 1024, 16, 16): (4, 2, 16, 128, 1, 2), + (8192, 8192, 1024, 32, 32): (1, 8, 32, 128, 1, 2), + (8192, 8192, 1024, 64, 64): (1, 16, 64, 64, 3, 2), + (8192, 8192, 1024, 128, 128): (2, 16, 128, 64, 2, 4), + (8192, 8192, 2048, 16, 16): (2, 1, 16, 64, 4, 1), + (8192, 8192, 2048, 32, 32): (1, 16, 32, 64, 5, 2), + (8192, 8192, 2048, 64, 64): (1, 16, 64, 64, 3, 2), + (8192, 8192, 2048, 128, 128): (2, 16, 128, 64, 2, 4), + (8192, 8192, 4096, 16, 16): (1, 1, 16, 64, 4, 1), + (8192, 8192, 4096, 32, 32): (1, 16, 32, 64, 5, 2), + (8192, 8192, 4096, 64, 64): (1, 16, 64, 64, 3, 2), + (8192, 8192, 4096, 128, 128): (2, 64, 128, 64, 2, 4), + (8192, 8192, 8192, 16, 16): (1, 1, 16, 64, 4, 1), + (8192, 8192, 8192, 32, 32): (1, 8, 32, 128, 5, 4), + (8192, 8192, 8192, 64, 64): (1, 8, 64, 64, 3, 2), + (8192, 8192, 8192, 128, 128): (2, 8, 128, 64, 1, 4), + (8192, 8192, 16384, 16, 16): (1, 1, 16, 64, 4, 1), + (8192, 8192, 16384, 32, 32): (1, 8, 32, 64, 5, 2), + (8192, 8192, 16384, 64, 64): (1, 8, 64, 64, 3, 2), + (8192, 8192, 16384, 128, 128): (1, 8, 128, 64, 1, 4), + (8192, 8192, 32768, 16, 16): (1, 1, 16, 64, 4, 1), + (8192, 8192, 32768, 32, 32): (1, 8, 32, 64, 5, 2), + (8192, 8192, 32768, 64, 64): (3, 8, 64, 64, 3, 2), + (8192, 8192, 32768, 128, 128): (2, 8, 128, 64, 1, 4), + (8192, 8192, 65536, 16, 16): (1, 1, 16, 64, 4, 1), + (8192, 8192, 65536, 32, 32): (5, 4, 32, 64, 3, 2), + (8192, 8192, 65536, 64, 64): (1, 8, 64, 64, 3, 2), + (8192, 8192, 65536, 128, 128): (2, 8, 128, 64, 1, 4), + (8192, 8192, 131072, 16, 16): (2, 1, 16, 64, 4, 1), + (8192, 8192, 131072, 32, 32): (1, 4, 32, 64, 5, 2), + (8192, 8192, 131072, 64, 64): (1, 4, 64, 128, 3, 4), + (8192, 8192, 131072, 128, 128): (2, 8, 128, 64, 1, 4), + (16384, 16384, 256, 16, 16): (1, 2, 16, 128, 1, 2), + (16384, 16384, 256, 32, 32): (1, 4, 32, 64, 3, 2), + (16384, 16384, 256, 64, 64): (2, 4, 64, 64, 4, 4), + (16384, 16384, 256, 128, 128): (1, 4, 128, 64, 1, 16), + (16384, 16384, 512, 16, 16): (1, 2, 16, 128, 3, 2), + (16384, 16384, 512, 32, 32): (1, 4, 32, 128, 5, 4), + (16384, 16384, 512, 64, 64): (1, 8, 64, 64, 3, 2), + (16384, 16384, 512, 128, 128): (2, 8, 128, 64, 1, 4), + (16384, 16384, 1024, 16, 16): (1, 2, 16, 128, 1, 2), + (16384, 16384, 1024, 32, 32): (1, 8, 32, 64, 5, 2), + (16384, 16384, 1024, 64, 64): (1, 16, 64, 64, 3, 2), + (16384, 16384, 1024, 128, 128): (5, 16, 128, 64, 2, 4), + (16384, 16384, 2048, 16, 16): (1, 2, 16, 128, 1, 2), + (16384, 16384, 2048, 32, 32): (1, 8, 32, 64, 5, 2), + (16384, 16384, 2048, 64, 64): (1, 16, 64, 64, 3, 2), + (16384, 16384, 2048, 128, 128): (4, 32, 128, 64, 2, 4), + (16384, 16384, 4096, 16, 16): (3, 2, 16, 128, 1, 2), + (16384, 16384, 4096, 32, 32): (1, 4, 32, 64, 5, 2), + (16384, 16384, 4096, 64, 64): (2, 16, 64, 64, 3, 2), + (16384, 16384, 4096, 128, 128): (3, 32, 128, 64, 2, 4), + (16384, 16384, 8192, 16, 16): (1, 2, 16, 128, 1, 2), + (16384, 16384, 8192, 32, 32): (1, 4, 32, 64, 5, 2), + (16384, 16384, 8192, 64, 64): (4, 8, 64, 64, 3, 2), + (16384, 16384, 8192, 128, 128): (5, 8, 128, 64, 1, 4), + (16384, 16384, 16384, 16, 16): (1, 2, 16, 128, 1, 2), + (16384, 16384, 16384, 32, 32): (1, 4, 32, 64, 5, 2), + (16384, 16384, 16384, 64, 64): (2, 4, 64, 128, 3, 4), + (16384, 16384, 16384, 128, 128): (4, 8, 128, 64, 1, 4), + (16384, 16384, 32768, 16, 16): (4, 2, 16, 128, 1, 2), + (16384, 16384, 32768, 32, 32): (1, 4, 32, 64, 5, 2), + (16384, 16384, 32768, 64, 64): (1, 8, 64, 64, 3, 2), + (16384, 16384, 32768, 128, 128): (2, 512, 128, 64, 2, 4), + (16384, 16384, 65536, 16, 16): (3, 2, 16, 128, 1, 2), + (16384, 16384, 65536, 32, 32): (1, 4, 32, 64, 5, 2), + (16384, 16384, 65536, 64, 64): (1, 4, 64, 128, 3, 4), + (16384, 16384, 65536, 128, 128): (2, 1024, 128, 64, 2, 4), + (16384, 16384, 131072, 16, 16): (1, 2, 16, 128, 1, 2), + (16384, 16384, 131072, 32, 32): (1, 4, 32, 64, 5, 2), + (16384, 16384, 131072, 64, 64): (3, 4, 64, 128, 3, 4), + (16384, 16384, 131072, 128, 128): (4, 2048, 128, 64, 2, 4), + }, + ("scatter_mm", "NVIDIA A100-SXM4-80GB", (0, torch.float16, 0.5)): { + (256, 256, 256, 16, 16): (5, 4, 16, 16, 1, 4), + (256, 256, 256, 32, 32): (5, 2, 32, 16, 1, 4), + (256, 256, 256, 64, 64): (4, 1, 32, 32, 1, 8), + (256, 256, 256, 128, 128): (2, 1, 32, 32, 1, 4), + (256, 256, 512, 16, 16): (2, 2, 16, 32, 1, 4), + (256, 256, 512, 32, 32): (4, 8, 32, 32, 1, 8), + (256, 256, 512, 64, 64): (4, 8, 32, 64, 1, 4), + (256, 256, 512, 128, 128): (4, 8, 32, 64, 1, 4), + (256, 256, 1024, 16, 16): (4, 2, 16, 64, 1, 2), + (256, 256, 1024, 32, 32): (4, 16, 32, 64, 1, 2), + (256, 256, 1024, 64, 64): (4, 16, 32, 64, 1, 4), + (256, 256, 1024, 128, 128): (4, 16, 64, 64, 1, 8), + (256, 256, 2048, 16, 16): (2, 16, 16, 64, 1, 8), + (256, 256, 2048, 32, 32): (4, 16, 32, 64, 1, 2), + (256, 256, 2048, 64, 64): (4, 16, 32, 64, 1, 4), + (256, 256, 2048, 128, 128): (4, 16, 64, 64, 1, 4), + (256, 256, 4096, 16, 16): (4, 32, 16, 64, 1, 1), + (256, 256, 4096, 32, 32): (2, 64, 32, 64, 1, 2), + (256, 256, 4096, 64, 64): (4, 64, 64, 64, 1, 4), + (256, 256, 4096, 128, 128): (4, 32, 64, 64, 1, 4), + (256, 256, 8192, 16, 16): (4, 64, 16, 64, 1, 1), + (256, 256, 8192, 32, 32): (4, 128, 32, 64, 1, 2), + (256, 256, 8192, 64, 64): (4, 64, 64, 64, 1, 4), + (256, 256, 8192, 128, 128): (4, 64, 64, 64, 1, 4), + (256, 256, 16384, 16, 16): (4, 128, 16, 64, 1, 1), + (256, 256, 16384, 32, 32): (2, 128, 32, 64, 1, 2), + (256, 256, 16384, 64, 64): (4, 32, 32, 128, 1, 4), + (256, 256, 16384, 128, 128): (4, 16, 64, 64, 1, 4), + (256, 256, 32768, 16, 16): (4, 64, 16, 64, 1, 1), + (256, 256, 32768, 32, 32): (2, 256, 32, 64, 1, 2), + (256, 256, 32768, 64, 64): (4, 32, 32, 128, 1, 4), + (256, 256, 32768, 128, 128): (4, 32, 64, 64, 1, 4), + (256, 256, 65536, 16, 16): (4, 128, 16, 64, 1, 1), + (256, 256, 65536, 32, 32): (4, 1, 32, 64, 1, 2), + (256, 256, 65536, 64, 64): (2, 1, 64, 64, 1, 2), + (256, 256, 65536, 128, 128): (4, 32, 64, 64, 1, 4), + (256, 256, 131072, 16, 16): (4, 64, 16, 64, 1, 1), + (256, 256, 131072, 32, 32): (2, 1, 32, 64, 1, 2), + (256, 256, 131072, 64, 64): (4, 32, 32, 128, 1, 4), + (256, 256, 131072, 128, 128): (4, 32, 64, 64, 1, 4), + (512, 512, 256, 16, 16): (4, 16, 16, 16, 1, 4), + (512, 512, 256, 32, 32): (2, 4, 32, 16, 1, 4), + (512, 512, 256, 64, 64): (2, 16, 64, 16, 3, 8), + (512, 512, 256, 128, 128): (4, 16, 64, 16, 1, 4), + (512, 512, 512, 16, 16): (1, 1, 16, 64, 1, 8), + (512, 512, 512, 32, 32): (2, 4, 16, 32, 1, 1), + (512, 512, 512, 64, 64): (2, 1, 32, 32, 1, 2), + (512, 512, 512, 128, 128): (4, 8, 32, 64, 1, 4), + (512, 512, 1024, 16, 16): (2, 8, 16, 64, 1, 8), + (512, 512, 1024, 32, 32): (4, 16, 32, 64, 1, 2), + (512, 512, 1024, 64, 64): (4, 16, 64, 64, 1, 4), + (512, 512, 1024, 128, 128): (2, 8, 64, 64, 1, 4), + (512, 512, 2048, 16, 16): (4, 16, 16, 64, 1, 4), + (512, 512, 2048, 32, 32): (4, 16, 32, 64, 1, 2), + (512, 512, 2048, 64, 64): (4, 16, 64, 64, 1, 8), + (512, 512, 2048, 128, 128): (4, 16, 64, 64, 1, 4), + (512, 512, 4096, 16, 16): (4, 32, 16, 128, 1, 2), + (512, 512, 4096, 32, 32): (4, 32, 32, 64, 1, 2), + (512, 512, 4096, 64, 64): (4, 32, 64, 64, 1, 4), + (512, 512, 4096, 128, 128): (4, 32, 64, 64, 1, 4), + (512, 512, 8192, 16, 16): (2, 32, 16, 128, 1, 2), + (512, 512, 8192, 32, 32): (4, 64, 32, 64, 1, 2), + (512, 512, 8192, 64, 64): (4, 128, 64, 64, 1, 2), + (512, 512, 8192, 128, 128): (4, 64, 64, 64, 1, 4), + (512, 512, 16384, 16, 16): (4, 32, 16, 64, 1, 1), + (512, 512, 16384, 32, 32): (4, 64, 32, 64, 1, 2), + (512, 512, 16384, 64, 64): (4, 16, 64, 64, 1, 4), + (512, 512, 16384, 128, 128): (4, 32, 64, 64, 1, 4), + (512, 512, 32768, 16, 16): (7, 16, 16, 128, 1, 2), + (512, 512, 32768, 32, 32): (4, 64, 32, 64, 1, 2), + (512, 512, 32768, 64, 64): (2, 32, 64, 64, 3, 2), + (512, 512, 32768, 128, 128): (2, 32, 64, 64, 1, 4), + (512, 512, 65536, 16, 16): (2, 32, 16, 64, 1, 1), + (512, 512, 65536, 32, 32): (4, 64, 32, 64, 1, 2), + (512, 512, 65536, 64, 64): (3, 32, 64, 64, 3, 2), + (512, 512, 65536, 128, 128): (4, 16, 64, 64, 1, 4), + (512, 512, 131072, 16, 16): (3, 32, 16, 128, 1, 2), + (512, 512, 131072, 32, 32): (4, 64, 32, 64, 1, 2), + (512, 512, 131072, 64, 64): (2, 32, 64, 64, 3, 2), + (512, 512, 131072, 128, 128): (3, 1, 64, 64, 1, 4), + (1024, 1024, 256, 16, 16): (4, 16, 16, 16, 1, 4), + (1024, 1024, 256, 32, 32): (4, 16, 32, 16, 1, 4), + (1024, 1024, 256, 64, 64): (4, 4, 64, 32, 1, 16), + (1024, 1024, 256, 128, 128): (4, 16, 64, 16, 1, 8), + (1024, 1024, 512, 16, 16): (2, 8, 16, 64, 1, 8), + (1024, 1024, 512, 32, 32): (3, 2, 32, 64, 1, 2), + (1024, 1024, 512, 64, 64): (4, 8, 32, 64, 1, 8), + (1024, 1024, 512, 128, 128): (4, 8, 64, 64, 1, 8), + (1024, 1024, 1024, 16, 16): (2, 2, 16, 64, 1, 2), + (1024, 1024, 1024, 32, 32): (2, 8, 32, 64, 1, 2), + (1024, 1024, 1024, 64, 64): (2, 8, 32, 128, 1, 4), + (1024, 1024, 1024, 128, 128): (2, 8, 64, 64, 1, 4), + (1024, 1024, 2048, 16, 16): (2, 16, 16, 128, 3, 2), + (1024, 1024, 2048, 32, 32): (4, 32, 32, 64, 1, 2), + (1024, 1024, 2048, 64, 64): (4, 16, 64, 64, 1, 4), + (1024, 1024, 2048, 128, 128): (4, 32, 64, 64, 1, 4), + (1024, 1024, 4096, 16, 16): (4, 16, 16, 128, 1, 2), + (1024, 1024, 4096, 32, 32): (3, 32, 32, 64, 1, 2), + (1024, 1024, 4096, 64, 64): (4, 32, 64, 64, 1, 4), + (1024, 1024, 4096, 128, 128): (4, 32, 64, 64, 1, 4), + (1024, 1024, 8192, 16, 16): (5, 16, 16, 128, 1, 2), + (1024, 1024, 8192, 32, 32): (2, 32, 32, 64, 3, 2), + (1024, 1024, 8192, 64, 64): (1, 16, 64, 64, 3, 2), + (1024, 1024, 8192, 128, 128): (4, 32, 64, 64, 1, 4), + (1024, 1024, 16384, 16, 16): (4, 16, 16, 128, 1, 2), + (1024, 1024, 16384, 32, 32): (1, 32, 32, 64, 3, 2), + (1024, 1024, 16384, 64, 64): (4, 16, 64, 64, 3, 2), + (1024, 1024, 16384, 128, 128): (4, 32, 128, 64, 1, 4), + (1024, 1024, 32768, 16, 16): (3, 16, 16, 128, 1, 2), + (1024, 1024, 32768, 32, 32): (1, 8, 32, 64, 3, 2), + (1024, 1024, 32768, 64, 64): (4, 16, 64, 64, 3, 2), + (1024, 1024, 32768, 128, 128): (4, 8, 128, 64, 2, 4), + (1024, 1024, 65536, 16, 16): (1, 2, 16, 128, 1, 2), + (1024, 1024, 65536, 32, 32): (2, 4, 32, 64, 3, 2), + (1024, 1024, 65536, 64, 64): (5, 16, 64, 64, 3, 2), + (1024, 1024, 65536, 128, 128): (5, 8, 128, 64, 2, 4), + (1024, 1024, 131072, 16, 16): (5, 2, 16, 128, 1, 2), + (1024, 1024, 131072, 32, 32): (1, 2, 32, 64, 3, 2), + (1024, 1024, 131072, 64, 64): (5, 16, 64, 64, 3, 2), + (1024, 1024, 131072, 128, 128): (2, 1, 128, 64, 2, 4), + (2048, 2048, 256, 16, 16): (4, 4, 16, 64, 1, 8), + (2048, 2048, 256, 32, 32): (4, 8, 32, 32, 1, 8), + (2048, 2048, 256, 64, 64): (4, 16, 64, 16, 1, 8), + (2048, 2048, 256, 128, 128): (4, 4, 128, 32, 3, 8), + (2048, 2048, 512, 16, 16): (2, 2, 16, 64, 1, 2), + (2048, 2048, 512, 32, 32): (2, 4, 32, 64, 3, 2), + (2048, 2048, 512, 64, 64): (4, 4, 64, 64, 1, 8), + (2048, 2048, 512, 128, 128): (4, 8, 64, 64, 1, 4), + (2048, 2048, 1024, 16, 16): (1, 8, 16, 64, 1, 2), + (2048, 2048, 1024, 32, 32): (2, 16, 32, 64, 3, 2), + (2048, 2048, 1024, 64, 64): (4, 8, 64, 64, 1, 4), + (2048, 2048, 1024, 128, 128): (4, 8, 128, 64, 1, 4), + (2048, 2048, 2048, 16, 16): (5, 4, 16, 128, 1, 2), + (2048, 2048, 2048, 32, 32): (1, 16, 32, 64, 3, 2), + (2048, 2048, 2048, 64, 64): (2, 8, 64, 64, 1, 4), + (2048, 2048, 2048, 128, 128): (2, 8, 128, 64, 1, 4), + (2048, 2048, 4096, 16, 16): (4, 2, 16, 128, 1, 2), + (2048, 2048, 4096, 32, 32): (2, 16, 32, 64, 3, 2), + (2048, 2048, 4096, 64, 64): (2, 8, 64, 64, 3, 2), + (2048, 2048, 4096, 128, 128): (4, 8, 128, 64, 1, 4), + (2048, 2048, 8192, 16, 16): (5, 4, 16, 128, 1, 2), + (2048, 2048, 8192, 32, 32): (2, 8, 32, 64, 3, 2), + (2048, 2048, 8192, 64, 64): (4, 8, 64, 64, 3, 2), + (2048, 2048, 8192, 128, 128): (4, 8, 128, 64, 1, 4), + (2048, 2048, 16384, 16, 16): (3, 2, 16, 128, 1, 2), + (2048, 2048, 16384, 32, 32): (2, 4, 32, 128, 3, 2), + (2048, 2048, 16384, 64, 64): (4, 8, 64, 64, 3, 2), + (2048, 2048, 16384, 128, 128): (4, 4, 128, 64, 1, 4), + (2048, 2048, 32768, 16, 16): (3, 2, 16, 128, 1, 2), + (2048, 2048, 32768, 32, 32): (3, 4, 32, 128, 3, 2), + (2048, 2048, 32768, 64, 64): (6, 4, 64, 64, 3, 2), + (2048, 2048, 32768, 128, 128): (3, 4, 128, 64, 1, 4), + (2048, 2048, 65536, 16, 16): (6, 2, 16, 128, 1, 2), + (2048, 2048, 65536, 32, 32): (1, 2, 32, 128, 1, 2), + (2048, 2048, 65536, 64, 64): (5, 4, 64, 64, 3, 2), + (2048, 2048, 65536, 128, 128): (5, 1, 128, 64, 2, 4), + (2048, 2048, 131072, 16, 16): (3, 2, 16, 128, 1, 2), + (2048, 2048, 131072, 32, 32): (2, 1, 32, 128, 3, 2), + (2048, 2048, 131072, 64, 64): (4, 1, 64, 64, 3, 2), + (2048, 2048, 131072, 128, 128): (3, 1, 128, 64, 2, 4), + (4096, 4096, 256, 16, 16): (5, 8, 16, 32, 1, 4), + (4096, 4096, 256, 32, 32): (4, 16, 32, 16, 2, 4), + (4096, 4096, 256, 64, 64): (2, 1, 64, 64, 3, 4), + (4096, 4096, 256, 128, 128): (4, 4, 128, 32, 1, 4), + (4096, 4096, 512, 16, 16): (4, 2, 16, 128, 1, 2), + (4096, 4096, 512, 32, 32): (4, 8, 32, 64, 1, 2), + (4096, 4096, 512, 64, 64): (4, 4, 64, 64, 1, 4), + (4096, 4096, 512, 128, 128): (4, 8, 128, 64, 2, 4), + (4096, 4096, 1024, 16, 16): (1, 2, 16, 128, 1, 2), + (4096, 4096, 1024, 32, 32): (6, 8, 32, 64, 3, 2), + (4096, 4096, 1024, 64, 64): (2, 16, 64, 64, 4, 4), + (4096, 4096, 1024, 128, 128): (2, 4, 128, 64, 2, 4), + (4096, 4096, 2048, 16, 16): (3, 1, 16, 128, 1, 2), + (4096, 4096, 2048, 32, 32): (1, 4, 32, 64, 5, 2), + (4096, 4096, 2048, 64, 64): (3, 16, 64, 64, 3, 2), + (4096, 4096, 2048, 128, 128): (4, 32, 128, 64, 2, 4), + (4096, 4096, 4096, 16, 16): (1, 2, 16, 128, 1, 2), + (4096, 4096, 4096, 32, 32): (1, 4, 32, 64, 3, 2), + (4096, 4096, 4096, 64, 64): (1, 1, 64, 64, 4, 4), + (4096, 4096, 4096, 128, 128): (2, 1, 128, 128, 1, 8), + (4096, 4096, 8192, 16, 16): (3, 1, 16, 128, 1, 2), + (4096, 4096, 8192, 32, 32): (2, 2, 32, 64, 5, 2), + (4096, 4096, 8192, 64, 64): (4, 16, 64, 64, 3, 2), + (4096, 4096, 8192, 128, 128): (4, 16, 128, 64, 2, 4), + (4096, 4096, 16384, 16, 16): (1, 2, 16, 128, 1, 2), + (4096, 4096, 16384, 32, 32): (4, 2, 32, 64, 5, 2), + (4096, 4096, 16384, 64, 64): (4, 16, 64, 64, 3, 2), + (4096, 4096, 16384, 128, 128): (4, 16, 128, 64, 2, 4), + (4096, 4096, 32768, 16, 16): (3, 1, 16, 128, 1, 2), + (4096, 4096, 32768, 32, 32): (3, 1, 32, 128, 1, 4), + (4096, 4096, 32768, 64, 64): (3, 1, 64, 64, 3, 4), + (4096, 4096, 32768, 128, 128): (5, 16, 128, 64, 2, 4), + (4096, 4096, 65536, 16, 16): (5, 1, 16, 128, 1, 2), + (4096, 4096, 65536, 32, 32): (5, 1, 32, 128, 1, 4), + (4096, 4096, 65536, 64, 64): (1, 1, 64, 64, 3, 4), + (4096, 4096, 65536, 128, 128): (3, 16, 128, 64, 2, 4), + (4096, 4096, 131072, 16, 16): (3, 1, 16, 128, 1, 2), + (4096, 4096, 131072, 32, 32): (3, 1, 32, 128, 3, 2), + (4096, 4096, 131072, 64, 64): (2, 1, 64, 64, 3, 4), + (4096, 4096, 131072, 128, 128): (1, 1, 128, 64, 1, 4), + (8192, 8192, 256, 16, 16): (4, 16, 16, 16, 1, 4), + (8192, 8192, 256, 32, 32): (1, 16, 32, 16, 4, 4), + (8192, 8192, 256, 64, 64): (4, 16, 64, 16, 3, 8), + (8192, 8192, 256, 128, 128): (4, 16, 128, 16, 1, 2), + (8192, 8192, 512, 16, 16): (2, 8, 16, 64, 1, 4), + (8192, 8192, 512, 32, 32): (4, 8, 32, 64, 3, 2), + (8192, 8192, 512, 64, 64): (2, 8, 64, 64, 4, 4), + (8192, 8192, 512, 128, 128): (4, 8, 128, 64, 2, 4), + (8192, 8192, 1024, 16, 16): (4, 16, 16, 64, 1, 8), + (8192, 8192, 1024, 32, 32): (2, 8, 32, 64, 5, 2), + (8192, 8192, 1024, 64, 64): (1, 16, 64, 64, 3, 2), + (8192, 8192, 1024, 128, 128): (5, 16, 128, 64, 2, 4), + (8192, 8192, 2048, 16, 16): (7, 2, 16, 128, 1, 2), + (8192, 8192, 2048, 32, 32): (1, 16, 32, 64, 5, 2), + (8192, 8192, 2048, 64, 64): (4, 16, 64, 64, 3, 2), + (8192, 8192, 2048, 128, 128): (6, 16, 128, 64, 2, 4), + (8192, 8192, 4096, 16, 16): (4, 2, 16, 128, 1, 2), + (8192, 8192, 4096, 32, 32): (2, 8, 32, 64, 5, 2), + (8192, 8192, 4096, 64, 64): (3, 16, 64, 64, 3, 2), + (8192, 8192, 4096, 128, 128): (3, 64, 128, 64, 2, 4), + (8192, 8192, 8192, 16, 16): (4, 2, 16, 128, 1, 2), + (8192, 8192, 8192, 32, 32): (1, 4, 32, 128, 5, 4), + (8192, 8192, 8192, 64, 64): (4, 4, 64, 64, 1, 4), + (8192, 8192, 8192, 128, 128): (2, 2, 128, 128, 3, 8), + (8192, 8192, 16384, 16, 16): (1, 2, 16, 128, 1, 2), + (8192, 8192, 16384, 32, 32): (4, 8, 32, 64, 5, 2), + (8192, 8192, 16384, 64, 64): (5, 8, 64, 64, 3, 2), + (8192, 8192, 16384, 128, 128): (3, 16, 128, 64, 2, 4), + (8192, 8192, 32768, 16, 16): (7, 2, 16, 128, 1, 2), + (8192, 8192, 32768, 32, 32): (3, 4, 32, 64, 3, 2), + (8192, 8192, 32768, 64, 64): (2, 8, 64, 64, 3, 2), + (8192, 8192, 32768, 128, 128): (6, 16, 128, 64, 2, 4), + (8192, 8192, 65536, 16, 16): (9, 2, 16, 128, 1, 2), + (8192, 8192, 65536, 32, 32): (7, 4, 32, 64, 5, 2), + (8192, 8192, 65536, 64, 64): (4, 8, 64, 64, 3, 2), + (8192, 8192, 65536, 128, 128): (3, 16, 128, 64, 2, 4), + (8192, 8192, 131072, 16, 16): (9, 2, 16, 128, 1, 2), + (8192, 8192, 131072, 32, 32): (1, 8, 32, 64, 5, 2), + (8192, 8192, 131072, 64, 64): (1, 8, 64, 64, 3, 2), + (8192, 8192, 131072, 128, 128): (4, 16, 128, 64, 2, 4), + (16384, 16384, 256, 16, 16): (5, 16, 16, 16, 1, 4), + (16384, 16384, 256, 32, 32): (4, 16, 32, 16, 4, 4), + (16384, 16384, 256, 64, 64): (4, 16, 64, 16, 3, 8), + (16384, 16384, 256, 128, 128): (4, 16, 128, 16, 1, 2), + (16384, 16384, 512, 16, 16): (2, 8, 16, 64, 1, 4), + (16384, 16384, 512, 32, 32): (1, 4, 32, 64, 5, 2), + (16384, 16384, 512, 64, 64): (4, 8, 64, 64, 1, 4), + (16384, 16384, 512, 128, 128): (3, 8, 128, 64, 2, 4), + (16384, 16384, 1024, 16, 16): (4, 2, 16, 128, 1, 2), + (16384, 16384, 1024, 32, 32): (4, 8, 32, 64, 5, 2), + (16384, 16384, 1024, 64, 64): (6, 16, 64, 64, 3, 2), + (16384, 16384, 1024, 128, 128): (3, 16, 128, 64, 2, 4), + (16384, 16384, 2048, 16, 16): (3, 2, 16, 128, 1, 2), + (16384, 16384, 2048, 32, 32): (1, 8, 32, 64, 5, 2), + (16384, 16384, 2048, 64, 64): (5, 16, 64, 64, 3, 2), + (16384, 16384, 2048, 128, 128): (2, 32, 128, 64, 2, 4), + (16384, 16384, 4096, 16, 16): (2, 2, 16, 128, 1, 2), + (16384, 16384, 4096, 32, 32): (1, 4, 32, 64, 3, 2), + (16384, 16384, 4096, 64, 64): (2, 8, 64, 64, 3, 2), + (16384, 16384, 4096, 128, 128): (3, 16, 128, 64, 2, 4), + (16384, 16384, 8192, 16, 16): (3, 2, 16, 128, 1, 2), + (16384, 16384, 8192, 32, 32): (2, 4, 32, 64, 5, 2), + (16384, 16384, 8192, 64, 64): (4, 8, 64, 64, 3, 2), + (16384, 16384, 8192, 128, 128): (8, 32, 128, 64, 2, 4), + (16384, 16384, 16384, 16, 16): (1, 2, 16, 256, 1, 4), + (16384, 16384, 16384, 32, 32): (1, 4, 32, 128, 3, 4), + (16384, 16384, 16384, 64, 64): (5, 4, 64, 64, 1, 4), + (16384, 16384, 16384, 128, 128): (4, 8, 128, 64, 2, 4), + (16384, 16384, 32768, 16, 16): (2, 2, 16, 128, 1, 2), + (16384, 16384, 32768, 32, 32): (1, 4, 32, 64, 3, 2), + (16384, 16384, 32768, 64, 64): (5, 4, 64, 64, 1, 4), + (16384, 16384, 32768, 128, 128): (5, 8, 128, 64, 2, 4), + (16384, 16384, 65536, 16, 16): (8, 2, 16, 128, 1, 2), + (16384, 16384, 65536, 32, 32): (6, 4, 32, 64, 5, 2), + (16384, 16384, 65536, 64, 64): (2, 4, 64, 64, 1, 4), + (16384, 16384, 65536, 128, 128): (4, 8, 128, 64, 2, 4), + (16384, 16384, 131072, 16, 16): (3, 1, 16, 128, 1, 2), + (16384, 16384, 131072, 32, 32): (1, 4, 32, 64, 3, 2), + (16384, 16384, 131072, 64, 64): (4, 4, 64, 64, 1, 4), + (16384, 16384, 131072, 128, 128): (1, 8, 128, 64, 2, 4), + (32768, 32768, 256, 16, 16): (4, 16, 16, 16, 1, 4), + (32768, 32768, 512, 16, 16): (4, 2, 16, 128, 1, 2), + (32768, 32768, 1024, 16, 16): (3, 2, 16, 128, 1, 2), + (32768, 32768, 2048, 16, 16): (4, 2, 16, 128, 1, 2), + (32768, 32768, 4096, 16, 16): (5, 4, 16, 64, 1, 1), + (32768, 32768, 8192, 16, 16): (4, 4, 16, 64, 1, 1), + (32768, 32768, 16384, 16, 16): (4, 4, 16, 64, 1, 1), + (32768, 32768, 32768, 16, 16): (5, 4, 16, 64, 1, 1), + }, + ("scatter_mm", "NVIDIA A100-SXM4-80GB", (0, torch.float32, 0.5)): { + (256, 256, 256, 16, 16): (1, 1, 16, 16, 1, 8), + (256, 256, 256, 32, 32): (1, 1, 16, 16, 1, 4), + (256, 256, 256, 64, 64): (1, 1, 16, 16, 1, 4), + (256, 256, 256, 128, 128): (1, 1, 16, 16, 1, 1), + (256, 256, 512, 16, 16): (1, 1, 16, 16, 1, 4), + (256, 256, 512, 32, 32): (1, 16, 16, 16, 1, 1), + (256, 256, 512, 64, 64): (1, 1, 16, 16, 1, 1), + (256, 256, 512, 128, 128): (1, 1, 32, 32, 1, 4), + (256, 256, 1024, 16, 16): (1, 1, 16, 32, 1, 2), + (256, 256, 1024, 32, 32): (1, 4, 16, 16, 1, 1), + (256, 256, 1024, 64, 64): (1, 1, 32, 32, 1, 4), + (256, 256, 1024, 128, 128): (1, 1, 32, 32, 1, 4), + (256, 256, 2048, 16, 16): (1, 2, 16, 32, 1, 2), + (256, 256, 2048, 32, 32): (1, 1, 16, 32, 1, 2), + (256, 256, 2048, 64, 64): (2, 1, 16, 32, 1, 2), + (256, 256, 2048, 128, 128): (1, 1, 16, 16, 1, 1), + (256, 256, 4096, 16, 16): (1, 1, 16, 32, 1, 2), + (256, 256, 4096, 32, 32): (1, 1, 16, 32, 1, 2), + (256, 256, 4096, 64, 64): (1, 1, 32, 32, 1, 4), + (256, 256, 4096, 128, 128): (3, 1, 32, 64, 1, 4), + (256, 256, 8192, 16, 16): (1, 32, 16, 64, 1, 2), + (256, 256, 8192, 32, 32): (1, 1, 32, 64, 1, 4), + (256, 256, 8192, 64, 64): (1, 1, 32, 64, 1, 4), + (256, 256, 8192, 128, 128): (2, 1, 64, 32, 1, 4), + (256, 256, 16384, 16, 16): (1, 1, 16, 64, 1, 2), + (256, 256, 16384, 32, 32): (1, 1, 32, 64, 1, 4), + (256, 256, 16384, 64, 64): (1, 128, 64, 64, 1, 4), + (256, 256, 16384, 128, 128): (2, 1, 64, 32, 1, 4), + (256, 256, 32768, 16, 16): (2, 128, 16, 64, 1, 1), + (256, 256, 32768, 32, 32): (1, 1, 32, 64, 1, 4), + (256, 256, 32768, 64, 64): (1, 128, 64, 64, 1, 4), + (256, 256, 32768, 128, 128): (2, 1, 64, 64, 1, 4), + (256, 256, 65536, 16, 16): (1, 1, 16, 64, 1, 2), + (256, 256, 65536, 32, 32): (1, 1, 32, 64, 1, 4), + (256, 256, 65536, 64, 64): (2, 1, 64, 64, 1, 4), + (256, 256, 65536, 128, 128): (1, 1, 128, 32, 1, 4), + (256, 256, 131072, 16, 16): (3, 128, 16, 64, 1, 1), + (256, 256, 131072, 32, 32): (1, 1, 32, 64, 1, 4), + (256, 256, 131072, 64, 64): (2, 1, 64, 64, 1, 4), + (256, 256, 131072, 128, 128): (1, 8192, 64, 16, 1, 4), + (512, 512, 256, 16, 16): (1, 2, 16, 16, 1, 1), + (512, 512, 256, 32, 32): (1, 4, 16, 16, 1, 1), + (512, 512, 256, 64, 64): (1, 16, 16, 16, 1, 1), + (512, 512, 256, 128, 128): (1, 1, 16, 32, 1, 4), + (512, 512, 512, 16, 16): (1, 8, 16, 32, 1, 2), + (512, 512, 512, 32, 32): (1, 8, 16, 32, 1, 2), + (512, 512, 512, 64, 64): (1, 2, 16, 32, 1, 2), + (512, 512, 512, 128, 128): (1, 1, 32, 32, 1, 4), + (512, 512, 1024, 16, 16): (1, 1, 16, 32, 1, 2), + (512, 512, 1024, 32, 32): (1, 1, 16, 32, 1, 2), + (512, 512, 1024, 64, 64): (1, 1, 16, 32, 1, 2), + (512, 512, 1024, 128, 128): (1, 1, 64, 32, 1, 4), + (512, 512, 2048, 16, 16): (1, 16, 16, 64, 1, 2), + (512, 512, 2048, 32, 32): (1, 1, 32, 32, 1, 4), + (512, 512, 2048, 64, 64): (1, 1, 32, 32, 1, 4), + (512, 512, 2048, 128, 128): (2, 1, 32, 32, 1, 4), + (512, 512, 4096, 16, 16): (2, 64, 16, 64, 1, 1), + (512, 512, 4096, 32, 32): (1, 64, 32, 64, 1, 4), + (512, 512, 4096, 64, 64): (1, 1, 32, 32, 1, 4), + (512, 512, 4096, 128, 128): (1, 1, 64, 32, 1, 4), + (512, 512, 8192, 16, 16): (2, 64, 16, 64, 1, 1), + (512, 512, 8192, 32, 32): (1, 256, 32, 32, 1, 1), + (512, 512, 8192, 64, 64): (1, 64, 64, 64, 1, 4), + (512, 512, 8192, 128, 128): (2, 1, 64, 32, 1, 8), + (512, 512, 16384, 16, 16): (2, 64, 16, 64, 1, 1), + (512, 512, 16384, 32, 32): (1, 128, 32, 32, 1, 1), + (512, 512, 16384, 64, 64): (1, 64, 64, 64, 1, 4), + (512, 512, 16384, 128, 128): (3, 1, 64, 32, 1, 8), + (512, 512, 32768, 16, 16): (2, 64, 16, 64, 1, 1), + (512, 512, 32768, 32, 32): (1, 128, 32, 32, 1, 1), + (512, 512, 32768, 64, 64): (1, 64, 64, 64, 1, 4), + (512, 512, 32768, 128, 128): (2, 1, 64, 32, 1, 8), + (512, 512, 65536, 16, 16): (2, 32, 16, 64, 1, 1), + (512, 512, 65536, 32, 32): (1, 128, 32, 32, 1, 1), + (512, 512, 65536, 64, 64): (1, 64, 64, 64, 1, 4), + (512, 512, 65536, 128, 128): (2, 1, 64, 32, 1, 8), + (512, 512, 131072, 16, 16): (2, 32, 16, 64, 1, 1), + (512, 512, 131072, 32, 32): (1, 128, 32, 32, 1, 1), + (512, 512, 131072, 64, 64): (3, 64, 64, 64, 1, 4), + (512, 512, 131072, 128, 128): (1, 8192, 64, 16, 1, 4), + (1024, 1024, 256, 16, 16): (1, 4, 16, 32, 1, 2), + (1024, 1024, 256, 32, 32): (1, 4, 16, 32, 1, 2), + (1024, 1024, 256, 64, 64): (1, 1, 16, 32, 1, 2), + (1024, 1024, 256, 128, 128): (1, 1, 16, 16, 1, 1), + (1024, 1024, 512, 16, 16): (1, 8, 16, 32, 1, 2), + (1024, 1024, 512, 32, 32): (1, 8, 16, 32, 1, 1), + (1024, 1024, 512, 64, 64): (1, 8, 32, 32, 1, 4), + (1024, 1024, 512, 128, 128): (2, 1, 32, 32, 1, 4), + (1024, 1024, 1024, 16, 16): (1, 16, 16, 32, 1, 2), + (1024, 1024, 1024, 32, 32): (1, 16, 32, 64, 1, 4), + (1024, 1024, 1024, 64, 64): (1, 16, 32, 64, 1, 4), + (1024, 1024, 1024, 128, 128): (1, 1, 32, 32, 1, 4), + (1024, 1024, 2048, 16, 16): (2, 32, 16, 64, 1, 1), + (1024, 1024, 2048, 32, 32): (1, 32, 32, 64, 1, 4), + (1024, 1024, 2048, 64, 64): (1, 32, 64, 64, 1, 4), + (1024, 1024, 2048, 128, 128): (1, 1, 32, 64, 1, 4), + (1024, 1024, 4096, 16, 16): (2, 16, 16, 64, 1, 1), + (1024, 1024, 4096, 32, 32): (1, 64, 32, 32, 1, 1), + (1024, 1024, 4096, 64, 64): (1, 64, 64, 64, 1, 4), + (1024, 1024, 4096, 128, 128): (2, 64, 64, 32, 1, 8), + (1024, 1024, 8192, 16, 16): (2, 16, 16, 64, 1, 1), + (1024, 1024, 8192, 32, 32): (1, 64, 32, 32, 1, 1), + (1024, 1024, 8192, 64, 64): (1, 64, 64, 64, 1, 4), + (1024, 1024, 8192, 128, 128): (4, 1, 32, 64, 1, 4), + (1024, 1024, 16384, 16, 16): (2, 16, 16, 64, 1, 1), + (1024, 1024, 16384, 32, 32): (1, 64, 32, 32, 1, 1), + (1024, 1024, 16384, 64, 64): (1, 32, 64, 64, 1, 4), + (1024, 1024, 16384, 128, 128): (2, 64, 64, 32, 1, 4), + (1024, 1024, 32768, 16, 16): (2, 16, 16, 64, 1, 1), + (1024, 1024, 32768, 32, 32): (1, 64, 32, 32, 1, 1), + (1024, 1024, 32768, 64, 64): (1, 32, 64, 64, 1, 4), + (1024, 1024, 32768, 128, 128): (4, 1, 32, 64, 1, 4), + (1024, 1024, 65536, 16, 16): (2, 16, 16, 64, 1, 1), + (1024, 1024, 65536, 32, 32): (1, 32, 32, 32, 1, 1), + (1024, 1024, 65536, 64, 64): (2, 32, 64, 64, 1, 4), + (1024, 1024, 65536, 128, 128): (4, 1, 64, 32, 1, 4), + (1024, 1024, 131072, 16, 16): (2, 16, 16, 64, 1, 1), + (1024, 1024, 131072, 32, 32): (1, 32, 32, 32, 1, 1), + (1024, 1024, 131072, 64, 64): (1, 16, 64, 64, 1, 4), + (1024, 1024, 131072, 128, 128): (1, 8192, 64, 16, 1, 4), + (2048, 2048, 256, 16, 16): (1, 4, 16, 32, 1, 2), + (2048, 2048, 256, 32, 32): (1, 8, 16, 32, 1, 1), + (2048, 2048, 256, 64, 64): (1, 8, 32, 32, 1, 4), + (2048, 2048, 256, 128, 128): (1, 4, 64, 64, 1, 8), + (2048, 2048, 512, 16, 16): (2, 8, 16, 32, 1, 2), + (2048, 2048, 512, 32, 32): (2, 8, 32, 64, 1, 4), + (2048, 2048, 512, 64, 64): (2, 4, 64, 64, 1, 4), + (2048, 2048, 512, 128, 128): (1, 8, 32, 64, 1, 4), + (2048, 2048, 1024, 16, 16): (2, 16, 16, 64, 3, 1), + (2048, 2048, 1024, 32, 32): (1, 32, 32, 32, 1, 1), + (2048, 2048, 1024, 64, 64): (1, 16, 64, 64, 1, 4), + (2048, 2048, 1024, 128, 128): (2, 4, 64, 64, 1, 8), + (2048, 2048, 2048, 16, 16): (2, 16, 16, 64, 1, 1), + (2048, 2048, 2048, 32, 32): (1, 32, 32, 32, 1, 1), + (2048, 2048, 2048, 64, 64): (1, 16, 64, 64, 1, 4), + (2048, 2048, 2048, 128, 128): (2, 32, 32, 64, 1, 4), + (2048, 2048, 4096, 16, 16): (3, 2, 16, 64, 1, 1), + (2048, 2048, 4096, 32, 32): (3, 4, 32, 32, 1, 1), + (2048, 2048, 4096, 64, 64): (1, 16, 64, 64, 1, 4), + (2048, 2048, 4096, 128, 128): (2, 32, 64, 32, 1, 4), + (2048, 2048, 8192, 16, 16): (3, 4, 16, 64, 1, 1), + (2048, 2048, 8192, 32, 32): (2, 4, 32, 32, 1, 1), + (2048, 2048, 8192, 64, 64): (2, 32, 64, 32, 1, 2), + (2048, 2048, 8192, 128, 128): (4, 1, 32, 64, 1, 4), + (2048, 2048, 16384, 16, 16): (3, 4, 16, 64, 1, 1), + (2048, 2048, 16384, 32, 32): (1, 4, 32, 32, 1, 1), + (2048, 2048, 16384, 64, 64): (2, 8, 64, 32, 1, 2), + (2048, 2048, 16384, 128, 128): (2, 8, 64, 32, 1, 4), + (2048, 2048, 32768, 16, 16): (2, 4, 16, 64, 1, 1), + (2048, 2048, 32768, 32, 32): (2, 8, 32, 32, 1, 1), + (2048, 2048, 32768, 64, 64): (1, 16, 64, 32, 1, 2), + (2048, 2048, 32768, 128, 128): (4, 1, 32, 64, 1, 4), + (2048, 2048, 65536, 16, 16): (3, 4, 16, 64, 1, 1), + (2048, 2048, 65536, 32, 32): (1, 8, 32, 32, 1, 1), + (2048, 2048, 65536, 64, 64): (1, 8, 64, 32, 1, 2), + (2048, 2048, 65536, 128, 128): (4, 1, 64, 32, 1, 4), + (2048, 2048, 131072, 16, 16): (2, 4, 16, 64, 1, 1), + (2048, 2048, 131072, 32, 32): (1, 8, 32, 32, 1, 1), + (2048, 2048, 131072, 64, 64): (3, 1, 64, 32, 1, 2), + (2048, 2048, 131072, 128, 128): (1, 8192, 128, 16, 1, 8), + (4096, 4096, 256, 16, 16): (2, 4, 16, 32, 1, 2), + (4096, 4096, 256, 32, 32): (1, 4, 32, 64, 1, 4), + (4096, 4096, 256, 64, 64): (1, 4, 64, 64, 1, 4), + (4096, 4096, 256, 128, 128): (1, 4, 32, 64, 1, 4), + (4096, 4096, 512, 16, 16): (2, 8, 16, 64, 3, 1), + (4096, 4096, 512, 32, 32): (2, 16, 32, 32, 1, 1), + (4096, 4096, 512, 64, 64): (1, 8, 64, 64, 1, 4), + (4096, 4096, 512, 128, 128): (1, 8, 32, 64, 1, 4), + (4096, 4096, 1024, 16, 16): (1, 8, 16, 64, 3, 1), + (4096, 4096, 1024, 32, 32): (1, 16, 32, 32, 1, 1), + (4096, 4096, 1024, 64, 64): (1, 16, 64, 32, 1, 2), + (4096, 4096, 1024, 128, 128): (1, 16, 32, 64, 1, 4), + (4096, 4096, 2048, 16, 16): (1, 16, 16, 64, 3, 1), + (4096, 4096, 2048, 32, 32): (1, 16, 32, 32, 1, 1), + (4096, 4096, 2048, 64, 64): (3, 16, 64, 32, 1, 2), + (4096, 4096, 2048, 128, 128): (4, 8, 32, 64, 1, 4), + (4096, 4096, 4096, 16, 16): (1, 8, 16, 64, 3, 1), + (4096, 4096, 4096, 32, 32): (1, 1, 32, 32, 1, 1), + (4096, 4096, 4096, 64, 64): (2, 16, 64, 32, 1, 2), + (4096, 4096, 4096, 128, 128): (4, 8, 32, 64, 1, 4), + (4096, 4096, 8192, 16, 16): (1, 8, 16, 64, 3, 1), + (4096, 4096, 8192, 32, 32): (2, 1, 32, 32, 1, 1), + (4096, 4096, 8192, 64, 64): (1, 16, 64, 32, 1, 2), + (4096, 4096, 8192, 128, 128): (2, 1, 32, 64, 1, 4), + (4096, 4096, 16384, 16, 16): (1, 8, 16, 64, 3, 1), + (4096, 4096, 16384, 32, 32): (1, 1, 32, 32, 1, 1), + (4096, 4096, 16384, 64, 64): (2, 8, 64, 32, 1, 2), + (4096, 4096, 16384, 128, 128): (2, 1, 32, 64, 1, 4), + (4096, 4096, 32768, 16, 16): (1, 8, 16, 64, 3, 1), + (4096, 4096, 32768, 32, 32): (1, 1, 32, 32, 1, 1), + (4096, 4096, 32768, 64, 64): (1, 8, 64, 32, 1, 2), + (4096, 4096, 32768, 128, 128): (2, 1, 32, 64, 1, 4), + (4096, 4096, 65536, 16, 16): (1, 8, 16, 64, 3, 1), + (4096, 4096, 65536, 32, 32): (3, 1, 32, 32, 1, 1), + (4096, 4096, 65536, 64, 64): (3, 4, 64, 32, 1, 2), + (4096, 4096, 65536, 128, 128): (2, 1, 32, 64, 1, 4), + (4096, 4096, 131072, 16, 16): (1, 8, 16, 64, 3, 1), + (4096, 4096, 131072, 32, 32): (1, 1, 32, 32, 1, 1), + (4096, 4096, 131072, 64, 64): (2, 8, 64, 32, 1, 2), + (4096, 4096, 131072, 128, 128): (1, 8192, 128, 16, 1, 8), + (8192, 8192, 256, 16, 16): (2, 4, 16, 64, 3, 1), + (8192, 8192, 256, 32, 32): (1, 8, 32, 32, 1, 1), + (8192, 8192, 256, 64, 64): (1, 4, 64, 64, 1, 4), + (8192, 8192, 256, 128, 128): (1, 4, 32, 64, 1, 4), + (8192, 8192, 512, 16, 16): (1, 4, 16, 64, 3, 1), + (8192, 8192, 512, 32, 32): (1, 16, 32, 32, 1, 1), + (8192, 8192, 512, 64, 64): (2, 4, 64, 64, 1, 4), + (8192, 8192, 512, 128, 128): (2, 1, 32, 64, 1, 4), + (8192, 8192, 1024, 16, 16): (3, 8, 16, 64, 3, 1), + (8192, 8192, 1024, 32, 32): (1, 16, 32, 32, 1, 1), + (8192, 8192, 1024, 64, 64): (1, 8, 64, 32, 1, 2), + (8192, 8192, 1024, 128, 128): (2, 4, 32, 64, 1, 4), + (8192, 8192, 2048, 16, 16): (1, 8, 16, 64, 3, 1), + (8192, 8192, 2048, 32, 32): (1, 16, 32, 32, 1, 1), + (8192, 8192, 2048, 64, 64): (2, 8, 64, 32, 1, 2), + (8192, 8192, 2048, 128, 128): (4, 1, 32, 64, 1, 4), + (8192, 8192, 4096, 16, 16): (1, 8, 16, 64, 3, 1), + (8192, 8192, 4096, 32, 32): (1, 16, 32, 32, 1, 1), + (8192, 8192, 4096, 64, 64): (1, 4, 64, 32, 1, 2), + (8192, 8192, 4096, 128, 128): (3, 1, 32, 64, 1, 4), + (8192, 8192, 8192, 16, 16): (1, 8, 16, 64, 3, 1), + (8192, 8192, 8192, 32, 32): (1, 8, 32, 32, 1, 1), + (8192, 8192, 8192, 64, 64): (1, 8, 64, 32, 1, 2), + (8192, 8192, 8192, 128, 128): (4, 1, 32, 64, 1, 4), + (8192, 8192, 16384, 16, 16): (3, 4, 16, 64, 3, 1), + (8192, 8192, 16384, 32, 32): (1, 8, 32, 32, 1, 1), + (8192, 8192, 16384, 64, 64): (2, 2, 64, 32, 1, 2), + (8192, 8192, 16384, 128, 128): (7, 1, 32, 64, 1, 4), + (8192, 8192, 32768, 16, 16): (1, 4, 16, 64, 3, 1), + (8192, 8192, 32768, 32, 32): (1, 8, 32, 32, 1, 1), + (8192, 8192, 32768, 64, 64): (3, 2, 64, 32, 1, 2), + (8192, 8192, 32768, 128, 128): (6, 1, 32, 64, 1, 4), + (8192, 8192, 65536, 16, 16): (1, 4, 16, 64, 3, 1), + (8192, 8192, 65536, 32, 32): (4, 8, 32, 32, 1, 1), + (8192, 8192, 65536, 64, 64): (1, 2, 64, 32, 1, 2), + (8192, 8192, 65536, 128, 128): (4, 1, 32, 64, 1, 4), + (8192, 8192, 131072, 16, 16): (1, 4, 16, 64, 3, 1), + (8192, 8192, 131072, 32, 32): (1, 8, 32, 32, 1, 1), + (8192, 8192, 131072, 64, 64): (5, 4, 64, 32, 1, 2), + (8192, 8192, 131072, 128, 128): (1, 4096, 128, 16, 1, 8), + (16384, 16384, 256, 16, 16): (1, 4, 16, 64, 3, 1), + (16384, 16384, 256, 32, 32): (1, 8, 32, 32, 1, 1), + (16384, 16384, 256, 64, 64): (1, 4, 64, 32, 1, 2), + (16384, 16384, 256, 128, 128): (1, 4, 32, 64, 1, 4), + (16384, 16384, 512, 16, 16): (1, 8, 16, 64, 3, 1), + (16384, 16384, 512, 32, 32): (1, 16, 32, 32, 1, 1), + (16384, 16384, 512, 64, 64): (1, 4, 64, 32, 1, 2), + (16384, 16384, 512, 128, 128): (3, 1, 32, 64, 1, 4), + (16384, 16384, 1024, 16, 16): (1, 8, 16, 64, 3, 1), + (16384, 16384, 1024, 32, 32): (1, 16, 32, 32, 1, 1), + (16384, 16384, 1024, 64, 64): (2, 4, 64, 32, 1, 2), + (16384, 16384, 1024, 128, 128): (1, 2, 32, 64, 1, 4), + (16384, 16384, 2048, 16, 16): (1, 4, 16, 64, 3, 1), + (16384, 16384, 2048, 32, 32): (1, 16, 32, 32, 1, 1), + (16384, 16384, 2048, 64, 64): (3, 4, 64, 32, 1, 2), + (16384, 16384, 2048, 128, 128): (2, 1, 32, 64, 1, 4), + (16384, 16384, 4096, 16, 16): (4, 8, 16, 64, 3, 1), + (16384, 16384, 4096, 32, 32): (5, 16, 32, 32, 1, 1), + (16384, 16384, 4096, 64, 64): (3, 2, 64, 32, 1, 2), + (16384, 16384, 4096, 128, 128): (2, 1, 32, 64, 1, 4), + (16384, 16384, 8192, 16, 16): (1, 4, 16, 64, 3, 1), + (16384, 16384, 8192, 32, 32): (1, 4, 32, 32, 1, 1), + (16384, 16384, 8192, 64, 64): (1, 2, 64, 32, 1, 2), + (16384, 16384, 8192, 128, 128): (2, 1, 32, 64, 1, 4), + (16384, 16384, 16384, 16, 16): (1, 8, 16, 64, 3, 1), + (16384, 16384, 16384, 32, 32): (1, 4, 32, 32, 1, 1), + (16384, 16384, 16384, 64, 64): (1, 2, 64, 32, 1, 2), + (16384, 16384, 16384, 128, 128): (3, 1, 32, 64, 1, 4), + (16384, 16384, 32768, 16, 16): (1, 4, 16, 64, 3, 1), + (16384, 16384, 32768, 32, 32): (1, 2, 32, 32, 1, 1), + (16384, 16384, 32768, 64, 64): (3, 2, 64, 32, 1, 2), + (16384, 16384, 32768, 128, 128): (3, 1, 32, 64, 1, 4), + (16384, 16384, 65536, 16, 16): (1, 8, 16, 64, 3, 1), + (16384, 16384, 65536, 32, 32): (1, 4, 32, 32, 1, 1), + (16384, 16384, 65536, 64, 64): (4, 4, 64, 32, 1, 2), + (16384, 16384, 65536, 128, 128): (5, 1, 32, 64, 1, 4), + (16384, 16384, 131072, 16, 16): (1, 2, 16, 64, 3, 1), + (16384, 16384, 131072, 32, 32): (1, 4, 32, 32, 1, 1), + (16384, 16384, 131072, 64, 64): (1, 2, 64, 32, 1, 2), + (16384, 16384, 131072, 128, 128): (1, 4096, 128, 16, 1, 8), + }, + # END GENERATED DATA +} + +if __name__ == "__main__": + for dtype in [torch.int8]: + for op in ["_int_bsr_dense_addmm"]: + main(op=op, force=False, dtype=dtype) + for dtype in [torch.float16, torch.bfloat16, torch.float32, torch.int8]: + for op in ["bsr_dense_addmm"]: + main(op=op, force=False, dtype=dtype) diff --git a/torchao/prototype/sparsity/superblock/blocksparse.py b/torchao/prototype/sparsity/superblock/blocksparse.py index d15d959d8f..1d204956c4 100644 --- a/torchao/prototype/sparsity/superblock/blocksparse.py +++ b/torchao/prototype/sparsity/superblock/blocksparse.py @@ -6,34 +6,10 @@ from torch.utils._python_dispatch import return_and_correct_aliasing from torchao.quantization.quant_api import _get_linear_subclass_inserter from torchao.utils import TorchAOBaseTensor -# from torchao.prototype.sparsity.blocksparse._triton_ops import bsr_dense_addmm as torchao_bsr_dense_addmm - -aten = torch.ops.aten - - -# quantization support -# @torch.library.custom_op("blocksparse::bsr_to_dense", mutates_args=()) -# def bsr_to_dense( -# crow_indices: torch.Tensor, -# col_indices: torch.Tensor, -# values: torch.Tensor, -# M: int, -# K: int, -# ) -> torch.Tensor: -# return torch.sparse_bsr_tensor( -# crow_indices=crow_indices, col_indices=col_indices, values=values, size=(M, K) -# ).to_dense() +from .bsr_triton_ops import bsr_dense_addmm as torchao_bsr_dense_addmm -# @torch.library.register_fake("blocksparse::bsr_to_dense") -# def bsr_to_dense_abstract( -# crow_indices: torch.Tensor, -# col_indices: torch.Tensor, -# values: torch.Tensor, -# M: int, -# K: int, -# ) -> torch.Tensor: -# return torch.empty((M, K), dtype=values.dtype, device=values.device) +aten = torch.ops.aten @torch.library.custom_op("blocksparse::int_addmm", mutates_args=()) @@ -127,10 +103,9 @@ def blocksparse_addmm( weight_bsr = torch.sparse_bsr_tensor(crow_indices, col_indices, values, size=(M, K)) N_padded = x_padded.shape[1] out = x_padded.new_empty((M, N_padded)) - bsr_dense_addmm( + torchao_bsr_dense_addmm( out, weight_bsr, - # x, x_padded, alpha=1, beta=0, @@ -157,23 +132,21 @@ def blocksparse_addmm_abstract( # Subclass definition class BlockSparseTensor(TorchAOBaseTensor): - # TODO: Use NJT as a field to store max/min seqlen bsr_crow_indices: Optional[torch.Tensor] bsr_col_indices: Optional[torch.Tensor] bsr_values: Optional[torch.Tensor] - # bsr_nt: Optional[torch.Tensor] + blocksize: int __slots__ = ["bsr_crow_indices", "bsr_col_indices", "bsr_values"] - # __slots__ = ["bsr_crow_indices", "bsr_col_indices", "bsr_values", "bsr_nt"] @staticmethod def __new__( # noqa: PYI034 cls, shape: torch.Size, + blocksize: int, bsr_crow_indices: Optional[torch.Tensor], bsr_col_indices: Optional[torch.Tensor], bsr_values: Optional[torch.Tensor], - # bsr_nt: Optional[torch.Tensor], requires_grad: bool = False, ): if bsr_values is None: @@ -190,7 +163,7 @@ def __new__( # noqa: PYI034 "requires_grad": requires_grad, } tensor = torch.Tensor._make_wrapper_subclass(cls, shape, **kwargs) # type: ignore[attr-defined] - # tensor.bsr_nt = bsr_nt + tensor.blocksize = blocksize tensor.bsr_crow_indices = bsr_crow_indices tensor.bsr_values = bsr_values tensor.bsr_col_indices = bsr_col_indices @@ -200,56 +173,63 @@ def __repr__(self) -> str: # type: ignore[override] assert hasattr(self, "shape") return f"{self.__class__.__name__}(shape={self.shape})" - def __tensor_flatten__(self) -> Tuple[List[str], Tuple[torch.Size, bool]]: + def __tensor_flatten__(self) -> Tuple[List[str], Tuple[torch.Size, bool, int]]: inner_tensors = list( filter(lambda x: getattr(self, x) is not None, self.__slots__) ) - tensor_meta = (self.shape, self.requires_grad) + tensor_meta = (self.shape, self.requires_grad, self.blocksize) return inner_tensors, tensor_meta @classmethod def __tensor_unflatten__( cls, inner_tensors, - tensor_meta: Tuple[torch.Size, bool], + tensor_meta: Tuple[torch.Size, bool, int], outer_size, outer_stride, ) -> torch.Tensor: - shape, requires_grad = tensor_meta + shape, requires_grad, blocksize = tensor_meta return cls( shape=shape, + blocksize=blocksize, bsr_crow_indices=inner_tensors.get("bsr_crow_indices", None), bsr_col_indices=inner_tensors.get("bsr_col_indices", None), bsr_values=inner_tensors.get("bsr_values", None), - # bsr_nt=inner_tensors.get("bsr_nt", None), requires_grad=requires_grad, ) @classmethod def from_dense(cls, dense_tensor, blocksize): bsr_tensor = dense_tensor.to_sparse_bsr(blocksize) - # print("A") - # bsr_nt = torch.nested.nested_tensor_from_jagged(bsr_tensor.values().detach(), bsr_tensor.crow_indices().detach()).detach() return cls( shape=dense_tensor.shape, + blocksize=blocksize, bsr_crow_indices=bsr_tensor.crow_indices(), bsr_col_indices=bsr_tensor.col_indices(), bsr_values=bsr_tensor.values(), - # bsr_nt=bsr_nt, requires_grad=False, ) def apply_fn_to_shard(self, func): return BlockSparseTensor( shape=self.shape, + blocksize=self.blocksize, bsr_crow_indices=func(self.bsr_crow_indices), bsr_col_indices=func(self.bsr_col_indices), bsr_values=func(self.bsr_values), - # bsr_nt=func(self.bsr_nt), requires_grad=self.requires_grad, ) + def dense(self): + return torch.sparse_bsr_tensor( + crow_indices=self.bsr_crow_indices, + col_indices=self.bsr_col_indices, + values=self.bsr_values, + size=self.shape, + ).to_dense() + + # Subclass op dispatch registration implements = BlockSparseTensor.implements @@ -270,10 +250,10 @@ def block_sparse_unsqueeze(func, types, args, kwargs): assert bsr.dim() == 2 assert not bsr.requires_grad return BlockSparseTensor(bsr.shape + (1,), + bsr.blocksize, bsr.crow_indices(), bsr.col_indices(), bsr.values().unsqueeze(-1)) - # bsr.bsr_nt) @implements(aten.mul.Tensor) @@ -293,10 +273,10 @@ def my_mul(bsr, t): masked_t = t_blocked.transpose(0, 1).index_select(0, bsr.col_indices()) new_values = bsr.values() * masked_t return BlockSparseTensor(bsr.shape, + bsr.blocksize, bsr.crow_indices(), bsr.col_indices(), new_values) - # bsr_nt) if isinstance(bsr, torch.Tensor) and isinstance(t, BlockSparseTensor): return my_mul(t, bsr) @@ -305,16 +285,23 @@ def my_mul(bsr, t): @implements(aten.sum.dim_IntList) def block_sparse_sum(func, types, args, kwargs): - breakpoint() bsr, dim = args assert type(dim) == list assert len(dim) == 1 dim = dim[0] bsr_dim = bsr.dim() assert dim == 1 - ret = bsr.values.detach().sum(dim=1).view(bsr.shape[0], -1).sum(1, keepdim=True).detach() - assert ret.dim() + 1 == bsr_dim - return ret + out = torch.empty((bsr.shape[0], bsr.shape[2]), dtype=bsr.dtype, device=bsr.device) + crow_indices = bsr.crow_indices() + blocksize = bsr.blocksize + + for i in range(crow_indices.shape[0]-1): + start, stop = crow_indices[i], crow_indices[i+1] + temp_sum = bsr.values()[start:stop] + temp_sum = temp_sum.sum(dim=0).sum(dim=1) + out[i * blocksize : (i + 1) * blocksize] = temp_sum + + return out @implements(aten.values.default) @@ -336,16 +323,6 @@ def block_sparse_col_indices(func, types, args, kwargs): def block_sparse__nnz(func, types, args, kwargs): return args[0].bsr_values.shape[0] -@implements(aten.to_dense.default) -def block_sparse_to_dense(func, types, args, kwargs): - return torch.sparse_bsr_tensor( - crow_indices=args[0].crow_indices, - col_indices=args[0].col_indices, - values=args[0].values, - size=args[0].shape, - ).to_dense() - - def next_power_of_two(n): assert n > 0 return 2 ** (n.bit_length()) @@ -367,7 +344,6 @@ def block_sparse_linear(func, types, args, kwargs): # TODO: Replace this with mul + sum for the mv case similar to # https://github.com/pytorch/pytorch/blob/a9685767773157440c162caaf125856e04e2981f/torch/_inductor/decomposition.py#L292 # use .to_dense to get a baseline implementation that works and then use NJT for .sum and such - # breakpoint() # if x.size(-1) == 1: # # print("USING THIS") # # breakpoint() @@ -379,10 +355,10 @@ def block_sparse_linear(func, types, args, kwargs): # special_ret = out_orig + bias # return special_ret # else: - N_padded = max(16, next_power_of_two(N)) - x_padded = torch.nn.functional.pad(x, (0, N_padded - N), 'constant', 0) + # N_padded = max(16, next_power_of_two(N)) + # x_padded = torch.nn.functional.pad(x, (0, N_padded - N), 'constant', 0) out = torch.ops.blocksparse.addmm( - x_padded, + x, w.crow_indices(), w.col_indices(), w.values(), @@ -390,15 +366,11 @@ def block_sparse_linear(func, types, args, kwargs): K, None, ) - # import pdb; pdb.set_trace() - # return out.view(x_orig.size(0), -1, M) - out_orig = out[:, :x.size(-1)].t().reshape(x_orig.shape[:-1] + (M,)) + # out_orig = out[:, :x.size(-1)].t().reshape(x_orig.shape[:-1] + (M,)) + out_orig = out.t() if bias is None: - # if x.size(-1) == 1: - # assert special_ret.size() == out_orig.size() return out_orig - # if x.size(-1) == 1: - # assert special_ret.size() == out_orig.size() + return out_orig + bias diff --git a/torchao/prototype/sparsity/superblock/bsr_triton_ops.py b/torchao/prototype/sparsity/superblock/bsr_triton_ops.py new file mode 100644 index 0000000000..7bd0483dd7 --- /dev/null +++ b/torchao/prototype/sparsity/superblock/bsr_triton_ops.py @@ -0,0 +1,2541 @@ +# mypy: allow-untyped-decorators +# mypy: allow-untyped-defs +import math +import os +import weakref +from functools import lru_cache +from typing import Optional + +import torch +from torch._dynamo.utils import warn_once +from torch.utils._triton import has_triton + +from ._triton_ops_meta import get_meta + + +TORCH_SPARSE_BSR_SCATTER_MM_LRU_CACHE_SIZE = int( + os.getenv("TORCH_SPARSE_BSR_SCATTER_MM_LRU_CACHE_SIZE", 2) +) + + +def check(cond, msg): + if not cond: + raise ValueError(msg) + + +def check_bsr_layout(f_name, t): + check( + t.layout == torch.sparse_bsr, + f"{f_name}(): only BSR sparse format is supported for the sparse argument.", + ) + + +def check_device(f_name, t, device): + check( + t.device == device and t.device.type == "cuda", + f"{f_name}(): all inputs are expected to be on the same GPU device.", + ) + + +def check_mm_compatible_shapes(f_name, lhs, rhs): + check( + lhs.dim() >= 2 and rhs.dim() >= 2, + f"{f_name}(): all inputs involved in the matrix product are expected to be at least 2D, " + f"but got lhs.dim() == {lhs.dim()} and rhs.dim() == {rhs.dim()}.", + ) + + _m, kl = lhs.shape[-2:] + kr, _n = rhs.shape[-2:] + + check( + kl == kr, + f"{f_name}(): arguments' sizes involved in the matrix product are not compatible for matrix multiplication, " + f"got lhs.shape[-1] == {kl} which is not equal to rhs.shape[-2] == {kr}.", + ) + + +def check_dtype(f_name, t, dtype, *additional_dtypes): + check( + t.dtype == dtype + and t.dtype + in ((torch.half, torch.bfloat16, torch.float) + tuple(*additional_dtypes)), + f"{f_name}(): all inputs are expected to be of the same dtype " + f"and one of (half, bfloat16, float32) or {additional_dtypes}, " + f"but got dtype == {t.dtype}.", + ) + + +def check_blocksize(f_name, blocksize): + assert len(blocksize) == 2 + + def is_power_of_two(v): + return not (v & (v - 1)) + + def is_compatible_blocksize(b): + res = True + for blocksize in b: + # Triton loads only blocks which are at least 16 and powers of 2. + res = (blocksize >= 16 and is_power_of_two(blocksize)) and res + return res + + check( + is_compatible_blocksize(blocksize), + f"{f_name}(): sparse inputs' blocksize ({blocksize[0]}, {blocksize[1]}) " + "should be at least 16 and a power of 2 in each dimension.", + ) + + +def make_triton_contiguous(t): + """Return input as a triton-contiguous tensor. + + A triton-contiguous tensor is defined as a tensor that has strides + with minimal value smaller than or equal to 1. + + While triton kernels support triton-non-contiguous tensors (all + strides being greater than 1) arguments, a considerable slow-down + occurs because tensor data is copied element-wise rather than + chunk-wise. Zero strides is assumed to not have this defect. + """ + if min(t.stride()) > 1: + # TODO: investigate if contiguity along other axes than the + # last one can be beneficial for performance + return t.contiguous() + else: + return t + + +def broadcast_batch_dims(f_name, *tensors): + try: + return torch.broadcast_shapes(*(t.shape[:-2] for t in tensors)) + except Exception: + check(False, f"{f_name}(): inputs' batch dimensions are not broadcastable!") + + +def slicer(dim, slice_range, *tensors): + for t in tensors: + slices = [slice(None)] * t.dim() + slices[dim] = slice_range + yield t[slices] + + +def multidim_slicer(dims, slices, *tensors): + for t in tensors: + s = [slice(None)] * t.dim() + for d, d_slice in zip(dims, slices): + if d is not None: + s[d] = d_slice + yield t[s] + + +def ptr_stride_extractor(*tensors): + for t in tensors: + yield t + yield from t.stride() + + +def grid_partitioner(full_grid, grid_blocks, tensor_dims_map): + assert 0 <= len(full_grid) <= 3 + assert 0 <= len(grid_blocks) <= 3 + + import itertools + + def generate_grid_points(): + for fg, mg in zip(full_grid, grid_blocks): + yield range(0, fg, mg) + + def generate_sliced_tensors(slices): + for t, t_dims in tensor_dims_map.items(): + yield next(multidim_slicer(t_dims, slices, t)) + + for grid_point in itertools.product(*generate_grid_points()): + grid = [ + min(fg - gp, mg) for fg, gp, mg in zip(full_grid, grid_point, grid_blocks) + ] + slices = [slice(gp, gp + g) for gp, g in zip(grid_point, grid)] + # grid_points are iterated in a "contiguous" order, i.e. + # left dimensions traversed slower than right dimensions. + # This order is reversed for CUDA grids. + yield grid[::-1], *generate_sliced_tensors(slices) + + +def launch_kernel(kernel, tensor_dims_map, full_grid, grid_blocks=None): + # cuda_max_grid = (2 ** 31 - 1, 2 ** 16 - 1, 2 ** 16 - 1) + cuda_max_grid = (2147483647, 65535, 65535)[::-1] + if grid_blocks is None: + grid_blocks = cuda_max_grid + else: + + def valid_grid_dim(g, mg): + if g is None: + return mg + else: + # grid must be at least 1 and no greater than mg + return max(1, min(g, mg)) + + grid_blocks = tuple( + valid_grid_dim(g, mg) for g, mg in zip(grid_blocks, cuda_max_grid) + ) # type: ignore[assignment] + + for grid, *sliced_tensors in grid_partitioner( + full_grid, grid_blocks, tensor_dims_map + ): + kernel(grid, *sliced_tensors) + + +def prepare_inputs(bsr, *dense_tensors): + # Introduce fake batch dimension if not present for convenience. + crow_indices = bsr.crow_indices().unsqueeze(0) + col_indices = bsr.col_indices().unsqueeze(0) + values = make_triton_contiguous(bsr.values().unsqueeze(0)) + tensors = [make_triton_contiguous(t.unsqueeze(0)) for t in dense_tensors] + + # Compute broadcasted batch dimension + batch_dims_broadcasted = torch.broadcast_shapes( + values.shape[:-3], *(t.shape[:-2] for t in tensors) + ) + + # Broadcast batch dimensions and squash. + # The result can be either a view or a copy. + def batch_broadcast_and_squash(t, batch_dims, invariant_dims): + return t.broadcast_to(batch_dims + invariant_dims).flatten( + 0, len(batch_dims) - 1 + ) + + crow_indices = batch_broadcast_and_squash( + crow_indices, batch_dims_broadcasted, (-1,) + ) + + col_indices = batch_broadcast_and_squash(col_indices, batch_dims_broadcasted, (-1,)) + values = batch_broadcast_and_squash( + values, batch_dims_broadcasted, values.shape[-3:] + ) + tensors = [ + batch_broadcast_and_squash(t, batch_dims_broadcasted, t.shape[-2:]) + for t in tensors + ] + + return crow_indices, col_indices, values, *tensors + + +def broadcast_batch_dims_bsr(f_name, bsr, *tensors): + batch_shape = broadcast_batch_dims(f_name, bsr, *tensors) + + crow_indices = bsr.crow_indices().broadcast_to(batch_shape + (-1,)) + col_indices = bsr.col_indices().broadcast_to(batch_shape + (-1,)) + values = bsr.values().broadcast_to(batch_shape + bsr.values().shape[-3:]) + size = batch_shape + bsr.shape[-2:] + return torch.sparse_compressed_tensor( + crow_indices, col_indices, values, size=size, layout=bsr.layout + ) + + +# NOTE: this function will ALWAYS create a view +def tile_to_blocksize(t, blocksize): + *rest, m, n = t.shape + new_shape = rest + [ + m // blocksize[0], + blocksize[0], + n // blocksize[1], + blocksize[1], + ] + # using .view instead of .reshape to ensure that the result is + # indeed a view: + return t.view(new_shape).transpose(-3, -2) + + +def as1Dbatch(tensor): + """Return tensor as 3D tensor by either prepending new dimensions to + the tensor shape (when ``tensor.ndim < 3``), or by collapsing + starting dimensions into the first dimension (when ``tensor.ndim > + 3``). + """ + while tensor.ndim < 3: + tensor = tensor.unsqueeze(0) + if tensor.ndim > 3: + tensor = tensor.flatten(0, tensor.ndim - 3) + assert tensor.ndim == 3, tensor.shape + return tensor + + +def scatter_mm(blocks, others, indices_data, *, accumulators=None): + """Scattered matrix multiplication of tensors. + + A scattered matrix multiplication is defined as a series of matrix + multiplications applied to input tensors according to the input + and output mappings specified by indices data. + + The following indices data formats are supported for defining a + scattered matrix multiplication operation (:attr:`indices_data[0]` + holds the name of the indices data format as specified below): + + - ``"scatter_mm"`` - matrix multiplications scattered in batches + of tensors. + + If :attr:`blocks` is a :math:`(* \times M \times K) tensor, + :attr:`others` is a :math:`(* \times K \times N)` tensor, + :attr:`accumulators` is a :math:`(* \times M \times N)` tensor, + and :attr:`indices = indices_data['indices']` is a :math:`(* + \times 3)` tensor, then the operation is equivalent to the + following code:: + + c_offsets, pq = indices_data[1:] + for r in range(len(c_offsets) - 1): + for g in range(c_offsets[r], c_offsets[r + 1]): + p, q = pq[g] + accumulators[r] += blocks[p] @ others[q] + + - ``"bsr_strided_mm"`` - matrix multiplications scattered in + batches of tensors and a tensor. + + If :attr:`blocks` is a :math:`(Ms \times Ks) tensor, + :attr:`others` is a :math:`(* \times K \times N)` tensor, + :attr:`accumulators` is a :math:`(* \times M \times N)` tensor, then + the operation is equivalent to the following code:: + + c_indices, r_offsets, p_offsets, q_offsets, meta = indices_data[1:] + for b in range(nbatches): + for i, r in enumerate(r_offsets): + r0, r1 = divmod(r, N) + acc = accumulators[b, r0:r0 + Ms, r1:r1 + Ns] + for g in range(c_indices[i], c_indices[i+1]): + p = p_offsets[g] + q0, q1 = divmod(q_offsets[g], N) + acc += blocks[p] @ others[b, q0:q0 + Ks, q1:q1 + Ns] + + where ``Ns = N // meta['SPLIT_N']``, and ``M`` and ``K`` are + integer multiples of ``Ms`` and ``Ks``, respectively. + + - ``"bsr_strided_mm_compressed"`` - matrix multiplications + scattered in batches of tensors and a tensor. A memory and + processor efficient version of ``"bsr_strided_mm"`` format. If + :attr:`blocks` is a :math:`(Ms \times Ks) tensor, :attr:`others` + is a :math:`(* \times K \times N)` tensor, :attr:`accumulators` + is a :math:`(* \times M \times N)` tensor, then the operation is + equivalent to the following code:: + + c_indices, r_offsets, q_offsets, meta = indices_data[1:] + for b in range(nbatches): + for r in r_offsets: + m = (r // N) // Ms + n = (r % N) // Ns + r0, r1 = divmod(r, N) + c0, c1 = c_indices[m], c_indices[m + 1] + acc = accumulators[b, r0:r0 + Ms, r1:r1 + Ns] + for i, p in enumerate(range(c0, c1)): + q = q_offsets[n * c1 + (SPLIT_N - n) * c0 + i] + q0, q1 = divmod(q, N) + acc += blocks[p] @ others[b, q0:q0 + Ks, q1:q1 + Ns] + + where ``Ns = N // meta['SPLIT_N']``, and ``M`` and ``K`` are + integer multiples of ``Ms`` and ``Ks``, respectively. + + Notice that the order of ``r_offsets`` items can be arbitrary; + this property enables defining swizzle operators via + rearrangements of ``r_offsets`` items.. + + Auxilary functions are provided for pre-computing + :attr:`indices_data`. For example, + :func:`bsr_scatter_mm_indices_data` is used to define indices data + for matrix multiplication of BSR and strided tensors. + + Parameters + ---------- + blocks (Tensor): a 3-D tensor of first matrices to be multiplied + + others (Tensor): a tensor of second matrices to be multiplied. If + ``indices_data[0]=="scatter_mm"``, the tensor is a 1-D batch + tensor of second input matrices to be multiplied. Otherwise, the + second input matrices are slices of the :attr:`others` tensor. + indices_data (tuple): a format data that defines the inputs and + outputs of scattered matrix multiplications. + + Keyword arguments + ----------------- + + accumulators (Tensor, optional): a tensor of matrix product + accumulators. If ``indices_data[0]=="scatter_mm"``, the tensor + is a 1-D batch tensor of output matrices. Otherwise, output + matrices are slices of the :attr:`accumulators` tensor. + """ + indices_format = indices_data[0] + + assert blocks.ndim == 3 + _P, Ms, Ks = blocks.shape + + if indices_format == "scatter_mm": + c_offsets, pq = indices_data[1:] + + assert others.ndim == 3 + _Q, Ks_, Ns = others.shape + assert Ks == Ks_ + + if accumulators is None: + R = c_offsets.shape[0] - 1 + accumulators = torch.zeros( + (R, Ms, Ns), dtype=blocks.dtype, device=blocks.device + ) + else: + R, Ms_, Ns_ = accumulators.shape + assert Ms_ == Ms + assert Ns_ == Ns + + if Ms % 16 or Ks % 16 or Ns % 16 or _scatter_mm2 is None: + for r in range(c_offsets.shape[0] - 1): + g0 = c_offsets[r] + g1 = c_offsets[r + 1] + for g in range(g0, g1): + p, q = pq[g] + accumulators[r] += blocks[p] @ others[q] + else: + _scatter_mm2(blocks, others, c_offsets, pq, accumulators) + return accumulators + + elif indices_format == "bsr_strided_mm": + others_shape = others.shape + others = as1Dbatch(others) + + B, K, N = others.shape + assert K % Ks == 0 + + c_indices, r_offsets, p_offsets, q_offsets, meta = indices_data[1:] + SPLIT_N = meta["SPLIT_N"] + + if accumulators is None: + M = Ms + (r_offsets.max().item() + 1) // N + accumulators = torch.zeros( + (*others_shape[:-2], M, N), dtype=blocks.dtype, device=blocks.device + ) + else: + M, N_ = accumulators.shape[-2:] + assert N_ == N + + accumulators_shape = accumulators.shape + accumulators = as1Dbatch(accumulators) + + Ns = N // SPLIT_N + + if Ms % 16 or Ks % 16 or Ns % 16 or _scatter_mm6 is None: + accumulators.zero_() + for b in range(B): + for r in range(r_offsets.shape[0]): + r_ = r_offsets[r].item() + g0 = c_indices[r].item() + g1 = c_indices[r + 1].item() + r0, r1 = divmod(r_, N) + acc = accumulators[b, r0 : r0 + Ms, r1 : r1 + Ns] + for g in range(g0, g1): + p, q = p_offsets[g], q_offsets[g] + q0, q1 = divmod(q.item(), N) + acc += blocks[p] @ others[b, q0 : q0 + Ks, q1 : q1 + Ns] + else: + _scatter_mm6( + blocks, + others, + c_indices, + r_offsets, + p_offsets, + q_offsets, + meta, + accumulators, + ) + return accumulators.view(accumulators_shape) + + elif indices_format == "bsr_strided_mm_compressed": + others_shape = others.shape + others = as1Dbatch(others) + + B, K, N = others.shape + assert K % Ks == 0 + + c_indices, r_offsets, q_offsets, meta = indices_data[1:] + SPLIT_N = meta["SPLIT_N"] + + if accumulators is None: + M = Ms + (r_offsets.max().item() + 1) // N + accumulators = torch.zeros( + (*others_shape[:-2], M, N), dtype=blocks.dtype, device=blocks.device + ) + else: + M, N_ = accumulators.shape[-2:] + assert N_ == N + + accumulators_shape = accumulators.shape + accumulators = as1Dbatch(accumulators) + + Ns = N // SPLIT_N + + if Ms % 16 or Ks % 16 or Ns % 16 or _scatter_mm6 is None: + for b in range(B): + for j in range(len(r_offsets)): + r0, r1 = divmod(r_offsets[j].item(), N) + m = r0 // Ms + n = r1 // Ns + c0 = c_indices[m].item() + c1 = c_indices[m + 1].item() + acc = accumulators[b, r0 : r0 + Ms, r1 : r1 + Ns] + for i, p in enumerate(range(c0, c1)): + q = q_offsets[n * c1 + (SPLIT_N - n) * c0 + i].item() + q0, q1 = divmod(q, N) + acc += blocks[p] @ others[b, q0 : q0 + Ks, q1 : q1 + Ns] + else: + p_offsets = torch.empty( + (0,), dtype=q_offsets.dtype, device=q_offsets.device + ) + _scatter_mm6( + blocks, + others, + c_indices, + r_offsets, + p_offsets, + q_offsets, + meta, + accumulators, + ) + return accumulators.view(accumulators_shape) + + else: + raise NotImplementedError(indices_format) + + +def scatter_mm_meta( + M, + K, + N, + Ms, + Ks, + GROUP_SIZE=None, + TILE_M=None, + TILE_N=None, + SPLIT_N=None, + num_warps=None, + num_stages=None, + **extra, +): + if {TILE_M, TILE_N, SPLIT_N, num_warps, num_stages, GROUP_SIZE} == {None}: + device_name = torch.cuda.get_device_name() + meta = get_meta( + "scatter_mm", + (M, K, N, Ms, Ks), + device_name, + version=(0, torch.float16, 0.5), + ) + if meta is not None: + meta.update(**extra) + return meta + # The following parameters are optimized for the performance + # equilibrium points of bsr-dense and dense-dense matrix + # multiplications when using GPU card NVIDIA GeForce RTX 2060 + # SUPER. For points far from the performance equilibrium + # points as well as for other GPU cards, the optimal + # parameters are likely different from what specified below. + if (M, K, N) == (256,) * 3: + if (Ms, Ks) == (16, 16): + SPLIT_N = 1 + TILE_M = 16 + TILE_N = 16 + GROUP_SIZE = 4 + num_stages = 1 + num_warps = 4 # noqa: E225,E231,E702 + elif (Ms, Ks) == (32, 32): + SPLIT_N = 2 + TILE_M = 32 + TILE_N = 16 + GROUP_SIZE = 4 + num_stages = 1 + num_warps = 4 # noqa: E225,E231,E702 + elif (Ms, Ks) == (64, 64): + SPLIT_N = 1 + TILE_M = 32 + TILE_N = 32 + GROUP_SIZE = 4 + num_stages = 1 + num_warps = 4 # noqa: E225,E231,E702 + elif (Ms, Ks) == (128, 128): + SPLIT_N = 1 + TILE_M = 32 + TILE_N = 32 + GROUP_SIZE = 2 + num_stages = 1 + num_warps = 4 # noqa: E225,E231,E702 + elif (M, K, N) == (512,) * 3: + if (Ms, Ks) == (16, 16): + SPLIT_N = 8 + TILE_M = 16 + TILE_N = 64 + GROUP_SIZE = 2 + num_stages = 1 + num_warps = 2 # noqa: E225,E231,E702 + elif (Ms, Ks) == (32, 32): + SPLIT_N = 8 + TILE_M = 32 + TILE_N = 64 + GROUP_SIZE = 4 + num_stages = 1 + num_warps = 2 # noqa: E225,E231,E702 + elif (Ms, Ks) == (64, 64): + SPLIT_N = 4 + TILE_M = 32 + TILE_N = 128 + GROUP_SIZE = 4 + num_stages = 1 + num_warps = 4 # noqa: E225,E231,E702 + elif (Ms, Ks) == (128, 128): + SPLIT_N = 8 + TILE_M = 64 + TILE_N = 64 + GROUP_SIZE = 4 + num_stages = 1 + num_warps = 4 # noqa: E225,E231,E702 + elif (M, K, N) == (1024,) * 3: + if (Ms, Ks) == (16, 16): + SPLIT_N = 4 + TILE_M = 16 + TILE_N = 128 + GROUP_SIZE = 2 + num_stages = 1 + num_warps = 1 # noqa: E225,E231,E702 + elif (Ms, Ks) == (32, 32): + SPLIT_N = 8 + TILE_M = 32 + TILE_N = 64 + GROUP_SIZE = 2 + num_stages = 1 + num_warps = 1 # noqa: E225,E231,E702 + elif (Ms, Ks) == (64, 64): + SPLIT_N = 16 + TILE_M = 64 + TILE_N = 64 + GROUP_SIZE = 4 + num_stages = 1 + num_warps = 2 # noqa: E225,E231,E702 + elif (Ms, Ks) == (128, 128): + SPLIT_N = 16 + TILE_M = 64 + TILE_N = 64 + GROUP_SIZE = 4 + num_stages = 1 + num_warps = 4 # noqa: E225,E231,E702 + elif (Ms, Ks) == (256, 256): + SPLIT_N = 16 + TILE_M = 64 + TILE_N = 64 + GROUP_SIZE = 2 + num_stages = 1 + num_warps = 4 # noqa: E225,E231,E702 + elif (M, K, N) == (2048,) * 3: + if (Ms, Ks) == (16, 16): + SPLIT_N = 4 + TILE_M = 16 + TILE_N = 128 + GROUP_SIZE = 8 + num_stages = 1 + num_warps = 1 # noqa: E225,E231,E702 + elif (Ms, Ks) == (32, 32): + SPLIT_N = 4 + TILE_M = 32 + TILE_N = 64 + GROUP_SIZE = 4 + num_stages = 1 + num_warps = 1 # noqa: E225,E231,E702 + elif (Ms, Ks) == (64, 64): + SPLIT_N = 4 + TILE_M = 64 + TILE_N = 128 + GROUP_SIZE = 4 + num_stages = 1 + num_warps = 4 # noqa: E225,E231,E702 + elif (Ms, Ks) == (128, 128): + SPLIT_N = 8 + TILE_M = 64 + TILE_N = 64 + GROUP_SIZE = 4 + num_stages = 1 + num_warps = 4 # noqa: E225,E231,E702 + elif (Ms, Ks) == (256, 256): + SPLIT_N = 4 + TILE_M = 64 + TILE_N = 64 + GROUP_SIZE = 2 + num_stages = 1 + num_warps = 4 # noqa: E225,E231,E702 + elif (M, K, N) == (4096,) * 3: + if (Ms, Ks) == (16, 16): + SPLIT_N = 2 + TILE_M = 16 + TILE_N = 256 + GROUP_SIZE = 2 + num_stages = 1 + num_warps = 2 # noqa: E225,E231,E702 + elif (Ms, Ks) == (32, 32): + SPLIT_N = 2 + TILE_M = 32 + TILE_N = 64 + GROUP_SIZE = 2 + num_stages = 1 + num_warps = 1 # noqa: E225,E231,E702 + elif (Ms, Ks) == (64, 64): + SPLIT_N = 2 + TILE_M = 64 + TILE_N = 128 + GROUP_SIZE = 2 + num_stages = 1 + num_warps = 4 # noqa: E225,E231,E702 + + if SPLIT_N is None: + # Assume NVIDIA GeForce RTX 2060 SUPER: + # With the probality of 92% (99.9% when N > 512), the + # performance will not be worse more than 2% from the + # performance when using an optimal value. Otherwise, when N + # <= 512, using the following heuristics may give upto 15% + # lower performance. + SPLIT_N = { + 16: 1, + 32: 2, + 64: 4, + 128: 8, + 256: 16, + 512: 8, + 1024: 16, + 4096: 32, + 8192: 64, + }.get(N, 16) + if Ms >= 512 and N >= 2048: + SPLIT_N = 1 + Ns = N // SPLIT_N + if TILE_M is None: + TILE_M = min(64 if Ns < 512 else 32, Ms) + if TILE_N is None: + TILE_N = min(64 if Ns < 512 else 32, Ns) + num_stages = num_stages or 1 + if num_warps is None: + if min(M, N) > 1024: + num_warps = {16: 1, 32: 1, 64: 2}.get(Ms, 4) + elif min(M, N) == 1024: + num_warps = {16: 1, 32: 1, 64: 2}.get(Ms, 4) + elif min(M, N) == 256: + num_warps = {16: 1, 32: 4}.get(Ms, 4) + else: + num_warps = {16: 1, 32: 2}.get(Ms, 4) + GROUP_SIZE = GROUP_SIZE or 4 + + assert TILE_M <= Ms, dict(TILE_M=TILE_M, Ms=Ms) + assert TILE_N <= Ns, dict(TILE_N=TILE_N, Ns=Ns) + assert Ms <= M, dict(M=M, Ms=Ms) + assert Ns <= N, dict(N=N, Ns=Ns) + assert Ks <= K, dict(K=K, Ks=Ks) + + return dict( + TILE_M=TILE_M, + TILE_N=TILE_N, + GROUP_SIZE=GROUP_SIZE, + num_stages=num_stages, + num_warps=num_warps, + SPLIT_N=SPLIT_N, + **extra, + ) + + +def bsr_dense_addmm_meta( + M, + K, + N, + Ms, + Ks, + beta, + alpha, + SPLIT_N=None, + GROUP_SIZE_ROW=None, + num_warps=None, + num_stages=None, + sparsity=None, + dtype=None, + out_dtype=None, + _version=0, + **extra, +): + # Specifying _version is useful for situations when one wants to + # discard existing triton kernel tuning results, say, in testing + # bsr_dense_addmm_meta functionality. + if dtype is None: + dtype = torch.float16 + if out_dtype is None: + out_dtype = dtype + if sparsity is None: + sparsity = 0.5 + if {SPLIT_N, num_warps, num_stages, GROUP_SIZE_ROW} == {None}: + device_name = torch.cuda.get_device_name() + key = (M, K, N, Ms, Ks, beta == 0, beta == 1, alpha == 1) + if dtype is out_dtype: + version_dtype = dtype + else: + version_dtype = dtype, out_dtype + meta = get_meta( + "bsr_dense_addmm", + key, + device_name, + version=(_version, version_dtype, sparsity), + ) + if meta is None and sparsity != 0.5: + meta = get_meta( + "bsr_dense_addmm", + key, + device_name, + version=(_version, version_dtype, 0.5), + ) + if meta is None and dtype is not out_dtype: + meta = get_meta( + "bsr_dense_addmm", key, device_name, version=(_version, dtype, 0.5) + ) + if meta is None: + # find approximate meta such that N % SPLIT_N == 0. + matching_meta = get_meta( + "bsr_dense_addmm", + (*key[:2], "*", *key[3:]), + device_name, + version=(_version, version_dtype, 0.5), + ) + if matching_meta is None and dtype is not out_dtype: + matching_meta = get_meta( + "bsr_dense_addmm", + (*key[:2], "*", *key[3:]), + device_name, + version=(_version, dtype, 0.5), + ) + for mkey in sorted(matching_meta or {}): + meta_ = matching_meta[mkey] + n = mkey[2] + split_n = meta_["SPLIT_N"] + c = n // split_n + if N % c == 0 and n <= N: + meta = dict(meta_) + meta["SPLIT_N"] = N // c + if meta is not None: + meta.update(**extra) + return meta + else: + # see [Computing optimal kernel parameters] in + # _triton_ops_meta.py for ways to avoid this warning + # message + warn_once( + "bsr_dense_addmm uses non-optimal triton kernel parameters" + f" for {M=} {K=} {N=} {Ms=}, {Ks=} {beta=} {alpha=} {dtype=} {out_dtype=}" + ) + + SPLIT_N = SPLIT_N or max(N // Ms, 1) + GROUP_SIZE_ROW = GROUP_SIZE_ROW or 4 + num_stages = num_stages or 1 + num_warps = num_warps or 4 + return dict( + SPLIT_N=SPLIT_N, + GROUP_SIZE_ROW=GROUP_SIZE_ROW, + num_stages=num_stages, + num_warps=num_warps, + **extra, + ) + + +class TensorAsKey: + """A light-weight wrapper of a tensor that enables storing tensors as + keys with efficient memory reference based comparision as an + approximation to data equality based keys. + + Motivation: the hash value of a torch tensor is tensor instance + based that does not use data equality and makes the usage of + tensors as keys less useful. For instance, the result of + ``len({a.crow_indices(), a.crow_indices()})`` is `2`, although, + the tensor results from `crow_indices` method call are equal, in + fact, these share the same data storage. + On the other hand, for efficient caching of tensors we want to + avoid calling torch.equal that compares tensors item-wise. + + TensorAsKey offers a compromise in that it guarantees key equality + of tensors that references data in the same storage in the same + manner and without accessing underlying data. However, this + approach does not always guarantee correctness. For instance, for + a complex tensor ``x``, we have ``TensorAsKey(x) == + TensorAsKey(x.conj())`` while ``torch.equal(x, x.conj())`` would + return False. + """ + + def __init__(self, obj): + def get_tensor_key(obj): + # Warning: TensorAsKey does not track negative nor + # conjugate bits of its input object because in the use + # case of wrapping compressed/plain indices of compressed + # sparse tensors (that are always integer tensors with + # non-negative items) these bits are never set. However, + # when extending the use of TensorAsKey to float or + # complex tensors, the values of these bits (see is_neg + # and is_conj methods) must be included in the key as + # well. + assert not (obj.dtype.is_floating_point or obj.dtype.is_complex), obj.dtype + return ( + obj.data_ptr(), + obj.storage_offset(), + obj.shape, + obj.stride(), + obj.dtype, + ) + + self._obj_ref = weakref.ref(obj) + if obj.layout is torch.strided: + self.key = get_tensor_key(obj) + elif obj.layout in {torch.sparse_csr, torch.sparse_bsr}: + self.key = ( + get_tensor_key(obj.crow_indices()), + get_tensor_key(obj.col_indices()), + ) + elif obj.layout in {torch.sparse_csc, torch.sparse_bsc}: + self.key = ( + get_tensor_key(obj.ccol_indices()), + get_tensor_key(obj.row_indices()), + ) + else: + raise NotImplementedError(obj.layout) + self._hash = hash(self.key) + + def __hash__(self): + return self._hash + + def __eq__(self, other): + if not isinstance(other, TensorAsKey): + return False + if self.obj is None or other.obj is None: + # dead objects always compare unequal unless these are + # same objects + return self is other + return self.key == other.key + + @property + def obj(self): + """Return object if alive, otherwise None.""" + return self._obj_ref() + + +@lru_cache(maxsize=TORCH_SPARSE_BSR_SCATTER_MM_LRU_CACHE_SIZE) +def _bsr_scatter_mm_indices_data( + indices_format, M, K, N, Ms, Ks, nbatches, SPLIT_N, compressed_sparse_tensor_as_key +): + bsr = compressed_sparse_tensor_as_key.obj + assert bsr is not None + crow_indices, col_indices = bsr.crow_indices(), bsr.col_indices() + device = crow_indices.device + indices_dtype = torch.int32 + + if indices_format == "bsr_strided_mm_compressed": + Ns = N // SPLIT_N + q_offsets_lst = [] + b = torch.arange(SPLIT_N, dtype=indices_dtype, device=device) * Ns + for m in range(M // Ms): + r0 = crow_indices[m].item() + r1 = crow_indices[m + 1].item() + if r1 == r0: + continue + q_offsets_lst.append( + (col_indices[r0:r1] * (Ks * N)).repeat(SPLIT_N) + + b.repeat_interleave(r1 - r0) + ) + q_offsets = torch.cat(q_offsets_lst) + crow_indices_diff = crow_indices.diff() + non_zero_row_indices = crow_indices_diff.nonzero() + a = non_zero_row_indices * (Ms * N) + r_offsets = (a + b).view(-1) + c_indices = crow_indices + # swizzle operation: mm elements with longer sums are computed first: + nnz_per_row = crow_indices_diff[non_zero_row_indices].repeat_interleave(SPLIT_N) + nnz_per_row, indices = nnz_per_row.sort(descending=True, stable=True) + r_offsets = r_offsets[indices] + return (indices_format, c_indices, r_offsets, q_offsets) + + elif indices_format == "bsr_strided_mm": + Ns = N // SPLIT_N + p_offsets_lst = [] + q_offsets_lst = [] + b = torch.arange(SPLIT_N, dtype=indices_dtype, device=device) * Ns + for m in range(M // Ms): + r0 = crow_indices[m].item() + r1 = crow_indices[m + 1].item() + if r1 == r0: + continue + p_offsets_lst.append( + torch.arange(r0, r1, dtype=indices_dtype, device=device).repeat(SPLIT_N) + ) + q_offsets_lst.append( + (col_indices[r0:r1] * (Ks * N)).repeat(SPLIT_N) + + b.repeat_interleave(r1 - r0) + ) + q_offsets = torch.cat(q_offsets_lst) + crow_indices_diff = crow_indices.diff() + non_zero_row_indices = crow_indices_diff.nonzero() + a = non_zero_row_indices * (Ms * N) + r_offsets = (a + b).view(-1) + c_indices = torch.cat( + ( + crow_indices[:1], + torch.cumsum( + crow_indices_diff[non_zero_row_indices].repeat_interleave(SPLIT_N), + 0, + ), + ) + ) + p_offsets = torch.cat(p_offsets_lst) + return (indices_format, c_indices, r_offsets, p_offsets, q_offsets) + + elif indices_format == "scatter_mm": + Ns = Ms + c_indices = [0] + pq_offsets = [] + # todo: eliminate inner for-loops for efficiency + for b in range(nbatches): + for m in range(M // Ms): + r0 = crow_indices[m].item() + r1 = crow_indices[m + 1].item() + for n in range(N // Ns): + c_indices.append(c_indices[-1] + r1 - r0) + for t in range(r1 - r0): + p = r0 + t + q = (col_indices[p].item() + b * (K // Ks)) * (N // Ns) + n + pq_offsets.append([p, q]) + + return ( + indices_format, + torch.tensor(c_indices, dtype=indices_dtype, device=device), + torch.tensor(pq_offsets, dtype=indices_dtype, device=device), + ) + + else: + raise ValueError( + f"Invalid {indices_format=}. Expected bsr_strided_mm_compressed|bsr_strided_mm|scatter_mm" + ) + + +def bsr_scatter_mm_indices_data( + bsr, other, indices_format="bsr_strided_mm_compressed", **meta_input +): + """Computes indices data for :func:`scatter_mm` used in BSR and + strided tensor matrix multiplication. + """ + assert bsr.dense_dim() == 0 + assert bsr.ndim == 2 # no batch dims + blocksize = bsr.values().shape[-2:] + M, K = bsr.shape + Ms, Ks = blocksize + K_, N = other.shape[-2:] + assert K_ == K + nbatches = other.shape[:-2].numel() + + meta = scatter_mm_meta(M, K, N, Ms, Ks, **meta_input) + if "allow_tf32" not in meta_input: + meta.update(allow_tf32=bsr.dtype in {torch.float16, torch.bfloat16}) + SPLIT_N = meta["SPLIT_N"] + indices_data = _bsr_scatter_mm_indices_data( + indices_format, M, K, N, Ms, Ks, nbatches, SPLIT_N, TensorAsKey(bsr) + ) + + if indices_format == "bsr_strided_mm_compressed": + meta.update(is_compressed=True) + return indices_data + (meta,) + elif indices_format == "bsr_strided_mm": + meta.update(is_compressed=False) + return indices_data + (meta,) + else: + return indices_data + + +def bsr_scatter_mm(bsr, other, indices_data=None, out=None): + """BSR @ strided -> strided""" + + assert bsr.ndim == 2 + assert other.ndim >= 2 + + Ms, Ks, Ns = bsr.shape[-2], bsr.shape[-1], other.shape[-1] + blocksize = bsr.values().shape[-2:] + + if indices_data is None: + indices_data = bsr_scatter_mm_indices_data( + bsr, other, indices_format="bsr_strided_mm_compressed" + ) + + indices_format = indices_data[0] + + if out is None: + out = torch.empty( + (*other.shape[:-2], Ms, Ns), dtype=bsr.dtype, device=bsr.device + ) + out_shape = out.shape + out = as1Dbatch(out) + + if bsr._nnz() == 0: + out.zero_() + elif indices_format in {"bsr_strided_mm_compressed", "bsr_strided_mm"}: + out.zero_() + scatter_mm(bsr.values(), other, indices_data, accumulators=out) + elif indices_format == "scatter_mm": + nbatches = other.shape[:-2].numel() + accumulators = torch.zeros( + ( + nbatches * Ms // blocksize[0] * Ns // blocksize[0], + blocksize[0], + blocksize[0], + ), + dtype=bsr.dtype, + device=bsr.device, + ) + others = ( + as1Dbatch(other) + .transpose(-2, -1) + .view( + nbatches, + Ns // blocksize[0], + blocksize[0], + Ks // blocksize[1], + blocksize[1], + ) + .movedim( + (3, 1, 4, 2), (1, 2, 3, 4) + ) # equivalent to .transpose(-3, -2).transpose(-2, -1).transpose(-4, -3) + .flatten(0, 2) + ) + scatter_mm(bsr.values(), others, indices_data, accumulators=accumulators) + out.copy_( + accumulators.unflatten( + 0, (nbatches, Ms // blocksize[0], Ns // blocksize[0]) + ) + .movedim( + (1, 2, 3, 4), (3, 1, 4, 2) + ) # equivalent to .transpose(-4, -3).transpose(-2, -1).transpose(-3, -2) + .reshape(nbatches, Ns, Ms) + .transpose(-2, -1) + ) + else: + raise NotImplementedError(indices_format) + + return out.view(out_shape) + + +def _int_bsr_dense_addmm( + input: torch.Tensor, + bsr: torch.Tensor, + dense: torch.Tensor, + *, + beta=1, + alpha=1, + left_alpha: Optional[torch.Tensor] = None, + right_alpha: Optional[torch.Tensor] = None, + out: Optional[torch.Tensor] = None, + skip_checks: bool = False, + max_grid: Optional[tuple[Optional[int], Optional[int], Optional[int]]] = None, + meta: Optional[dict] = None, +): + if out is None and dense.dtype is torch.int8: + f_name = "_int_bsr_dense_addmm" + crow_indices = bsr.crow_indices() + batch_ndim = crow_indices.dim() - 1 + M = bsr.shape[batch_ndim] + N = dense.shape[-1] + original_batch_dims_broadcasted = broadcast_batch_dims(f_name, bsr, dense) + out = torch.empty( + original_batch_dims_broadcasted + (M, N), + dtype=torch.int32, + device=dense.device, + ) + return bsr_dense_addmm( + input, + bsr, + dense, + beta=beta, + alpha=alpha, + left_alpha=left_alpha, + right_alpha=right_alpha, + out=out, + skip_checks=skip_checks, + max_grid=max_grid, + meta=meta, + ) + + +def bsr_dense_addmm( + input: torch.Tensor, + bsr: torch.Tensor, + dense: torch.Tensor, + *, + beta=1, + alpha=1, + left_alpha: Optional[torch.Tensor] = None, + right_alpha: Optional[torch.Tensor] = None, + out: Optional[torch.Tensor] = None, + skip_checks: bool = False, + max_grid: Optional[tuple[Optional[int], Optional[int], Optional[int]]] = None, + meta: Optional[dict] = None, +): + """Compute + + out = beta * input + left_alpha.reshape(-1, 1) * (alpha * (bsr @ dense)) * right_alpha.reshape(1, -1) + + where left_alpha, right_alpha are (* + 1)-D tensors when + specified, otherwise, these are treated as tensors filled with + ones. + """ + f_name = "bsr_dense_addmm" + values = bsr.values() + crow_indices = bsr.crow_indices() + col_indices = bsr.col_indices() + batch_ndim = crow_indices.dim() - 1 + M, K = bsr.shape[batch_ndim : batch_ndim + 2] + blocksize = values.shape[batch_ndim + 1 : batch_ndim + 3] + N = dense.shape[-1] + + # todo: implement checks + + original_batch_dims_broadcasted = broadcast_batch_dims(f_name, bsr, dense) + if out is None: + out = dense.new_empty(original_batch_dims_broadcasted + (M, N)) + + if bsr._nnz() == 0 or alpha == 0 or N == 0 or M == 0 or K == 0: + if beta == 0: + out.zero_() + else: + out.copy_(input) + if beta != 1: + out.mul_(beta) + return out + + left_alpha_is_one = False + right_alpha_is_one = False + if left_alpha is None: + left_alpha_is_one = True + left_alpha = dense.new_empty(()).expand( + *original_batch_dims_broadcasted, M, N + ) # not referenced + else: + left_alpha = left_alpha.view(*original_batch_dims_broadcasted, M, 1).expand( + *original_batch_dims_broadcasted, M, N + ) + + if right_alpha is None: + right_alpha_is_one = True + right_alpha = dense.new_empty(()).expand( + *original_batch_dims_broadcasted, M, N + ) # not referenced + else: + right_alpha = right_alpha.view(*original_batch_dims_broadcasted, 1, N).expand( + *original_batch_dims_broadcasted, M, N + ) + assert left_alpha.stride()[-1] == 0 + assert right_alpha.stride()[-2] == 0 + + if meta is None: + sparsity = round(1 - bsr._nnz() * blocksize[0] * blocksize[1] / (M * K), 2) + meta = bsr_dense_addmm_meta( + M, + K, + N, + blocksize[0], + blocksize[1], + beta, + alpha, + sparsity=sparsity, + dtype=dense.dtype, + out_dtype=out.dtype, + ) + out_backup = out + + ( + crow_indices, + col_indices, + values, + input, + dense, + left_alpha, + right_alpha, + out, + ) = prepare_inputs(bsr, input, dense, left_alpha, right_alpha, out) + + BM, BK = blocksize + SPLIT_N = meta.get("SPLIT_N", N // BM) + BN = N // SPLIT_N + + out_untiled = out + out = tile_to_blocksize(out, (BM, BN)) + dense = tile_to_blocksize(dense, (BK, BN)) + input = tile_to_blocksize(input, (BM, BN)) + left_alpha = tile_to_blocksize(left_alpha, (BM, BN)) + right_alpha = tile_to_blocksize(right_alpha, (BM, BN)) + + # tl.dot supports float16, float32, int32 as accumulator types. + dot_out_dtype = { + torch.float16: tl.float32, + torch.bfloat16: tl.float32, + torch.float32: tl.float64, + torch.float64: tl.float64, + torch.int8: tl.int32, + torch.int32: tl.int32, + }[out.dtype] + + n_batches = dense.size(0) + n_block_rows = crow_indices.size(-1) - 1 + n_block_cols = dense.size(-3) + + full_grid = (n_batches, n_block_cols, n_block_rows) + if max_grid is not None: + grid_blocks = tuple(max_grid[:3][::-1]) + (None,) * (3 - len(max_grid[:3])) + else: + grid_blocks = None + + tensor_dims_map = { + values: (0, None, None), + crow_indices: (0, None, -1), + col_indices: (0, None, None), + input: (0, -3, -4), + dense: (0, -3, None), + left_alpha: (0, -3, -4), + right_alpha: (0, -3, -4), + out: (0, -3, -4), + } + + assert alpha != 0 + + def kernel(grid, *sliced_tensors): + _bsr_strided_addmm_kernel[grid]( + *ptr_stride_extractor(*sliced_tensors), + beta, + alpha, + beta_is_one=beta == 1, + beta_is_nonzero=beta != 0, + alpha_is_one=alpha == 1, + left_alpha_is_one=left_alpha_is_one, + right_alpha_is_one=right_alpha_is_one, + BLOCKSIZE_ROW=BM, + BLOCKSIZE_INNER=BK, + BLOCKSIZE_COL=BN, + allow_tf32=dot_out_dtype == tl.float32, + acc_dtype=dot_out_dtype, + **meta, + ) + + launch_kernel(kernel, tensor_dims_map, full_grid, grid_blocks) + + if out.data_ptr() != out_backup.data_ptr(): + # prepare_inputs has made a copy of out, copy its content back + # to out_backup: + out_backup.copy_(out_untiled.view(out_backup.shape)) + + return out_backup + + +if has_triton(): + import triton + import triton.language as tl + + @triton.jit + def _sampled_addmm_kernel( + alpha, + beta, + IS_BETA_ZERO: tl.constexpr, + BLOCKSIZE_ROW: tl.constexpr, + BLOCKSIZE_COL: tl.constexpr, + k, + TILE_K: tl.constexpr, + values_ptr, + values_batch_stride, + values_nnz_stride, + values_row_block_stride, + values_col_block_stride, + crow_indices_ptr, + crow_indices_batch_stride, + crow_indices_stride, + col_indices_ptr, + col_indices_batch_stride, + col_indices_stride, + mat1_ptr, + mat1_batch_stride, + mat1_tiled_row_stride, + mat1_tiled_col_stride, + mat1_row_block_stride, + mat1_col_block_stride, + mat2_ptr, + mat2_batch_stride, + mat2_tiled_row_stride, + mat2_tiled_col_stride, + mat2_row_block_stride, + mat2_col_block_stride, + acc_dtype: tl.constexpr, + allow_tf32: tl.constexpr, + ): + batch_pid = tl.program_id(axis=1) + row_block_pid = tl.program_id(axis=0) + + crow_indices_offset_ptr = ( + crow_indices_ptr + + crow_indices_batch_stride * batch_pid + + crow_indices_stride * row_block_pid + ) + nnz_offset = tl.load(crow_indices_offset_ptr) + nnz_offset_next = tl.load(crow_indices_offset_ptr + crow_indices_stride) + + # Compute nnz for the row with number row_block_pid. + # If it is zero, skip the row. + row_nnz = nnz_offset_next - nnz_offset + if row_nnz == 0: + return + + row_block_arange = tl.arange(0, BLOCKSIZE_ROW) + col_block_arange = tl.arange(0, BLOCKSIZE_COL) + + # Pointers are set to the first block of the current row. + values_block_ptrs = ( + values_ptr + + values_batch_stride * batch_pid + + values_nnz_stride * nnz_offset + + values_row_block_stride * row_block_arange[:, None] + + values_col_block_stride * col_block_arange[None, :] + ) + + col_index_nnz_ptr = ( + col_indices_ptr + + col_indices_batch_stride * batch_pid + + col_indices_stride * nnz_offset + ) + + # Advance mat1 to the current tiled row, ignore columns. + mat1_block_ptrs = ( + mat1_ptr + + mat1_batch_stride * batch_pid + + mat1_tiled_row_stride * row_block_pid + + mat1_row_block_stride * row_block_arange[:, None] + ) + + # Advance mat2 in batch and block col dimension. + mat2_block_ptrs = ( + mat2_ptr + + mat2_batch_stride * batch_pid + + mat2_col_block_stride * col_block_arange[None, :] + ) + + k_tile_arange = tl.arange(0, TILE_K) + for _ in range(row_nnz): + acc_block = tl.zeros((BLOCKSIZE_ROW, BLOCKSIZE_COL), dtype=acc_dtype) + + # find column block index + col_block = tl.load(col_index_nnz_ptr) + + for k_tile in range(0, k, TILE_K): + k_offsets = k_tile + k_tile_arange + mask_k = k_offsets < k + + mat1_block = tl.load( + mat1_block_ptrs + mat1_col_block_stride * k_offsets[None, :], + mask=mask_k[None, :], + other=0.0, + ) + + mat2_block = tl.load( + mat2_block_ptrs + + mat2_tiled_col_stride * col_block + + mat2_row_block_stride * k_offsets[:, None], + mask=mask_k[:, None], + other=0.0, + ) + + acc_block += tl.dot( + mat1_block, mat2_block, allow_tf32=allow_tf32, out_dtype=acc_dtype + ) + + if IS_BETA_ZERO: + acc_block *= alpha + else: + acc_block = alpha * acc_block + beta * tl.load(values_block_ptrs) + + # write result + tl.store(values_block_ptrs, acc_block.to(values_ptr.dtype.element_ty)) + + # advance val/col_index ptrs to the next block in the row. + values_block_ptrs += values_nnz_stride + col_index_nnz_ptr += col_indices_stride + + @triton.jit + def _bsr_strided_dense_rowspace_kernel( + # values prologue + values_ptr, + values_batch_stride, + values_nnz_stride, + values_row_block_stride, + values_col_block_stride, + # values epilogue + # crow_indices prologue + crow_indices_ptr, + crow_indices_batch_stride, + crow_indices_stride, + # crow_indices epilogue + # col_indices prologue + col_indices_ptr, + col_indices_batch_stride, + col_indices_stride, + # col_indices epilogue + # dense prologue + dense_ptr, + dense_batch_stride, + dense_tiled_row_stride, + dense_tiled_col_stride, + dense_row_block_stride, + dense_col_block_stride, + # dense epilogue + # output prologue + output_ptr, + output_batch_stride, + output_tiled_row_stride, + output_tiled_col_stride, + output_row_block_stride, + output_col_block_stride, + # output epilogue + # + # gh-113754: Always keep all constexpr arguments at the end of + # triton kernel arguments list because with triton 2.1 or + # earlier non-contiguous outputs will corrupt CUDA state due + # to a triton bug (fixed in openai/triton#2262). + BLOCKSIZE_ROW: tl.constexpr, + BLOCKSIZE_COL: tl.constexpr, + acc_dtype: tl.constexpr, + allow_tf32: tl.constexpr, + GROUP_SIZE_ROW: tl.constexpr, + ): + batch_pid = tl.program_id(axis=2) + row_block_pid = tl.program_id(axis=0) + col_block_pid = tl.program_id(axis=1) + n_block_rows = tl.num_programs(axis=0) + n_block_cols = tl.num_programs(axis=1) + + row_block_pid, col_block_pid = tl.swizzle2d( + row_block_pid, col_block_pid, n_block_rows, n_block_cols, GROUP_SIZE_ROW + ) + + crow_indices_offset_ptr = ( + crow_indices_ptr + + crow_indices_batch_stride * batch_pid + + crow_indices_stride * row_block_pid + ) + nnz_offset = tl.load(crow_indices_offset_ptr) + nnz_offset_next = tl.load(crow_indices_offset_ptr + crow_indices_stride) + + # Compute nnz for the row with number row_block_pid. + # If it is zero, skip the row. + row_nnz = nnz_offset_next - nnz_offset + if row_nnz == 0: + return + + row_block_arange = tl.arange(0, BLOCKSIZE_ROW) + col_block_arange = tl.arange(0, BLOCKSIZE_COL) + + # Pointers are set to the first block of the current row. + values_block_ptrs = ( + values_ptr + + values_batch_stride * batch_pid + + values_nnz_stride * nnz_offset + + values_row_block_stride * row_block_arange[:, None] + + values_col_block_stride * col_block_arange[None, :] + ) + + # NOTE: dense is advanced into all dimensions but the tiled row one. + # That will be advanced in the loop according to values in col_indices. + dense_block_ptrs = ( + dense_ptr + + dense_batch_stride * batch_pid + + dense_tiled_col_stride * col_block_pid + + dense_row_block_stride * col_block_arange[:, None] + + dense_col_block_stride * row_block_arange[None, :] + ) + + # Pointers are set to exact write-to locations + output_ptrs = ( + output_ptr + + output_batch_stride * batch_pid + + output_tiled_row_stride * row_block_pid + + output_tiled_col_stride * col_block_pid + + output_row_block_stride * row_block_arange[:, None] + + output_col_block_stride * row_block_arange[None, :] + ) + + # Set pointer to the first nonzero element in the current row + col_index_nnz_ptr = ( + col_indices_ptr + + col_indices_batch_stride * batch_pid + + col_indices_stride * nnz_offset + ) + + output_acc_block = tl.zeros((BLOCKSIZE_ROW, BLOCKSIZE_COL), dtype=acc_dtype) + for _ in range(row_nnz): + values_block = tl.load(values_block_ptrs) + + # find which row of dense needs to get loaded + # for multiplication with values_block. + dense_row_idx = tl.load(col_index_nnz_ptr) + dense_block = tl.load( + dense_block_ptrs + dense_tiled_row_stride * dense_row_idx + ) + + # do block mm + output_acc_block += tl.dot( + values_block, dense_block, allow_tf32=allow_tf32, out_dtype=acc_dtype + ) + + # move val/col_index ptrs to the next block in the row + values_block_ptrs += values_nnz_stride + col_index_nnz_ptr += col_indices_stride + + # write back the result + tl.store(output_ptrs, output_acc_block.to(output_ptr.dtype.element_ty)) + + def _run_sampled_addmm_kernel( + alpha, + beta, + is_beta_zero, + blocksize, + k, + tile_k, + values, + crow_indices, + col_indices, + mat1, + mat2, + max_grid, + ): + n_batches = values.size(0) + n_block_rows = crow_indices.size(-1) - 1 + + full_grid = (n_batches, n_block_rows) + if max_grid is not None: + grid_blocks = tuple(max_grid[:2][::-1]) + (None,) * (2 - len(max_grid[:2])) + else: + grid_blocks = None + tensor_dims_map = { + values: (0, None), + crow_indices: (0, -1), + col_indices: (0, None), + mat1: (0, -4), + mat2: (0, None), + } + if values.dtype in (torch.half, torch.bfloat16): + acc_dtype = tl.float32 + allow_tf32 = True + else: + acc_dtype = tl.float64 + allow_tf32 = False + + def kernel(grid, *sliced_tensors): + _sampled_addmm_kernel[grid]( + alpha, + beta, + is_beta_zero, + *blocksize, + k, + tile_k, + *ptr_stride_extractor(*sliced_tensors), + acc_dtype=acc_dtype, + allow_tf32=allow_tf32, + num_stages=1, + num_warps=4, + ) + + launch_kernel(kernel, tensor_dims_map, full_grid, grid_blocks) + + def sampled_addmm( + input: torch.Tensor, + mat1: torch.Tensor, + mat2: torch.Tensor, + *, + beta=1.0, + alpha=1.0, + out: Optional[torch.Tensor] = None, + skip_checks: bool = False, + max_grid: Optional[tuple[Optional[int], Optional[int], Optional[int]]] = None, + ): + f_name = "sampled_addmm" + + check_bsr_layout(f_name, input) + input_broadcasted = broadcast_batch_dims_bsr(f_name, input, mat1, mat2) + + if not skip_checks: + check_device(f_name, mat1, input.device) + check_device(f_name, mat2, input.device) + if beta != 0.0 and input.dtype is torch.bool: + check( + False, + f"{f_name}(): having beta == {beta} not equal to 0.0 with boolean mask is not allowed.", + ) + if input.dtype is not torch.bool: + check_dtype(f_name, mat1, input.dtype) + check_dtype(f_name, mat2, input.dtype) + else: + check_dtype(f_name, mat1, mat2.dtype) + check_mm_compatible_shapes(f_name, mat1, mat2) + if out is not None: + check_bsr_layout(f_name, out) + check_device(f_name, out, mat1.device) + check_dtype(f_name, out, input.dtype) + check( + out.shape == input_broadcasted.shape and out._nnz() == input._nnz(), + f"{f_name}(): Expects `out` to be of shape {input_broadcasted.shape} " + f"and with nnz equal to {input_broadcasted._nnz()} " + f"but got out.shape = {out.shape} and out.nnz = {out._nnz()}", + ) + + if out is None: + out = input_broadcasted.to(mat1.dtype, copy=True) + else: + out.copy_(input_broadcasted) + + if out.numel() == 0 or out._nnz() == 0: + return out + + blocksize = out.values().shape[-2:] + k = mat1.size(-1) + + # NOTE: (m, 0) @ (0, n) == zeros(m, n) + if alpha == 0.0 or k == 0: + out.values().mul_(beta) + return out + + # prepare inputs by reshaping them to be kernel-compatible + out_backup = out + crow_indices, col_indices, values, mat1, mat2 = prepare_inputs(out, mat1, mat2) + + mat1 = tile_to_blocksize(mat1, (blocksize[0], k)) + mat2 = tile_to_blocksize(mat2, (k, blocksize[1])) + tile_k = max(*blocksize) + + _run_sampled_addmm_kernel( + alpha, + beta, + beta == 0.0, + blocksize, + k, + tile_k, + values, + crow_indices, + col_indices, + mat1, + mat2, + max_grid, + ) + + # If nnz x block strides are not the same in out_backup.values and values, + # it means that out_backup.values and values are not the views of each other, + # so we have to copy. + if out_backup.values().stride()[-3:] != values.stride()[-3:]: + out_backup.values().copy_(values.reshape(out_backup.values().shape)) + return out_backup + + def bsr_dense_mm( + bsr: torch.Tensor, + dense: torch.Tensor, + *, + out: Optional[torch.Tensor] = None, + skip_checks: bool = False, + max_grid: Optional[tuple[Optional[int], Optional[int], Optional[int]]] = None, + meta: Optional[dict] = None, + ): + f_name = "bsr_dense_mm" + m, _kl = bsr.shape[-2:] + if not skip_checks: + check_bsr_layout(f_name, bsr) + check_device(f_name, bsr, dense.device) + check_dtype(f_name, bsr, dense.dtype, (torch.int8,)) + check_mm_compatible_shapes(f_name, bsr, dense) + + n = dense.size(-1) + row_block, col_block = bsr.values().shape[-2:] + check_blocksize(f_name, (row_block, col_block)) + check( + not n % 16, + f"{f_name}(): dense.size(-1) == {n} should be divisible by 16", + ) + else: + _kr, n = dense.shape[-2:] + + original_batch_dims_broadcasted = broadcast_batch_dims(f_name, bsr, dense) + + if out is not None and not skip_checks: + expected_out_shape = original_batch_dims_broadcasted + (m, n) + check( + out.shape == expected_out_shape, + "bsr_dense_mm(): `out` argument has wrong shape, " + f"expected {expected_out_shape}, but got {out.shape}.", + ) + check( + out.is_contiguous() or out.transpose(-2, -1).is_contiguous(), + "bsr_dense_mm(): only row-major/col-major `out` arguments are supported, " + "i.e. (out.is_contiguous() or out.transpose(-2, -1).is_contiguous()) " + "should be True.", + ) + + # Allocate out + if out is None: + out = dense.new_empty(original_batch_dims_broadcasted + (m, n)) + + # Short circuit if lhs is zero + if bsr._nnz() == 0: + return out.zero_() + + # with beta==0, addmm ignores input content, so we can use out + # as a placeholder for input because their shapes match: + return bsr_dense_addmm(out, bsr, dense, alpha=1, beta=0, out=out) + + @triton.jit + def _bsr_softmax_kernel( + crow_indices_ptr, + crow_indices_batch_stride, + crow_indices_stride, + values_ptr, + values_batch_stride, + values_row_block_stride, + values_nnz_col_block_stride, + row_block, + col_block, + MAX_ROW_NNZ: tl.constexpr, + TILE: tl.constexpr, + ): + batch_pid = tl.program_id(axis=2) + row_block_offset_pid = tl.program_id(axis=1) + row_block_pid = tl.program_id(axis=0) + + crow_indices_offset_ptr = ( + crow_indices_ptr + + crow_indices_batch_stride * batch_pid + + crow_indices_stride * row_block_pid + ) + nnz_offset = tl.load(crow_indices_offset_ptr) + nnz_offset_next = tl.load(crow_indices_offset_ptr + crow_indices_stride) + + # Compute nnz for the row with number row_block_pid. + # If it is zero, skip the row. + row_nnz = nnz_offset_next - nnz_offset + if row_nnz == 0: + return + + row_arange = tl.arange(0, TILE) + mask = row_arange < row_nnz * col_block + + curr_row_values_ptrs = ( + values_ptr + + values_batch_stride * batch_pid + + values_row_block_stride * row_block_offset_pid + + nnz_offset * col_block + ) + + # find max in the row + row_tile = tl.load( + curr_row_values_ptrs + row_arange, mask=mask, other=-float("inf") + ).to(tl.float32) + max_row_value = tl.max(row_tile, axis=0) + for _ in range(TILE, MAX_ROW_NNZ, TILE): + row_arange += TILE + mask = row_arange < row_nnz * col_block + row_tile = tl.load( + curr_row_values_ptrs + row_arange, mask=mask, other=-float("inf") + ).to(tl.float32) + curr_max_row_value = tl.max(row_tile, axis=0) + max_row_value = tl.where( + max_row_value > curr_max_row_value, max_row_value, curr_max_row_value + ) + + # find denominator for stable softmax + num = tl.exp(row_tile - max_row_value) + denom = tl.sum(num, axis=0) + for _ in range(TILE, MAX_ROW_NNZ, TILE): + row_arange -= TILE + mask = row_arange < row_nnz * col_block + row_tile = tl.load( + curr_row_values_ptrs + row_arange, mask=mask, other=-float("inf") + ).to(tl.float32) + num = tl.exp(row_tile - max_row_value) + denom += tl.sum(num, axis=0) + + # populate output + tl.store( + curr_row_values_ptrs + row_arange, + (num / denom).to(values_ptr.dtype.element_ty), + mask=mask, + ) + for _ in range(TILE, MAX_ROW_NNZ, TILE): + row_arange += TILE + mask = row_arange < row_nnz * col_block + row_tile = tl.load( + curr_row_values_ptrs + row_arange, mask=mask, other=-float("inf") + ).to(tl.float32) + num = tl.exp(row_tile - max_row_value) + tl.store( + curr_row_values_ptrs + row_arange, + (num / denom).to(values_ptr.dtype.element_ty), + mask=mask, + ) + + def bsr_softmax(input, max_row_nnz=None): + f_name = "bsr_softmax" + + check_bsr_layout(f_name, input) + check_dtype(f_name, input, input.dtype) + + if input._nnz() == 0 or input.numel() == 0: + return input.clone() + + m, n = input.shape[-2:] + nnz = input._nnz() + row_block, col_block = input.values().shape[-2:] + + if max_row_nnz is None: + max_row_nnz = triton.next_power_of_2(n) + else: + max_row_nnz = triton.next_power_of_2(max_row_nnz) + + crow_indices = input.crow_indices().unsqueeze(0).flatten(0, -2) + # reshape values from + # (b1, ..., bn, nnz, row_block, col_block) to + # (b1 * ... * bn, row_block, nnz * col_block). + # This simplifies batch dim manipulation and unlocks + # the possibility to access all nnzs in any given row. + if input.values().transpose(-3, -2).is_contiguous(): + # Need to clone to avoid `contiguous` returning a view. + values = input.values().clone() + else: + values = input.values() + values = ( + values.transpose(-3, -2) + .contiguous() + .unsqueeze(0) + .flatten(0, -4) + .reshape(-1, row_block, nnz * col_block) + ) + full_grid = (values.shape[0], row_block, m // row_block) + grid_blocks = None + tensor_dims_map = { + # We span nnz number of blocks, not nnz + 1, + # hence crow_indices[..., :-1] + crow_indices[..., :-1]: (0, None, -1), + values: (0, None, None), + } + + def kernel(grid, *sliced_tensors): + _bsr_softmax_kernel[grid]( + *ptr_stride_extractor(*sliced_tensors), + row_block, + col_block, + max_row_nnz, + # Triton's max numel is bounded by 2 ** 17. + min(2**17, max_row_nnz), + ) + + launch_kernel(kernel, tensor_dims_map, full_grid, grid_blocks) + + values = ( + values.reshape(-1, row_block, nnz, col_block) + .transpose(-3, -2) + .reshape(*input.values().shape) + ) + + return torch.sparse_compressed_tensor( + input.crow_indices().clone(), + input.col_indices().clone(), + values, + size=input.shape, + layout=input.layout, + ) + + def _scaled_dot_product_attention( + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + attn_mask: Optional[torch.Tensor], + dropout_p: float = 0.0, + is_causal: bool = False, + scale: Optional[float] = None, + ): + f_name = "_scaled_dot_product_attention" + check(not is_causal, f"{f_name}(): is_causal == True is not supported.") + check(attn_mask is not None, f"{f_name}(): attn_mask == None is not supported.") + assert attn_mask is not None + + check( + attn_mask.layout == torch.sparse_bsr, + f"{f_name}(): " + f"attn_mask.layout must be {torch.sparse_bsr}, but got " + f"attn_mask.layout == {attn_mask.layout}.", + ) + + check_device(f_name, key, query.device) + check_device(f_name, value, query.device) + check_device(f_name, attn_mask, query.device) + + check_dtype(f_name, key, query.dtype) + check_dtype(f_name, value, query.dtype) + if attn_mask.dtype is not torch.bool: + check_dtype(f_name, attn_mask, query.dtype) + + sdpa = sampled_addmm( + attn_mask, query, key.transpose(-2, -1), beta=0.0, skip_checks=False + ) + if scale is None and query.size(-1) == 0 or scale == 0.0: + check( + False, + f"{f_name}(): current value of scale == {scale} " + "results in division by zero.", + ) + scale_factor = 1 / math.sqrt(query.size(-1)) if scale is None else scale + sdpa.values().mul_(scale_factor) + sdpa = bsr_softmax(sdpa) + torch.nn.functional.dropout(sdpa.values(), p=dropout_p, inplace=True) + sdpa = bsr_dense_mm(sdpa, value) + return sdpa + + @triton.jit + def _scatter_mm2_kernel( + M: tl.constexpr, + K: tl.constexpr, + N: tl.constexpr, + blocks_ptr, + blocks_stride_P, + blocks_stride_M, + blocks_stride_K, + others_ptr, + others_stride_Q, + others_stride_K, + others_stride_N, + accumulators_ptr, + accumulators_stride_R, + accumulators_stride_M, + accumulators_stride_N, + pq_offsets_ptr, + pq_offsets_stride, + pq_ptr, + pq_stride_T, + pq_stride_1, + dot_out_dtype: tl.constexpr, + TILE_M: tl.constexpr, + TILE_N: tl.constexpr, + allow_tf32: tl.constexpr, + ): + Ms = M // TILE_M + + pid_t = tl.program_id(axis=0) + + pid = tl.program_id(axis=1) + pid_m = pid // Ms + pid_n = pid % Ms + + rm = pid_m * TILE_M + tl.arange(0, TILE_M) + rn = pid_n * TILE_N + tl.arange(0, TILE_N) + rk = tl.arange(0, K) + + A_ptr = blocks_ptr + ( + rm[:, None] * blocks_stride_M + rk[None, :] * blocks_stride_K + ) + B_ptr = others_ptr + ( + rk[:, None] * others_stride_K + rn[None, :] * others_stride_N + ) + + g0 = tl.load(pq_offsets_ptr + pid_t * pq_offsets_stride) + g1 = tl.load(pq_offsets_ptr + (pid_t + 1) * pq_offsets_stride) + + if g0 == g1: + return + + acc_block = tl.zeros((TILE_M, TILE_N), dtype=dot_out_dtype) + + for i in range(g0, g1): + p = tl.load(pq_ptr + i * pq_stride_T) + q = tl.load(pq_ptr + i * pq_stride_T + pq_stride_1) + A = tl.load(A_ptr + p * blocks_stride_P) + B = tl.load(B_ptr + q * others_stride_Q) + acc_block += tl.dot(A, B, out_dtype=dot_out_dtype, allow_tf32=allow_tf32) + + C_ptr = ( + accumulators_ptr + + pid_t * accumulators_stride_R + + ( + rm[:, None] * accumulators_stride_M + + rn[None, :] * accumulators_stride_N + ) + ) + tl.store(C_ptr, acc_block.to(accumulators_ptr.dtype.element_ty)) + + def _scatter_mm2( + blocks: torch.Tensor, + others: torch.Tensor, + pq_offsets: torch.Tensor, + pq_indices: torch.Tensor, + accumulators: torch.Tensor, + ): + _P, M, K = blocks.shape + _Q, _, N = others.shape + + meta = dict( + TILE_M=max(16, M // 4), TILE_N=max(16, N // 4), num_stages=1, num_warps=2 + ) + + def grid(META): + return ( + pq_offsets.shape[0] - 1, + triton.cdiv(M, META["TILE_M"]) * triton.cdiv(N, META["TILE_N"]), + 1, + ) + + dot_out_dtype = { + torch.float16: tl.float32, + torch.bfloat16: tl.float32, + torch.float32: tl.float64, + torch.float64: tl.float64, + }[accumulators.dtype] + if "allow_tf32" not in meta: + meta.update(allow_tf32=dot_out_dtype == tl.float32) + _scatter_mm2_kernel[grid]( + M, + K, + N, + blocks, + blocks.stride(0), + blocks.stride(1), + blocks.stride(2), + others, + others.stride(0), + others.stride(1), + others.stride(2), + accumulators, + accumulators.stride(0), + accumulators.stride(1), + accumulators.stride(2), + pq_offsets, + pq_offsets.stride(0), + pq_indices, + pq_indices.stride(0), + pq_indices.stride(1), + dot_out_dtype=dot_out_dtype, + **meta, + ) + + @triton.jit + def _scatter_mm6_kernel( + nbatches, + Ms, + Ks: tl.constexpr, + N, + blocks_ptr, + blocks_stride_P, + blocks_stride_M, + blocks_stride_K, + others_ptr, + others_stride_B, + others_stride_K, + others_stride_N, + accumulators_ptr, + accumulators_stride_B, + accumulators_stride_M, + accumulators_stride_N, + c_indices_ptr, + r_offsets_ptr, + p_offsets_ptr, + q_offsets_ptr, + is_compressed: tl.constexpr, + dot_out_dtype: tl.constexpr, + SPLIT_N: tl.constexpr, + TILE_M: tl.constexpr, + TILE_N: tl.constexpr, + GROUP_SIZE: tl.constexpr, + allow_tf32: tl.constexpr, + ): + Ns = N // SPLIT_N + BLOCKS_M = Ms // TILE_M + BLOCKS_N = Ns // TILE_N + + pid_t_ = tl.program_id(axis=0) + pid = tl.program_id(axis=1) + pid_b = pid_t_ % nbatches + pid_t = pid_t_ // nbatches + + num_pid_in_group = GROUP_SIZE * BLOCKS_N + group_id = pid // num_pid_in_group + first_pid_m = group_id * GROUP_SIZE + group_size_m = min(BLOCKS_M - first_pid_m, GROUP_SIZE) + pid_m = first_pid_m + (pid % group_size_m) + pid_n = (pid % num_pid_in_group) // group_size_m + + rm = pid_m * TILE_M + tl.arange(0, TILE_M) + rn = pid_n * TILE_N + tl.arange(0, TILE_N) + rk = tl.arange(0, Ks) + A_ptr = blocks_ptr + ( + rm[:, None] * blocks_stride_M + rk[None, :] * blocks_stride_K + ) + B_ptr = ( + others_ptr + + pid_b * others_stride_B + + (rk[:, None] * others_stride_K + rn[None, :] * others_stride_N) + ) + + # When is_compressed is True, r is the only variable that + # depends on pid_t. This property allows sorting r values + # before calling the kernel. The sorting of r is equivalent to + # defining swizzle operator outside of the kernel. + r = tl.load(r_offsets_ptr + pid_t) + + if is_compressed: + m = (r // N) // Ms + n = (r % N) // Ns + r0 = tl.load(c_indices_ptr + m) + r1 = tl.load(c_indices_ptr + m + 1) + g0 = n * r1 + (SPLIT_N - n) * r0 + nnz = r1 - r0 + else: + g0 = tl.load(c_indices_ptr + pid_t) + g1 = tl.load(c_indices_ptr + pid_t + 1) + nnz = g1 - g0 + + q_ptr = q_offsets_ptr + g0 + acc_block = tl.zeros((TILE_M, TILE_N), dtype=dot_out_dtype) + + if is_compressed: + A_ptr += r0 * blocks_stride_P # type: ignore[possibly-undefined] + for _ in range(nnz): + q = tl.load(q_ptr) + B = tl.load(B_ptr + q) + A = tl.load(A_ptr) + acc_block += tl.dot( + A, B, out_dtype=dot_out_dtype, allow_tf32=allow_tf32 + ) + A_ptr += blocks_stride_P + q_ptr += 1 + else: + p_ptr = p_offsets_ptr + g0 + for _ in range(nnz): + q = tl.load(q_ptr) + B = tl.load(B_ptr + q) + p = tl.load(p_ptr) + A = tl.load(A_ptr + p * blocks_stride_P) + p_ptr += 1 + q_ptr += 1 + acc_block += tl.dot( + A, B, out_dtype=dot_out_dtype, allow_tf32=allow_tf32 + ) + + C_ptr = ( + accumulators_ptr + + r + + pid_b * accumulators_stride_B + + ( + rm[:, None] * accumulators_stride_M + + rn[None, :] * accumulators_stride_N + ) + ) + tl.store(C_ptr, acc_block.to(accumulators_ptr.dtype.element_ty)) + + def _scatter_mm6( + blocks: torch.Tensor, + others: torch.Tensor, + c_indices: torch.Tensor, + r_offsets: torch.Tensor, + p_offsets: torch.Tensor, + q_offsets: torch.Tensor, + meta: dict, + accumulators: torch.Tensor, + force_contiguous: bool = True, + ): + SPLIT_N = meta["SPLIT_N"] + _P, Ms, Ks = blocks.shape + B, _K, N = others.shape + B_, _M, N_ = accumulators.shape + assert N_ == N + Ns = N // SPLIT_N + assert B_ == B + + def grid(META): + return ( + r_offsets.shape[0] * B, + triton.cdiv(Ms, META["TILE_M"]) * triton.cdiv(Ns, META["TILE_N"]), + ) + + dot_out_dtype = { + torch.float16: tl.float32, + torch.bfloat16: tl.float32, + torch.float32: tl.float64, + torch.float64: tl.float64, + }[accumulators.dtype] + if "allow_tf32" not in meta: + meta.update(allow_tf32=dot_out_dtype == tl.float32) + + assert c_indices.stride(0) == 1 + assert r_offsets.stride(0) == 1 + assert p_offsets.stride(0) == 1 + assert q_offsets.stride(0) == 1 + + # Re non-contiguous tensor arguments. Sometimes triton kernel + # launches may fail with + # + # RuntimeError: Triton Error [CUDA]: an illegal memory access was encountered + # + # that appears to be case when the size of a non-contiguous + # tensor argument is larger than a certain threshold. Could + # this be related to shared memory or L1 cache size of a GPU + # card? In anycase, ensuring that tensor arguments are + # contiguous seems to avoid the above exception. So, in the + # following we'll always convert tensor arguments to + # C-contiguous tensors. + + if force_contiguous: + blocks = blocks.contiguous() + others = others.contiguous() + if not accumulators.is_contiguous(): + accumulators_ = accumulators.contiguous() + else: + accumulators_ = accumulators + else: + accumulators_ = accumulators + + _scatter_mm6_kernel[grid]( + B, + Ms, + Ks, + N, + blocks, + blocks.stride(0), + blocks.stride(1), + blocks.stride(2), + others, + others.stride(0), + others.stride(1), + others.stride(2), + accumulators_, + accumulators_.stride(0), + accumulators_.stride(1), + accumulators_.stride(2), + c_indices, + r_offsets, + p_offsets, + q_offsets, + dot_out_dtype=dot_out_dtype, + **meta, + ) + + if force_contiguous and not accumulators.is_contiguous(): + accumulators.copy_(accumulators_) + + def next_power_of_two(n): + assert n > 0 + return 2 ** (n.bit_length()) + + @triton.jit + def _bsr_strided_addmm_kernel( + # values prologue + values_ptr, + values_batch_stride, + values_nnz_stride, + values_row_block_stride, + values_col_block_stride, + # values epilogue + # crow_indices prologue + crow_indices_ptr, + crow_indices_batch_stride, + crow_indices_stride, + # crow_indices epilogue + # col_indices prologue + col_indices_ptr, + col_indices_batch_stride, + col_indices_stride, + # col_indices epilogue + # input prologue + input_ptr, + input_batch_stride, + input_tiled_row_stride, + input_tiled_col_stride, + input_row_block_stride, + input_col_block_stride, + # input epilogue + # dense prologue + dense_ptr, + dense_batch_stride, + dense_tiled_row_stride, + dense_tiled_col_stride, + dense_row_block_stride, + dense_col_block_stride, + # dense epilogue + # left_alpha prologue + left_alpha_ptr, + left_alpha_batch_stride, + left_alpha_tiled_row_stride, + left_alpha_tiled_col_stride: tl.constexpr, + left_alpha_row_block_stride, + left_alpha_col_block_stride: tl.constexpr, + # left_alpha epilogue + # right_alpha prologue + right_alpha_ptr, + right_alpha_batch_stride, + right_alpha_tiled_row_stride: tl.constexpr, + right_alpha_tiled_col_stride, + right_alpha_row_block_stride: tl.constexpr, + right_alpha_col_block_stride, + # right_alpha epilogue + # output prologue + output_ptr, + output_batch_stride, + output_tiled_row_stride, + output_tiled_col_stride, + output_row_block_stride, + output_col_block_stride, + # output epilogue + beta, + alpha, + beta_is_one: tl.constexpr, + beta_is_nonzero: tl.constexpr, + alpha_is_one: tl.constexpr, + left_alpha_is_one: tl.constexpr, + right_alpha_is_one: tl.constexpr, + BLOCKSIZE_ROW: tl.constexpr, + BLOCKSIZE_COL: tl.constexpr, + BLOCKSIZE_INNER: tl.constexpr, + acc_dtype: tl.constexpr, + allow_tf32: tl.constexpr, + GROUP_SIZE_ROW: tl.constexpr, + SPLIT_N: tl.constexpr, + ): + # left/right_alpha tensors are originally (* + 1)-dimensional + assert left_alpha_tiled_col_stride == 0 + assert left_alpha_col_block_stride == 0 + assert right_alpha_tiled_row_stride == 0 + assert right_alpha_row_block_stride == 0 + + batch_pid = tl.program_id(axis=2) + row_block_pid = tl.program_id(axis=0) + col_block_pid = tl.program_id(axis=1) + n_block_rows = tl.num_programs(axis=0) + n_block_cols = tl.num_programs(axis=1) + + row_block_pid, col_block_pid = tl.swizzle2d( + row_block_pid, col_block_pid, n_block_rows, n_block_cols, GROUP_SIZE_ROW + ) + + crow_indices_offset_ptr = ( + crow_indices_ptr + + crow_indices_batch_stride * batch_pid + + crow_indices_stride * row_block_pid + ) + nnz_offset = tl.load(crow_indices_offset_ptr) + nnz_offset_next = tl.load(crow_indices_offset_ptr + crow_indices_stride) + + # Compute nnz for the row with number row_block_pid. + row_nnz = nnz_offset_next - nnz_offset + + row_block_arange = tl.arange(0, BLOCKSIZE_ROW) + inner_block_arange = tl.arange(0, BLOCKSIZE_INNER) + + if BLOCKSIZE_COL < 16 or BLOCKSIZE_COL % 16 != 0: + PADDED_BLOCKSIZE_COL : tl.constexpr = 16 + else: + PADDED_BLOCKSIZE_COL: tl.constexpr = BLOCKSIZE_COL + + col_block_arange = tl.arange(0, PADDED_BLOCKSIZE_COL) + + # Pointers are set to the first block of the current row. + values_block_ptrs = ( + values_ptr + + values_batch_stride * batch_pid + + values_nnz_stride * nnz_offset + + values_row_block_stride * row_block_arange[:, None] + + values_col_block_stride * inner_block_arange[None, :] + ) + + # NOTE: dense is advanced into all dimensions but the tiled row one. + # That will be advanced in the loop according to values in col_indices. + dense_block_ptrs = ( + dense_ptr + + dense_batch_stride * batch_pid + + dense_tiled_col_stride * col_block_pid + + dense_row_block_stride * inner_block_arange[:, None] + + dense_col_block_stride * col_block_arange[None, :] + ) + + # Pointers are set to exact write-to locations + output_ptrs = ( + output_ptr + + output_batch_stride * batch_pid + + output_tiled_row_stride * row_block_pid + + output_tiled_col_stride * col_block_pid + + output_row_block_stride * row_block_arange[:, None] + + output_col_block_stride * col_block_arange[None, :] + ) + + # Set pointer to the first nonzero element in the current row + col_index_nnz_ptr = ( + col_indices_ptr + + col_indices_batch_stride * batch_pid + + col_indices_stride * nnz_offset + ) + + output_acc_block = tl.zeros((BLOCKSIZE_ROW, PADDED_BLOCKSIZE_COL), dtype=acc_dtype) + + for _ in range(row_nnz): + values_block = tl.load(values_block_ptrs) + + # find which row of dense needs to get loaded + # for multiplication with values_block. + dense_row_idx = tl.load(col_index_nnz_ptr) + offsets = tl.arange(0, 16)[None, :] + dense_block = tl.load( + dense_block_ptrs + dense_tiled_row_stride * dense_row_idx, + mask=offsets < BLOCKSIZE_COL, + ) + + # do block mm + output_acc_block += tl.dot( + values_block, dense_block, allow_tf32=allow_tf32, out_dtype=acc_dtype + ) + + # move val/col_index ptrs to the next block in the row + values_block_ptrs += values_nnz_stride + col_index_nnz_ptr += col_indices_stride + + if not alpha_is_one: + output_acc_block *= alpha + + if not left_alpha_is_one: + left_alpha_ptrs = ( + left_alpha_ptr + + left_alpha_batch_stride * batch_pid + + left_alpha_tiled_row_stride * row_block_pid + + left_alpha_tiled_col_stride * col_block_pid + + left_alpha_row_block_stride * row_block_arange[:, None] + + left_alpha_col_block_stride * col_block_arange[None, :] + ) + output_acc_block *= tl.load(left_alpha_ptrs) + + if not right_alpha_is_one: + right_alpha_ptrs = ( + right_alpha_ptr + + right_alpha_batch_stride * batch_pid + + right_alpha_tiled_row_stride * row_block_pid + + right_alpha_tiled_col_stride * col_block_pid + + right_alpha_row_block_stride * row_block_arange[:, None] + + right_alpha_col_block_stride * col_block_arange[None, :] + ) + output_acc_block *= tl.load(right_alpha_ptrs) + + if beta_is_nonzero: + input_ptrs = ( + input_ptr + + input_batch_stride * batch_pid + + input_tiled_row_stride * row_block_pid + + input_tiled_col_stride * col_block_pid + + input_row_block_stride * row_block_arange[:, None] + + input_col_block_stride * col_block_arange[None, :] + ) + if beta_is_one: + output_acc_block += tl.load(input_ptrs) + else: + output_acc_block += beta * tl.load(input_ptrs) + + # write back the result + tl.store(output_ptrs, output_acc_block.to(output_ptr.dtype.element_ty)) + +else: + bsr_softmax = None # type: ignore[assignment] + bsr_dense_mm = None # type: ignore[assignment] + sampled_addmm = None # type: ignore[assignment] + _scaled_dot_product_attention = None # type: ignore[assignment] + _scatter_mm2 = None # type: ignore[assignment] + _scatter_mm6 = None # type: ignore[assignment] + _bsr_strided_addmm_kernel = None # type: ignore[assignment] From 44985d21091d86ec0014d2d49f7f7696042d03df Mon Sep 17 00:00:00 2001 From: Jesse Cai Date: Thu, 23 Jan 2025 15:29:54 -0800 Subject: [PATCH 06/23] wip --- torchao/_models/llama/generate.py | 17 ++++++++ .../sparsity/superblock/_triton_ops_meta.py | 41 ++++++++----------- .../sparsity/superblock/benchmark.py | 3 +- .../sparsity/superblock/bsr_triton_ops.py | 35 +++++++++++++--- 4 files changed, 67 insertions(+), 29 deletions(-) diff --git a/torchao/_models/llama/generate.py b/torchao/_models/llama/generate.py index a537ccd6d2..834fe0b4ee 100644 --- a/torchao/_models/llama/generate.py +++ b/torchao/_models/llama/generate.py @@ -23,6 +23,7 @@ from torchao.utils import TORCH_VERSION_AT_LEAST_2_5, get_model_size_in_bytes torch.sparse.SparseSemiStructuredTensor._FORCE_CUTLASS = False +torch.backends.cuda.enable_cudnn_sdp(True) class HostEvent: @@ -812,6 +813,22 @@ def ffn_or_attn_only(mod, fqn): sparsify_(model, block_sparse_weight(blocksize=64), filter_fn=ffn_only) + + # from torchao.prototype.sparsity.superblock._triton_ops_meta import optimize_bsr_dense_addmm + # for M, K, N in [(14336, 4096, 8192), (4096, 14336, 8192)]: + # optimize_bsr_dense_addmm( + # M, + # K, + # N, + # 64, + # 64, + # beta=0, + # alpha=1, + # sparsity=0.9, + # dtype=torch.bfloat16, + # opname="bsr_dense_addmm", + # verbose=True, + # ) model_size = get_model_size_in_bytes(model, ignore_embeddings=True) / 1e9 diff --git a/torchao/prototype/sparsity/superblock/_triton_ops_meta.py b/torchao/prototype/sparsity/superblock/_triton_ops_meta.py index 08471ac058..17e3d98e5e 100644 --- a/torchao/prototype/sparsity/superblock/_triton_ops_meta.py +++ b/torchao/prototype/sparsity/superblock/_triton_ops_meta.py @@ -753,7 +753,7 @@ def optimize_bsr_dense_addmm( verbose=False, opname=None, ): - torch.manual_seed(0) + # torch.manual_seed(0) bsr = create_blocked_tensor( 0, m, k, (bm, bk), sparsity, dtype, device ).to_sparse_bsr((bm, bk)) @@ -802,6 +802,8 @@ def main(op="scatter_mm", force=False, dtype=torch.float16, verbose=True): ] sizes3_lst = [3 * sz for sz in [64, 128] + sizes_lst if sz <= 2048] shapes_lst = [(sz, sz) for sz in sizes_lst[:-4] + sizes3_lst] + + shapes_lst=[] shapes_lst.extend([(3072, 768), (768, 3072)]) shapes_lst.extend([(5120, 1280), (1280, 5120)]) if dtype is torch.int8: @@ -809,7 +811,7 @@ def main(op="scatter_mm", force=False, dtype=torch.float16, verbose=True): blocksize_lst = [(32, 32), (64, 64), (128, 128), (256, 256)] else: blocksize_lst = [(16, 16), (32, 32), (64, 64), (128, 128)] - sparsity_lst = [0.5, 0.7, 0.3][:1] + sparsity_lst = [0.9][:1] for sparsity in sparsity_lst: print(f"{op, dtype, sparsity=}") try: @@ -826,21 +828,6 @@ def main(op="scatter_mm", force=False, dtype=torch.float16, verbose=True): if M == K and N == 50432: continue print(f"{M, K, N, (BM, BK)=}") - for alpha, beta in [(1, 1), (1, 0)]: - optimize_bsr_dense_addmm( - M, - K, - N, - BM, - BK, - beta=beta, - alpha=alpha, - force=force, - sparsity=sparsity, - dtype=dtype, - verbose=verbose, - opname=op, - ) else: raise NotImplementedError(op) except KeyboardInterrupt: @@ -7748,9 +7735,17 @@ def test_func(): } if __name__ == "__main__": - for dtype in [torch.int8]: - for op in ["_int_bsr_dense_addmm"]: - main(op=op, force=False, dtype=dtype) - for dtype in [torch.float16, torch.bfloat16, torch.float32, torch.int8]: - for op in ["bsr_dense_addmm"]: - main(op=op, force=False, dtype=dtype) + for M, K, N in [(14336, 4096, 16), (4096, 14336, 16), (14336, 4096, 16), (4096, 14336, 16)]: + optimize_bsr_dense_addmm( + M, + K, + N, + 64, + 64, + beta=0, + alpha=1, + sparsity=0.9, + dtype=torch.bfloat16, + opname="bsr_dense_addmm", + verbose=True, + ) diff --git a/torchao/prototype/sparsity/superblock/benchmark.py b/torchao/prototype/sparsity/superblock/benchmark.py index b87834afae..9f6d70e8b0 100644 --- a/torchao/prototype/sparsity/superblock/benchmark.py +++ b/torchao/prototype/sparsity/superblock/benchmark.py @@ -65,7 +65,8 @@ def main(args): ).eval() # Fake sparsity necessary for BSR, since we find based on SuperBlock - sparsifier_or_none = simulate_sparsity(model, args) + # sparsifier_or_none = simulate_sparsity(model, args) + sparsifier_or_none = None if sparsifier_or_none is not None: sparsifier_or_none.squash_mask() diff --git a/torchao/prototype/sparsity/superblock/bsr_triton_ops.py b/torchao/prototype/sparsity/superblock/bsr_triton_ops.py index 7bd0483dd7..28100b2bb9 100644 --- a/torchao/prototype/sparsity/superblock/bsr_triton_ops.py +++ b/torchao/prototype/sparsity/superblock/bsr_triton_ops.py @@ -815,11 +815,36 @@ def bsr_dense_addmm_meta( else: # see [Computing optimal kernel parameters] in # _triton_ops_meta.py for ways to avoid this warning - # message - warn_once( - "bsr_dense_addmm uses non-optimal triton kernel parameters" - f" for {M=} {K=} {N=} {Ms=}, {Ks=} {beta=} {alpha=} {dtype=} {out_dtype=}" + # from ._triton_ops_meta import optimize_bsr_dense_addmm + # optimize_bsr_dense_addmm( + # M, + # K, + # 16, + # 64, + # 64, + # beta=beta, + # alpha=alpha, + # sparsity=sparsity, + # dtype=dtype, + # opname="bsr_dense_addmm", + # verbose=True, + # ) + # get padded key + padded_key = (M, K, 16, Ms, Ks, beta == 0, beta == 1, alpha == 1) + meta = get_meta( + "bsr_dense_addmm", + padded_key, + device_name, + version=(_version, version_dtype, sparsity), ) + # breakpoint() + # return meta + # message + # breakpoint() + # warn_once( + # "bsr_dense_addmm uses non-optimal triton kernel parameters" + # f" for {M=} {K=} {N=} {Ms=}, {Ks=} {beta=} {alpha=} {dtype=} {out_dtype=}" + # ) SPLIT_N = SPLIT_N or max(N // Ms, 1) GROUP_SIZE_ROW = GROUP_SIZE_ROW or 4 @@ -2474,7 +2499,7 @@ def _bsr_strided_addmm_kernel( # find which row of dense needs to get loaded # for multiplication with values_block. dense_row_idx = tl.load(col_index_nnz_ptr) - offsets = tl.arange(0, 16)[None, :] + offsets = tl.arange(0, PADDED_BLOCKSIZE_COL)[None, :] dense_block = tl.load( dense_block_ptrs + dense_tiled_row_stride * dense_row_idx, mask=offsets < BLOCKSIZE_COL, From b32bdfbc8e966962ec1cbdc364d6bb2fbc54e7c5 Mon Sep 17 00:00:00 2001 From: Jesse Cai Date: Thu, 23 Jan 2025 16:00:49 -0800 Subject: [PATCH 07/23] wip --- .../sparsity/superblock/_triton_ops_meta.py | 7751 ----------------- .../sparsity/superblock/bsr_triton_ops.py | 1680 +--- .../superblock => sparsity}/blocksparse.py | 0 .../superblock => sparsity}/supermask.py | 0 4 files changed, 2 insertions(+), 9429 deletions(-) delete mode 100644 torchao/prototype/sparsity/superblock/_triton_ops_meta.py rename torchao/{prototype/sparsity/superblock => sparsity}/blocksparse.py (100%) rename torchao/{prototype/sparsity/superblock => sparsity}/supermask.py (100%) diff --git a/torchao/prototype/sparsity/superblock/_triton_ops_meta.py b/torchao/prototype/sparsity/superblock/_triton_ops_meta.py deleted file mode 100644 index 17e3d98e5e..0000000000 --- a/torchao/prototype/sparsity/superblock/_triton_ops_meta.py +++ /dev/null @@ -1,7751 +0,0 @@ -# mypy: allow-untyped-defs -"""Provides optimal triton kernel parameters. - -Aim ---- - -The usage of optimal triton kernel parameters may increase the -performance of operations several times. For example, for large tensor -shapes, the usage of a bsr tensor as mat1 argument in addmm-based -operations typically outperforms the corresponding operation with -strided-only inputs when the blocked representation of a tensor -provides a better alignement with memory access than what the strided -representation would provide. - -Pre-computed kernel parameters ------------------------------- - -This script finds and stores the optimal triton kernel parameters for -a specific set of shape configurations. For instance, the set of shape -configurations of the bsr_dense_addmm kernel is defined as - - input, out: M x N strided tensor - mat1: M x K bsr tensor with blocksize (BM, BK) and given sparsity - mat2: M x N strided tensor - dtype = float16, bfloat16, float32 - sparsity = 0.5 - M = 256, 512, ..., 16384 - K = M - N = 256, 512, ..., 131072 - BM = 16, 32, ..., 128 - BK = BM - alpha = 1 - beta = 0, 1 - GPUs: NVIDIA A100-SXM4-80GB - -Approximations --------------- - -It is practically infeasible to pre-compute optimal kernel parameter -for all possible shape configurations as well as for all existing -GPUs. Therefore, we'll assume that the pre-computed optimal parameters -are good enough approximations when -1) the used GPU is any of NVIDIA A100 Tensor Core GPUs, -2) the actual sparsity of mat1 is different from sparsity value 0.5. - -If a particular shape configuration does not fall in the set of -pre-computed kernel parameters, or it does not match with the listed -approximations above, or the used GPU device is not a NVIDIA A100 GPU, -then a reference set of triton kernel parameters will be used when -executing operations. The reference kernel parameters are defined in -torch/sparse/_triton_ops.py, see bsr_dense_addmm_meta function, for -instance. - -Computing optimal kernel parameters ------------------------------------ - -If the approximations listed above are unacceptable, e.g. when one -seeks a maximal performance possible, the optimal kernel parameters -for a particular GPU can be computed by simply running this script in -the pytorch developement tree:: - - cd /path/to/pytorch - python setup.py develop - python torch/sparse/_triton_ops_meta.py - -This will compute the optimal kernel parameters for the GPU device -available in the host system for all shape configurations listed in -"Pre-computed kernel parameters" above. The results will be stored in -the database of kernel parameters. Currently, this database is defined -as this module (see "BEGIN GENERATED DATA" comment below) that will be -modified when the script is run. Create a pytorch PR with the -corresponding modifications in this file to make the computed optimal -kernel parameters available for other users as pre-computed kernel -parameters. - -Moreover, one can compute the optimal kernel parameters for a specific -set of shape configurations and specific sparsity patterns. For that, -use tuning functions provided by this module: - - tune_bsr_dense_addmm(input, mat1, mat2, beta=1, alpha=1, out=None, verbose=False, store=False) -> meta - -The tuning functions return a dictionary of optimal kernel parameters -that can be passed to the corresponding operation, e.g. - - bsr_dense_addmm(..., meta=meta) - -Or, when store==True, the optimal kernel parameters will be stored in -the database of pre-computed kernel parameters in runtime so that all -addmm-based operations such as torch.addmm, torch.mm, -torch.nn.functional.linear will benefit from using the computed -optimal set of kernel parameters. - -Note that running tune_bsr_dense_addmm can take several minutes. So, -use it wisely, e.g. by implementing persisten storage of optimized -kernel parameters. See the source code of get_meta and -tune_bsr_dense_addmm to learn how to register a custom set of optimal -kernel parameters for addmm-based operations. - -""" -__all__ = ["get_meta", "tune_bsr_dense_addmm", "tune__int_bsr_dense_addmm"] - -import inspect -import itertools -import re -import warnings -from typing import Any - -import torch -from torch.hub import tqdm -from torch.testing import make_tensor - - -def get_meta(op, key, device_name=None, version=(0, torch.float16, 0.5), exact=False): - """Return triton kernel meta parameters of the specified op and its inputs key. - - Parameters - ---------- - op (str): The name of an operation that implementation uses meta parameters. - key (tuple): A tuple of op input parameters, e.g. shapes, etc. - device_name (optional, str): The name of a device for which op - parameters are provided. - version (optional, hashable): Specifies the version of parameters. - exact (optional, bool): When True, the returned data (if - available) corresponds exactly to the specified device_name and - version information. Otherwise, if the corresponding data is not - available but there exists a data set that is computed for a - similar GPU device, then this data set will be returned. - - Returns - ------- - result (dict): The requested mapping of parameter names and - values, or None when no data is available. If the input `key` - contains `"*"`, the result will be a dictionary of keys and - mappings that match with the given `key`. - """ - if device_name is None: - device_name = torch.cuda.get_device_name() - - op_data = _operation_device_version_data.get((op, device_name, version)) - if op_data is None and not exact: - # A lack of op data could be due to using a (slightly) - # different GPU model compared to a model for which optimal - # meta parameters have been computed. In the following we'll - # assume that there is a set of GPU models that all have - # a similar set of optimal meta parameters. - if re.match(r"NVIDIA A100[^\d]", device_name) is not None: - device_name = "NVIDIA A100-SXM4-80GB" - else: - return - op_data = _operation_device_version_data.get((op, device_name, version)) - if op_data is None: - return - - matching_data = {} - if "*" in key: - for op_key in op_data: - if [None for k1, k2 in zip(op_key, key) if k2 != "*" and k1 != k2]: - continue - matching_data[op_key] = op_data[op_key] - else: - values = op_data.get(key) - if values is not None: - matching_data[key] = values - matching_meta = {} - for op_key, values in matching_data.items(): - if op == "scatter_mm": - names = ( - "GROUP_SIZE", - "SPLIT_N", - "TILE_M", - "TILE_N", - "num_stages", - "num_warps", - ) - meta = dict(zip(names, values)) - elif op in {"bsr_dense_addmm", "_int_bsr_dense_addmm"}: - meta = dict( - zip(("GROUP_SIZE_ROW", "SPLIT_N", "num_stages", "num_warps"), values) - ) - else: - raise NotImplementedError(f"names for {op=}") - if "*" not in key: - return meta - - matching_meta[op_key] = meta - - if "*" in key: - return matching_meta - - -def update(op, device_name, version, key, value): - """Update the db of op parameters.""" - # skip storing possible optimization failures: - if not value: - warnings.warn( - f"skipping empty value for {op}: {device_name=} {version=} {key=}" - ) - return - if (op, device_name, version) in _operation_device_version_data: - if _operation_device_version_data[op, device_name, version].get(key) == value: - return - _operation_device_version_data[op, device_name, version][key] = value - else: - _operation_device_version_data[op, device_name, version] = {key: value} - - -def dump(): - """Store the current runtime db state to the module file.""" - current_file = inspect.getfile(dump) - f = open(current_file) - current_content = f.read() - f.close() - begin_data_str = "# BEGIN GENERATED DATA\n" - begin_data_index = current_content.find(begin_data_str) - end_data_index = current_content.find(" # END GENERATED DATA\n") - if begin_data_index == -1 or end_data_index == -1: - warnings.warn( - f"{current_file} cannot be updated:" - " BEGIN/END GENERATED DATA comment blocks appear to be corrupted" - ) - return - - def sort_key(key): - op, device_name, version = key - version = tuple( - (str(item) if isinstance(item, torch.dtype) else item) for item in version - ) - return (op, device_name, version) - - part1 = current_content[: begin_data_index + len(begin_data_str)] - part2 = current_content[end_data_index:] - data_part = [] - for op_key in sorted(_operation_device_version_data, key=sort_key): - data_part.append(" " + repr(op_key).replace("'", '"') + ": {") - op_data = _operation_device_version_data[op_key] - data_part.extend(f" {key}: {op_data[key]}," for key in sorted(op_data)) - data_part.append(" },") - new_content = part1 + "\n".join(data_part) + "\n" + part2 - if current_content != new_content: - f = open(current_file, "w") - f.write(new_content) - f.close() - - -def minimize( - target_func, - initial_parameters, - reference_parameters, - step_func, - max_step=2, - verbose=False, - all_values=None, -): - """Find a dict of parameters that minimizes the target function using - the initial dict of parameters and a step function that progresses - a specified parameter in a dict of parameters. - - Parameters - ---------- - target_func (callable): a functional with the signature - ``target_func(parameters: dict) -> float`` - initial_parameters (dict): a set of parameters used as an initial - value to the minimization process. - reference_parameters (dict): a set of parameters used as an - reference value with respect to which the speed up is computed. - step_func (callable): a functional with the signature - ``step_func(parameter_name:str, parameter_value:int, direction:int, parameters:dict) -> int`` - that increments or decrements (when ``direction`` is positive or - negative, respectively) the parameter with given name and value. - When return value is equal to ``parameter_value``, it means that - no step along the given direction can be made. - - Returns - ------- - parameters (dict): a set of parameters that minimizes the target - function. - speedup_incr (float): a speedup change given in percentage. - timing (float): the value of the target function at the parameters. - sensitivity_message (str): a message containing sensitivity. - information of parameters around the target function minimizer. - """ - - def to_key(parameters): - return tuple(parameters[k] for k in sorted(parameters)) - - def from_key(key, parameters): - return dict(zip(sorted(parameters), key)) - - if all_values is None: - all_values = {} - - directions = list(range(-max_step, max_step + 1)) - names = sorted(initial_parameters) - all_directions = [] - for d_tuple in itertools.product(*((directions,) * len(names))): - dist = sum(map(abs, d_tuple)) - if dist > 0 and dist <= max_step: - all_directions.append((dist, d_tuple)) - all_directions.sort() - - try: - reference_target = target_func(reference_parameters) - except Exception as msg: - if verbose and "out of resource" not in str(msg): - print(f"{reference_parameters=} lead to failure: {msg}.") - reference_target = None - - if reference_target is not None: - all_values[to_key(reference_parameters)] = reference_target - - parameters = initial_parameters - try: - initial_target = target_func(parameters) - except Exception as msg: - if reference_target is None: - if verbose: - print( - f"{initial_parameters=} lead to failure: {msg}. Optimization failed!" - ) - return {}, -1, -1, f"{msg}" - if verbose and "out of resource" not in str(msg): - print( - f"{initial_parameters=} lead to failure: {msg}. Using reference parameters instead of initial parameters." - ) - parameters = reference_parameters - initial_target = reference_target - - if reference_target is None: - if verbose: - print("Using initial parameters instead of reference parameters.") - reference_target = initial_target - - initial_key = to_key(parameters) - minimal_target = all_values[initial_key] = initial_target - pbar = tqdm( - total=len(all_directions), - desc="Tuning...", - disable=not verbose, - ncols=75, - ) - while True: - for i, (_, d_tuple) in enumerate(all_directions): - pbar.update(1) - next_parameters = parameters.copy() - for name, direction in zip(names, d_tuple): - value = next_parameters[name] - if direction == 0: - continue - next_value = step_func(name, value, direction, parameters) - if next_value == value: - break - next_parameters[name] = next_value - else: - next_key = to_key(next_parameters) - if next_key in all_values: - continue - try: - next_target = target_func(next_parameters) - except Exception as msg: - all_values[next_key] = str(msg) - if verbose and "out of resource" not in str(msg): - print(f"{next_parameters=} lead to failure: {msg}. Skipping.") - continue - all_values[next_key] = next_target - - if next_target < minimal_target: - minimal_target = next_target - parameters = next_parameters - pbar.total += i + 1 - break - else: - # ensure stable minimizer: - minimizer_keys = { - k - for k, v in all_values.items() - if isinstance(v, float) and abs(1 - v / minimal_target) < 0.001 - } - minimizer_key = ( - initial_key if initial_key in minimizer_keys else min(minimizer_keys) - ) - parameters = from_key(minimizer_key, parameters) - speedup_incr = (1 - minimal_target / reference_target) * 100 - if speedup_incr < 0: - if verbose: - print( - f"{speedup_incr=} is negative. Rerunning minimize with reference parameters as initial parameters." - ) - return minimize( - target_func, - reference_parameters, - reference_parameters, - step_func, - max_step=max_step, - verbose=verbose, - all_values=all_values, - ) - sensitivity = [] - for name in parameters: - value = parameters[name] - rel_diffs = [] - for direction in range(-max_step, max_step + 1): - if direction == 0: - continue - next_value = step_func(name, value, direction, parameters) - if next_value == value: - rel_diffs.append(0) - continue - next_parameters = parameters.copy() - next_parameters[name] = next_value - next_key = to_key(next_parameters) - next_target = all_values.get(next_key) - if next_target is None or isinstance(next_target, str): - rel_diffs.append(0) - continue - rel_diff = (next_target / minimal_target - 1) * 100 - rel_diffs.append(rel_diff) - sensitivity.append((max(rel_diffs), rel_diffs, name)) - - sensitivity_message = [f"timing0={initial_target:.3f}"] - for _, rel_diffs, name in sorted(sensitivity, reverse=True): - left_diffs = "|".join( - [f"{rel_diff:.1f}" for rel_diff in rel_diffs[:max_step]] - ) - right_diffs = "|".join( - [f"{rel_diff:.1f}" for rel_diff in rel_diffs[max_step:]] - ) - sensitivity_message.append( - f"{name}={parameters[name]} ({left_diffs}...{right_diffs} %)" - ) - sensitivity_message = ", ".join(sensitivity_message) - return parameters, speedup_incr, minimal_target, sensitivity_message - - -def create_blocked_tensor(B, M, N, blocksize, sparsity, dtype, device): - assert ( - sparsity <= 1.0 and sparsity >= 0.0 - ), "sparsity should be a value between 0 and 1" - assert M % blocksize[0] == 0 - assert N % blocksize[1] == 0 - shape = (B, M // blocksize[0], N // blocksize[1])[int(B == 0) :] - A = torch.bernoulli( - torch.full(shape, 1 - sparsity, dtype=torch.float32, device=device) - ).to(dtype) - expected_nnz = int((1 - sparsity) * M * N / (blocksize[0] * blocksize[1])) - nonzero_indices = A.flatten().nonzero() - actual_nnz = nonzero_indices.shape[0] - if actual_nnz > expected_nnz: - selected_nonzeros = torch.randperm(actual_nnz)[: actual_nnz - expected_nnz] - A.flatten()[nonzero_indices[selected_nonzeros]] = 0 - elif actual_nnz < expected_nnz: - zero_indices = (A == 0).flatten().nonzero() - selected_zeros = torch.randperm(zero_indices.shape[0])[ - : expected_nnz - actual_nnz - ] - A.flatten()[zero_indices[selected_zeros]] = 1 - A = torch.repeat_interleave(A, blocksize[0], dim=-2) - A = torch.repeat_interleave(A, blocksize[1], dim=-1) - return A - - -def optimize_scatter_mm( - m, k, n, bm, bk, dtype=torch.float16, device="cuda", sparsity=0.5, force=False -): - import triton - - from torch.sparse._triton_ops import bsr_scatter_mm, bsr_scatter_mm_indices_data - - key = (m, k, n, bm, bk) - - version = (0, dtype, sparsity) - device_name = torch.cuda.get_device_name() - - reference_meta = dict( - GROUP_SIZE=1, - TILE_M=16, - TILE_N=16, - SPLIT_N=n // 16, - num_stages=1, - num_warps=1, - ) - - initial_meta = get_meta( - "scatter_mm", key, device_name=device_name, version=version, exact=True - ) - if initial_meta is None: - initial_meta = get_meta( - "bsr_dense_addmm", - key, - device_name=device_name, - version=(0, dtype, 0.5), - exact=True, - ) - if initial_meta is None: - initial_meta = reference_meta - elif not force: - return - - torch.manual_seed(0) - bsr = create_blocked_tensor( - 0, m, k, (bm, bk), sparsity, dtype, device - ).to_sparse_bsr((bm, bk)) - dense = make_tensor(k, n, dtype=dtype, device=device) - - def bench(meta, bsr=bsr, dense=dense): - indices_data = bsr_scatter_mm_indices_data( - bsr, dense, indices_format="bsr_strided_mm_compressed", **meta - ) - - def test_func(): - return bsr_scatter_mm(bsr, dense, indices_data=indices_data) - - ms_min = triton.testing.do_bench(test_func, warmup=500, rep=100) - - return ms_min - - def step_meta_parameter(name, value, direction, meta, m=m, n=n, k=k, bm=bm, bk=bk): - # return next value in positive or negative direction, or - # input value if the step will result an invalid - # value. The input value is assumed to be valid. - - is_log = name in {"SPLIT_N", "TILE_M", "TILE_N", "num_warps"} - min_value = dict( - SPLIT_N=1, TILE_M=16, TILE_N=16, num_warps=1, num_stages=1, GROUP_SIZE=1 - )[name] - max_value = dict( - SPLIT_N=n // meta["TILE_N"], TILE_M=bm, TILE_N=n // meta["SPLIT_N"] - ).get(name) - value_step = dict( - SPLIT_N=2, TILE_M=2, TILE_N=2, num_warps=2, num_stages=1, GROUP_SIZE=1 - )[name] - if is_log: - next_value = ( - value * value_step**direction - if direction > 0 - else value // (value_step ** abs(direction)) - ) - else: - next_value = value + value_step * direction - if min_value is not None: - next_value = max(next_value, min_value) - if max_value is not None: - next_value = min(next_value, max_value) - if name == "SPLIT_N" and n % next_value != 0: - return value - # Hard-skip parameter combinations that break CUDA state for pytorch: - if (dtype, name, next_value, m, n, k, bm, bk) in { - (torch.float32, "num_warps", 32, 256, 256, 256, 16, 16), - (torch.float32, "num_warps", 16, 256, 256, 256, 32, 32), - (torch.float32, "num_warps", 16, 256, 256, 256, 64, 64), - (torch.float32, "num_warps", 16, 256, 256, 256, 128, 128), - (torch.float32, "num_warps", 16, 512, 512, 256, 128, 128), - } and re.match(r"NVIDIA A100[^\d]", device_name) is not None: - return value - return next_value - - meta, speedup, timing, _sensitivity_message = minimize( - bench, initial_meta, reference_meta, step_meta_parameter - ) - if initial_meta is not reference_meta and initial_meta == meta and not force: - return - print(f"{meta=} {speedup=:.1f} % {timing=:.3f} ms") - if speedup < 0: - return - device_name = torch.cuda.get_device_name() - - update( - "scatter_mm", device_name, version, key, tuple(meta[k] for k in sorted(meta)) - ) - - -def tune__int_bsr_dense_addmm( - input, - bsr, - dense, - *, - beta=1, - alpha=1, - out=None, - store=False, - verbose=False, - force=False, -): - return tune_bsr_dense_addmm( - input, - bsr, - dense, - beta=beta, - alpha=alpha, - out=out, - store=store, - verbose=verbose, - force=force, - opname="_int_bsr_dense_addmm", - ) - - -def tune_bsr_dense_addmm( - input, - bsr, - dense, - *, - beta=1, - alpha=1, - left_alpha=None, - right_alpha=None, - out=None, - store=False, - verbose=False, - force=False, - opname=None, -): - """Tune bsr_dense_addmm kernel parameters against the given inputs. - - When store is True, the tuning results will be stored in the - database of kernel parameters. - """ - import triton - - if opname is None: - opname = "bsr_dense_addmm" - - if opname == "_int_bsr_dense_addmm": - from torch.sparse._triton_ops import _int_bsr_dense_addmm as bsr_dense_addmm - else: - from torch.sparse._triton_ops import bsr_dense_addmm - - N = dense.shape[-1] - values = bsr.values() - crow_indices = bsr.crow_indices() - batch_ndim = crow_indices.dim() - 1 - M, K = bsr.shape[batch_ndim : batch_ndim + 2] - BM, BK = values.shape[batch_ndim + 1 : batch_ndim + 3] - - # Reference parameters is a set of parameters that leads to a - # successful kernel call and the corresponding timing is used as a - # reference for computing speedups. Avoid changing the reference - # parameters when possible. - reference_meta = dict( - GROUP_SIZE_ROW=1, num_stages=1, num_warps=4, SPLIT_N=max(N // BM, 1) - ) - - # Compute the key of parameters: - sparsity = round(1 - bsr._nnz() * BM * BK / (M * K), 2) - dtype = bsr.dtype - if out is None: - out_dtype = dtype - else: - out_dtype = out.dtype - if out_dtype is dtype: - version_dtype = dtype - else: - version_dtype = (dtype, out_dtype) - version = (0, version_dtype, sparsity) - key = (M, K, N, BM, BK, beta == 0, beta == 1, alpha == 1) - - # For tuning, for an initial state, use parameters from the - # database if available, otherwise, use the reference parameters. - initial_meta = get_meta(opname, key, version=version, exact=True) - if initial_meta is None: - may_skip_update = False - initial_meta = get_meta(opname, key, version=(0, dtype, 0.5), exact=True) - if initial_meta is None: - initial_meta = reference_meta - elif not force: - return initial_meta - else: - may_skip_update = True - - # The target function that is minimized in the tuning process: - def bench(meta, input=input, bsr=bsr, dense=dense, alpha=alpha, out=out): - def test_func(): - return bsr_dense_addmm( - input, - bsr, - dense, - beta=beta, - alpha=alpha, - left_alpha=left_alpha, - right_alpha=right_alpha, - meta=meta, - out=out, - ) - - return triton.testing.do_bench(test_func, warmup=500, rep=100) - - # The step function that increments a specified meta parameter: - def step_meta_parameter(name, value, direction, meta, M=M, N=N, K=K, BM=BM, BK=BK): - # return next value in positive or negative direction, or - # input value if the step will result an invalid - # value. The input value is assumed to be valid. - is_log = name in {"SPLIT_N", "num_warps"} - min_value = dict(SPLIT_N=1, num_warps=1, num_stages=1, GROUP_SIZE_ROW=1)[name] - max_value = dict(SPLIT_N=max(N // BM, 1)).get(name) - value_step = dict(SPLIT_N=2, num_warps=2, num_stages=1, GROUP_SIZE_ROW=1)[name] - if is_log: - next_value = ( - value * value_step**direction - if direction > 0 - else value // (value_step ** abs(direction)) - ) - else: - next_value = value + value_step * direction - if min_value is not None: - next_value = max(next_value, min_value) - if max_value is not None: - next_value = min(next_value, max_value) - if name == "SPLIT_N" and N % next_value != 0: - return value - return next_value - - # Tune: - meta, speedup, timing, sensitivity_message = minimize( - bench, - initial_meta, - reference_meta, - step_meta_parameter, - max_step=2, - verbose=verbose, - ) - if verbose: - print(f"-> {sensitivity_message}, {speedup=:.1f} %, {timing=:.3f} ms") - - if store and not ( - may_skip_update and meta == initial_meta and initial_meta is not reference_meta - ): - device_name = torch.cuda.get_device_name() - update( - opname, - device_name, - version, - key, - tuple(meta[k] for k in sorted(meta)), - ) - - return meta - - -def optimize_bsr_dense_addmm( - m, - k, - n, - bm, - bk, - beta=1, - alpha=1, - use_left_alpha=False, - use_right_alpha=False, - dtype=torch.float16, - out_dtype=None, - device="cuda", - sparsity=0.5, - force=False, - verbose=False, - opname=None, -): - # torch.manual_seed(0) - bsr = create_blocked_tensor( - 0, m, k, (bm, bk), sparsity, dtype, device - ).to_sparse_bsr((bm, bk)) - dense = make_tensor(k, n, dtype=dtype, device=device) - input = make_tensor(m, n, dtype=dtype, device=device) - left_alpha = make_tensor(m, dtype=dtype, device=device) if use_left_alpha else None - right_alpha = ( - make_tensor(n, dtype=dtype, device=device) if use_right_alpha else None - ) - if out_dtype is not None: - out = dense.new_empty((m, n), dtype=out_dtype) - else: - out = None - tune_bsr_dense_addmm( - input, - bsr, - dense, - beta=beta, - alpha=alpha, - left_alpha=left_alpha, - right_alpha=right_alpha, - out=out, - store=True, - force=force, - verbose=verbose, - opname=opname, - ) - - -def main(op="scatter_mm", force=False, dtype=torch.float16, verbose=True): - import itertools - - sizes_lst = [ - 256, - 512, - 1024, - 2048, - 4096, - 8192, - 16384, - 32768, - 65536, - 131072, - 50432, - 65792, - ] - sizes3_lst = [3 * sz for sz in [64, 128] + sizes_lst if sz <= 2048] - shapes_lst = [(sz, sz) for sz in sizes_lst[:-4] + sizes3_lst] - - shapes_lst=[] - shapes_lst.extend([(3072, 768), (768, 3072)]) - shapes_lst.extend([(5120, 1280), (1280, 5120)]) - if dtype is torch.int8: - # triton does not support smaller blocks than 32 - blocksize_lst = [(32, 32), (64, 64), (128, 128), (256, 256)] - else: - blocksize_lst = [(16, 16), (32, 32), (64, 64), (128, 128)] - sparsity_lst = [0.9][:1] - for sparsity in sparsity_lst: - print(f"{op, dtype, sparsity=}") - try: - for (M, K), N, (BM, BK) in itertools.product( - shapes_lst, sizes_lst, blocksize_lst - ): - if not (BM <= M and BK <= K and M % BM == 0 and K % BK == 0): - continue - if op == "scatter_mm": - optimize_scatter_mm( - M, K, N, BM, BK, force=force, sparsity=sparsity, dtype=dtype - ) - elif op in {"bsr_dense_addmm", "_int_bsr_dense_addmm"}: - if M == K and N == 50432: - continue - print(f"{M, K, N, (BM, BK)=}") - else: - raise NotImplementedError(op) - except KeyboardInterrupt: - break - except Exception: - dump() - raise - dump() - - if 0: - # Check performance dependence on sparsity and apply - # adjustments when differences are noticable (more than 10%). - # - # When using NVIDIA A100 GPU, the performance dependence on - # sparsity is insignificant (0 % ... 10 %) for majority of - # shapes/blocksizes combinations. However, for a very few - # specific size combinations, the effect of sparsity on - # performance can be up to 20 %. - for (M, K), N, (BM, BK) in itertools.product( - shapes_lst, sizes_lst, blocksize_lst - ): - meta_lst: list = [] - key = (M, K, N, BM, BK) - for sparsity1 in sparsity_lst: - torch.manual_seed(0) - bsr = create_blocked_tensor( - 0, M, K, (BM, BK), sparsity1, dtype, device="cuda" - ).to_sparse_bsr((BM, BK)) - dense = make_tensor(K, N, dtype=dtype, device="cuda") - meta_lst = [] - for sparsity in sparsity_lst: - meta = get_meta(op, key, version=(0, dtype, sparsity), exact=True) - if meta is None: - continue - - def bench(meta, bsr=bsr, dense=dense): - import triton - - if op == "scatter_mm": - from torch.sparse._triton_ops import ( - bsr_scatter_mm, - bsr_scatter_mm_indices_data, - ) - - indices_data = bsr_scatter_mm_indices_data( - bsr, - dense, - indices_format="bsr_strided_mm_compressed", - **meta, - ) - - def test_func(): - return bsr_scatter_mm( - bsr, dense, indices_data=indices_data - ) - - else: - raise NotImplementedError(op) - - ms_min = triton.testing.do_bench(test_func, warmup=500, rep=100) - - return ms_min - - meta_lst.append( - (bench(meta), sparsity, tuple(meta[k] for k in sorted(meta))) - ) - if not meta_lst: - continue - meta_lst = sorted(meta_lst) - index = next( - i for i, item in enumerate(meta_lst) if item[1] == sparsity1 - ) - if meta_lst[0][2] == meta_lst[index][2]: - continue - speeddiff = (1 - meta_lst[index][0] / meta_lst[0][0]) * 100 - if abs(speeddiff) < 10: - continue - - print(sparsity1, index, key, meta_lst, speeddiff) - - if index > 0: - device_name = torch.cuda.get_device_name() - meta = get_meta( - op, key, version=(0, dtype, meta_lst[0][1]), exact=True - ) - update( - op, - device_name, - (0, dtype, sparsity1), - key, - tuple(meta[k] for k in sorted(meta)), - ) - print("update") - dump() - - -_operation_device_version_data: dict[Any, dict] = { - # Warning: the data in between the BEGIN/END DATA comment lines - # below is generated. It can be updated either manually or via - # calling dump function defined above. - # - # Legend [op: key -> data]: - # scatter_mm : M, K, N, Ms, Ks -> GROUP_SIZE, SPLIT_N, TILE_M, TILE_N, num_stages, num_warps - # bsr_dense_addmm : M, K, N, Ms, Ks, beta==0, beta==1, alpha==1 -> GROUP_SIZE_ROW, SPLIT_N, num_stages, num_warps - # - # BEGIN GENERATED DATA - ("_int_bsr_dense_addmm", "NVIDIA A100-SXM4-80GB", (0, torch.int8, 0.5)): { - (192, 192, 256, 32, 32, False, True, True): (2, 8, 1, 4), - (192, 192, 256, 32, 32, True, False, True): (2, 8, 5, 4), - (192, 192, 512, 32, 32, False, True, True): (1, 16, 1, 4), - (192, 192, 512, 32, 32, True, False, True): (1, 16, 5, 4), - (192, 192, 1024, 32, 32, False, True, True): (1, 32, 1, 4), - (192, 192, 1024, 32, 32, True, False, True): (4, 32, 4, 4), - (192, 192, 2048, 32, 32, False, True, True): (2, 64, 1, 4), - (192, 192, 2048, 32, 32, True, False, True): (3, 16, 5, 4), - (192, 192, 4096, 32, 32, False, True, True): (1, 128, 1, 4), - (192, 192, 4096, 32, 32, True, False, True): (1, 128, 1, 4), - (192, 192, 8192, 32, 32, False, True, True): (1, 256, 1, 4), - (192, 192, 8192, 32, 32, True, False, True): (1, 64, 3, 4), - (192, 192, 16384, 32, 32, False, True, True): (2, 512, 1, 4), - (192, 192, 16384, 32, 32, True, False, True): (5, 128, 1, 4), - (192, 192, 32768, 32, 32, False, True, True): (1, 1024, 1, 4), - (192, 192, 32768, 32, 32, True, False, True): (1, 256, 1, 4), - (192, 192, 65536, 32, 32, False, True, True): (1, 1024, 1, 8), - (192, 192, 65536, 32, 32, True, False, True): (1, 512, 1, 4), - (192, 192, 131072, 32, 32, False, True, True): (1, 2048, 1, 8), - (192, 192, 131072, 32, 32, True, False, True): (2, 512, 1, 4), - (256, 256, 256, 32, 32, False, True, True): (4, 8, 1, 4), - (256, 256, 256, 32, 32, True, False, True): (1, 8, 6, 4), - (256, 256, 256, 64, 64, False, True, True): (1, 4, 1, 16), - (256, 256, 256, 64, 64, True, False, True): (1, 4, 4, 4), - (256, 256, 256, 128, 128, False, True, True): (3, 2, 1, 16), - (256, 256, 256, 128, 128, True, False, True): (1, 2, 1, 4), - (256, 256, 512, 32, 32, False, True, True): (2, 16, 1, 4), - (256, 256, 512, 32, 32, True, False, True): (2, 16, 4, 4), - (256, 256, 512, 64, 64, False, True, True): (7, 8, 1, 16), - (256, 256, 512, 64, 64, True, False, True): (3, 8, 3, 4), - (256, 256, 512, 128, 128, False, True, True): (1, 4, 1, 32), - (256, 256, 512, 128, 128, True, False, True): (1, 4, 1, 4), - (256, 256, 1024, 32, 32, False, True, True): (1, 32, 1, 4), - (256, 256, 1024, 32, 32, True, False, True): (1, 8, 6, 4), - (256, 256, 1024, 64, 64, False, True, True): (2, 16, 1, 16), - (256, 256, 1024, 64, 64, True, False, True): (1, 16, 5, 4), - (256, 256, 1024, 128, 128, False, True, True): (4, 8, 1, 32), - (256, 256, 1024, 128, 128, True, False, True): (1, 8, 2, 4), - (256, 256, 2048, 32, 32, False, True, True): (1, 64, 1, 4), - (256, 256, 2048, 32, 32, True, False, True): (2, 32, 3, 2), - (256, 256, 2048, 64, 64, False, True, True): (2, 32, 1, 16), - (256, 256, 2048, 64, 64, True, False, True): (1, 16, 3, 4), - (256, 256, 2048, 128, 128, False, True, True): (1, 16, 1, 32), - (256, 256, 2048, 128, 128, True, False, True): (1, 16, 2, 4), - (256, 256, 4096, 32, 32, False, True, True): (2, 128, 1, 4), - (256, 256, 4096, 32, 32, True, False, True): (1, 32, 3, 2), - (256, 256, 4096, 64, 64, False, True, True): (2, 64, 1, 8), - (256, 256, 4096, 64, 64, True, False, True): (1, 64, 3, 2), - (256, 256, 4096, 128, 128, False, True, True): (2, 32, 1, 32), - (256, 256, 4096, 128, 128, True, False, True): (3, 32, 2, 8), - (256, 256, 8192, 32, 32, False, True, True): (1, 256, 1, 4), - (256, 256, 8192, 32, 32, True, False, True): (1, 64, 1, 4), - (256, 256, 8192, 64, 64, False, True, True): (1, 128, 1, 8), - (256, 256, 8192, 64, 64, True, False, True): (2, 128, 1, 4), - (256, 256, 8192, 128, 128, False, True, True): (4, 64, 1, 32), - (256, 256, 8192, 128, 128, True, False, True): (3, 64, 1, 4), - (256, 256, 16384, 32, 32, False, True, True): (1, 256, 1, 8), - (256, 256, 16384, 32, 32, True, False, True): (3, 128, 1, 4), - (256, 256, 16384, 64, 64, False, True, True): (2, 256, 1, 8), - (256, 256, 16384, 64, 64, True, False, True): (2, 256, 1, 4), - (256, 256, 16384, 128, 128, False, True, True): (2, 128, 1, 32), - (256, 256, 16384, 128, 128, True, False, True): (4, 128, 2, 4), - (256, 256, 32768, 32, 32, False, True, True): (2, 512, 1, 8), - (256, 256, 32768, 32, 32, True, False, True): (1, 256, 1, 4), - (256, 256, 32768, 64, 64, False, True, True): (1, 512, 1, 8), - (256, 256, 32768, 64, 64, True, False, True): (1, 512, 1, 4), - (256, 256, 32768, 128, 128, False, True, True): (2, 256, 1, 32), - (256, 256, 32768, 128, 128, True, False, True): (1, 256, 2, 4), - (256, 256, 65536, 32, 32, False, True, True): (1, 1024, 1, 8), - (256, 256, 65536, 32, 32, True, False, True): (1, 512, 1, 4), - (256, 256, 65536, 64, 64, False, True, True): (1, 1024, 1, 8), - (256, 256, 65536, 64, 64, True, False, True): (1, 512, 1, 4), - (256, 256, 65536, 128, 128, False, True, True): (2, 512, 1, 16), - (256, 256, 65536, 128, 128, True, False, True): (1, 512, 1, 4), - (256, 256, 65792, 32, 32, False, True, True): (1, 1028, 1, 8), - (256, 256, 65792, 32, 32, True, False, True): (1, 514, 1, 4), - (256, 256, 65792, 64, 64, False, True, True): (1, 1028, 1, 8), - (256, 256, 65792, 64, 64, True, False, True): (4, 257, 1, 4), - (256, 256, 65792, 128, 128, False, True, True): (2, 514, 1, 16), - (256, 256, 65792, 128, 128, True, False, True): (3, 514, 1, 4), - (256, 256, 131072, 32, 32, False, True, True): (1, 2048, 1, 8), - (256, 256, 131072, 32, 32, True, False, True): (2, 1024, 1, 4), - (256, 256, 131072, 64, 64, False, True, True): (1, 2048, 1, 8), - (256, 256, 131072, 64, 64, True, False, True): (2, 512, 1, 4), - (256, 256, 131072, 128, 128, False, True, True): (2, 1024, 1, 16), - (256, 256, 131072, 128, 128, True, False, True): (4, 1024, 1, 4), - (384, 384, 256, 32, 32, False, True, True): (1, 8, 1, 4), - (384, 384, 256, 32, 32, True, False, True): (5, 8, 5, 4), - (384, 384, 256, 64, 64, False, True, True): (2, 4, 1, 16), - (384, 384, 256, 64, 64, True, False, True): (1, 4, 5, 4), - (384, 384, 512, 32, 32, False, True, True): (2, 16, 1, 4), - (384, 384, 512, 32, 32, True, False, True): (1, 16, 4, 4), - (384, 384, 512, 64, 64, False, True, True): (3, 8, 1, 16), - (384, 384, 512, 64, 64, True, False, True): (3, 8, 3, 4), - (384, 384, 1024, 32, 32, False, True, True): (2, 32, 1, 4), - (384, 384, 1024, 32, 32, True, False, True): (1, 8, 6, 4), - (384, 384, 1024, 64, 64, False, True, True): (2, 16, 1, 16), - (384, 384, 1024, 64, 64, True, False, True): (1, 16, 5, 4), - (384, 384, 2048, 32, 32, False, True, True): (1, 64, 1, 4), - (384, 384, 2048, 32, 32, True, False, True): (3, 16, 4, 4), - (384, 384, 2048, 64, 64, False, True, True): (2, 32, 1, 16), - (384, 384, 2048, 64, 64, True, False, True): (1, 16, 4, 4), - (384, 384, 4096, 32, 32, False, True, True): (4, 64, 1, 8), - (384, 384, 4096, 32, 32, True, False, True): (4, 32, 1, 4), - (384, 384, 4096, 64, 64, False, True, True): (1, 64, 1, 8), - (384, 384, 4096, 64, 64, True, False, True): (1, 64, 1, 4), - (384, 384, 8192, 32, 32, False, True, True): (1, 128, 1, 8), - (384, 384, 8192, 32, 32, True, False, True): (3, 64, 1, 1), - (384, 384, 8192, 64, 64, False, True, True): (2, 128, 1, 8), - (384, 384, 8192, 64, 64, True, False, True): (1, 64, 2, 2), - (384, 384, 16384, 32, 32, False, True, True): (1, 256, 1, 8), - (384, 384, 16384, 32, 32, True, False, True): (1, 128, 1, 4), - (384, 384, 16384, 64, 64, False, True, True): (2, 256, 1, 8), - (384, 384, 16384, 64, 64, True, False, True): (2, 128, 1, 4), - (384, 384, 32768, 32, 32, False, True, True): (1, 512, 1, 8), - (384, 384, 32768, 32, 32, True, False, True): (1, 256, 1, 4), - (384, 384, 32768, 64, 64, False, True, True): (1, 512, 1, 8), - (384, 384, 32768, 64, 64, True, False, True): (1, 256, 3, 2), - (384, 384, 65536, 32, 32, False, True, True): (1, 1024, 1, 8), - (384, 384, 65536, 32, 32, True, False, True): (1, 512, 3, 2), - (384, 384, 65536, 64, 64, False, True, True): (2, 1024, 1, 8), - (384, 384, 65536, 64, 64, True, False, True): (3, 256, 3, 4), - (384, 384, 131072, 32, 32, False, True, True): (1, 2048, 1, 8), - (384, 384, 131072, 32, 32, True, False, True): (1, 1024, 3, 2), - (384, 384, 131072, 64, 64, False, True, True): (1, 2048, 1, 8), - (384, 384, 131072, 64, 64, True, False, True): (2, 512, 3, 4), - (512, 512, 256, 32, 32, False, True, True): (1, 8, 1, 4), - (512, 512, 256, 32, 32, True, False, True): (4, 8, 4, 4), - (512, 512, 256, 64, 64, False, True, True): (3, 4, 1, 16), - (512, 512, 256, 64, 64, True, False, True): (2, 4, 5, 4), - (512, 512, 256, 128, 128, False, True, True): (4, 2, 1, 16), - (512, 512, 256, 128, 128, True, False, True): (1, 2, 3, 4), - (512, 512, 256, 256, 256, False, True, True): (1, 1, 1, 32), - (512, 512, 256, 256, 256, True, False, True): (2, 1, 1, 32), - (512, 512, 512, 32, 32, False, True, True): (3, 16, 1, 4), - (512, 512, 512, 32, 32, True, False, True): (1, 8, 4, 2), - (512, 512, 512, 64, 64, False, True, True): (2, 8, 1, 16), - (512, 512, 512, 64, 64, True, False, True): (2, 8, 5, 4), - (512, 512, 512, 128, 128, False, True, True): (3, 4, 1, 16), - (512, 512, 512, 128, 128, True, False, True): (1, 4, 3, 4), - (512, 512, 512, 256, 256, False, True, True): (1, 2, 1, 32), - (512, 512, 512, 256, 256, True, False, True): (2, 2, 1, 32), - (512, 512, 1024, 32, 32, False, True, True): (2, 32, 1, 4), - (512, 512, 1024, 32, 32, True, False, True): (4, 16, 3, 2), - (512, 512, 1024, 64, 64, False, True, True): (4, 16, 1, 16), - (512, 512, 1024, 64, 64, True, False, True): (1, 8, 4, 4), - (512, 512, 1024, 128, 128, False, True, True): (1, 8, 1, 32), - (512, 512, 1024, 128, 128, True, False, True): (1, 8, 3, 4), - (512, 512, 1024, 256, 256, False, True, True): (4, 4, 1, 32), - (512, 512, 1024, 256, 256, True, False, True): (2, 4, 1, 32), - (512, 512, 2048, 32, 32, False, True, True): (3, 32, 1, 8), - (512, 512, 2048, 32, 32, True, False, True): (1, 16, 3, 4), - (512, 512, 2048, 64, 64, False, True, True): (1, 32, 1, 8), - (512, 512, 2048, 64, 64, True, False, True): (1, 32, 3, 2), - (512, 512, 2048, 128, 128, False, True, True): (4, 16, 1, 32), - (512, 512, 2048, 128, 128, True, False, True): (1, 16, 3, 4), - (512, 512, 2048, 256, 256, False, True, True): (1, 8, 1, 32), - (512, 512, 2048, 256, 256, True, False, True): (3, 8, 1, 32), - (512, 512, 4096, 32, 32, False, True, True): (1, 64, 1, 8), - (512, 512, 4096, 32, 32, True, False, True): (5, 32, 1, 4), - (512, 512, 4096, 64, 64, False, True, True): (1, 64, 1, 8), - (512, 512, 4096, 64, 64, True, False, True): (1, 64, 1, 4), - (512, 512, 4096, 128, 128, False, True, True): (5, 32, 1, 32), - (512, 512, 4096, 128, 128, True, False, True): (2, 32, 3, 4), - (512, 512, 4096, 256, 256, False, True, True): (1, 16, 1, 32), - (512, 512, 4096, 256, 256, True, False, True): (3, 16, 1, 32), - (512, 512, 8192, 32, 32, False, True, True): (3, 128, 1, 8), - (512, 512, 8192, 32, 32, True, False, True): (3, 64, 1, 4), - (512, 512, 8192, 64, 64, False, True, True): (4, 128, 1, 8), - (512, 512, 8192, 64, 64, True, False, True): (1, 64, 3, 2), - (512, 512, 8192, 128, 128, False, True, True): (5, 64, 1, 32), - (512, 512, 8192, 128, 128, True, False, True): (1, 64, 2, 4), - (512, 512, 8192, 256, 256, False, True, True): (1, 32, 1, 32), - (512, 512, 8192, 256, 256, True, False, True): (1, 32, 1, 32), - (512, 512, 16384, 32, 32, False, True, True): (1, 256, 1, 8), - (512, 512, 16384, 32, 32, True, False, True): (2, 128, 1, 4), - (512, 512, 16384, 64, 64, False, True, True): (2, 256, 1, 8), - (512, 512, 16384, 64, 64, True, False, True): (1, 128, 3, 2), - (512, 512, 16384, 128, 128, False, True, True): (4, 128, 1, 16), - (512, 512, 16384, 128, 128, True, False, True): (2, 128, 1, 4), - (512, 512, 16384, 256, 256, False, True, True): (1, 64, 1, 32), - (512, 512, 16384, 256, 256, True, False, True): (2, 64, 1, 32), - (512, 512, 32768, 32, 32, False, True, True): (1, 512, 1, 8), - (512, 512, 32768, 32, 32, True, False, True): (2, 256, 1, 4), - (512, 512, 32768, 64, 64, False, True, True): (1, 512, 1, 8), - (512, 512, 32768, 64, 64, True, False, True): (1, 256, 3, 2), - (512, 512, 32768, 128, 128, False, True, True): (4, 256, 1, 16), - (512, 512, 32768, 128, 128, True, False, True): (2, 256, 1, 4), - (512, 512, 32768, 256, 256, False, True, True): (1, 128, 1, 32), - (512, 512, 32768, 256, 256, True, False, True): (2, 128, 1, 32), - (512, 512, 65536, 32, 32, False, True, True): (1, 1024, 1, 8), - (512, 512, 65536, 32, 32, True, False, True): (2, 512, 1, 2), - (512, 512, 65536, 64, 64, False, True, True): (1, 1024, 1, 8), - (512, 512, 65536, 64, 64, True, False, True): (1, 512, 3, 2), - (512, 512, 65536, 128, 128, False, True, True): (4, 512, 1, 16), - (512, 512, 65536, 128, 128, True, False, True): (1, 512, 1, 4), - (512, 512, 65536, 256, 256, False, True, True): (1, 256, 1, 32), - (512, 512, 65536, 256, 256, True, False, True): (1, 256, 1, 32), - (512, 512, 65792, 32, 32, False, True, True): (1, 1028, 1, 8), - (512, 512, 65792, 32, 32, True, False, True): (1, 514, 3, 2), - (512, 512, 65792, 64, 64, False, True, True): (1, 1028, 1, 8), - (512, 512, 65792, 64, 64, True, False, True): (2, 257, 3, 4), - (512, 512, 65792, 128, 128, False, True, True): (4, 514, 1, 16), - (512, 512, 65792, 128, 128, True, False, True): (1, 514, 1, 4), - (512, 512, 65792, 256, 256, False, True, True): (1, 257, 1, 32), - (512, 512, 65792, 256, 256, True, False, True): (2, 257, 1, 32), - (512, 512, 131072, 32, 32, False, True, True): (1, 2048, 1, 8), - (512, 512, 131072, 32, 32, True, False, True): (1, 1024, 3, 2), - (512, 512, 131072, 64, 64, False, True, True): (1, 2048, 1, 8), - (512, 512, 131072, 64, 64, True, False, True): (1, 1024, 3, 2), - (512, 512, 131072, 128, 128, False, True, True): (4, 1024, 1, 16), - (512, 512, 131072, 128, 128, True, False, True): (1, 1024, 1, 4), - (512, 512, 131072, 256, 256, False, True, True): (1, 512, 1, 32), - (512, 512, 131072, 256, 256, True, False, True): (2, 512, 1, 32), - (768, 768, 256, 32, 32, False, True, True): (1, 8, 1, 4), - (768, 768, 256, 32, 32, True, False, True): (2, 8, 4, 4), - (768, 768, 256, 64, 64, False, True, True): (3, 4, 1, 16), - (768, 768, 256, 64, 64, True, False, True): (2, 4, 4, 4), - (768, 768, 256, 128, 128, False, True, True): (1, 2, 1, 8), - (768, 768, 256, 128, 128, True, False, True): (1, 2, 3, 4), - (768, 768, 512, 32, 32, False, True, True): (1, 16, 1, 4), - (768, 768, 512, 32, 32, True, False, True): (1, 4, 5, 4), - (768, 768, 512, 64, 64, False, True, True): (1, 8, 3, 32), - (768, 768, 512, 64, 64, True, False, True): (4, 8, 4, 4), - (768, 768, 512, 128, 128, False, True, True): (4, 4, 1, 16), - (768, 768, 512, 128, 128, True, False, True): (4, 4, 3, 4), - (768, 768, 1024, 32, 32, False, True, True): (1, 16, 1, 8), - (768, 768, 1024, 32, 32, True, False, True): (1, 8, 3, 4), - (768, 768, 1024, 64, 64, False, True, True): (3, 16, 1, 16), - (768, 768, 1024, 64, 64, True, False, True): (1, 8, 4, 4), - (768, 768, 1024, 128, 128, False, True, True): (3, 8, 1, 32), - (768, 768, 1024, 128, 128, True, False, True): (1, 8, 3, 4), - (768, 768, 2048, 32, 32, False, True, True): (2, 32, 1, 8), - (768, 768, 2048, 32, 32, True, False, True): (3, 16, 1, 4), - (768, 768, 2048, 64, 64, False, True, True): (1, 32, 1, 8), - (768, 768, 2048, 64, 64, True, False, True): (4, 8, 3, 4), - (768, 768, 2048, 128, 128, False, True, True): (1, 16, 1, 32), - (768, 768, 2048, 128, 128, True, False, True): (1, 16, 3, 4), - (768, 768, 4096, 32, 32, False, True, True): (1, 64, 1, 8), - (768, 768, 4096, 32, 32, True, False, True): (1, 32, 1, 1), - (768, 768, 4096, 64, 64, False, True, True): (2, 64, 1, 8), - (768, 768, 4096, 64, 64, True, False, True): (1, 32, 2, 2), - (768, 768, 4096, 128, 128, False, True, True): (1, 32, 1, 32), - (768, 768, 4096, 128, 128, True, False, True): (6, 32, 1, 4), - (768, 768, 8192, 32, 32, False, True, True): (1, 128, 1, 8), - (768, 768, 8192, 32, 32, True, False, True): (1, 64, 1, 4), - (768, 768, 8192, 64, 64, False, True, True): (1, 128, 1, 8), - (768, 768, 8192, 64, 64, True, False, True): (4, 32, 3, 4), - (768, 768, 8192, 128, 128, False, True, True): (2, 64, 1, 16), - (768, 768, 8192, 128, 128, True, False, True): (2, 64, 3, 4), - (768, 768, 16384, 32, 32, False, True, True): (1, 256, 1, 8), - (768, 768, 16384, 32, 32, True, False, True): (1, 128, 1, 4), - (768, 768, 16384, 64, 64, False, True, True): (1, 256, 1, 8), - (768, 768, 16384, 64, 64, True, False, True): (1, 128, 3, 2), - (768, 768, 16384, 128, 128, False, True, True): (2, 128, 1, 16), - (768, 768, 16384, 128, 128, True, False, True): (2, 128, 1, 4), - (768, 768, 32768, 32, 32, False, True, True): (1, 512, 1, 8), - (768, 768, 32768, 32, 32, True, False, True): (1, 256, 3, 2), - (768, 768, 32768, 64, 64, False, True, True): (2, 512, 1, 8), - (768, 768, 32768, 64, 64, True, False, True): (1, 256, 3, 2), - (768, 768, 32768, 128, 128, False, True, True): (2, 256, 1, 16), - (768, 768, 32768, 128, 128, True, False, True): (3, 256, 1, 4), - (768, 768, 65536, 32, 32, False, True, True): (1, 1024, 1, 8), - (768, 768, 65536, 32, 32, True, False, True): (1, 512, 3, 2), - (768, 768, 65536, 64, 64, False, True, True): (2, 512, 1, 4), - (768, 768, 65536, 64, 64, True, False, True): (1, 512, 3, 2), - (768, 768, 65536, 128, 128, False, True, True): (2, 512, 1, 16), - (768, 768, 65536, 128, 128, True, False, True): (2, 512, 1, 4), - (768, 768, 131072, 32, 32, False, True, True): (1, 2048, 1, 8), - (768, 768, 131072, 32, 32, True, False, True): (1, 1024, 3, 2), - (768, 768, 131072, 64, 64, False, True, True): (2, 1024, 1, 4), - (768, 768, 131072, 64, 64, True, False, True): (2, 1024, 3, 2), - (768, 768, 131072, 128, 128, False, True, True): (2, 1024, 1, 16), - (768, 768, 131072, 128, 128, True, False, True): (2, 1024, 1, 4), - (768, 3072, 256, 32, 32, False, True, True): (3, 8, 4, 8), - (768, 3072, 256, 32, 32, True, False, True): (3, 8, 5, 4), - (768, 3072, 256, 64, 64, False, True, True): (1, 4, 4, 16), - (768, 3072, 256, 64, 64, True, False, True): (1, 4, 4, 4), - (768, 3072, 256, 128, 128, False, True, True): (2, 2, 1, 8), - (768, 3072, 256, 128, 128, True, False, True): (2, 2, 4, 4), - (768, 3072, 256, 256, 256, False, True, True): (1, 1, 1, 32), - (768, 3072, 256, 256, 256, True, False, True): (1, 1, 1, 32), - (768, 3072, 512, 32, 32, False, True, True): (1, 16, 1, 4), - (768, 3072, 512, 32, 32, True, False, True): (2, 4, 4, 4), - (768, 3072, 512, 64, 64, False, True, True): (3, 8, 4, 16), - (768, 3072, 512, 64, 64, True, False, True): (1, 8, 4, 4), - (768, 3072, 512, 128, 128, False, True, True): (2, 4, 1, 8), - (768, 3072, 512, 128, 128, True, False, True): (4, 4, 3, 4), - (768, 3072, 512, 256, 256, False, True, True): (1, 2, 1, 32), - (768, 3072, 512, 256, 256, True, False, True): (1, 2, 1, 32), - (768, 3072, 1024, 32, 32, False, True, True): (1, 16, 1, 8), - (768, 3072, 1024, 32, 32, True, False, True): (3, 8, 3, 4), - (768, 3072, 1024, 64, 64, False, True, True): (2, 16, 1, 16), - (768, 3072, 1024, 64, 64, True, False, True): (1, 8, 3, 4), - (768, 3072, 1024, 128, 128, False, True, True): (1, 8, 1, 8), - (768, 3072, 1024, 128, 128, True, False, True): (3, 8, 4, 4), - (768, 3072, 1024, 256, 256, False, True, True): (1, 4, 1, 32), - (768, 3072, 1024, 256, 256, True, False, True): (4, 4, 1, 32), - (768, 3072, 2048, 32, 32, False, True, True): (3, 32, 1, 8), - (768, 3072, 2048, 32, 32, True, False, True): (4, 8, 3, 4), - (768, 3072, 2048, 64, 64, False, True, True): (5, 16, 1, 16), - (768, 3072, 2048, 64, 64, True, False, True): (6, 8, 3, 4), - (768, 3072, 2048, 128, 128, False, True, True): (2, 16, 1, 16), - (768, 3072, 2048, 128, 128, True, False, True): (1, 16, 4, 4), - (768, 3072, 2048, 256, 256, False, True, True): (1, 8, 1, 32), - (768, 3072, 2048, 256, 256, True, False, True): (1, 8, 1, 32), - (768, 3072, 4096, 32, 32, False, True, True): (1, 64, 1, 8), - (768, 3072, 4096, 32, 32, True, False, True): (1, 32, 3, 4), - (768, 3072, 4096, 64, 64, False, True, True): (1, 64, 1, 8), - (768, 3072, 4096, 64, 64, True, False, True): (2, 16, 3, 4), - (768, 3072, 4096, 128, 128, False, True, True): (1, 32, 1, 8), - (768, 3072, 4096, 128, 128, True, False, True): (2, 32, 2, 4), - (768, 3072, 4096, 256, 256, False, True, True): (1, 16, 1, 32), - (768, 3072, 4096, 256, 256, True, False, True): (1, 16, 1, 32), - (768, 3072, 8192, 32, 32, False, True, True): (1, 128, 1, 8), - (768, 3072, 8192, 32, 32, True, False, True): (1, 64, 1, 4), - (768, 3072, 8192, 64, 64, False, True, True): (1, 128, 1, 8), - (768, 3072, 8192, 64, 64, True, False, True): (2, 32, 3, 4), - (768, 3072, 8192, 128, 128, False, True, True): (2, 64, 1, 16), - (768, 3072, 8192, 128, 128, True, False, True): (2, 64, 3, 4), - (768, 3072, 8192, 256, 256, False, True, True): (1, 32, 1, 32), - (768, 3072, 8192, 256, 256, True, False, True): (1, 32, 1, 32), - (768, 3072, 16384, 32, 32, False, True, True): (1, 256, 1, 8), - (768, 3072, 16384, 32, 32, True, False, True): (1, 128, 1, 4), - (768, 3072, 16384, 64, 64, False, True, True): (1, 256, 1, 8), - (768, 3072, 16384, 64, 64, True, False, True): (2, 64, 3, 4), - (768, 3072, 16384, 128, 128, False, True, True): (2, 128, 1, 16), - (768, 3072, 16384, 128, 128, True, False, True): (2, 128, 3, 4), - (768, 3072, 16384, 256, 256, False, True, True): (1, 64, 1, 32), - (768, 3072, 16384, 256, 256, True, False, True): (1, 64, 1, 32), - (768, 3072, 32768, 32, 32, False, True, True): (1, 512, 1, 8), - (768, 3072, 32768, 32, 32, True, False, True): (1, 256, 3, 2), - (768, 3072, 32768, 64, 64, False, True, True): (1, 512, 1, 8), - (768, 3072, 32768, 64, 64, True, False, True): (3, 128, 3, 4), - (768, 3072, 32768, 128, 128, False, True, True): (2, 256, 1, 16), - (768, 3072, 32768, 128, 128, True, False, True): (2, 256, 3, 4), - (768, 3072, 32768, 256, 256, False, True, True): (1, 128, 1, 32), - (768, 3072, 32768, 256, 256, True, False, True): (1, 128, 1, 32), - (768, 3072, 50432, 32, 32, False, True, True): (1, 788, 1, 8), - (768, 3072, 50432, 32, 32, True, False, True): (1, 394, 3, 2), - (768, 3072, 50432, 64, 64, False, True, True): (1, 788, 1, 8), - (768, 3072, 50432, 64, 64, True, False, True): (2, 197, 3, 4), - (768, 3072, 50432, 128, 128, False, True, True): (2, 394, 1, 16), - (768, 3072, 50432, 128, 128, True, False, True): (2, 394, 3, 4), - (768, 3072, 50432, 256, 256, False, True, True): (1, 197, 1, 32), - (768, 3072, 50432, 256, 256, True, False, True): (1, 197, 1, 32), - (768, 3072, 65536, 32, 32, False, True, True): (1, 1024, 1, 8), - (768, 3072, 65536, 32, 32, True, False, True): (1, 512, 3, 2), - (768, 3072, 65536, 64, 64, False, True, True): (1, 1024, 1, 8), - (768, 3072, 65536, 64, 64, True, False, True): (2, 256, 3, 4), - (768, 3072, 65536, 128, 128, False, True, True): (2, 512, 1, 16), - (768, 3072, 65536, 128, 128, True, False, True): (2, 512, 3, 4), - (768, 3072, 65536, 256, 256, False, True, True): (1, 256, 1, 32), - (768, 3072, 65536, 256, 256, True, False, True): (1, 256, 1, 32), - (768, 3072, 131072, 32, 32, False, True, True): (1, 2048, 1, 8), - (768, 3072, 131072, 32, 32, True, False, True): (1, 1024, 3, 2), - (768, 3072, 131072, 64, 64, False, True, True): (1, 2048, 1, 8), - (768, 3072, 131072, 64, 64, True, False, True): (2, 512, 3, 4), - (768, 3072, 131072, 128, 128, False, True, True): (2, 1024, 1, 16), - (768, 3072, 131072, 128, 128, True, False, True): (1, 1024, 3, 4), - (768, 3072, 131072, 256, 256, False, True, True): (1, 512, 1, 32), - (768, 3072, 131072, 256, 256, True, False, True): (1, 512, 1, 32), - (1024, 1024, 256, 32, 32, False, True, True): (1, 8, 1, 4), - (1024, 1024, 256, 32, 32, True, False, True): (1, 8, 5, 4), - (1024, 1024, 256, 64, 64, False, True, True): (1, 4, 1, 16), - (1024, 1024, 256, 64, 64, True, False, True): (4, 4, 4, 4), - (1024, 1024, 256, 128, 128, False, True, True): (1, 2, 1, 8), - (1024, 1024, 256, 128, 128, True, False, True): (1, 2, 3, 8), - (1024, 1024, 256, 256, 256, False, True, True): (1, 1, 1, 32), - (1024, 1024, 256, 256, 256, True, False, True): (1, 1, 1, 32), - (1024, 1024, 512, 32, 32, False, True, True): (5, 16, 1, 4), - (1024, 1024, 512, 32, 32, True, False, True): (2, 8, 4, 2), - (1024, 1024, 512, 64, 64, False, True, True): (4, 8, 1, 16), - (1024, 1024, 512, 64, 64, True, False, True): (1, 4, 3, 4), - (1024, 1024, 512, 128, 128, False, True, True): (3, 4, 1, 16), - (1024, 1024, 512, 128, 128, True, False, True): (1, 4, 2, 4), - (1024, 1024, 512, 256, 256, False, True, True): (1, 2, 1, 32), - (1024, 1024, 512, 256, 256, True, False, True): (1, 2, 1, 32), - (1024, 1024, 1024, 32, 32, False, True, True): (1, 16, 1, 8), - (1024, 1024, 1024, 32, 32, True, False, True): (1, 8, 3, 4), - (1024, 1024, 1024, 64, 64, False, True, True): (3, 16, 1, 8), - (1024, 1024, 1024, 64, 64, True, False, True): (1, 16, 3, 2), - (1024, 1024, 1024, 128, 128, False, True, True): (1, 8, 1, 16), - (1024, 1024, 1024, 128, 128, True, False, True): (2, 8, 3, 8), - (1024, 1024, 1024, 256, 256, False, True, True): (1, 4, 1, 32), - (1024, 1024, 1024, 256, 256, True, False, True): (2, 4, 1, 32), - (1024, 1024, 2048, 32, 32, False, True, True): (2, 32, 1, 8), - (1024, 1024, 2048, 32, 32, True, False, True): (3, 16, 1, 4), - (1024, 1024, 2048, 64, 64, False, True, True): (1, 32, 1, 8), - (1024, 1024, 2048, 64, 64, True, False, True): (3, 32, 1, 4), - (1024, 1024, 2048, 128, 128, False, True, True): (4, 16, 1, 16), - (1024, 1024, 2048, 128, 128, True, False, True): (1, 16, 3, 4), - (1024, 1024, 2048, 256, 256, False, True, True): (1, 8, 1, 32), - (1024, 1024, 2048, 256, 256, True, False, True): (1, 8, 1, 32), - (1024, 1024, 4096, 32, 32, False, True, True): (4, 64, 1, 8), - (1024, 1024, 4096, 32, 32, True, False, True): (3, 32, 1, 4), - (1024, 1024, 4096, 64, 64, False, True, True): (3, 64, 1, 8), - (1024, 1024, 4096, 64, 64, True, False, True): (1, 32, 3, 2), - (1024, 1024, 4096, 128, 128, False, True, True): (4, 32, 1, 16), - (1024, 1024, 4096, 128, 128, True, False, True): (2, 32, 2, 4), - (1024, 1024, 4096, 256, 256, False, True, True): (1, 16, 1, 32), - (1024, 1024, 4096, 256, 256, True, False, True): (7, 16, 1, 32), - (1024, 1024, 8192, 32, 32, False, True, True): (1, 128, 1, 8), - (1024, 1024, 8192, 32, 32, True, False, True): (4, 64, 1, 4), - (1024, 1024, 8192, 64, 64, False, True, True): (2, 128, 1, 8), - (1024, 1024, 8192, 64, 64, True, False, True): (3, 32, 3, 4), - (1024, 1024, 8192, 128, 128, False, True, True): (4, 64, 1, 16), - (1024, 1024, 8192, 128, 128, True, False, True): (2, 64, 2, 4), - (1024, 1024, 8192, 256, 256, False, True, True): (1, 32, 1, 32), - (1024, 1024, 8192, 256, 256, True, False, True): (1, 32, 1, 32), - (1024, 1024, 16384, 32, 32, False, True, True): (1, 256, 1, 8), - (1024, 1024, 16384, 32, 32, True, False, True): (1, 128, 1, 4), - (1024, 1024, 16384, 64, 64, False, True, True): (1, 256, 1, 8), - (1024, 1024, 16384, 64, 64, True, False, True): (4, 64, 3, 4), - (1024, 1024, 16384, 128, 128, False, True, True): (4, 128, 1, 16), - (1024, 1024, 16384, 128, 128, True, False, True): (1, 128, 3, 4), - (1024, 1024, 16384, 256, 256, False, True, True): (1, 64, 1, 32), - (1024, 1024, 16384, 256, 256, True, False, True): (1, 64, 1, 32), - (1024, 1024, 32768, 32, 32, False, True, True): (1, 512, 1, 8), - (1024, 1024, 32768, 32, 32, True, False, True): (1, 256, 3, 2), - (1024, 1024, 32768, 64, 64, False, True, True): (1, 256, 1, 4), - (1024, 1024, 32768, 64, 64, True, False, True): (4, 128, 3, 4), - (1024, 1024, 32768, 128, 128, False, True, True): (4, 256, 1, 16), - (1024, 1024, 32768, 128, 128, True, False, True): (2, 256, 3, 4), - (1024, 1024, 32768, 256, 256, False, True, True): (1, 128, 1, 32), - (1024, 1024, 32768, 256, 256, True, False, True): (2, 128, 1, 32), - (1024, 1024, 65536, 32, 32, False, True, True): (1, 1024, 1, 8), - (1024, 1024, 65536, 32, 32, True, False, True): (1, 512, 3, 2), - (1024, 1024, 65536, 64, 64, False, True, True): (1, 512, 1, 4), - (1024, 1024, 65536, 64, 64, True, False, True): (2, 256, 3, 4), - (1024, 1024, 65536, 128, 128, False, True, True): (4, 512, 1, 16), - (1024, 1024, 65536, 128, 128, True, False, True): (4, 512, 3, 4), - (1024, 1024, 65536, 256, 256, False, True, True): (1, 256, 1, 32), - (1024, 1024, 65536, 256, 256, True, False, True): (1, 256, 1, 32), - (1024, 1024, 65792, 32, 32, False, True, True): (1, 1028, 1, 8), - (1024, 1024, 65792, 32, 32, True, False, True): (1, 514, 3, 2), - (1024, 1024, 65792, 64, 64, False, True, True): (2, 514, 1, 4), - (1024, 1024, 65792, 64, 64, True, False, True): (4, 257, 3, 4), - (1024, 1024, 65792, 128, 128, False, True, True): (2, 514, 1, 16), - (1024, 1024, 65792, 128, 128, True, False, True): (2, 514, 2, 4), - (1024, 1024, 65792, 256, 256, False, True, True): (1, 257, 1, 32), - (1024, 1024, 65792, 256, 256, True, False, True): (1, 257, 1, 32), - (1024, 1024, 131072, 32, 32, False, True, True): (1, 2048, 1, 8), - (1024, 1024, 131072, 32, 32, True, False, True): (1, 1024, 3, 2), - (1024, 1024, 131072, 64, 64, False, True, True): (2, 1024, 1, 4), - (1024, 1024, 131072, 64, 64, True, False, True): (2, 512, 3, 4), - (1024, 1024, 131072, 128, 128, False, True, True): (4, 1024, 1, 16), - (1024, 1024, 131072, 128, 128, True, False, True): (1, 1024, 3, 4), - (1024, 1024, 131072, 256, 256, False, True, True): (1, 512, 1, 32), - (1024, 1024, 131072, 256, 256, True, False, True): (1, 512, 1, 32), - (1280, 5120, 65792, 32, 32, False, True, True): (1, 1028, 1, 8), - (1280, 5120, 65792, 32, 32, True, False, True): (1, 514, 3, 2), - (1280, 5120, 65792, 64, 64, False, True, True): (1, 1028, 1, 8), - (1280, 5120, 65792, 64, 64, True, False, True): (2, 257, 3, 4), - (1280, 5120, 65792, 128, 128, False, True, True): (2, 514, 1, 16), - (1280, 5120, 65792, 128, 128, True, False, True): (1, 514, 3, 4), - (1280, 5120, 65792, 256, 256, False, True, True): (1, 257, 1, 32), - (1280, 5120, 65792, 256, 256, True, False, True): (1, 257, 1, 32), - (1536, 1536, 256, 32, 32, False, True, True): (1, 8, 1, 4), - (1536, 1536, 256, 32, 32, True, False, True): (2, 8, 1, 8), - (1536, 1536, 256, 64, 64, False, True, True): (4, 4, 1, 16), - (1536, 1536, 256, 64, 64, True, False, True): (1, 4, 4, 4), - (1536, 1536, 256, 128, 128, False, True, True): (2, 2, 1, 16), - (1536, 1536, 256, 128, 128, True, False, True): (2, 2, 3, 4), - (1536, 1536, 256, 256, 256, False, True, True): (1, 1, 1, 32), - (1536, 1536, 256, 256, 256, True, False, True): (1, 1, 1, 32), - (1536, 1536, 512, 32, 32, False, True, True): (1, 8, 1, 8), - (1536, 1536, 512, 32, 32, True, False, True): (3, 4, 4, 4), - (1536, 1536, 512, 64, 64, False, True, True): (3, 8, 1, 16), - (1536, 1536, 512, 64, 64, True, False, True): (1, 4, 3, 4), - (1536, 1536, 512, 128, 128, False, True, True): (1, 4, 1, 16), - (1536, 1536, 512, 128, 128, True, False, True): (2, 4, 4, 4), - (1536, 1536, 512, 256, 256, False, True, True): (1, 2, 1, 32), - (1536, 1536, 512, 256, 256, True, False, True): (1, 2, 1, 32), - (1536, 1536, 1024, 32, 32, False, True, True): (4, 16, 1, 8), - (1536, 1536, 1024, 32, 32, True, False, True): (2, 8, 1, 4), - (1536, 1536, 1024, 64, 64, False, True, True): (2, 16, 1, 16), - (1536, 1536, 1024, 64, 64, True, False, True): (2, 4, 3, 4), - (1536, 1536, 1024, 128, 128, False, True, True): (3, 8, 1, 32), - (1536, 1536, 1024, 128, 128, True, False, True): (4, 8, 3, 4), - (1536, 1536, 1024, 256, 256, False, True, True): (1, 4, 1, 32), - (1536, 1536, 1024, 256, 256, True, False, True): (1, 4, 1, 32), - (1536, 1536, 2048, 32, 32, False, True, True): (1, 32, 1, 8), - (1536, 1536, 2048, 32, 32, True, False, True): (1, 16, 1, 4), - (1536, 1536, 2048, 64, 64, False, True, True): (1, 32, 1, 8), - (1536, 1536, 2048, 64, 64, True, False, True): (1, 16, 2, 2), - (1536, 1536, 2048, 128, 128, False, True, True): (2, 16, 1, 16), - (1536, 1536, 2048, 128, 128, True, False, True): (4, 16, 2, 4), - (1536, 1536, 2048, 256, 256, False, True, True): (1, 8, 1, 32), - (1536, 1536, 2048, 256, 256, True, False, True): (1, 8, 1, 32), - (1536, 1536, 4096, 32, 32, False, True, True): (1, 64, 1, 8), - (1536, 1536, 4096, 32, 32, True, False, True): (1, 32, 1, 4), - (1536, 1536, 4096, 64, 64, False, True, True): (3, 64, 1, 8), - (1536, 1536, 4096, 64, 64, True, False, True): (1, 32, 3, 2), - (1536, 1536, 4096, 128, 128, False, True, True): (1, 32, 1, 8), - (1536, 1536, 4096, 128, 128, True, False, True): (2, 32, 2, 4), - (1536, 1536, 4096, 256, 256, False, True, True): (1, 16, 1, 32), - (1536, 1536, 4096, 256, 256, True, False, True): (2, 16, 1, 32), - (1536, 1536, 8192, 32, 32, False, True, True): (1, 128, 1, 8), - (1536, 1536, 8192, 32, 32, True, False, True): (1, 64, 1, 4), - (1536, 1536, 8192, 64, 64, False, True, True): (3, 128, 1, 8), - (1536, 1536, 8192, 64, 64, True, False, True): (1, 64, 3, 2), - (1536, 1536, 8192, 128, 128, False, True, True): (1, 64, 1, 8), - (1536, 1536, 8192, 128, 128, True, False, True): (1, 64, 2, 4), - (1536, 1536, 8192, 256, 256, False, True, True): (1, 32, 1, 32), - (1536, 1536, 8192, 256, 256, True, False, True): (2, 32, 1, 32), - (1536, 1536, 16384, 32, 32, False, True, True): (1, 256, 1, 8), - (1536, 1536, 16384, 32, 32, True, False, True): (1, 128, 3, 2), - (1536, 1536, 16384, 64, 64, False, True, True): (2, 128, 1, 4), - (1536, 1536, 16384, 64, 64, True, False, True): (2, 64, 3, 4), - (1536, 1536, 16384, 128, 128, False, True, True): (1, 128, 1, 8), - (1536, 1536, 16384, 128, 128, True, False, True): (2, 128, 2, 4), - (1536, 1536, 16384, 256, 256, False, True, True): (1, 64, 1, 32), - (1536, 1536, 16384, 256, 256, True, False, True): (2, 64, 1, 32), - (1536, 1536, 32768, 32, 32, False, True, True): (1, 512, 1, 8), - (1536, 1536, 32768, 32, 32, True, False, True): (1, 256, 3, 2), - (1536, 1536, 32768, 64, 64, False, True, True): (1, 256, 1, 4), - (1536, 1536, 32768, 64, 64, True, False, True): (3, 128, 3, 4), - (1536, 1536, 32768, 128, 128, False, True, True): (1, 256, 1, 8), - (1536, 1536, 32768, 128, 128, True, False, True): (1, 256, 2, 4), - (1536, 1536, 32768, 256, 256, False, True, True): (1, 128, 1, 32), - (1536, 1536, 32768, 256, 256, True, False, True): (2, 128, 1, 32), - (1536, 1536, 65536, 32, 32, False, True, True): (1, 1024, 1, 8), - (1536, 1536, 65536, 32, 32, True, False, True): (1, 512, 3, 2), - (1536, 1536, 65536, 64, 64, False, True, True): (1, 512, 1, 4), - (1536, 1536, 65536, 64, 64, True, False, True): (1, 512, 3, 2), - (1536, 1536, 65536, 128, 128, False, True, True): (1, 512, 1, 8), - (1536, 1536, 65536, 128, 128, True, False, True): (1, 512, 3, 4), - (1536, 1536, 65536, 256, 256, False, True, True): (1, 256, 1, 32), - (1536, 1536, 65536, 256, 256, True, False, True): (2, 256, 1, 32), - (1536, 1536, 131072, 32, 32, False, True, True): (1, 2048, 1, 8), - (1536, 1536, 131072, 32, 32, True, False, True): (1, 1024, 3, 2), - (1536, 1536, 131072, 64, 64, False, True, True): (3, 1024, 1, 4), - (1536, 1536, 131072, 64, 64, True, False, True): (3, 512, 3, 4), - (1536, 1536, 131072, 128, 128, False, True, True): (1, 1024, 1, 8), - (1536, 1536, 131072, 128, 128, True, False, True): (1, 1024, 3, 4), - (1536, 1536, 131072, 256, 256, False, True, True): (1, 512, 1, 32), - (1536, 1536, 131072, 256, 256, True, False, True): (2, 512, 1, 32), - (2048, 2048, 256, 32, 32, False, True, True): (3, 8, 1, 4), - (2048, 2048, 256, 32, 32, True, False, True): (1, 4, 4, 2), - (2048, 2048, 256, 64, 64, False, True, True): (2, 4, 1, 16), - (2048, 2048, 256, 64, 64, True, False, True): (1, 2, 3, 4), - (2048, 2048, 256, 128, 128, False, True, True): (1, 2, 1, 8), - (2048, 2048, 256, 128, 128, True, False, True): (1, 2, 4, 4), - (2048, 2048, 256, 256, 256, False, True, True): (1, 1, 1, 32), - (2048, 2048, 256, 256, 256, True, False, True): (1, 1, 1, 32), - (2048, 2048, 512, 32, 32, False, True, True): (3, 8, 1, 8), - (2048, 2048, 512, 32, 32, True, False, True): (4, 4, 3, 2), - (2048, 2048, 512, 64, 64, False, True, True): (1, 8, 1, 8), - (2048, 2048, 512, 64, 64, True, False, True): (1, 8, 3, 4), - (2048, 2048, 512, 128, 128, False, True, True): (1, 4, 1, 8), - (2048, 2048, 512, 128, 128, True, False, True): (1, 4, 4, 4), - (2048, 2048, 512, 256, 256, False, True, True): (1, 2, 1, 32), - (2048, 2048, 512, 256, 256, True, False, True): (2, 2, 1, 32), - (2048, 2048, 1024, 32, 32, False, True, True): (1, 16, 1, 8), - (2048, 2048, 1024, 32, 32, True, False, True): (3, 8, 1, 4), - (2048, 2048, 1024, 64, 64, False, True, True): (4, 16, 1, 8), - (2048, 2048, 1024, 64, 64, True, False, True): (1, 8, 3, 2), - (2048, 2048, 1024, 128, 128, False, True, True): (4, 8, 1, 16), - (2048, 2048, 1024, 128, 128, True, False, True): (2, 8, 2, 4), - (2048, 2048, 1024, 256, 256, False, True, True): (1, 4, 1, 32), - (2048, 2048, 1024, 256, 256, True, False, True): (3, 4, 1, 32), - (2048, 2048, 2048, 32, 32, False, True, True): (1, 32, 1, 8), - (2048, 2048, 2048, 32, 32, True, False, True): (1, 16, 1, 4), - (2048, 2048, 2048, 64, 64, False, True, True): (1, 32, 1, 8), - (2048, 2048, 2048, 64, 64, True, False, True): (1, 16, 3, 2), - (2048, 2048, 2048, 128, 128, False, True, True): (4, 16, 1, 16), - (2048, 2048, 2048, 128, 128, True, False, True): (2, 16, 2, 4), - (2048, 2048, 2048, 256, 256, False, True, True): (1, 8, 1, 32), - (2048, 2048, 2048, 256, 256, True, False, True): (1, 8, 1, 32), - (2048, 2048, 4096, 32, 32, False, True, True): (1, 64, 1, 8), - (2048, 2048, 4096, 32, 32, True, False, True): (1, 32, 1, 4), - (2048, 2048, 4096, 64, 64, False, True, True): (4, 64, 1, 8), - (2048, 2048, 4096, 64, 64, True, False, True): (2, 16, 3, 4), - (2048, 2048, 4096, 128, 128, False, True, True): (4, 32, 1, 8), - (2048, 2048, 4096, 128, 128, True, False, True): (1, 32, 2, 4), - (2048, 2048, 4096, 256, 256, False, True, True): (1, 16, 1, 32), - (2048, 2048, 4096, 256, 256, True, False, True): (4, 16, 1, 32), - (2048, 2048, 8192, 32, 32, False, True, True): (1, 128, 1, 8), - (2048, 2048, 8192, 32, 32, True, False, True): (1, 64, 1, 4), - (2048, 2048, 8192, 64, 64, False, True, True): (2, 64, 1, 4), - (2048, 2048, 8192, 64, 64, True, False, True): (2, 32, 3, 4), - (2048, 2048, 8192, 128, 128, False, True, True): (4, 64, 1, 8), - (2048, 2048, 8192, 128, 128, True, False, True): (2, 64, 2, 4), - (2048, 2048, 8192, 256, 256, False, True, True): (1, 32, 1, 32), - (2048, 2048, 8192, 256, 256, True, False, True): (4, 32, 1, 32), - (2048, 2048, 16384, 32, 32, False, True, True): (1, 256, 1, 8), - (2048, 2048, 16384, 32, 32, True, False, True): (1, 128, 3, 2), - (2048, 2048, 16384, 64, 64, False, True, True): (2, 128, 1, 4), - (2048, 2048, 16384, 64, 64, True, False, True): (2, 64, 3, 4), - (2048, 2048, 16384, 128, 128, False, True, True): (1, 128, 1, 8), - (2048, 2048, 16384, 128, 128, True, False, True): (2, 128, 2, 4), - (2048, 2048, 16384, 256, 256, False, True, True): (1, 64, 1, 32), - (2048, 2048, 16384, 256, 256, True, False, True): (4, 64, 1, 32), - (2048, 2048, 32768, 32, 32, False, True, True): (1, 512, 1, 8), - (2048, 2048, 32768, 32, 32, True, False, True): (1, 256, 3, 2), - (2048, 2048, 32768, 64, 64, False, True, True): (2, 256, 1, 4), - (2048, 2048, 32768, 64, 64, True, False, True): (2, 128, 3, 4), - (2048, 2048, 32768, 128, 128, False, True, True): (1, 256, 1, 8), - (2048, 2048, 32768, 128, 128, True, False, True): (2, 256, 2, 4), - (2048, 2048, 32768, 256, 256, False, True, True): (1, 128, 1, 32), - (2048, 2048, 32768, 256, 256, True, False, True): (4, 128, 1, 32), - (2048, 2048, 65536, 32, 32, False, True, True): (1, 1024, 1, 8), - (2048, 2048, 65536, 32, 32, True, False, True): (1, 512, 3, 2), - (2048, 2048, 65536, 64, 64, False, True, True): (1, 512, 1, 4), - (2048, 2048, 65536, 64, 64, True, False, True): (2, 256, 3, 4), - (2048, 2048, 65536, 128, 128, False, True, True): (1, 512, 1, 8), - (2048, 2048, 65536, 128, 128, True, False, True): (1, 512, 2, 4), - (2048, 2048, 65536, 256, 256, False, True, True): (1, 256, 1, 32), - (2048, 2048, 65536, 256, 256, True, False, True): (4, 256, 1, 32), - (2048, 2048, 65792, 32, 32, False, True, True): (1, 1028, 1, 8), - (2048, 2048, 65792, 32, 32, True, False, True): (1, 514, 3, 2), - (2048, 2048, 65792, 64, 64, False, True, True): (1, 514, 1, 4), - (2048, 2048, 65792, 64, 64, True, False, True): (2, 257, 3, 4), - (2048, 2048, 65792, 128, 128, False, True, True): (1, 514, 1, 8), - (2048, 2048, 65792, 128, 128, True, False, True): (1, 514, 2, 4), - (2048, 2048, 65792, 256, 256, False, True, True): (1, 257, 1, 32), - (2048, 2048, 65792, 256, 256, True, False, True): (1, 257, 1, 32), - (2048, 2048, 131072, 32, 32, False, True, True): (1, 2048, 1, 8), - (2048, 2048, 131072, 32, 32, True, False, True): (1, 1024, 3, 2), - (2048, 2048, 131072, 64, 64, False, True, True): (1, 1024, 1, 4), - (2048, 2048, 131072, 64, 64, True, False, True): (2, 512, 3, 4), - (2048, 2048, 131072, 128, 128, False, True, True): (1, 1024, 1, 8), - (2048, 2048, 131072, 128, 128, True, False, True): (1, 1024, 3, 4), - (2048, 2048, 131072, 256, 256, False, True, True): (1, 512, 1, 32), - (2048, 2048, 131072, 256, 256, True, False, True): (4, 512, 1, 32), - (3072, 768, 256, 32, 32, False, True, True): (5, 4, 1, 8), - (3072, 768, 256, 32, 32, True, False, True): (2, 2, 4, 4), - (3072, 768, 256, 64, 64, False, True, True): (1, 4, 1, 16), - (3072, 768, 256, 64, 64, True, False, True): (2, 2, 3, 4), - (3072, 768, 256, 128, 128, False, True, True): (5, 2, 1, 16), - (3072, 768, 256, 128, 128, True, False, True): (1, 2, 5, 4), - (3072, 768, 256, 256, 256, False, True, True): (1, 1, 1, 32), - (3072, 768, 256, 256, 256, True, False, True): (1, 1, 1, 32), - (3072, 768, 512, 32, 32, False, True, True): (1, 8, 1, 8), - (3072, 768, 512, 32, 32, True, False, True): (5, 4, 1, 4), - (3072, 768, 512, 64, 64, False, True, True): (1, 8, 1, 8), - (3072, 768, 512, 64, 64, True, False, True): (3, 2, 3, 4), - (3072, 768, 512, 128, 128, False, True, True): (3, 4, 1, 32), - (3072, 768, 512, 128, 128, True, False, True): (2, 4, 3, 4), - (3072, 768, 512, 256, 256, False, True, True): (1, 2, 1, 32), - (3072, 768, 512, 256, 256, True, False, True): (2, 2, 1, 32), - (3072, 768, 1024, 32, 32, False, True, True): (2, 16, 1, 8), - (3072, 768, 1024, 32, 32, True, False, True): (3, 8, 1, 4), - (3072, 768, 1024, 64, 64, False, True, True): (4, 16, 1, 8), - (3072, 768, 1024, 64, 64, True, False, True): (1, 8, 3, 2), - (3072, 768, 1024, 128, 128, False, True, True): (2, 8, 1, 32), - (3072, 768, 1024, 128, 128, True, False, True): (3, 8, 2, 4), - (3072, 768, 1024, 256, 256, False, True, True): (1, 4, 1, 32), - (3072, 768, 1024, 256, 256, True, False, True): (4, 4, 1, 32), - (3072, 768, 2048, 32, 32, False, True, True): (1, 32, 1, 8), - (3072, 768, 2048, 32, 32, True, False, True): (1, 16, 1, 4), - (3072, 768, 2048, 64, 64, False, True, True): (2, 32, 1, 8), - (3072, 768, 2048, 64, 64, True, False, True): (2, 8, 3, 4), - (3072, 768, 2048, 128, 128, False, True, True): (2, 16, 1, 16), - (3072, 768, 2048, 128, 128, True, False, True): (2, 16, 1, 4), - (3072, 768, 2048, 256, 256, False, True, True): (1, 8, 1, 32), - (3072, 768, 2048, 256, 256, True, False, True): (2, 8, 1, 32), - (3072, 768, 4096, 32, 32, False, True, True): (1, 64, 1, 8), - (3072, 768, 4096, 32, 32, True, False, True): (1, 32, 1, 2), - (3072, 768, 4096, 64, 64, False, True, True): (2, 64, 1, 8), - (3072, 768, 4096, 64, 64, True, False, True): (2, 32, 2, 2), - (3072, 768, 4096, 128, 128, False, True, True): (1, 32, 1, 8), - (3072, 768, 4096, 128, 128, True, False, True): (2, 32, 2, 4), - (3072, 768, 4096, 256, 256, False, True, True): (1, 16, 1, 32), - (3072, 768, 4096, 256, 256, True, False, True): (4, 16, 1, 32), - (3072, 768, 8192, 32, 32, False, True, True): (1, 128, 1, 8), - (3072, 768, 8192, 32, 32, True, False, True): (3, 64, 1, 2), - (3072, 768, 8192, 64, 64, False, True, True): (1, 128, 1, 8), - (3072, 768, 8192, 64, 64, True, False, True): (2, 64, 2, 2), - (3072, 768, 8192, 128, 128, False, True, True): (1, 64, 1, 8), - (3072, 768, 8192, 128, 128, True, False, True): (2, 64, 2, 4), - (3072, 768, 8192, 256, 256, False, True, True): (1, 32, 1, 32), - (3072, 768, 8192, 256, 256, True, False, True): (4, 32, 1, 32), - (3072, 768, 16384, 32, 32, False, True, True): (1, 256, 1, 8), - (3072, 768, 16384, 32, 32, True, False, True): (1, 128, 1, 2), - (3072, 768, 16384, 64, 64, False, True, True): (2, 128, 1, 4), - (3072, 768, 16384, 64, 64, True, False, True): (1, 128, 2, 2), - (3072, 768, 16384, 128, 128, False, True, True): (1, 128, 1, 8), - (3072, 768, 16384, 128, 128, True, False, True): (1, 128, 1, 4), - (3072, 768, 16384, 256, 256, False, True, True): (1, 64, 1, 32), - (3072, 768, 16384, 256, 256, True, False, True): (4, 64, 1, 32), - (3072, 768, 32768, 32, 32, False, True, True): (1, 512, 1, 8), - (3072, 768, 32768, 32, 32, True, False, True): (1, 256, 1, 2), - (3072, 768, 32768, 64, 64, False, True, True): (1, 256, 1, 4), - (3072, 768, 32768, 64, 64, True, False, True): (2, 256, 2, 2), - (3072, 768, 32768, 128, 128, False, True, True): (1, 256, 1, 8), - (3072, 768, 32768, 128, 128, True, False, True): (2, 256, 1, 4), - (3072, 768, 32768, 256, 256, False, True, True): (1, 128, 1, 32), - (3072, 768, 32768, 256, 256, True, False, True): (4, 128, 1, 32), - (3072, 768, 50432, 32, 32, False, True, True): (1, 788, 1, 8), - (3072, 768, 50432, 32, 32, True, False, True): (1, 394, 1, 2), - (3072, 768, 50432, 64, 64, False, True, True): (2, 394, 1, 4), - (3072, 768, 50432, 64, 64, True, False, True): (2, 394, 2, 2), - (3072, 768, 50432, 128, 128, False, True, True): (1, 394, 1, 8), - (3072, 768, 50432, 128, 128, True, False, True): (2, 394, 1, 4), - (3072, 768, 50432, 256, 256, False, True, True): (1, 197, 1, 32), - (3072, 768, 50432, 256, 256, True, False, True): (1, 197, 1, 32), - (3072, 768, 65536, 32, 32, False, True, True): (1, 1024, 1, 8), - (3072, 768, 65536, 32, 32, True, False, True): (1, 512, 1, 2), - (3072, 768, 65536, 64, 64, False, True, True): (1, 512, 1, 4), - (3072, 768, 65536, 64, 64, True, False, True): (2, 512, 2, 2), - (3072, 768, 65536, 128, 128, False, True, True): (1, 512, 1, 8), - (3072, 768, 65536, 128, 128, True, False, True): (2, 512, 1, 4), - (3072, 768, 65536, 256, 256, False, True, True): (1, 256, 1, 32), - (3072, 768, 65536, 256, 256, True, False, True): (4, 256, 1, 32), - (3072, 768, 131072, 32, 32, False, True, True): (1, 2048, 1, 8), - (3072, 768, 131072, 32, 32, True, False, True): (1, 1024, 1, 2), - (3072, 768, 131072, 64, 64, False, True, True): (2, 1024, 1, 4), - (3072, 768, 131072, 64, 64, True, False, True): (2, 1024, 2, 2), - (3072, 768, 131072, 128, 128, False, True, True): (1, 1024, 1, 8), - (3072, 768, 131072, 128, 128, True, False, True): (2, 1024, 1, 4), - (3072, 768, 131072, 256, 256, False, True, True): (1, 512, 1, 32), - (3072, 768, 131072, 256, 256, True, False, True): (4, 512, 1, 32), - (3072, 3072, 256, 32, 32, False, True, True): (1, 4, 1, 8), - (3072, 3072, 256, 32, 32, True, False, True): (2, 2, 5, 4), - (3072, 3072, 256, 64, 64, False, True, True): (2, 4, 1, 16), - (3072, 3072, 256, 64, 64, True, False, True): (3, 2, 3, 4), - (3072, 3072, 256, 128, 128, False, True, True): (1, 2, 1, 8), - (3072, 3072, 256, 128, 128, True, False, True): (1, 2, 5, 4), - (3072, 3072, 256, 256, 256, False, True, True): (1, 1, 1, 32), - (3072, 3072, 256, 256, 256, True, False, True): (1, 1, 1, 32), - (3072, 3072, 512, 32, 32, False, True, True): (1, 8, 1, 8), - (3072, 3072, 512, 32, 32, True, False, True): (3, 2, 3, 4), - (3072, 3072, 512, 64, 64, False, True, True): (1, 8, 1, 8), - (3072, 3072, 512, 64, 64, True, False, True): (3, 2, 3, 4), - (3072, 3072, 512, 128, 128, False, True, True): (2, 4, 1, 8), - (3072, 3072, 512, 128, 128, True, False, True): (2, 4, 4, 4), - (3072, 3072, 512, 256, 256, False, True, True): (1, 2, 1, 32), - (3072, 3072, 512, 256, 256, True, False, True): (1, 2, 1, 32), - (3072, 3072, 1024, 32, 32, False, True, True): (1, 16, 1, 8), - (3072, 3072, 1024, 32, 32, True, False, True): (3, 8, 3, 4), - (3072, 3072, 1024, 64, 64, False, True, True): (2, 16, 1, 8), - (3072, 3072, 1024, 64, 64, True, False, True): (2, 4, 3, 4), - (3072, 3072, 1024, 128, 128, False, True, True): (1, 8, 1, 8), - (3072, 3072, 1024, 128, 128, True, False, True): (3, 8, 2, 4), - (3072, 3072, 1024, 256, 256, False, True, True): (1, 4, 1, 32), - (3072, 3072, 1024, 256, 256, True, False, True): (3, 4, 1, 32), - (3072, 3072, 2048, 32, 32, False, True, True): (1, 32, 1, 8), - (3072, 3072, 2048, 32, 32, True, False, True): (1, 16, 1, 4), - (3072, 3072, 2048, 64, 64, False, True, True): (1, 32, 1, 8), - (3072, 3072, 2048, 64, 64, True, False, True): (1, 16, 3, 2), - (3072, 3072, 2048, 128, 128, False, True, True): (1, 16, 1, 8), - (3072, 3072, 2048, 128, 128, True, False, True): (2, 16, 2, 4), - (3072, 3072, 2048, 256, 256, False, True, True): (1, 8, 1, 32), - (3072, 3072, 2048, 256, 256, True, False, True): (3, 8, 1, 32), - (3072, 3072, 4096, 32, 32, False, True, True): (1, 64, 1, 8), - (3072, 3072, 4096, 32, 32, True, False, True): (1, 32, 1, 4), - (3072, 3072, 4096, 64, 64, False, True, True): (1, 64, 1, 8), - (3072, 3072, 4096, 64, 64, True, False, True): (3, 16, 3, 4), - (3072, 3072, 4096, 128, 128, False, True, True): (1, 32, 1, 8), - (3072, 3072, 4096, 128, 128, True, False, True): (2, 32, 2, 4), - (3072, 3072, 4096, 256, 256, False, True, True): (1, 16, 1, 32), - (3072, 3072, 4096, 256, 256, True, False, True): (2, 16, 1, 32), - (3072, 3072, 8192, 32, 32, False, True, True): (1, 128, 1, 8), - (3072, 3072, 8192, 32, 32, True, False, True): (1, 64, 1, 2), - (3072, 3072, 8192, 64, 64, False, True, True): (1, 64, 1, 4), - (3072, 3072, 8192, 64, 64, True, False, True): (1, 64, 3, 2), - (3072, 3072, 8192, 128, 128, False, True, True): (1, 64, 1, 8), - (3072, 3072, 8192, 128, 128, True, False, True): (2, 64, 2, 4), - (3072, 3072, 8192, 256, 256, False, True, True): (1, 32, 1, 32), - (3072, 3072, 8192, 256, 256, True, False, True): (4, 32, 1, 32), - (3072, 3072, 16384, 32, 32, False, True, True): (1, 256, 1, 8), - (3072, 3072, 16384, 32, 32, True, False, True): (1, 128, 3, 2), - (3072, 3072, 16384, 64, 64, False, True, True): (1, 128, 1, 4), - (3072, 3072, 16384, 64, 64, True, False, True): (2, 64, 3, 4), - (3072, 3072, 16384, 128, 128, False, True, True): (1, 128, 1, 8), - (3072, 3072, 16384, 128, 128, True, False, True): (1, 128, 2, 4), - (3072, 3072, 16384, 256, 256, False, True, True): (1, 64, 1, 32), - (3072, 3072, 16384, 256, 256, True, False, True): (4, 64, 1, 32), - (3072, 3072, 32768, 32, 32, False, True, True): (1, 512, 1, 8), - (3072, 3072, 32768, 32, 32, True, False, True): (1, 256, 3, 2), - (3072, 3072, 32768, 64, 64, False, True, True): (1, 256, 1, 4), - (3072, 3072, 32768, 64, 64, True, False, True): (1, 256, 3, 2), - (3072, 3072, 32768, 128, 128, False, True, True): (1, 256, 1, 8), - (3072, 3072, 32768, 128, 128, True, False, True): (1, 256, 2, 4), - (3072, 3072, 32768, 256, 256, False, True, True): (1, 128, 1, 32), - (3072, 3072, 32768, 256, 256, True, False, True): (4, 128, 1, 32), - (3072, 3072, 65536, 32, 32, False, True, True): (1, 1024, 1, 8), - (3072, 3072, 65536, 32, 32, True, False, True): (1, 512, 3, 2), - (3072, 3072, 65536, 64, 64, False, True, True): (1, 512, 1, 4), - (3072, 3072, 65536, 64, 64, True, False, True): (2, 256, 3, 4), - (3072, 3072, 65536, 128, 128, False, True, True): (1, 512, 1, 8), - (3072, 3072, 65536, 128, 128, True, False, True): (1, 512, 3, 4), - (3072, 3072, 65536, 256, 256, False, True, True): (1, 256, 1, 32), - (3072, 3072, 65536, 256, 256, True, False, True): (4, 256, 1, 32), - (3072, 3072, 131072, 32, 32, False, True, True): (1, 2048, 1, 8), - (3072, 3072, 131072, 32, 32, True, False, True): (1, 1024, 3, 2), - (3072, 3072, 131072, 64, 64, False, True, True): (1, 1024, 1, 4), - (3072, 3072, 131072, 64, 64, True, False, True): (1, 1024, 3, 2), - (3072, 3072, 131072, 128, 128, False, True, True): (1, 1024, 1, 8), - (3072, 3072, 131072, 128, 128, True, False, True): (1, 1024, 3, 4), - (3072, 3072, 131072, 256, 256, False, True, True): (1, 512, 1, 32), - (3072, 3072, 131072, 256, 256, True, False, True): (4, 512, 1, 32), - (4096, 4096, 256, 32, 32, False, True, True): (1, 4, 1, 8), - (4096, 4096, 256, 32, 32, True, False, True): (5, 2, 3, 4), - (4096, 4096, 256, 64, 64, False, True, True): (3, 4, 1, 8), - (4096, 4096, 256, 64, 64, True, False, True): (3, 4, 3, 2), - (4096, 4096, 256, 128, 128, False, True, True): (1, 2, 1, 8), - (4096, 4096, 256, 128, 128, True, False, True): (2, 2, 4, 4), - (4096, 4096, 256, 256, 256, False, True, True): (1, 1, 1, 32), - (4096, 4096, 256, 256, 256, True, False, True): (1, 1, 1, 32), - (4096, 4096, 512, 32, 32, False, True, True): (1, 8, 1, 8), - (4096, 4096, 512, 32, 32, True, False, True): (1, 4, 1, 4), - (4096, 4096, 512, 64, 64, False, True, True): (1, 8, 1, 8), - (4096, 4096, 512, 64, 64, True, False, True): (3, 4, 2, 2), - (4096, 4096, 512, 128, 128, False, True, True): (2, 4, 1, 8), - (4096, 4096, 512, 128, 128, True, False, True): (2, 4, 2, 4), - (4096, 4096, 512, 256, 256, False, True, True): (2, 2, 1, 32), - (4096, 4096, 512, 256, 256, True, False, True): (2, 2, 1, 32), - (4096, 4096, 1024, 32, 32, False, True, True): (4, 16, 1, 8), - (4096, 4096, 1024, 32, 32, True, False, True): (1, 8, 1, 4), - (4096, 4096, 1024, 64, 64, False, True, True): (1, 16, 1, 8), - (4096, 4096, 1024, 64, 64, True, False, True): (4, 4, 3, 4), - (4096, 4096, 1024, 128, 128, False, True, True): (2, 8, 1, 8), - (4096, 4096, 1024, 128, 128, True, False, True): (1, 8, 3, 4), - (4096, 4096, 1024, 256, 256, False, True, True): (1, 4, 1, 32), - (4096, 4096, 1024, 256, 256, True, False, True): (6, 4, 1, 32), - (4096, 4096, 2048, 32, 32, False, True, True): (1, 32, 1, 8), - (4096, 4096, 2048, 32, 32, True, False, True): (1, 16, 1, 4), - (4096, 4096, 2048, 64, 64, False, True, True): (4, 32, 1, 8), - (4096, 4096, 2048, 64, 64, True, False, True): (4, 8, 3, 4), - (4096, 4096, 2048, 128, 128, False, True, True): (2, 16, 1, 8), - (4096, 4096, 2048, 128, 128, True, False, True): (1, 16, 3, 4), - (4096, 4096, 2048, 256, 256, False, True, True): (1, 8, 1, 32), - (4096, 4096, 2048, 256, 256, True, False, True): (4, 8, 1, 32), - (4096, 4096, 4096, 32, 32, False, True, True): (1, 64, 1, 8), - (4096, 4096, 4096, 32, 32, True, False, True): (1, 32, 1, 4), - (4096, 4096, 4096, 64, 64, False, True, True): (1, 64, 1, 8), - (4096, 4096, 4096, 64, 64, True, False, True): (1, 32, 3, 2), - (4096, 4096, 4096, 128, 128, False, True, True): (1, 32, 1, 8), - (4096, 4096, 4096, 128, 128, True, False, True): (2, 32, 3, 4), - (4096, 4096, 4096, 256, 256, False, True, True): (1, 16, 1, 32), - (4096, 4096, 4096, 256, 256, True, False, True): (4, 16, 1, 32), - (4096, 4096, 8192, 32, 32, False, True, True): (1, 128, 1, 8), - (4096, 4096, 8192, 32, 32, True, False, True): (1, 64, 1, 4), - (4096, 4096, 8192, 64, 64, False, True, True): (1, 128, 1, 8), - (4096, 4096, 8192, 64, 64, True, False, True): (1, 64, 3, 2), - (4096, 4096, 8192, 128, 128, False, True, True): (1, 64, 1, 8), - (4096, 4096, 8192, 128, 128, True, False, True): (1, 64, 3, 4), - (4096, 4096, 8192, 256, 256, False, True, True): (1, 32, 1, 32), - (4096, 4096, 8192, 256, 256, True, False, True): (4, 32, 1, 32), - (4096, 4096, 16384, 32, 32, False, True, True): (1, 256, 1, 8), - (4096, 4096, 16384, 32, 32, True, False, True): (1, 128, 3, 2), - (4096, 4096, 16384, 64, 64, False, True, True): (1, 128, 1, 4), - (4096, 4096, 16384, 64, 64, True, False, True): (4, 64, 3, 4), - (4096, 4096, 16384, 128, 128, False, True, True): (1, 128, 1, 8), - (4096, 4096, 16384, 128, 128, True, False, True): (1, 128, 3, 4), - (4096, 4096, 16384, 256, 256, False, True, True): (1, 64, 1, 32), - (4096, 4096, 16384, 256, 256, True, False, True): (4, 64, 1, 32), - (4096, 4096, 32768, 32, 32, False, True, True): (1, 512, 1, 8), - (4096, 4096, 32768, 32, 32, True, False, True): (1, 256, 3, 2), - (4096, 4096, 32768, 64, 64, False, True, True): (1, 256, 1, 4), - (4096, 4096, 32768, 64, 64, True, False, True): (1, 256, 3, 2), - (4096, 4096, 32768, 128, 128, False, True, True): (1, 256, 1, 8), - (4096, 4096, 32768, 128, 128, True, False, True): (1, 256, 3, 4), - (4096, 4096, 32768, 256, 256, False, True, True): (1, 128, 1, 32), - (4096, 4096, 32768, 256, 256, True, False, True): (4, 128, 1, 32), - (4096, 4096, 65536, 32, 32, False, True, True): (1, 1024, 1, 8), - (4096, 4096, 65536, 32, 32, True, False, True): (1, 512, 3, 2), - (4096, 4096, 65536, 64, 64, False, True, True): (1, 512, 1, 4), - (4096, 4096, 65536, 64, 64, True, False, True): (4, 256, 3, 4), - (4096, 4096, 65536, 128, 128, False, True, True): (1, 512, 1, 8), - (4096, 4096, 65536, 128, 128, True, False, True): (1, 512, 3, 4), - (4096, 4096, 65536, 256, 256, False, True, True): (1, 256, 1, 32), - (4096, 4096, 65536, 256, 256, True, False, True): (4, 256, 1, 32), - (4096, 4096, 65792, 32, 32, False, True, True): (1, 1028, 1, 8), - (4096, 4096, 65792, 32, 32, True, False, True): (1, 514, 3, 2), - (4096, 4096, 65792, 64, 64, False, True, True): (1, 1028, 1, 8), - (4096, 4096, 65792, 64, 64, True, False, True): (1, 514, 3, 2), - (4096, 4096, 65792, 128, 128, False, True, True): (1, 514, 1, 8), - (4096, 4096, 65792, 128, 128, True, False, True): (1, 514, 2, 4), - (4096, 4096, 65792, 256, 256, False, True, True): (1, 257, 1, 32), - (4096, 4096, 65792, 256, 256, True, False, True): (1, 257, 1, 32), - (4096, 4096, 131072, 32, 32, False, True, True): (1, 2048, 1, 8), - (4096, 4096, 131072, 32, 32, True, False, True): (1, 1024, 3, 2), - (4096, 4096, 131072, 64, 64, False, True, True): (1, 2048, 1, 8), - (4096, 4096, 131072, 64, 64, True, False, True): (1, 1024, 3, 2), - (4096, 4096, 131072, 128, 128, False, True, True): (1, 1024, 1, 8), - (4096, 4096, 131072, 128, 128, True, False, True): (1, 1024, 3, 4), - (4096, 4096, 131072, 256, 256, False, True, True): (1, 512, 1, 32), - (4096, 4096, 131072, 256, 256, True, False, True): (4, 512, 1, 32), - (5120, 1280, 65792, 32, 32, False, True, True): (1, 1028, 1, 8), - (5120, 1280, 65792, 32, 32, True, False, True): (1, 514, 1, 2), - (5120, 1280, 65792, 64, 64, False, True, True): (1, 514, 1, 4), - (5120, 1280, 65792, 64, 64, True, False, True): (1, 514, 2, 2), - (5120, 1280, 65792, 128, 128, False, True, True): (1, 514, 1, 8), - (5120, 1280, 65792, 128, 128, True, False, True): (1, 514, 2, 4), - (5120, 1280, 65792, 256, 256, False, True, True): (1, 257, 1, 32), - (5120, 1280, 65792, 256, 256, True, False, True): (1, 257, 1, 32), - (6144, 6144, 256, 32, 32, False, True, True): (2, 4, 1, 8), - (6144, 6144, 256, 32, 32, True, False, True): (2, 1, 4, 4), - (6144, 6144, 256, 64, 64, False, True, True): (1, 4, 1, 8), - (6144, 6144, 256, 64, 64, True, False, True): (5, 1, 3, 4), - (6144, 6144, 256, 128, 128, False, True, True): (1, 2, 1, 8), - (6144, 6144, 256, 128, 128, True, False, True): (1, 2, 3, 4), - (6144, 6144, 256, 256, 256, False, True, True): (1, 1, 1, 32), - (6144, 6144, 256, 256, 256, True, False, True): (1, 1, 1, 32), - (6144, 6144, 512, 32, 32, False, True, True): (1, 8, 1, 8), - (6144, 6144, 512, 32, 32, True, False, True): (1, 4, 4, 2), - (6144, 6144, 512, 64, 64, False, True, True): (2, 8, 1, 8), - (6144, 6144, 512, 64, 64, True, False, True): (2, 2, 3, 4), - (6144, 6144, 512, 128, 128, False, True, True): (3, 4, 1, 8), - (6144, 6144, 512, 128, 128, True, False, True): (2, 4, 3, 4), - (6144, 6144, 512, 256, 256, False, True, True): (1, 2, 1, 32), - (6144, 6144, 512, 256, 256, True, False, True): (2, 2, 1, 32), - (6144, 6144, 1024, 32, 32, False, True, True): (1, 16, 1, 8), - (6144, 6144, 1024, 32, 32, True, False, True): (1, 8, 1, 4), - (6144, 6144, 1024, 64, 64, False, True, True): (1, 16, 1, 8), - (6144, 6144, 1024, 64, 64, True, False, True): (4, 4, 3, 4), - (6144, 6144, 1024, 128, 128, False, True, True): (1, 8, 1, 8), - (6144, 6144, 1024, 128, 128, True, False, True): (3, 8, 3, 4), - (6144, 6144, 1024, 256, 256, False, True, True): (1, 4, 1, 32), - (6144, 6144, 1024, 256, 256, True, False, True): (1, 4, 1, 32), - (6144, 6144, 2048, 32, 32, False, True, True): (1, 32, 1, 8), - (6144, 6144, 2048, 32, 32, True, False, True): (1, 16, 1, 4), - (6144, 6144, 2048, 64, 64, False, True, True): (1, 32, 1, 8), - (6144, 6144, 2048, 64, 64, True, False, True): (4, 8, 3, 4), - (6144, 6144, 2048, 128, 128, False, True, True): (1, 16, 1, 8), - (6144, 6144, 2048, 128, 128, True, False, True): (3, 16, 3, 4), - (6144, 6144, 2048, 256, 256, False, True, True): (1, 8, 1, 32), - (6144, 6144, 2048, 256, 256, True, False, True): (4, 8, 1, 32), - (6144, 6144, 4096, 32, 32, False, True, True): (1, 64, 1, 8), - (6144, 6144, 4096, 32, 32, True, False, True): (1, 32, 1, 4), - (6144, 6144, 4096, 64, 64, False, True, True): (1, 64, 1, 8), - (6144, 6144, 4096, 64, 64, True, False, True): (4, 16, 3, 4), - (6144, 6144, 4096, 128, 128, False, True, True): (1, 32, 1, 8), - (6144, 6144, 4096, 128, 128, True, False, True): (4, 32, 3, 4), - (6144, 6144, 4096, 256, 256, False, True, True): (1, 16, 1, 32), - (6144, 6144, 4096, 256, 256, True, False, True): (4, 16, 1, 32), - (6144, 6144, 8192, 32, 32, False, True, True): (1, 128, 1, 8), - (6144, 6144, 8192, 32, 32, True, False, True): (1, 64, 1, 4), - (6144, 6144, 8192, 64, 64, False, True, True): (1, 128, 1, 8), - (6144, 6144, 8192, 64, 64, True, False, True): (4, 32, 3, 4), - (6144, 6144, 8192, 128, 128, False, True, True): (1, 64, 1, 8), - (6144, 6144, 8192, 128, 128, True, False, True): (1, 64, 3, 4), - (6144, 6144, 8192, 256, 256, False, True, True): (1, 32, 1, 32), - (6144, 6144, 8192, 256, 256, True, False, True): (4, 32, 1, 32), - (6144, 6144, 16384, 32, 32, False, True, True): (1, 256, 1, 8), - (6144, 6144, 16384, 32, 32, True, False, True): (1, 128, 1, 4), - (6144, 6144, 16384, 64, 64, False, True, True): (1, 256, 1, 8), - (6144, 6144, 16384, 64, 64, True, False, True): (4, 64, 3, 4), - (6144, 6144, 16384, 128, 128, False, True, True): (1, 128, 1, 8), - (6144, 6144, 16384, 128, 128, True, False, True): (4, 128, 3, 4), - (6144, 6144, 16384, 256, 256, False, True, True): (1, 64, 1, 32), - (6144, 6144, 16384, 256, 256, True, False, True): (4, 64, 1, 32), - (6144, 6144, 32768, 32, 32, False, True, True): (1, 512, 1, 8), - (6144, 6144, 32768, 32, 32, True, False, True): (1, 256, 1, 4), - (6144, 6144, 32768, 64, 64, False, True, True): (1, 512, 1, 8), - (6144, 6144, 32768, 64, 64, True, False, True): (4, 128, 3, 4), - (6144, 6144, 32768, 128, 128, False, True, True): (1, 256, 1, 8), - (6144, 6144, 32768, 128, 128, True, False, True): (1, 256, 3, 4), - (6144, 6144, 32768, 256, 256, False, True, True): (1, 128, 1, 32), - (6144, 6144, 32768, 256, 256, True, False, True): (4, 128, 1, 32), - (6144, 6144, 65536, 32, 32, False, True, True): (1, 1024, 1, 8), - (6144, 6144, 65536, 32, 32, True, False, True): (1, 512, 1, 4), - (6144, 6144, 65536, 64, 64, False, True, True): (1, 1024, 1, 8), - (6144, 6144, 65536, 64, 64, True, False, True): (4, 256, 3, 4), - (6144, 6144, 65536, 128, 128, False, True, True): (1, 512, 1, 8), - (6144, 6144, 65536, 128, 128, True, False, True): (1, 512, 3, 4), - (6144, 6144, 65536, 256, 256, False, True, True): (1, 256, 1, 32), - (6144, 6144, 65536, 256, 256, True, False, True): (4, 256, 1, 32), - (6144, 6144, 131072, 32, 32, False, True, True): (1, 2048, 1, 8), - (6144, 6144, 131072, 32, 32, True, False, True): (1, 1024, 1, 4), - (6144, 6144, 131072, 64, 64, False, True, True): (1, 2048, 1, 8), - (6144, 6144, 131072, 64, 64, True, False, True): (4, 512, 3, 4), - (6144, 6144, 131072, 128, 128, False, True, True): (1, 1024, 1, 8), - (6144, 6144, 131072, 128, 128, True, False, True): (1, 1024, 3, 4), - (6144, 6144, 131072, 256, 256, False, True, True): (1, 512, 1, 32), - (6144, 6144, 131072, 256, 256, True, False, True): (4, 512, 1, 32), - (8192, 8192, 256, 32, 32, False, True, True): (1, 4, 1, 8), - (8192, 8192, 256, 32, 32, True, False, True): (3, 2, 3, 4), - (8192, 8192, 256, 64, 64, False, True, True): (1, 4, 1, 4), - (8192, 8192, 256, 64, 64, True, False, True): (1, 4, 1, 4), - (8192, 8192, 256, 128, 128, False, True, True): (1, 2, 1, 8), - (8192, 8192, 256, 128, 128, True, False, True): (2, 2, 3, 4), - (8192, 8192, 256, 256, 256, False, True, True): (1, 1, 1, 32), - (8192, 8192, 256, 256, 256, True, False, True): (1, 1, 1, 32), - (8192, 8192, 512, 32, 32, False, True, True): (4, 8, 1, 8), - (8192, 8192, 512, 32, 32, True, False, True): (2, 4, 4, 2), - (8192, 8192, 512, 64, 64, False, True, True): (4, 4, 1, 4), - (8192, 8192, 512, 64, 64, True, False, True): (3, 2, 3, 4), - (8192, 8192, 512, 128, 128, False, True, True): (1, 4, 1, 8), - (8192, 8192, 512, 128, 128, True, False, True): (1, 4, 3, 4), - (8192, 8192, 512, 256, 256, False, True, True): (1, 2, 1, 32), - (8192, 8192, 512, 256, 256, True, False, True): (1, 2, 1, 32), - (8192, 8192, 1024, 32, 32, False, True, True): (4, 16, 1, 8), - (8192, 8192, 1024, 32, 32, True, False, True): (1, 8, 3, 2), - (8192, 8192, 1024, 64, 64, False, True, True): (4, 8, 1, 4), - (8192, 8192, 1024, 64, 64, True, False, True): (4, 4, 3, 4), - (8192, 8192, 1024, 128, 128, False, True, True): (1, 8, 1, 8), - (8192, 8192, 1024, 128, 128, True, False, True): (1, 8, 3, 4), - (8192, 8192, 1024, 256, 256, False, True, True): (1, 4, 1, 32), - (8192, 8192, 1024, 256, 256, True, False, True): (4, 4, 1, 32), - (8192, 8192, 2048, 32, 32, False, True, True): (4, 32, 1, 8), - (8192, 8192, 2048, 32, 32, True, False, True): (1, 16, 3, 2), - (8192, 8192, 2048, 64, 64, False, True, True): (4, 32, 1, 8), - (8192, 8192, 2048, 64, 64, True, False, True): (4, 8, 3, 4), - (8192, 8192, 2048, 128, 128, False, True, True): (4, 16, 1, 8), - (8192, 8192, 2048, 128, 128, True, False, True): (4, 16, 3, 4), - (8192, 8192, 2048, 256, 256, False, True, True): (1, 8, 1, 32), - (8192, 8192, 2048, 256, 256, True, False, True): (4, 8, 1, 32), - (8192, 8192, 4096, 32, 32, False, True, True): (4, 64, 1, 8), - (8192, 8192, 4096, 32, 32, True, False, True): (2, 32, 3, 2), - (8192, 8192, 4096, 64, 64, False, True, True): (4, 64, 1, 8), - (8192, 8192, 4096, 64, 64, True, False, True): (4, 16, 3, 4), - (8192, 8192, 4096, 128, 128, False, True, True): (4, 32, 1, 8), - (8192, 8192, 4096, 128, 128, True, False, True): (4, 32, 3, 4), - (8192, 8192, 4096, 256, 256, False, True, True): (1, 16, 1, 32), - (8192, 8192, 4096, 256, 256, True, False, True): (2, 16, 1, 32), - (8192, 8192, 8192, 32, 32, False, True, True): (4, 128, 1, 8), - (8192, 8192, 8192, 32, 32, True, False, True): (1, 64, 3, 2), - (8192, 8192, 8192, 64, 64, False, True, True): (4, 64, 1, 4), - (8192, 8192, 8192, 64, 64, True, False, True): (4, 32, 3, 4), - (8192, 8192, 8192, 128, 128, False, True, True): (4, 64, 1, 16), - (8192, 8192, 8192, 128, 128, True, False, True): (4, 64, 3, 4), - (8192, 8192, 8192, 256, 256, False, True, True): (1, 32, 1, 32), - (8192, 8192, 8192, 256, 256, True, False, True): (4, 32, 1, 32), - (8192, 8192, 16384, 32, 32, False, True, True): (4, 256, 1, 8), - (8192, 8192, 16384, 32, 32, True, False, True): (4, 128, 3, 2), - (8192, 8192, 16384, 64, 64, False, True, True): (4, 128, 1, 4), - (8192, 8192, 16384, 64, 64, True, False, True): (4, 64, 3, 4), - (8192, 8192, 16384, 128, 128, False, True, True): (4, 128, 1, 16), - (8192, 8192, 16384, 128, 128, True, False, True): (4, 128, 3, 4), - (8192, 8192, 16384, 256, 256, False, True, True): (1, 64, 1, 32), - (8192, 8192, 16384, 256, 256, True, False, True): (4, 64, 1, 32), - (8192, 8192, 32768, 32, 32, False, True, True): (4, 512, 1, 8), - (8192, 8192, 32768, 32, 32, True, False, True): (2, 256, 3, 2), - (8192, 8192, 32768, 64, 64, False, True, True): (4, 256, 1, 4), - (8192, 8192, 32768, 64, 64, True, False, True): (4, 128, 3, 4), - (8192, 8192, 32768, 128, 128, False, True, True): (4, 256, 1, 16), - (8192, 8192, 32768, 128, 128, True, False, True): (4, 256, 3, 4), - (8192, 8192, 32768, 256, 256, False, True, True): (1, 128, 1, 32), - (8192, 8192, 32768, 256, 256, True, False, True): (4, 128, 1, 32), - (8192, 8192, 65536, 32, 32, False, True, True): (4, 1024, 1, 8), - (8192, 8192, 65536, 32, 32, True, False, True): (4, 512, 3, 2), - (8192, 8192, 65536, 64, 64, False, True, True): (4, 512, 1, 4), - (8192, 8192, 65536, 64, 64, True, False, True): (4, 256, 3, 4), - (8192, 8192, 65536, 128, 128, False, True, True): (4, 512, 1, 16), - (8192, 8192, 65536, 128, 128, True, False, True): (4, 512, 3, 4), - (8192, 8192, 65536, 256, 256, False, True, True): (1, 256, 1, 32), - (8192, 8192, 65536, 256, 256, True, False, True): (4, 256, 1, 32), - (8192, 8192, 65792, 32, 32, False, True, True): (4, 1028, 1, 8), - (8192, 8192, 65792, 32, 32, True, False, True): (1, 514, 3, 2), - (8192, 8192, 65792, 64, 64, False, True, True): (4, 1028, 1, 8), - (8192, 8192, 65792, 64, 64, True, False, True): (2, 257, 3, 4), - (8192, 8192, 65792, 128, 128, False, True, True): (4, 514, 1, 16), - (8192, 8192, 65792, 128, 128, True, False, True): (2, 514, 3, 4), - (8192, 8192, 65792, 256, 256, False, True, True): (1, 257, 1, 32), - (8192, 8192, 65792, 256, 256, True, False, True): (1, 257, 1, 32), - (8192, 8192, 131072, 32, 32, False, True, True): (4, 2048, 1, 8), - (8192, 8192, 131072, 32, 32, True, False, True): (4, 1024, 3, 2), - (8192, 8192, 131072, 64, 64, False, True, True): (4, 1024, 1, 4), - (8192, 8192, 131072, 64, 64, True, False, True): (4, 512, 3, 4), - (8192, 8192, 131072, 128, 128, False, True, True): (4, 1024, 1, 16), - (8192, 8192, 131072, 128, 128, True, False, True): (4, 1024, 3, 4), - (8192, 8192, 131072, 256, 256, False, True, True): (1, 512, 1, 32), - (8192, 8192, 131072, 256, 256, True, False, True): (4, 512, 1, 32), - (16384, 16384, 256, 32, 32, False, True, True): (4, 4, 1, 8), - (16384, 16384, 256, 32, 32, True, False, True): (2, 2, 4, 2), - (16384, 16384, 256, 64, 64, False, True, True): (2, 2, 1, 4), - (16384, 16384, 256, 64, 64, True, False, True): (5, 1, 3, 4), - (16384, 16384, 256, 128, 128, False, True, True): (6, 2, 1, 8), - (16384, 16384, 256, 128, 128, True, False, True): (6, 2, 3, 4), - (16384, 16384, 256, 256, 256, False, True, True): (1, 1, 1, 32), - (16384, 16384, 256, 256, 256, True, False, True): (1, 1, 1, 32), - (16384, 16384, 512, 32, 32, False, True, True): (4, 8, 1, 8), - (16384, 16384, 512, 32, 32, True, False, True): (1, 4, 4, 2), - (16384, 16384, 512, 64, 64, False, True, True): (4, 4, 1, 4), - (16384, 16384, 512, 64, 64, True, False, True): (2, 2, 3, 4), - (16384, 16384, 512, 128, 128, False, True, True): (4, 4, 1, 8), - (16384, 16384, 512, 128, 128, True, False, True): (4, 4, 3, 4), - (16384, 16384, 512, 256, 256, False, True, True): (1, 2, 1, 32), - (16384, 16384, 512, 256, 256, True, False, True): (2, 2, 1, 32), - (16384, 16384, 1024, 32, 32, False, True, True): (4, 16, 1, 8), - (16384, 16384, 1024, 32, 32, True, False, True): (1, 8, 3, 2), - (16384, 16384, 1024, 64, 64, False, True, True): (4, 8, 1, 4), - (16384, 16384, 1024, 64, 64, True, False, True): (4, 4, 3, 4), - (16384, 16384, 1024, 128, 128, False, True, True): (4, 4, 1, 8), - (16384, 16384, 1024, 128, 128, True, False, True): (4, 8, 3, 4), - (16384, 16384, 1024, 256, 256, False, True, True): (1, 4, 1, 32), - (16384, 16384, 1024, 256, 256, True, False, True): (4, 4, 1, 32), - (16384, 16384, 2048, 32, 32, False, True, True): (4, 32, 1, 8), - (16384, 16384, 2048, 32, 32, True, False, True): (2, 16, 3, 2), - (16384, 16384, 2048, 64, 64, False, True, True): (4, 16, 1, 4), - (16384, 16384, 2048, 64, 64, True, False, True): (4, 8, 3, 4), - (16384, 16384, 2048, 128, 128, False, True, True): (4, 16, 1, 8), - (16384, 16384, 2048, 128, 128, True, False, True): (4, 16, 3, 4), - (16384, 16384, 2048, 256, 256, False, True, True): (1, 8, 1, 32), - (16384, 16384, 2048, 256, 256, True, False, True): (4, 8, 1, 32), - (16384, 16384, 4096, 32, 32, False, True, True): (4, 64, 1, 8), - (16384, 16384, 4096, 32, 32, True, False, True): (2, 32, 3, 2), - (16384, 16384, 4096, 64, 64, False, True, True): (2, 32, 1, 4), - (16384, 16384, 4096, 64, 64, True, False, True): (4, 16, 3, 4), - (16384, 16384, 4096, 128, 128, False, True, True): (4, 32, 1, 8), - (16384, 16384, 4096, 128, 128, True, False, True): (4, 32, 3, 4), - (16384, 16384, 4096, 256, 256, False, True, True): (1, 16, 1, 32), - (16384, 16384, 4096, 256, 256, True, False, True): (4, 16, 1, 32), - (16384, 16384, 8192, 32, 32, False, True, True): (4, 128, 1, 8), - (16384, 16384, 8192, 32, 32, True, False, True): (2, 64, 3, 2), - (16384, 16384, 8192, 64, 64, False, True, True): (4, 64, 1, 4), - (16384, 16384, 8192, 64, 64, True, False, True): (4, 32, 3, 4), - (16384, 16384, 8192, 128, 128, False, True, True): (4, 64, 1, 16), - (16384, 16384, 8192, 128, 128, True, False, True): (4, 64, 3, 4), - (16384, 16384, 8192, 256, 256, False, True, True): (1, 32, 1, 32), - (16384, 16384, 8192, 256, 256, True, False, True): (4, 32, 1, 32), - (16384, 16384, 16384, 32, 32, False, True, True): (4, 256, 1, 8), - (16384, 16384, 16384, 32, 32, True, False, True): (2, 128, 3, 2), - (16384, 16384, 16384, 64, 64, False, True, True): (4, 128, 1, 4), - (16384, 16384, 16384, 64, 64, True, False, True): (4, 64, 3, 4), - (16384, 16384, 16384, 128, 128, False, True, True): (1, 64, 1, 8), - (16384, 16384, 16384, 128, 128, True, False, True): (4, 128, 3, 4), - (16384, 16384, 16384, 256, 256, False, True, True): (1, 64, 1, 32), - (16384, 16384, 16384, 256, 256, True, False, True): (4, 64, 1, 32), - (16384, 16384, 32768, 32, 32, False, True, True): (4, 512, 1, 8), - (16384, 16384, 32768, 32, 32, True, False, True): (1, 256, 3, 2), - (16384, 16384, 32768, 64, 64, False, True, True): (4, 256, 1, 4), - (16384, 16384, 32768, 64, 64, True, False, True): (4, 128, 3, 4), - (16384, 16384, 32768, 128, 128, False, True, True): (4, 256, 1, 16), - (16384, 16384, 32768, 128, 128, True, False, True): (4, 256, 3, 4), - (16384, 16384, 32768, 256, 256, False, True, True): (1, 128, 1, 32), - (16384, 16384, 32768, 256, 256, True, False, True): (4, 128, 1, 32), - (16384, 16384, 65536, 32, 32, False, True, True): (4, 1024, 1, 8), - (16384, 16384, 65536, 32, 32, True, False, True): (1, 512, 3, 2), - (16384, 16384, 65536, 64, 64, False, True, True): (2, 512, 1, 4), - (16384, 16384, 65536, 64, 64, True, False, True): (4, 256, 3, 4), - (16384, 16384, 65536, 128, 128, False, True, True): (4, 512, 1, 16), - (16384, 16384, 65536, 128, 128, True, False, True): (4, 512, 3, 4), - (16384, 16384, 65536, 256, 256, False, True, True): (1, 256, 1, 32), - (16384, 16384, 65536, 256, 256, True, False, True): (4, 256, 1, 32), - (16384, 16384, 65792, 32, 32, False, True, True): (4, 1028, 1, 8), - (16384, 16384, 65792, 32, 32, True, False, True): (1, 514, 3, 2), - (16384, 16384, 65792, 64, 64, False, True, True): (2, 514, 1, 4), - (16384, 16384, 65792, 64, 64, True, False, True): (2, 257, 3, 4), - (16384, 16384, 65792, 128, 128, False, True, True): (2, 514, 1, 16), - (16384, 16384, 65792, 128, 128, True, False, True): (2, 514, 3, 4), - (16384, 16384, 65792, 256, 256, False, True, True): (1, 257, 1, 32), - (16384, 16384, 65792, 256, 256, True, False, True): (1, 257, 1, 32), - (16384, 16384, 131072, 32, 32, False, True, True): (4, 1024, 1, 8), - (16384, 16384, 131072, 32, 32, True, False, True): (4, 512, 3, 4), - (16384, 16384, 131072, 64, 64, False, True, True): (4, 1024, 1, 4), - (16384, 16384, 131072, 64, 64, True, False, True): (4, 1024, 3, 2), - (16384, 16384, 131072, 128, 128, False, True, True): (2, 1024, 3, 8), - (16384, 16384, 131072, 128, 128, True, False, True): (4, 1024, 3, 4), - (16384, 16384, 131072, 256, 256, False, True, True): (4, 512, 1, 32), - (16384, 16384, 131072, 256, 256, True, False, True): (4, 512, 1, 32), - (32768, 32768, 256, 32, 32, False, True, True): (4, 4, 1, 8), - (32768, 32768, 256, 32, 32, True, False, True): (1, 2, 4, 2), - (32768, 32768, 256, 64, 64, False, True, True): (2, 2, 1, 4), - (32768, 32768, 256, 64, 64, True, False, True): (2, 1, 3, 4), - (32768, 32768, 256, 128, 128, False, True, True): (4, 2, 1, 8), - (32768, 32768, 256, 128, 128, True, False, True): (4, 2, 3, 4), - (32768, 32768, 256, 256, 256, False, True, True): (1, 1, 1, 32), - (32768, 32768, 256, 256, 256, True, False, True): (1, 1, 1, 32), - (32768, 32768, 512, 32, 32, False, True, True): (4, 8, 1, 8), - (32768, 32768, 512, 32, 32, True, False, True): (1, 4, 3, 2), - (32768, 32768, 512, 64, 64, False, True, True): (4, 4, 1, 4), - (32768, 32768, 512, 64, 64, True, False, True): (4, 2, 3, 4), - (32768, 32768, 512, 128, 128, False, True, True): (1, 2, 1, 8), - (32768, 32768, 512, 128, 128, True, False, True): (4, 4, 3, 4), - (32768, 32768, 512, 256, 256, False, True, True): (1, 2, 1, 32), - (32768, 32768, 512, 256, 256, True, False, True): (2, 2, 1, 32), - (32768, 32768, 1024, 32, 32, False, True, True): (4, 16, 1, 8), - (32768, 32768, 1024, 32, 32, True, False, True): (1, 8, 4, 2), - (32768, 32768, 1024, 64, 64, False, True, True): (4, 8, 1, 4), - (32768, 32768, 1024, 64, 64, True, False, True): (4, 4, 3, 4), - (32768, 32768, 1024, 128, 128, False, True, True): (1, 4, 1, 8), - (32768, 32768, 1024, 128, 128, True, False, True): (4, 8, 3, 4), - (32768, 32768, 1024, 256, 256, False, True, True): (1, 4, 1, 32), - (32768, 32768, 1024, 256, 256, True, False, True): (1, 4, 1, 32), - (32768, 32768, 2048, 32, 32, False, True, True): (2, 32, 1, 8), - (32768, 32768, 2048, 32, 32, True, False, True): (1, 16, 4, 2), - (32768, 32768, 2048, 64, 64, False, True, True): (2, 16, 1, 4), - (32768, 32768, 2048, 64, 64, True, False, True): (4, 8, 3, 4), - (32768, 32768, 2048, 128, 128, False, True, True): (1, 8, 1, 8), - (32768, 32768, 2048, 128, 128, True, False, True): (4, 16, 3, 4), - (32768, 32768, 2048, 256, 256, False, True, True): (1, 8, 1, 32), - (32768, 32768, 2048, 256, 256, True, False, True): (4, 8, 1, 32), - (32768, 32768, 4096, 32, 32, False, True, True): (2, 64, 1, 8), - (32768, 32768, 4096, 32, 32, True, False, True): (2, 32, 3, 2), - (32768, 32768, 4096, 64, 64, False, True, True): (2, 32, 1, 4), - (32768, 32768, 4096, 64, 64, True, False, True): (2, 16, 3, 4), - (32768, 32768, 4096, 128, 128, False, True, True): (1, 16, 1, 8), - (32768, 32768, 4096, 128, 128, True, False, True): (2, 32, 3, 4), - (32768, 32768, 4096, 256, 256, False, True, True): (1, 16, 1, 32), - (32768, 32768, 4096, 256, 256, True, False, True): (4, 16, 1, 32), - (32768, 32768, 8192, 32, 32, False, True, True): (2, 128, 1, 8), - (32768, 32768, 8192, 32, 32, True, False, True): (2, 64, 3, 2), - (32768, 32768, 8192, 64, 64, False, True, True): (2, 64, 1, 4), - (32768, 32768, 8192, 64, 64, True, False, True): (2, 32, 3, 4), - (32768, 32768, 8192, 128, 128, False, True, True): (1, 32, 1, 8), - (32768, 32768, 8192, 128, 128, True, False, True): (4, 64, 3, 4), - (32768, 32768, 8192, 256, 256, False, True, True): (1, 32, 1, 32), - (32768, 32768, 8192, 256, 256, True, False, True): (4, 32, 1, 32), - (32768, 32768, 16384, 32, 32, False, True, True): (2, 256, 1, 8), - (32768, 32768, 16384, 32, 32, True, False, True): (2, 128, 4, 2), - (32768, 32768, 16384, 64, 64, False, True, True): (2, 128, 1, 4), - (32768, 32768, 16384, 64, 64, True, False, True): (4, 64, 3, 4), - (32768, 32768, 16384, 128, 128, False, True, True): (1, 64, 1, 8), - (32768, 32768, 16384, 128, 128, True, False, True): (4, 128, 3, 4), - (32768, 32768, 16384, 256, 256, False, True, True): (1, 64, 1, 32), - (32768, 32768, 16384, 256, 256, True, False, True): (2, 64, 1, 32), - (32768, 32768, 32768, 32, 32, False, True, True): (2, 512, 1, 8), - (32768, 32768, 32768, 32, 32, True, False, True): (4, 256, 3, 2), - (32768, 32768, 32768, 64, 64, False, True, True): (1, 256, 1, 4), - (32768, 32768, 32768, 64, 64, True, False, True): (2, 128, 3, 4), - (32768, 32768, 32768, 128, 128, False, True, True): (1, 128, 1, 8), - (32768, 32768, 32768, 128, 128, True, False, True): (2, 256, 3, 4), - (32768, 32768, 32768, 256, 256, False, True, True): (1, 128, 1, 32), - (32768, 32768, 32768, 256, 256, True, False, True): (1, 128, 1, 32), - (32768, 32768, 65536, 32, 32, False, True, True): (2, 512, 1, 8), - (32768, 32768, 65536, 32, 32, True, False, True): (3, 512, 4, 2), - (32768, 32768, 65536, 64, 64, False, True, True): (1, 512, 1, 4), - (32768, 32768, 65536, 64, 64, True, False, True): (2, 512, 3, 2), - (32768, 32768, 65536, 128, 128, False, True, True): (1, 256, 1, 8), - (32768, 32768, 65536, 128, 128, True, False, True): (2, 512, 3, 4), - (32768, 32768, 65536, 256, 256, False, True, True): (1, 256, 1, 32), - (32768, 32768, 65536, 256, 256, True, False, True): (1, 256, 1, 32), - }, - ("_int_bsr_dense_addmm", "NVIDIA A100-SXM4-80GB", (0, torch.int8, 0.56)): { - (192, 192, 256, 64, 64, False, True, True): (3, 4, 3, 32), - (192, 192, 256, 64, 64, True, False, True): (1, 4, 3, 4), - (192, 192, 512, 64, 64, False, True, True): (1, 8, 1, 16), - (192, 192, 512, 64, 64, True, False, True): (1, 8, 5, 4), - (192, 192, 1024, 64, 64, False, True, True): (4, 16, 1, 16), - (192, 192, 1024, 64, 64, True, False, True): (3, 16, 3, 4), - (192, 192, 2048, 64, 64, False, True, True): (5, 32, 1, 8), - (192, 192, 2048, 64, 64, True, False, True): (2, 32, 4, 4), - (192, 192, 4096, 64, 64, False, True, True): (4, 64, 1, 16), - (192, 192, 4096, 64, 64, True, False, True): (1, 32, 4, 4), - (192, 192, 8192, 64, 64, False, True, True): (2, 128, 1, 8), - (192, 192, 8192, 64, 64, True, False, True): (3, 64, 1, 4), - (192, 192, 16384, 64, 64, False, True, True): (2, 256, 1, 8), - (192, 192, 16384, 64, 64, True, False, True): (1, 128, 3, 2), - (192, 192, 32768, 64, 64, False, True, True): (2, 512, 1, 8), - (192, 192, 32768, 64, 64, True, False, True): (3, 128, 1, 4), - (192, 192, 65536, 64, 64, False, True, True): (3, 1024, 1, 8), - (192, 192, 65536, 64, 64, True, False, True): (1, 512, 3, 4), - (192, 192, 131072, 64, 64, False, True, True): (1, 2048, 1, 8), - (192, 192, 131072, 64, 64, True, False, True): (1, 512, 1, 4), - (384, 384, 256, 128, 128, False, True, True): (4, 2, 1, 16), - (384, 384, 256, 128, 128, True, False, True): (1, 2, 3, 4), - (384, 384, 512, 128, 128, False, True, True): (2, 4, 1, 16), - (384, 384, 512, 128, 128, True, False, True): (2, 4, 3, 4), - (384, 384, 1024, 128, 128, False, True, True): (3, 8, 1, 32), - (384, 384, 1024, 128, 128, True, False, True): (3, 8, 3, 4), - (384, 384, 2048, 128, 128, False, True, True): (3, 16, 1, 32), - (384, 384, 2048, 128, 128, True, False, True): (2, 16, 3, 4), - (384, 384, 4096, 128, 128, False, True, True): (3, 32, 1, 32), - (384, 384, 4096, 128, 128, True, False, True): (3, 32, 3, 4), - (384, 384, 8192, 128, 128, False, True, True): (2, 64, 1, 32), - (384, 384, 8192, 128, 128, True, False, True): (4, 64, 1, 4), - (384, 384, 16384, 128, 128, False, True, True): (2, 128, 1, 32), - (384, 384, 16384, 128, 128, True, False, True): (2, 128, 1, 4), - (384, 384, 32768, 128, 128, False, True, True): (3, 256, 1, 16), - (384, 384, 32768, 128, 128, True, False, True): (1, 256, 1, 4), - (384, 384, 65536, 128, 128, False, True, True): (4, 512, 1, 16), - (384, 384, 65536, 128, 128, True, False, True): (1, 512, 1, 4), - (384, 384, 131072, 128, 128, False, True, True): (4, 1024, 1, 16), - (384, 384, 131072, 128, 128, True, False, True): (1, 1024, 1, 4), - (768, 768, 256, 256, 256, False, True, True): (1, 1, 1, 32), - (768, 768, 256, 256, 256, True, False, True): (3, 1, 1, 32), - (768, 768, 512, 256, 256, False, True, True): (1, 2, 1, 32), - (768, 768, 512, 256, 256, True, False, True): (1, 2, 1, 32), - (768, 768, 1024, 256, 256, False, True, True): (1, 4, 1, 32), - (768, 768, 1024, 256, 256, True, False, True): (2, 4, 1, 32), - (768, 768, 2048, 256, 256, False, True, True): (1, 8, 1, 32), - (768, 768, 2048, 256, 256, True, False, True): (2, 8, 1, 32), - (768, 768, 4096, 256, 256, False, True, True): (1, 16, 1, 32), - (768, 768, 4096, 256, 256, True, False, True): (1, 16, 1, 32), - (768, 768, 8192, 256, 256, False, True, True): (1, 32, 1, 32), - (768, 768, 8192, 256, 256, True, False, True): (2, 32, 1, 32), - (768, 768, 16384, 256, 256, False, True, True): (1, 64, 1, 32), - (768, 768, 16384, 256, 256, True, False, True): (7, 64, 1, 32), - (768, 768, 32768, 256, 256, False, True, True): (1, 128, 1, 32), - (768, 768, 32768, 256, 256, True, False, True): (1, 128, 1, 32), - (768, 768, 65536, 256, 256, False, True, True): (1, 256, 1, 32), - (768, 768, 65536, 256, 256, True, False, True): (1, 256, 1, 32), - (768, 768, 131072, 256, 256, False, True, True): (1, 512, 1, 32), - (768, 768, 131072, 256, 256, True, False, True): (1, 512, 1, 32), - }, - ("_int_bsr_dense_addmm", "NVIDIA A100-SXM4-80GB", (0, torch.int8, 1.0)): { - (256, 256, 256, 256, 256, False, True, True): (2, 1, 1, 4), - (256, 256, 256, 256, 256, True, False, True): (2, 1, 2, 1), - (256, 256, 512, 256, 256, False, True, True): (2, 1, 1, 2), - (256, 256, 512, 256, 256, True, False, True): (2, 2, 2, 8), - (256, 256, 1024, 256, 256, False, True, True): (1, 4, 1, 4), - (256, 256, 1024, 256, 256, True, False, True): (1, 2, 2, 4), - (256, 256, 2048, 256, 256, False, True, True): (1, 4, 1, 2), - (256, 256, 2048, 256, 256, True, False, True): (1, 8, 1, 2), - (256, 256, 4096, 256, 256, False, True, True): (1, 16, 1, 4), - (256, 256, 4096, 256, 256, True, False, True): (1, 16, 1, 2), - (256, 256, 8192, 256, 256, False, True, True): (1, 16, 3, 4), - (256, 256, 8192, 256, 256, True, False, True): (1, 8, 1, 4), - (256, 256, 16384, 256, 256, False, True, True): (2, 16, 1, 8), - (256, 256, 16384, 256, 256, True, False, True): (1, 32, 1, 2), - (256, 256, 32768, 256, 256, False, True, True): (1, 128, 1, 8), - (256, 256, 32768, 256, 256, True, False, True): (1, 128, 1, 4), - (256, 256, 65536, 256, 256, False, True, True): (1, 4, 1, 1), - (256, 256, 65536, 256, 256, True, False, True): (1, 128, 1, 4), - (256, 256, 65792, 256, 256, False, True, True): (1, 128, 2, 16), - (256, 256, 65792, 256, 256, True, False, True): (1, 16, 3, 4), - (256, 256, 131072, 256, 256, False, True, True): (1, 512, 1, 4), - (256, 256, 131072, 256, 256, True, False, True): (1, 512, 1, 2), - }, - ("bsr_dense_addmm", "NVIDIA A100-SXM4-80GB", (0, torch.bfloat16, 0.5)): { - (16, 16, 16, 16, 16, False, False, False): (2, 1, 1, 2), - (16, 16, 16, 16, 16, False, False, True): (1, 1, 1, 4), - (16, 16, 16, 16, 16, False, True, False): (1, 1, 3, 16), - (16, 16, 16, 16, 16, False, True, True): (1, 1, 1, 8), - (16, 16, 16, 16, 16, True, False, False): (2, 1, 1, 8), - (16, 16, 16, 16, 16, True, False, True): (1, 1, 1, 8), - (16, 16, 32, 16, 16, False, False, False): (1, 2, 1, 8), - (16, 16, 32, 16, 16, False, False, True): (1, 2, 2, 4), - (16, 16, 32, 16, 16, False, True, False): (1, 1, 2, 4), - (16, 16, 32, 16, 16, False, True, True): (1, 1, 2, 4), - (16, 16, 32, 16, 16, True, False, False): (1, 1, 2, 4), - (16, 16, 32, 16, 16, True, False, True): (2, 2, 1, 2), - (16, 16, 64, 16, 16, False, False, False): (1, 4, 2, 4), - (16, 16, 64, 16, 16, False, False, True): (1, 2, 1, 2), - (16, 16, 64, 16, 16, False, True, False): (2, 1, 1, 2), - (16, 16, 64, 16, 16, False, True, True): (1, 4, 1, 8), - (16, 16, 64, 16, 16, True, False, False): (1, 4, 1, 1), - (16, 16, 64, 16, 16, True, False, True): (1, 4, 2, 4), - (16, 32, 16, 16, 16, False, False, False): (1, 1, 2, 2), - (16, 32, 16, 16, 16, False, False, True): (1, 1, 1, 4), - (16, 32, 16, 16, 16, False, True, False): (1, 1, 1, 2), - (16, 32, 16, 16, 16, False, True, True): (1, 1, 1, 1), - (16, 32, 16, 16, 16, True, False, False): (1, 1, 1, 2), - (16, 32, 16, 16, 16, True, False, True): (2, 1, 1, 2), - (16, 32, 16, 16, 32, False, False, False): (1, 1, 1, 4), - (16, 32, 16, 16, 32, False, False, True): (1, 1, 1, 8), - (16, 32, 16, 16, 32, False, True, False): (1, 1, 1, 8), - (16, 32, 16, 16, 32, False, True, True): (1, 1, 2, 4), - (16, 32, 16, 16, 32, True, False, False): (1, 1, 1, 2), - (16, 32, 16, 16, 32, True, False, True): (1, 1, 1, 1), - (16, 32, 32, 16, 16, False, False, False): (2, 2, 1, 4), - (16, 32, 32, 16, 16, False, False, True): (2, 2, 1, 2), - (16, 32, 32, 16, 16, False, True, False): (1, 1, 2, 8), - (16, 32, 32, 16, 16, False, True, True): (1, 2, 1, 1), - (16, 32, 32, 16, 16, True, False, False): (1, 1, 1, 8), - (16, 32, 32, 16, 16, True, False, True): (1, 2, 1, 4), - (16, 32, 32, 16, 32, False, False, False): (1, 1, 2, 8), - (16, 32, 32, 16, 32, False, False, True): (2, 1, 1, 8), - (16, 32, 32, 16, 32, False, True, False): (1, 1, 1, 4), - (16, 32, 32, 16, 32, False, True, True): (1, 1, 1, 4), - (16, 32, 32, 16, 32, True, False, False): (1, 2, 1, 8), - (16, 32, 32, 16, 32, True, False, True): (1, 1, 1, 4), - (16, 32, 64, 16, 16, False, False, False): (1, 4, 3, 8), - (16, 32, 64, 16, 16, False, False, True): (1, 4, 1, 4), - (16, 32, 64, 16, 16, False, True, False): (1, 4, 1, 4), - (16, 32, 64, 16, 16, False, True, True): (2, 4, 1, 4), - (16, 32, 64, 16, 16, True, False, False): (1, 2, 1, 4), - (16, 32, 64, 16, 16, True, False, True): (1, 2, 1, 4), - (16, 32, 64, 16, 32, False, False, False): (1, 4, 1, 8), - (16, 32, 64, 16, 32, False, False, True): (1, 4, 1, 4), - (16, 32, 64, 16, 32, False, True, False): (1, 4, 1, 2), - (16, 32, 64, 16, 32, False, True, True): (1, 2, 1, 4), - (16, 32, 64, 16, 32, True, False, False): (1, 2, 1, 4), - (16, 32, 64, 16, 32, True, False, True): (1, 2, 1, 2), - (16, 64, 16, 16, 32, False, False, False): (1, 1, 1, 2), - (16, 64, 16, 16, 32, False, False, True): (1, 1, 2, 2), - (16, 64, 16, 16, 32, False, True, False): (1, 1, 2, 8), - (16, 64, 16, 16, 32, False, True, True): (1, 1, 1, 4), - (16, 64, 16, 16, 32, True, False, False): (1, 1, 1, 8), - (16, 64, 16, 16, 32, True, False, True): (1, 1, 1, 4), - (16, 64, 32, 16, 32, False, False, False): (1, 2, 1, 2), - (16, 64, 32, 16, 32, False, False, True): (1, 2, 1, 4), - (16, 64, 32, 16, 32, False, True, False): (1, 2, 1, 4), - (16, 64, 32, 16, 32, False, True, True): (2, 2, 1, 4), - (16, 64, 32, 16, 32, True, False, False): (1, 2, 1, 4), - (16, 64, 32, 16, 32, True, False, True): (1, 2, 1, 8), - (16, 64, 64, 16, 32, False, False, False): (1, 2, 1, 4), - (16, 64, 64, 16, 32, False, False, True): (1, 4, 2, 2), - (16, 64, 64, 16, 32, False, True, False): (1, 1, 1, 4), - (16, 64, 64, 16, 32, False, True, True): (1, 4, 1, 2), - (16, 64, 64, 16, 32, True, False, False): (1, 2, 1, 4), - (16, 64, 64, 16, 32, True, False, True): (1, 4, 1, 4), - (32, 16, 16, 16, 16, False, False, False): (1, 1, 1, 8), - (32, 16, 16, 16, 16, False, False, True): (1, 1, 2, 4), - (32, 16, 16, 16, 16, False, True, False): (1, 1, 1, 4), - (32, 16, 16, 16, 16, False, True, True): (1, 1, 2, 4), - (32, 16, 16, 16, 16, True, False, False): (1, 1, 1, 2), - (32, 16, 16, 16, 16, True, False, True): (1, 1, 1, 4), - (32, 16, 32, 16, 16, False, False, False): (1, 1, 1, 4), - (32, 16, 32, 16, 16, False, False, True): (2, 2, 1, 4), - (32, 16, 32, 16, 16, False, True, False): (1, 2, 2, 2), - (32, 16, 32, 16, 16, False, True, True): (2, 2, 1, 4), - (32, 16, 32, 16, 16, True, False, False): (1, 2, 2, 8), - (32, 16, 32, 16, 16, True, False, True): (1, 2, 1, 2), - (32, 16, 64, 16, 16, False, False, False): (1, 4, 1, 4), - (32, 16, 64, 16, 16, False, False, True): (1, 4, 2, 4), - (32, 16, 64, 16, 16, False, True, False): (1, 2, 2, 2), - (32, 16, 64, 16, 16, False, True, True): (3, 4, 1, 4), - (32, 16, 64, 16, 16, True, False, False): (1, 2, 1, 2), - (32, 16, 64, 16, 16, True, False, True): (1, 2, 1, 4), - (32, 32, 16, 16, 16, False, False, False): (1, 1, 3, 4), - (32, 32, 16, 16, 16, False, False, True): (1, 1, 1, 4), - (32, 32, 16, 16, 16, False, True, False): (1, 1, 1, 2), - (32, 32, 16, 16, 16, False, True, True): (1, 1, 1, 4), - (32, 32, 16, 16, 16, True, False, False): (1, 1, 1, 4), - (32, 32, 16, 16, 16, True, False, True): (1, 1, 2, 2), - (32, 32, 16, 16, 32, False, False, False): (2, 1, 1, 4), - (32, 32, 16, 16, 32, False, False, True): (1, 1, 1, 4), - (32, 32, 16, 16, 32, False, True, False): (1, 1, 1, 4), - (32, 32, 16, 16, 32, False, True, True): (3, 1, 2, 4), - (32, 32, 16, 16, 32, True, False, False): (1, 1, 1, 4), - (32, 32, 16, 16, 32, True, False, True): (1, 1, 1, 4), - (32, 32, 16, 32, 32, False, False, False): (1, 1, 1, 8), - (32, 32, 16, 32, 32, False, False, True): (1, 1, 1, 4), - (32, 32, 16, 32, 32, False, True, False): (1, 1, 2, 1), - (32, 32, 16, 32, 32, False, True, True): (2, 1, 2, 2), - (32, 32, 16, 32, 32, True, False, False): (1, 1, 1, 8), - (32, 32, 16, 32, 32, True, False, True): (2, 1, 3, 4), - (32, 32, 32, 16, 16, False, False, False): (1, 2, 1, 4), - (32, 32, 32, 16, 16, False, False, True): (2, 2, 1, 4), - (32, 32, 32, 16, 16, False, True, False): (1, 1, 1, 8), - (32, 32, 32, 16, 16, False, True, True): (2, 2, 1, 4), - (32, 32, 32, 16, 16, True, False, False): (1, 1, 1, 4), - (32, 32, 32, 16, 16, True, False, True): (2, 2, 2, 4), - (32, 32, 32, 16, 32, False, False, False): (2, 2, 1, 8), - (32, 32, 32, 16, 32, False, False, True): (1, 2, 1, 2), - (32, 32, 32, 16, 32, False, True, False): (1, 2, 1, 4), - (32, 32, 32, 16, 32, False, True, True): (1, 2, 1, 4), - (32, 32, 32, 16, 32, True, False, False): (1, 2, 1, 4), - (32, 32, 32, 16, 32, True, False, True): (1, 2, 1, 2), - (32, 32, 32, 32, 32, False, False, False): (1, 1, 3, 8), - (32, 32, 32, 32, 32, False, False, True): (1, 1, 1, 8), - (32, 32, 32, 32, 32, False, True, False): (2, 1, 3, 4), - (32, 32, 32, 32, 32, False, True, True): (2, 1, 1, 2), - (32, 32, 32, 32, 32, True, False, False): (1, 1, 1, 2), - (32, 32, 32, 32, 32, True, False, True): (4, 1, 1, 1), - (32, 32, 64, 16, 16, False, False, False): (1, 4, 1, 4), - (32, 32, 64, 16, 16, False, False, True): (1, 4, 1, 4), - (32, 32, 64, 16, 16, False, True, False): (1, 2, 1, 8), - (32, 32, 64, 16, 16, False, True, True): (1, 4, 1, 2), - (32, 32, 64, 16, 16, True, False, False): (2, 4, 1, 2), - (32, 32, 64, 16, 16, True, False, True): (1, 4, 1, 2), - (32, 32, 64, 16, 32, False, False, False): (1, 2, 1, 8), - (32, 32, 64, 16, 32, False, False, True): (1, 4, 2, 2), - (32, 32, 64, 16, 32, False, True, False): (1, 2, 1, 4), - (32, 32, 64, 16, 32, False, True, True): (1, 4, 1, 4), - (32, 32, 64, 16, 32, True, False, False): (1, 4, 2, 2), - (32, 32, 64, 16, 32, True, False, True): (3, 4, 2, 2), - (32, 32, 64, 32, 32, False, False, False): (2, 2, 1, 4), - (32, 32, 64, 32, 32, False, False, True): (1, 2, 1, 4), - (32, 32, 64, 32, 32, False, True, False): (1, 1, 1, 8), - (32, 32, 64, 32, 32, False, True, True): (1, 1, 1, 4), - (32, 32, 64, 32, 32, True, False, False): (1, 2, 1, 2), - (32, 32, 64, 32, 32, True, False, True): (3, 2, 1, 8), - (32, 64, 16, 16, 32, False, False, False): (1, 1, 2, 2), - (32, 64, 16, 16, 32, False, False, True): (1, 1, 1, 4), - (32, 64, 16, 16, 32, False, True, False): (1, 1, 2, 4), - (32, 64, 16, 16, 32, False, True, True): (1, 1, 1, 4), - (32, 64, 16, 16, 32, True, False, False): (1, 1, 1, 2), - (32, 64, 16, 16, 32, True, False, True): (2, 1, 2, 2), - (32, 64, 16, 32, 32, False, False, False): (1, 1, 1, 1), - (32, 64, 16, 32, 32, False, False, True): (2, 1, 1, 4), - (32, 64, 16, 32, 32, False, True, False): (1, 1, 1, 1), - (32, 64, 16, 32, 32, False, True, True): (1, 1, 2, 2), - (32, 64, 16, 32, 32, True, False, False): (1, 1, 2, 4), - (32, 64, 16, 32, 32, True, False, True): (1, 1, 1, 4), - (32, 64, 32, 16, 32, False, False, False): (2, 2, 1, 4), - (32, 64, 32, 16, 32, False, False, True): (1, 2, 1, 4), - (32, 64, 32, 16, 32, False, True, False): (1, 1, 1, 4), - (32, 64, 32, 16, 32, False, True, True): (2, 2, 3, 4), - (32, 64, 32, 16, 32, True, False, False): (1, 1, 1, 2), - (32, 64, 32, 16, 32, True, False, True): (1, 2, 1, 2), - (32, 64, 32, 32, 32, False, False, False): (1, 1, 1, 2), - (32, 64, 32, 32, 32, False, False, True): (2, 1, 1, 4), - (32, 64, 32, 32, 32, False, True, False): (1, 1, 1, 8), - (32, 64, 32, 32, 32, False, True, True): (1, 1, 2, 4), - (32, 64, 32, 32, 32, True, False, False): (2, 1, 1, 4), - (32, 64, 32, 32, 32, True, False, True): (1, 1, 2, 4), - (32, 64, 64, 16, 32, False, False, False): (1, 4, 1, 4), - (32, 64, 64, 16, 32, False, False, True): (1, 4, 2, 4), - (32, 64, 64, 16, 32, False, True, False): (1, 4, 2, 2), - (32, 64, 64, 16, 32, False, True, True): (1, 4, 1, 4), - (32, 64, 64, 16, 32, True, False, False): (1, 4, 1, 8), - (32, 64, 64, 16, 32, True, False, True): (1, 4, 2, 1), - (32, 64, 64, 32, 32, False, False, False): (1, 1, 1, 4), - (32, 64, 64, 32, 32, False, False, True): (2, 2, 1, 4), - (32, 64, 64, 32, 32, False, True, False): (1, 1, 1, 4), - (32, 64, 64, 32, 32, False, True, True): (2, 2, 1, 4), - (32, 64, 64, 32, 32, True, False, False): (1, 2, 2, 4), - (32, 64, 64, 32, 32, True, False, True): (2, 2, 3, 4), - (64, 32, 16, 32, 32, False, False, False): (1, 1, 1, 4), - (64, 32, 16, 32, 32, False, False, True): (1, 1, 1, 4), - (64, 32, 16, 32, 32, False, True, False): (1, 1, 1, 8), - (64, 32, 16, 32, 32, False, True, True): (1, 1, 1, 4), - (64, 32, 16, 32, 32, True, False, False): (1, 1, 1, 16), - (64, 32, 16, 32, 32, True, False, True): (2, 1, 1, 4), - (64, 32, 32, 32, 32, False, False, False): (1, 1, 3, 4), - (64, 32, 32, 32, 32, False, False, True): (2, 1, 1, 4), - (64, 32, 32, 32, 32, False, True, False): (1, 1, 2, 4), - (64, 32, 32, 32, 32, False, True, True): (2, 1, 1, 4), - (64, 32, 32, 32, 32, True, False, False): (2, 1, 1, 16), - (64, 32, 32, 32, 32, True, False, True): (2, 1, 1, 4), - (64, 32, 64, 32, 32, False, False, False): (1, 2, 1, 4), - (64, 32, 64, 32, 32, False, False, True): (2, 2, 1, 4), - (64, 32, 64, 32, 32, False, True, False): (1, 1, 1, 4), - (64, 32, 64, 32, 32, False, True, True): (2, 2, 1, 4), - (64, 32, 64, 32, 32, True, False, False): (1, 2, 1, 8), - (64, 32, 64, 32, 32, True, False, True): (2, 2, 3, 4), - (64, 64, 16, 32, 32, False, False, False): (1, 1, 2, 16), - (64, 64, 16, 32, 32, False, False, True): (1, 1, 3, 4), - (64, 64, 16, 32, 32, False, True, False): (1, 1, 1, 2), - (64, 64, 16, 32, 32, False, True, True): (2, 1, 1, 4), - (64, 64, 16, 32, 32, True, False, False): (2, 1, 3, 2), - (64, 64, 16, 32, 32, True, False, True): (1, 1, 2, 4), - (64, 64, 32, 32, 32, False, False, False): (1, 1, 1, 8), - (64, 64, 32, 32, 32, False, False, True): (2, 1, 2, 4), - (64, 64, 32, 32, 32, False, True, False): (2, 1, 1, 4), - (64, 64, 32, 32, 32, False, True, True): (1, 1, 2, 4), - (64, 64, 32, 32, 32, True, False, False): (2, 1, 1, 4), - (64, 64, 32, 32, 32, True, False, True): (1, 1, 2, 4), - (64, 64, 64, 32, 32, False, False, False): (1, 2, 2, 4), - (64, 64, 64, 32, 32, False, False, True): (1, 2, 2, 2), - (64, 64, 64, 32, 32, False, True, False): (1, 2, 1, 2), - (64, 64, 64, 32, 32, False, True, True): (1, 2, 1, 4), - (64, 64, 64, 32, 32, True, False, False): (1, 2, 1, 4), - (64, 64, 64, 32, 32, True, False, True): (1, 2, 1, 4), - (192, 192, 256, 16, 16, False, True, True): (1, 8, 5, 4), - (192, 192, 256, 16, 16, True, False, True): (2, 8, 5, 2), - (192, 192, 256, 32, 32, False, True, True): (1, 8, 6, 4), - (192, 192, 256, 32, 32, True, False, True): (3, 8, 5, 2), - (192, 192, 512, 16, 16, False, True, True): (1, 16, 5, 2), - (192, 192, 512, 16, 16, True, False, True): (1, 8, 4, 2), - (192, 192, 512, 32, 32, False, True, True): (2, 16, 5, 4), - (192, 192, 512, 32, 32, True, False, True): (2, 8, 5, 2), - (192, 192, 1024, 16, 16, False, True, True): (1, 16, 3, 4), - (192, 192, 1024, 16, 16, True, False, True): (1, 16, 6, 2), - (192, 192, 1024, 32, 32, False, True, True): (1, 32, 3, 4), - (192, 192, 1024, 32, 32, True, False, True): (1, 16, 4, 2), - (192, 192, 2048, 16, 16, False, True, True): (1, 32, 1, 4), - (192, 192, 2048, 16, 16, True, False, True): (4, 32, 4, 2), - (192, 192, 2048, 32, 32, False, True, True): (1, 16, 3, 8), - (192, 192, 2048, 32, 32, True, False, True): (2, 32, 4, 2), - (192, 192, 4096, 16, 16, False, True, True): (2, 64, 1, 4), - (192, 192, 4096, 16, 16, True, False, True): (1, 32, 3, 2), - (192, 192, 4096, 32, 32, False, True, True): (1, 64, 1, 8), - (192, 192, 4096, 32, 32, True, False, True): (2, 32, 4, 4), - (192, 192, 8192, 16, 16, False, True, True): (1, 64, 1, 4), - (192, 192, 8192, 16, 16, True, False, True): (2, 32, 3, 1), - (192, 192, 8192, 32, 32, False, True, True): (3, 128, 1, 4), - (192, 192, 8192, 32, 32, True, False, True): (1, 64, 3, 4), - (192, 192, 16384, 16, 16, False, True, True): (1, 128, 1, 4), - (192, 192, 16384, 16, 16, True, False, True): (4, 64, 3, 1), - (192, 192, 16384, 32, 32, False, True, True): (1, 128, 1, 4), - (192, 192, 16384, 32, 32, True, False, True): (1, 64, 3, 4), - (192, 192, 32768, 16, 16, False, True, True): (2, 256, 1, 2), - (192, 192, 32768, 16, 16, True, False, True): (1, 128, 3, 4), - (192, 192, 32768, 32, 32, False, True, True): (2, 256, 1, 4), - (192, 192, 32768, 32, 32, True, False, True): (4, 128, 3, 4), - (192, 192, 65536, 16, 16, False, True, True): (2, 512, 1, 2), - (192, 192, 65536, 16, 16, True, False, True): (2, 256, 3, 2), - (192, 192, 65536, 32, 32, False, True, True): (2, 512, 1, 4), - (192, 192, 65536, 32, 32, True, False, True): (1, 256, 3, 4), - (192, 192, 131072, 16, 16, False, True, True): (4, 1024, 1, 2), - (192, 192, 131072, 16, 16, True, False, True): (3, 512, 3, 2), - (192, 192, 131072, 32, 32, False, True, True): (1, 1024, 1, 2), - (192, 192, 131072, 32, 32, True, False, True): (1, 512, 3, 4), - (256, 256, 256, 16, 16, False, True, True): (4, 8, 5, 1), - (256, 256, 256, 16, 16, True, False, True): (2, 8, 4, 2), - (256, 256, 256, 32, 32, False, True, True): (2, 8, 5, 2), - (256, 256, 256, 32, 32, True, False, True): (1, 8, 5, 4), - (256, 256, 256, 64, 64, False, True, True): (2, 4, 4, 4), - (256, 256, 256, 64, 64, True, False, True): (1, 4, 3, 4), - (256, 256, 256, 128, 128, False, True, True): (4, 2, 2, 8), - (256, 256, 256, 128, 128, True, False, True): (1, 2, 2, 8), - (256, 256, 512, 16, 16, False, True, True): (1, 16, 5, 1), - (256, 256, 512, 16, 16, True, False, True): (3, 16, 3, 2), - (256, 256, 512, 32, 32, False, True, True): (2, 8, 5, 2), - (256, 256, 512, 32, 32, True, False, True): (1, 16, 4, 4), - (256, 256, 512, 64, 64, False, True, True): (1, 8, 4, 4), - (256, 256, 512, 64, 64, True, False, True): (3, 8, 3, 4), - (256, 256, 512, 128, 128, False, True, True): (1, 4, 2, 8), - (256, 256, 512, 128, 128, True, False, True): (1, 4, 2, 8), - (256, 256, 1024, 16, 16, False, True, True): (1, 16, 5, 4), - (256, 256, 1024, 16, 16, True, False, True): (5, 16, 4, 2), - (256, 256, 1024, 32, 32, False, True, True): (1, 32, 5, 2), - (256, 256, 1024, 32, 32, True, False, True): (2, 16, 5, 2), - (256, 256, 1024, 64, 64, False, True, True): (1, 16, 4, 4), - (256, 256, 1024, 64, 64, True, False, True): (1, 16, 4, 4), - (256, 256, 1024, 128, 128, False, True, True): (1, 8, 2, 8), - (256, 256, 1024, 128, 128, True, False, True): (1, 8, 2, 8), - (256, 256, 2048, 16, 16, False, True, True): (1, 16, 4, 4), - (256, 256, 2048, 16, 16, True, False, True): (2, 32, 5, 1), - (256, 256, 2048, 32, 32, False, True, True): (1, 64, 4, 1), - (256, 256, 2048, 32, 32, True, False, True): (2, 32, 4, 2), - (256, 256, 2048, 64, 64, False, True, True): (8, 16, 5, 4), - (256, 256, 2048, 64, 64, True, False, True): (1, 16, 4, 4), - (256, 256, 2048, 128, 128, False, True, True): (2, 16, 2, 8), - (256, 256, 2048, 128, 128, True, False, True): (1, 16, 2, 8), - (256, 256, 4096, 16, 16, False, True, True): (1, 64, 1, 4), - (256, 256, 4096, 16, 16, True, False, True): (1, 16, 3, 2), - (256, 256, 4096, 32, 32, False, True, True): (6, 32, 3, 2), - (256, 256, 4096, 32, 32, True, False, True): (4, 32, 4, 2), - (256, 256, 4096, 64, 64, False, True, True): (6, 64, 3, 4), - (256, 256, 4096, 64, 64, True, False, True): (2, 64, 3, 4), - (256, 256, 4096, 128, 128, False, True, True): (1, 32, 2, 8), - (256, 256, 4096, 128, 128, True, False, True): (1, 32, 2, 8), - (256, 256, 8192, 16, 16, False, True, True): (2, 32, 3, 4), - (256, 256, 8192, 16, 16, True, False, True): (4, 64, 3, 2), - (256, 256, 8192, 32, 32, False, True, True): (1, 64, 3, 4), - (256, 256, 8192, 32, 32, True, False, True): (3, 128, 1, 2), - (256, 256, 8192, 64, 64, False, True, True): (9, 128, 1, 4), - (256, 256, 8192, 64, 64, True, False, True): (8, 128, 1, 4), - (256, 256, 8192, 128, 128, False, True, True): (7, 64, 1, 4), - (256, 256, 8192, 128, 128, True, False, True): (1, 32, 1, 16), - (256, 256, 16384, 16, 16, False, True, True): (3, 128, 3, 2), - (256, 256, 16384, 16, 16, True, False, True): (5, 64, 3, 2), - (256, 256, 16384, 32, 32, False, True, True): (3, 128, 3, 2), - (256, 256, 16384, 32, 32, True, False, True): (1, 128, 3, 2), - (256, 256, 16384, 64, 64, False, True, True): (3, 128, 1, 4), - (256, 256, 16384, 64, 64, True, False, True): (2, 128, 1, 4), - (256, 256, 16384, 128, 128, False, True, True): (7, 128, 1, 4), - (256, 256, 16384, 128, 128, True, False, True): (1, 128, 2, 8), - (256, 256, 32768, 16, 16, False, True, True): (2, 128, 3, 2), - (256, 256, 32768, 16, 16, True, False, True): (1, 128, 3, 2), - (256, 256, 32768, 32, 32, False, True, True): (1, 256, 3, 4), - (256, 256, 32768, 32, 32, True, False, True): (3, 256, 3, 2), - (256, 256, 32768, 64, 64, False, True, True): (1, 256, 1, 4), - (256, 256, 32768, 64, 64, True, False, True): (3, 256, 1, 4), - (256, 256, 32768, 128, 128, False, True, True): (9, 256, 1, 4), - (256, 256, 32768, 128, 128, True, False, True): (2, 256, 1, 4), - (256, 256, 65536, 16, 16, False, True, True): (1, 256, 3, 2), - (256, 256, 65536, 16, 16, True, False, True): (1, 256, 3, 2), - (256, 256, 65536, 32, 32, False, True, True): (2, 512, 3, 2), - (256, 256, 65536, 32, 32, True, False, True): (2, 512, 3, 2), - (256, 256, 65536, 64, 64, False, True, True): (2, 512, 1, 4), - (256, 256, 65536, 64, 64, True, False, True): (1, 512, 1, 4), - (256, 256, 65536, 128, 128, False, True, True): (7, 512, 1, 4), - (256, 256, 65536, 128, 128, True, False, True): (2, 512, 1, 4), - (256, 256, 131072, 16, 16, False, True, True): (1, 512, 3, 2), - (256, 256, 131072, 16, 16, True, False, True): (1, 512, 3, 2), - (256, 256, 131072, 32, 32, False, True, True): (1, 1024, 3, 2), - (256, 256, 131072, 32, 32, True, False, True): (1, 1024, 3, 2), - (256, 256, 131072, 64, 64, False, True, True): (1, 1024, 1, 4), - (256, 256, 131072, 64, 64, True, False, True): (1, 1024, 1, 4), - (256, 256, 131072, 128, 128, False, True, True): (3, 1024, 1, 4), - (256, 256, 131072, 128, 128, True, False, True): (1, 1024, 1, 4), - (384, 384, 256, 16, 16, False, True, True): (1, 8, 5, 2), - (384, 384, 256, 16, 16, True, False, True): (3, 4, 5, 2), - (384, 384, 256, 32, 32, False, True, True): (2, 8, 4, 4), - (384, 384, 256, 32, 32, True, False, True): (1, 4, 6, 2), - (384, 384, 256, 64, 64, False, True, True): (2, 4, 4, 4), - (384, 384, 256, 64, 64, True, False, True): (2, 4, 4, 4), - (384, 384, 512, 16, 16, False, True, True): (1, 8, 4, 2), - (384, 384, 512, 16, 16, True, False, True): (1, 4, 5, 4), - (384, 384, 512, 32, 32, False, True, True): (1, 8, 4, 4), - (384, 384, 512, 32, 32, True, False, True): (3, 8, 5, 2), - (384, 384, 512, 64, 64, False, True, True): (3, 8, 3, 4), - (384, 384, 512, 64, 64, True, False, True): (5, 8, 5, 4), - (384, 384, 1024, 16, 16, False, True, True): (3, 16, 4, 2), - (384, 384, 1024, 16, 16, True, False, True): (1, 8, 4, 4), - (384, 384, 1024, 32, 32, False, True, True): (6, 32, 3, 2), - (384, 384, 1024, 32, 32, True, False, True): (3, 8, 4, 4), - (384, 384, 1024, 64, 64, False, True, True): (3, 16, 3, 4), - (384, 384, 1024, 64, 64, True, False, True): (2, 16, 4, 4), - (384, 384, 2048, 16, 16, False, True, True): (1, 32, 1, 4), - (384, 384, 2048, 16, 16, True, False, True): (1, 16, 5, 2), - (384, 384, 2048, 32, 32, False, True, True): (1, 32, 1, 8), - (384, 384, 2048, 32, 32, True, False, True): (1, 8, 4, 4), - (384, 384, 2048, 64, 64, False, True, True): (4, 16, 3, 4), - (384, 384, 2048, 64, 64, True, False, True): (1, 16, 3, 8), - (384, 384, 4096, 16, 16, False, True, True): (5, 32, 1, 4), - (384, 384, 4096, 16, 16, True, False, True): (6, 32, 3, 2), - (384, 384, 4096, 32, 32, False, True, True): (1, 32, 1, 8), - (384, 384, 4096, 32, 32, True, False, True): (1, 16, 3, 4), - (384, 384, 4096, 64, 64, False, True, True): (1, 64, 1, 4), - (384, 384, 4096, 64, 64, True, False, True): (2, 32, 3, 4), - (384, 384, 8192, 16, 16, False, True, True): (2, 64, 1, 4), - (384, 384, 8192, 16, 16, True, False, True): (3, 32, 3, 2), - (384, 384, 8192, 32, 32, False, True, True): (5, 64, 1, 8), - (384, 384, 8192, 32, 32, True, False, True): (1, 32, 3, 2), - (384, 384, 8192, 64, 64, False, True, True): (1, 128, 1, 4), - (384, 384, 8192, 64, 64, True, False, True): (3, 64, 3, 4), - (384, 384, 16384, 16, 16, False, True, True): (1, 128, 1, 2), - (384, 384, 16384, 16, 16, True, False, True): (4, 128, 3, 2), - (384, 384, 16384, 32, 32, False, True, True): (3, 128, 1, 4), - (384, 384, 16384, 32, 32, True, False, True): (1, 128, 3, 2), - (384, 384, 16384, 64, 64, False, True, True): (3, 256, 1, 4), - (384, 384, 16384, 64, 64, True, False, True): (2, 128, 3, 4), - (384, 384, 32768, 16, 16, False, True, True): (1, 256, 1, 2), - (384, 384, 32768, 16, 16, True, False, True): (1, 128, 3, 4), - (384, 384, 32768, 32, 32, False, True, True): (1, 256, 1, 2), - (384, 384, 32768, 32, 32, True, False, True): (1, 128, 3, 4), - (384, 384, 32768, 64, 64, False, True, True): (2, 256, 1, 4), - (384, 384, 32768, 64, 64, True, False, True): (1, 256, 3, 4), - (384, 384, 65536, 16, 16, False, True, True): (4, 512, 1, 2), - (384, 384, 65536, 16, 16, True, False, True): (1, 256, 3, 4), - (384, 384, 65536, 32, 32, False, True, True): (1, 512, 1, 2), - (384, 384, 65536, 32, 32, True, False, True): (1, 256, 3, 4), - (384, 384, 65536, 64, 64, False, True, True): (3, 512, 1, 4), - (384, 384, 65536, 64, 64, True, False, True): (3, 256, 3, 4), - (384, 384, 131072, 16, 16, False, True, True): (1, 512, 1, 1), - (384, 384, 131072, 16, 16, True, False, True): (1, 512, 3, 4), - (384, 384, 131072, 32, 32, False, True, True): (1, 512, 1, 4), - (384, 384, 131072, 32, 32, True, False, True): (1, 512, 3, 4), - (384, 384, 131072, 64, 64, False, True, True): (3, 1024, 1, 4), - (384, 384, 131072, 64, 64, True, False, True): (3, 512, 3, 4), - (512, 512, 256, 16, 16, False, True, True): (2, 4, 5, 4), - (512, 512, 256, 16, 16, True, False, True): (3, 4, 5, 4), - (512, 512, 256, 32, 32, False, True, True): (1, 4, 5, 2), - (512, 512, 256, 32, 32, True, False, True): (4, 8, 5, 1), - (512, 512, 256, 64, 64, False, True, True): (4, 4, 5, 4), - (512, 512, 256, 64, 64, True, False, True): (5, 4, 5, 4), - (512, 512, 256, 128, 128, False, True, True): (3, 2, 2, 8), - (512, 512, 256, 128, 128, True, False, True): (2, 2, 2, 8), - (512, 512, 512, 16, 16, False, True, True): (1, 8, 5, 4), - (512, 512, 512, 16, 16, True, False, True): (4, 8, 5, 2), - (512, 512, 512, 32, 32, False, True, True): (1, 16, 4, 1), - (512, 512, 512, 32, 32, True, False, True): (1, 8, 5, 2), - (512, 512, 512, 64, 64, False, True, True): (4, 8, 5, 4), - (512, 512, 512, 64, 64, True, False, True): (2, 8, 5, 4), - (512, 512, 512, 128, 128, False, True, True): (2, 4, 2, 8), - (512, 512, 512, 128, 128, True, False, True): (1, 4, 2, 8), - (512, 512, 1024, 16, 16, False, True, True): (2, 8, 4, 4), - (512, 512, 1024, 16, 16, True, False, True): (1, 8, 4, 4), - (512, 512, 1024, 32, 32, False, True, True): (3, 16, 4, 2), - (512, 512, 1024, 32, 32, True, False, True): (1, 16, 5, 2), - (512, 512, 1024, 64, 64, False, True, True): (2, 8, 3, 4), - (512, 512, 1024, 64, 64, True, False, True): (2, 16, 3, 4), - (512, 512, 1024, 128, 128, False, True, True): (2, 8, 2, 8), - (512, 512, 1024, 128, 128, True, False, True): (3, 8, 2, 8), - (512, 512, 2048, 16, 16, False, True, True): (4, 16, 3, 2), - (512, 512, 2048, 16, 16, True, False, True): (1, 16, 4, 2), - (512, 512, 2048, 32, 32, False, True, True): (3, 32, 3, 2), - (512, 512, 2048, 32, 32, True, False, True): (2, 32, 3, 2), - (512, 512, 2048, 64, 64, False, True, True): (6, 32, 3, 2), - (512, 512, 2048, 64, 64, True, False, True): (1, 32, 3, 2), - (512, 512, 2048, 128, 128, False, True, True): (4, 16, 2, 8), - (512, 512, 2048, 128, 128, True, False, True): (1, 16, 2, 8), - (512, 512, 4096, 16, 16, False, True, True): (1, 16, 3, 2), - (512, 512, 4096, 16, 16, True, False, True): (4, 32, 3, 2), - (512, 512, 4096, 32, 32, False, True, True): (3, 32, 3, 2), - (512, 512, 4096, 32, 32, True, False, True): (2, 32, 3, 2), - (512, 512, 4096, 64, 64, False, True, True): (1, 32, 3, 4), - (512, 512, 4096, 64, 64, True, False, True): (1, 64, 3, 4), - (512, 512, 4096, 128, 128, False, True, True): (4, 32, 1, 4), - (512, 512, 4096, 128, 128, True, False, True): (4, 32, 2, 8), - (512, 512, 8192, 16, 16, False, True, True): (8, 64, 3, 2), - (512, 512, 8192, 16, 16, True, False, True): (4, 64, 3, 2), - (512, 512, 8192, 32, 32, False, True, True): (3, 64, 3, 2), - (512, 512, 8192, 32, 32, True, False, True): (3, 64, 3, 2), - (512, 512, 8192, 64, 64, False, True, True): (1, 64, 3, 4), - (512, 512, 8192, 64, 64, True, False, True): (7, 64, 3, 4), - (512, 512, 8192, 128, 128, False, True, True): (1, 64, 1, 4), - (512, 512, 8192, 128, 128, True, False, True): (4, 64, 2, 8), - (512, 512, 16384, 16, 16, False, True, True): (1, 64, 3, 2), - (512, 512, 16384, 16, 16, True, False, True): (1, 128, 3, 2), - (512, 512, 16384, 32, 32, False, True, True): (3, 128, 3, 2), - (512, 512, 16384, 32, 32, True, False, True): (1, 128, 3, 2), - (512, 512, 16384, 64, 64, False, True, True): (4, 64, 2, 4), - (512, 512, 16384, 64, 64, True, False, True): (2, 64, 2, 4), - (512, 512, 16384, 128, 128, False, True, True): (4, 128, 1, 4), - (512, 512, 16384, 128, 128, True, False, True): (2, 128, 1, 4), - (512, 512, 32768, 16, 16, False, True, True): (1, 128, 3, 2), - (512, 512, 32768, 16, 16, True, False, True): (1, 128, 3, 2), - (512, 512, 32768, 32, 32, False, True, True): (1, 256, 3, 2), - (512, 512, 32768, 32, 32, True, False, True): (1, 256, 3, 2), - (512, 512, 32768, 64, 64, False, True, True): (1, 256, 3, 4), - (512, 512, 32768, 64, 64, True, False, True): (2, 256, 3, 4), - (512, 512, 32768, 128, 128, False, True, True): (5, 256, 1, 4), - (512, 512, 32768, 128, 128, True, False, True): (4, 256, 1, 4), - (512, 512, 65536, 16, 16, False, True, True): (1, 256, 3, 2), - (512, 512, 65536, 16, 16, True, False, True): (1, 256, 3, 1), - (512, 512, 65536, 32, 32, False, True, True): (1, 512, 3, 2), - (512, 512, 65536, 32, 32, True, False, True): (1, 512, 3, 2), - (512, 512, 65536, 64, 64, False, True, True): (4, 256, 2, 4), - (512, 512, 65536, 64, 64, True, False, True): (2, 512, 3, 4), - (512, 512, 65536, 128, 128, False, True, True): (6, 512, 1, 4), - (512, 512, 65536, 128, 128, True, False, True): (4, 512, 1, 4), - (512, 512, 131072, 16, 16, False, True, True): (1, 512, 3, 2), - (512, 512, 131072, 16, 16, True, False, True): (1, 512, 3, 1), - (512, 512, 131072, 32, 32, False, True, True): (1, 1024, 3, 2), - (512, 512, 131072, 32, 32, True, False, True): (1, 1024, 3, 2), - (512, 512, 131072, 64, 64, False, True, True): (4, 512, 2, 4), - (512, 512, 131072, 64, 64, True, False, True): (4, 1024, 3, 4), - (512, 512, 131072, 128, 128, False, True, True): (6, 1024, 1, 4), - (512, 512, 131072, 128, 128, True, False, True): (4, 1024, 1, 4), - (768, 768, 256, 16, 16, False, True, True): (1, 8, 4, 1), - (768, 768, 256, 16, 16, True, False, True): (3, 2, 6, 4), - (768, 768, 256, 32, 32, False, True, True): (3, 8, 3, 4), - (768, 768, 256, 32, 32, True, False, True): (1, 4, 4, 2), - (768, 768, 256, 64, 64, False, True, True): (2, 4, 3, 4), - (768, 768, 256, 64, 64, True, False, True): (1, 4, 4, 4), - (768, 768, 256, 128, 128, False, True, True): (2, 2, 3, 8), - (768, 768, 256, 128, 128, True, False, True): (4, 2, 3, 8), - (768, 768, 512, 16, 16, False, True, True): (4, 8, 4, 2), - (768, 768, 512, 16, 16, True, False, True): (4, 8, 6, 2), - (768, 768, 512, 32, 32, False, True, True): (1, 8, 4, 4), - (768, 768, 512, 32, 32, True, False, True): (3, 8, 4, 2), - (768, 768, 512, 64, 64, False, True, True): (1, 8, 3, 4), - (768, 768, 512, 64, 64, True, False, True): (1, 8, 4, 4), - (768, 768, 512, 128, 128, False, True, True): (1, 4, 3, 8), - (768, 768, 512, 128, 128, True, False, True): (4, 4, 3, 8), - (768, 768, 1024, 16, 16, False, True, True): (3, 16, 1, 4), - (768, 768, 1024, 16, 16, True, False, True): (1, 8, 5, 2), - (768, 768, 1024, 32, 32, False, True, True): (3, 16, 1, 8), - (768, 768, 1024, 32, 32, True, False, True): (1, 16, 3, 2), - (768, 768, 1024, 64, 64, False, True, True): (1, 8, 3, 4), - (768, 768, 1024, 64, 64, True, False, True): (2, 8, 3, 8), - (768, 768, 1024, 128, 128, False, True, True): (1, 8, 3, 8), - (768, 768, 1024, 128, 128, True, False, True): (1, 8, 3, 8), - (768, 768, 2048, 16, 16, False, True, True): (2, 16, 1, 2), - (768, 768, 2048, 16, 16, True, False, True): (1, 16, 3, 2), - (768, 768, 2048, 32, 32, False, True, True): (5, 32, 1, 4), - (768, 768, 2048, 32, 32, True, False, True): (3, 8, 3, 4), - (768, 768, 2048, 64, 64, False, True, True): (1, 16, 1, 8), - (768, 768, 2048, 64, 64, True, False, True): (3, 16, 3, 4), - (768, 768, 2048, 128, 128, False, True, True): (2, 16, 3, 8), - (768, 768, 2048, 128, 128, True, False, True): (1, 16, 3, 8), - (768, 768, 4096, 16, 16, False, True, True): (3, 32, 1, 4), - (768, 768, 4096, 16, 16, True, False, True): (2, 32, 3, 1), - (768, 768, 4096, 32, 32, False, True, True): (2, 64, 1, 4), - (768, 768, 4096, 32, 32, True, False, True): (1, 16, 4, 4), - (768, 768, 4096, 64, 64, False, True, True): (3, 64, 3, 4), - (768, 768, 4096, 64, 64, True, False, True): (2, 16, 3, 4), - (768, 768, 4096, 128, 128, False, True, True): (1, 32, 3, 8), - (768, 768, 4096, 128, 128, True, False, True): (4, 32, 3, 8), - (768, 768, 8192, 16, 16, False, True, True): (1, 64, 1, 2), - (768, 768, 8192, 16, 16, True, False, True): (4, 64, 3, 2), - (768, 768, 8192, 32, 32, False, True, True): (1, 64, 1, 8), - (768, 768, 8192, 32, 32, True, False, True): (2, 32, 3, 4), - (768, 768, 8192, 64, 64, False, True, True): (4, 64, 3, 4), - (768, 768, 8192, 64, 64, True, False, True): (2, 32, 3, 4), - (768, 768, 8192, 128, 128, False, True, True): (2, 64, 3, 8), - (768, 768, 8192, 128, 128, True, False, True): (1, 64, 3, 8), - (768, 768, 16384, 16, 16, False, True, True): (1, 128, 1, 2), - (768, 768, 16384, 16, 16, True, False, True): (1, 64, 4, 4), - (768, 768, 16384, 32, 32, False, True, True): (1, 128, 1, 8), - (768, 768, 16384, 32, 32, True, False, True): (1, 64, 3, 4), - (768, 768, 16384, 64, 64, False, True, True): (4, 128, 3, 4), - (768, 768, 16384, 64, 64, True, False, True): (1, 64, 3, 4), - (768, 768, 16384, 128, 128, False, True, True): (3, 128, 1, 4), - (768, 768, 16384, 128, 128, True, False, True): (3, 128, 2, 4), - (768, 768, 32768, 16, 16, False, True, True): (1, 256, 1, 2), - (768, 768, 32768, 16, 16, True, False, True): (1, 128, 4, 4), - (768, 768, 32768, 32, 32, False, True, True): (1, 128, 1, 2), - (768, 768, 32768, 32, 32, True, False, True): (1, 128, 3, 4), - (768, 768, 32768, 64, 64, False, True, True): (1, 256, 1, 4), - (768, 768, 32768, 64, 64, True, False, True): (2, 128, 3, 4), - (768, 768, 32768, 128, 128, False, True, True): (3, 256, 1, 4), - (768, 768, 32768, 128, 128, True, False, True): (2, 256, 2, 4), - (768, 768, 65536, 16, 16, False, True, True): (4, 512, 1, 2), - (768, 768, 65536, 16, 16, True, False, True): (1, 256, 4, 4), - (768, 768, 65536, 32, 32, False, True, True): (1, 256, 1, 2), - (768, 768, 65536, 32, 32, True, False, True): (1, 256, 3, 4), - (768, 768, 65536, 64, 64, False, True, True): (3, 512, 1, 4), - (768, 768, 65536, 64, 64, True, False, True): (2, 256, 3, 4), - (768, 768, 65536, 128, 128, False, True, True): (3, 512, 1, 4), - (768, 768, 65536, 128, 128, True, False, True): (2, 512, 2, 4), - (768, 768, 131072, 16, 16, False, True, True): (4, 1024, 1, 2), - (768, 768, 131072, 16, 16, True, False, True): (1, 512, 4, 1), - (768, 768, 131072, 32, 32, False, True, True): (1, 512, 1, 2), - (768, 768, 131072, 32, 32, True, False, True): (1, 512, 3, 4), - (768, 768, 131072, 64, 64, False, True, True): (1, 1024, 1, 4), - (768, 768, 131072, 64, 64, True, False, True): (2, 512, 3, 4), - (768, 768, 131072, 128, 128, False, True, True): (3, 1024, 1, 4), - (768, 768, 131072, 128, 128, True, False, True): (1, 1024, 2, 4), - (768, 3072, 256, 16, 16, False, True, True): (3, 8, 6, 1), - (768, 3072, 256, 16, 16, True, False, True): (1, 4, 6, 2), - (768, 3072, 256, 32, 32, False, True, True): (1, 8, 4, 4), - (768, 3072, 256, 32, 32, True, False, True): (3, 4, 6, 4), - (768, 3072, 256, 64, 64, False, True, True): (2, 4, 3, 4), - (768, 3072, 256, 64, 64, True, False, True): (1, 4, 4, 4), - (768, 3072, 256, 128, 128, False, True, True): (2, 2, 3, 8), - (768, 3072, 256, 128, 128, True, False, True): (1, 2, 3, 8), - (768, 3072, 512, 16, 16, False, True, True): (1, 8, 4, 2), - (768, 3072, 512, 16, 16, True, False, True): (1, 8, 5, 2), - (768, 3072, 512, 32, 32, False, True, True): (1, 16, 3, 2), - (768, 3072, 512, 32, 32, True, False, True): (1, 8, 5, 2), - (768, 3072, 512, 64, 64, False, True, True): (1, 8, 3, 4), - (768, 3072, 512, 64, 64, True, False, True): (3, 8, 4, 4), - (768, 3072, 512, 128, 128, False, True, True): (1, 4, 3, 8), - (768, 3072, 512, 128, 128, True, False, True): (2, 4, 3, 8), - (768, 3072, 1024, 16, 16, False, True, True): (1, 16, 1, 4), - (768, 3072, 1024, 16, 16, True, False, True): (5, 4, 4, 4), - (768, 3072, 1024, 32, 32, False, True, True): (3, 8, 3, 4), - (768, 3072, 1024, 32, 32, True, False, True): (1, 8, 4, 4), - (768, 3072, 1024, 64, 64, False, True, True): (2, 16, 3, 4), - (768, 3072, 1024, 64, 64, True, False, True): (2, 16, 4, 4), - (768, 3072, 1024, 128, 128, False, True, True): (1, 8, 3, 8), - (768, 3072, 1024, 128, 128, True, False, True): (5, 8, 3, 8), - (768, 3072, 2048, 16, 16, False, True, True): (3, 16, 1, 2), - (768, 3072, 2048, 16, 16, True, False, True): (1, 8, 3, 4), - (768, 3072, 2048, 32, 32, False, True, True): (4, 16, 1, 8), - (768, 3072, 2048, 32, 32, True, False, True): (3, 8, 3, 4), - (768, 3072, 2048, 64, 64, False, True, True): (2, 16, 3, 4), - (768, 3072, 2048, 64, 64, True, False, True): (2, 16, 3, 4), - (768, 3072, 2048, 128, 128, False, True, True): (3, 16, 3, 8), - (768, 3072, 2048, 128, 128, True, False, True): (4, 16, 3, 8), - (768, 3072, 4096, 16, 16, False, True, True): (1, 32, 1, 4), - (768, 3072, 4096, 16, 16, True, False, True): (1, 16, 3, 1), - (768, 3072, 4096, 32, 32, False, True, True): (3, 32, 1, 8), - (768, 3072, 4096, 32, 32, True, False, True): (3, 16, 4, 4), - (768, 3072, 4096, 64, 64, False, True, True): (2, 32, 3, 4), - (768, 3072, 4096, 64, 64, True, False, True): (2, 16, 3, 4), - (768, 3072, 4096, 128, 128, False, True, True): (5, 32, 1, 4), - (768, 3072, 4096, 128, 128, True, False, True): (9, 32, 3, 8), - (768, 3072, 8192, 16, 16, False, True, True): (1, 32, 1, 4), - (768, 3072, 8192, 16, 16, True, False, True): (4, 64, 4, 2), - (768, 3072, 8192, 32, 32, False, True, True): (1, 64, 1, 8), - (768, 3072, 8192, 32, 32, True, False, True): (2, 64, 4, 2), - (768, 3072, 8192, 64, 64, False, True, True): (1, 64, 3, 4), - (768, 3072, 8192, 64, 64, True, False, True): (2, 32, 3, 4), - (768, 3072, 8192, 128, 128, False, True, True): (2, 64, 3, 8), - (768, 3072, 8192, 128, 128, True, False, True): (2, 64, 3, 8), - (768, 3072, 16384, 16, 16, False, True, True): (1, 64, 1, 4), - (768, 3072, 16384, 16, 16, True, False, True): (1, 64, 4, 1), - (768, 3072, 16384, 32, 32, False, True, True): (1, 128, 1, 8), - (768, 3072, 16384, 32, 32, True, False, True): (1, 64, 3, 4), - (768, 3072, 16384, 64, 64, False, True, True): (1, 128, 3, 4), - (768, 3072, 16384, 64, 64, True, False, True): (4, 64, 3, 4), - (768, 3072, 16384, 128, 128, False, True, True): (2, 128, 3, 8), - (768, 3072, 16384, 128, 128, True, False, True): (2, 128, 3, 8), - (768, 3072, 32768, 16, 16, False, True, True): (1, 128, 1, 4), - (768, 3072, 32768, 16, 16, True, False, True): (1, 128, 4, 1), - (768, 3072, 32768, 32, 32, False, True, True): (1, 256, 1, 8), - (768, 3072, 32768, 32, 32, True, False, True): (1, 128, 3, 4), - (768, 3072, 32768, 64, 64, False, True, True): (1, 256, 3, 4), - (768, 3072, 32768, 64, 64, True, False, True): (1, 128, 3, 4), - (768, 3072, 32768, 128, 128, False, True, True): (3, 256, 1, 4), - (768, 3072, 32768, 128, 128, True, False, True): (2, 256, 3, 8), - (768, 3072, 50432, 16, 16, False, True, True): (1, 197, 1, 4), - (768, 3072, 50432, 16, 16, True, False, True): (4, 197, 4, 4), - (768, 3072, 50432, 32, 32, False, True, True): (1, 197, 1, 4), - (768, 3072, 50432, 32, 32, True, False, True): (4, 197, 3, 4), - (768, 3072, 50432, 64, 64, False, True, True): (1, 394, 3, 4), - (768, 3072, 50432, 64, 64, True, False, True): (3, 197, 3, 4), - (768, 3072, 50432, 128, 128, False, True, True): (3, 394, 1, 4), - (768, 3072, 50432, 128, 128, True, False, True): (1, 394, 3, 8), - (768, 3072, 65536, 16, 16, False, True, True): (1, 256, 1, 4), - (768, 3072, 65536, 16, 16, True, False, True): (5, 256, 4, 1), - (768, 3072, 65536, 32, 32, False, True, True): (1, 256, 1, 4), - (768, 3072, 65536, 32, 32, True, False, True): (3, 256, 3, 4), - (768, 3072, 65536, 64, 64, False, True, True): (2, 512, 3, 4), - (768, 3072, 65536, 64, 64, True, False, True): (3, 256, 3, 4), - (768, 3072, 65536, 128, 128, False, True, True): (3, 512, 1, 4), - (768, 3072, 65536, 128, 128, True, False, True): (2, 512, 3, 8), - (768, 3072, 131072, 16, 16, False, True, True): (1, 512, 1, 4), - (768, 3072, 131072, 16, 16, True, False, True): (5, 512, 4, 1), - (768, 3072, 131072, 32, 32, False, True, True): (1, 512, 1, 4), - (768, 3072, 131072, 32, 32, True, False, True): (4, 512, 3, 4), - (768, 3072, 131072, 64, 64, False, True, True): (1, 1024, 3, 4), - (768, 3072, 131072, 64, 64, True, False, True): (1, 512, 3, 4), - (768, 3072, 131072, 128, 128, False, True, True): (3, 1024, 1, 4), - (768, 3072, 131072, 128, 128, True, False, True): (1, 1024, 3, 8), - (1024, 1024, 256, 16, 16, False, True, True): (1, 4, 5, 4), - (1024, 1024, 256, 16, 16, True, False, True): (3, 4, 4, 4), - (1024, 1024, 256, 32, 32, False, True, True): (4, 4, 5, 2), - (1024, 1024, 256, 32, 32, True, False, True): (3, 4, 5, 2), - (1024, 1024, 256, 64, 64, False, True, True): (1, 4, 5, 4), - (1024, 1024, 256, 64, 64, True, False, True): (1, 4, 5, 4), - (1024, 1024, 256, 128, 128, False, True, True): (1, 2, 2, 8), - (1024, 1024, 256, 128, 128, True, False, True): (2, 2, 2, 8), - (1024, 1024, 512, 16, 16, False, True, True): (3, 4, 4, 4), - (1024, 1024, 512, 16, 16, True, False, True): (4, 8, 5, 2), - (1024, 1024, 512, 32, 32, False, True, True): (1, 8, 4, 2), - (1024, 1024, 512, 32, 32, True, False, True): (1, 8, 4, 2), - (1024, 1024, 512, 64, 64, False, True, True): (4, 8, 4, 4), - (1024, 1024, 512, 64, 64, True, False, True): (2, 8, 3, 4), - (1024, 1024, 512, 128, 128, False, True, True): (2, 4, 2, 8), - (1024, 1024, 512, 128, 128, True, False, True): (1, 4, 2, 8), - (1024, 1024, 1024, 16, 16, False, True, True): (3, 8, 4, 4), - (1024, 1024, 1024, 16, 16, True, False, True): (4, 8, 4, 2), - (1024, 1024, 1024, 32, 32, False, True, True): (1, 16, 3, 2), - (1024, 1024, 1024, 32, 32, True, False, True): (1, 16, 3, 2), - (1024, 1024, 1024, 64, 64, False, True, True): (1, 16, 3, 4), - (1024, 1024, 1024, 64, 64, True, False, True): (3, 16, 3, 2), - (1024, 1024, 1024, 128, 128, False, True, True): (1, 8, 2, 8), - (1024, 1024, 1024, 128, 128, True, False, True): (2, 8, 2, 8), - (1024, 1024, 2048, 16, 16, False, True, True): (3, 8, 3, 4), - (1024, 1024, 2048, 16, 16, True, False, True): (3, 8, 3, 2), - (1024, 1024, 2048, 32, 32, False, True, True): (5, 16, 3, 4), - (1024, 1024, 2048, 32, 32, True, False, True): (1, 16, 3, 2), - (1024, 1024, 2048, 64, 64, False, True, True): (6, 16, 4, 4), - (1024, 1024, 2048, 64, 64, True, False, True): (5, 16, 3, 4), - (1024, 1024, 2048, 128, 128, False, True, True): (4, 16, 2, 8), - (1024, 1024, 2048, 128, 128, True, False, True): (4, 16, 2, 8), - (1024, 1024, 4096, 16, 16, False, True, True): (8, 32, 3, 2), - (1024, 1024, 4096, 16, 16, True, False, True): (4, 32, 3, 2), - (1024, 1024, 4096, 32, 32, False, True, True): (2, 32, 3, 4), - (1024, 1024, 4096, 32, 32, True, False, True): (3, 32, 3, 2), - (1024, 1024, 4096, 64, 64, False, True, True): (3, 32, 3, 4), - (1024, 1024, 4096, 64, 64, True, False, True): (1, 32, 3, 4), - (1024, 1024, 4096, 128, 128, False, True, True): (4, 32, 2, 8), - (1024, 1024, 4096, 128, 128, True, False, True): (1, 32, 2, 8), - (1024, 1024, 8192, 16, 16, False, True, True): (4, 64, 3, 2), - (1024, 1024, 8192, 16, 16, True, False, True): (4, 64, 3, 2), - (1024, 1024, 8192, 32, 32, False, True, True): (8, 64, 3, 4), - (1024, 1024, 8192, 32, 32, True, False, True): (4, 32, 3, 4), - (1024, 1024, 8192, 64, 64, False, True, True): (4, 64, 3, 4), - (1024, 1024, 8192, 64, 64, True, False, True): (2, 64, 3, 4), - (1024, 1024, 8192, 128, 128, False, True, True): (4, 64, 2, 8), - (1024, 1024, 8192, 128, 128, True, False, True): (4, 64, 1, 4), - (1024, 1024, 16384, 16, 16, False, True, True): (1, 64, 3, 2), - (1024, 1024, 16384, 16, 16, True, False, True): (1, 64, 3, 2), - (1024, 1024, 16384, 32, 32, False, True, True): (1, 128, 3, 2), - (1024, 1024, 16384, 32, 32, True, False, True): (1, 64, 3, 4), - (1024, 1024, 16384, 64, 64, False, True, True): (1, 128, 3, 4), - (1024, 1024, 16384, 64, 64, True, False, True): (1, 128, 3, 4), - (1024, 1024, 16384, 128, 128, False, True, True): (2, 128, 1, 4), - (1024, 1024, 16384, 128, 128, True, False, True): (4, 128, 1, 4), - (1024, 1024, 32768, 16, 16, False, True, True): (1, 128, 3, 2), - (1024, 1024, 32768, 16, 16, True, False, True): (1, 128, 3, 2), - (1024, 1024, 32768, 32, 32, False, True, True): (1, 256, 3, 2), - (1024, 1024, 32768, 32, 32, True, False, True): (1, 128, 3, 4), - (1024, 1024, 32768, 64, 64, False, True, True): (2, 128, 2, 4), - (1024, 1024, 32768, 64, 64, True, False, True): (1, 256, 3, 4), - (1024, 1024, 32768, 128, 128, False, True, True): (2, 256, 1, 4), - (1024, 1024, 32768, 128, 128, True, False, True): (4, 256, 1, 4), - (1024, 1024, 65536, 16, 16, False, True, True): (1, 256, 3, 4), - (1024, 1024, 65536, 16, 16, True, False, True): (1, 256, 3, 4), - (1024, 1024, 65536, 32, 32, False, True, True): (9, 256, 3, 4), - (1024, 1024, 65536, 32, 32, True, False, True): (7, 256, 3, 4), - (1024, 1024, 65536, 64, 64, False, True, True): (2, 256, 2, 4), - (1024, 1024, 65536, 64, 64, True, False, True): (2, 512, 3, 4), - (1024, 1024, 65536, 128, 128, False, True, True): (2, 512, 1, 4), - (1024, 1024, 65536, 128, 128, True, False, True): (4, 512, 1, 4), - (1024, 1024, 131072, 16, 16, False, True, True): (11, 512, 3, 2), - (1024, 1024, 131072, 16, 16, True, False, True): (11, 512, 3, 2), - (1024, 1024, 131072, 32, 32, False, True, True): (4, 512, 3, 4), - (1024, 1024, 131072, 32, 32, True, False, True): (6, 512, 3, 4), - (1024, 1024, 131072, 64, 64, False, True, True): (2, 512, 2, 4), - (1024, 1024, 131072, 64, 64, True, False, True): (2, 1024, 3, 4), - (1024, 1024, 131072, 128, 128, False, True, True): (4, 1024, 1, 4), - (1024, 1024, 131072, 128, 128, True, False, True): (4, 1024, 1, 4), - (1280, 5120, 65792, 16, 16, False, True, True): (1, 257, 1, 4), - (1280, 5120, 65792, 16, 16, True, False, True): (5, 257, 4, 1), - (1280, 5120, 65792, 32, 32, False, True, True): (1, 514, 1, 8), - (1280, 5120, 65792, 32, 32, True, False, True): (2, 257, 3, 4), - (1280, 5120, 65792, 64, 64, False, True, True): (1, 514, 3, 4), - (1280, 5120, 65792, 64, 64, True, False, True): (1, 257, 3, 4), - (1280, 5120, 65792, 128, 128, False, True, True): (1, 514, 3, 8), - (1280, 5120, 65792, 128, 128, True, False, True): (2, 514, 3, 8), - (1536, 1536, 256, 16, 16, False, True, True): (1, 4, 6, 2), - (1536, 1536, 256, 16, 16, True, False, True): (3, 4, 5, 2), - (1536, 1536, 256, 32, 32, False, True, True): (2, 4, 3, 4), - (1536, 1536, 256, 32, 32, True, False, True): (1, 4, 5, 2), - (1536, 1536, 256, 64, 64, False, True, True): (2, 4, 3, 4), - (1536, 1536, 256, 64, 64, True, False, True): (1, 4, 4, 4), - (1536, 1536, 256, 128, 128, False, True, True): (3, 2, 3, 8), - (1536, 1536, 256, 128, 128, True, False, True): (6, 2, 3, 8), - (1536, 1536, 512, 16, 16, False, True, True): (1, 8, 1, 4), - (1536, 1536, 512, 16, 16, True, False, True): (3, 4, 5, 2), - (1536, 1536, 512, 32, 32, False, True, True): (1, 8, 1, 8), - (1536, 1536, 512, 32, 32, True, False, True): (1, 4, 4, 4), - (1536, 1536, 512, 64, 64, False, True, True): (3, 8, 5, 4), - (1536, 1536, 512, 64, 64, True, False, True): (3, 8, 3, 4), - (1536, 1536, 512, 128, 128, False, True, True): (2, 4, 3, 8), - (1536, 1536, 512, 128, 128, True, False, True): (3, 4, 3, 8), - (1536, 1536, 1024, 16, 16, False, True, True): (1, 8, 1, 2), - (1536, 1536, 1024, 16, 16, True, False, True): (2, 8, 4, 2), - (1536, 1536, 1024, 32, 32, False, True, True): (8, 16, 1, 4), - (1536, 1536, 1024, 32, 32, True, False, True): (3, 8, 4, 2), - (1536, 1536, 1024, 64, 64, False, True, True): (1, 16, 3, 4), - (1536, 1536, 1024, 64, 64, True, False, True): (3, 8, 3, 4), - (1536, 1536, 1024, 128, 128, False, True, True): (3, 8, 3, 8), - (1536, 1536, 1024, 128, 128, True, False, True): (3, 8, 3, 8), - (1536, 1536, 2048, 16, 16, False, True, True): (1, 16, 1, 4), - (1536, 1536, 2048, 16, 16, True, False, True): (1, 8, 3, 1), - (1536, 1536, 2048, 32, 32, False, True, True): (3, 16, 1, 8), - (1536, 1536, 2048, 32, 32, True, False, True): (3, 8, 4, 4), - (1536, 1536, 2048, 64, 64, False, True, True): (1, 16, 3, 4), - (1536, 1536, 2048, 64, 64, True, False, True): (3, 8, 3, 4), - (1536, 1536, 2048, 128, 128, False, True, True): (4, 16, 1, 4), - (1536, 1536, 2048, 128, 128, True, False, True): (6, 16, 3, 8), - (1536, 1536, 4096, 16, 16, False, True, True): (1, 32, 1, 2), - (1536, 1536, 4096, 16, 16, True, False, True): (4, 32, 4, 2), - (1536, 1536, 4096, 32, 32, False, True, True): (1, 32, 1, 8), - (1536, 1536, 4096, 32, 32, True, False, True): (5, 32, 4, 2), - (1536, 1536, 4096, 64, 64, False, True, True): (2, 32, 3, 4), - (1536, 1536, 4096, 64, 64, True, False, True): (2, 16, 3, 4), - (1536, 1536, 4096, 128, 128, False, True, True): (4, 32, 3, 8), - (1536, 1536, 4096, 128, 128, True, False, True): (4, 32, 3, 8), - (1536, 1536, 8192, 16, 16, False, True, True): (1, 64, 1, 2), - (1536, 1536, 8192, 16, 16, True, False, True): (4, 64, 4, 2), - (1536, 1536, 8192, 32, 32, False, True, True): (2, 64, 1, 8), - (1536, 1536, 8192, 32, 32, True, False, True): (2, 32, 3, 4), - (1536, 1536, 8192, 64, 64, False, True, True): (1, 64, 3, 4), - (1536, 1536, 8192, 64, 64, True, False, True): (2, 32, 3, 4), - (1536, 1536, 8192, 128, 128, False, True, True): (4, 64, 3, 8), - (1536, 1536, 8192, 128, 128, True, False, True): (1, 64, 3, 8), - (1536, 1536, 16384, 16, 16, False, True, True): (1, 128, 1, 2), - (1536, 1536, 16384, 16, 16, True, False, True): (1, 64, 4, 4), - (1536, 1536, 16384, 32, 32, False, True, True): (1, 64, 1, 2), - (1536, 1536, 16384, 32, 32, True, False, True): (1, 64, 3, 4), - (1536, 1536, 16384, 64, 64, False, True, True): (1, 128, 3, 4), - (1536, 1536, 16384, 64, 64, True, False, True): (1, 64, 3, 4), - (1536, 1536, 16384, 128, 128, False, True, True): (1, 128, 1, 4), - (1536, 1536, 16384, 128, 128, True, False, True): (1, 128, 2, 4), - (1536, 1536, 32768, 16, 16, False, True, True): (1, 256, 1, 2), - (1536, 1536, 32768, 16, 16, True, False, True): (1, 128, 3, 2), - (1536, 1536, 32768, 32, 32, False, True, True): (1, 128, 1, 2), - (1536, 1536, 32768, 32, 32, True, False, True): (1, 128, 3, 4), - (1536, 1536, 32768, 64, 64, False, True, True): (1, 256, 3, 4), - (1536, 1536, 32768, 64, 64, True, False, True): (1, 128, 3, 4), - (1536, 1536, 32768, 128, 128, False, True, True): (1, 256, 1, 4), - (1536, 1536, 32768, 128, 128, True, False, True): (2, 256, 2, 4), - (1536, 1536, 65536, 16, 16, False, True, True): (2, 512, 1, 2), - (1536, 1536, 65536, 16, 16, True, False, True): (1, 256, 4, 4), - (1536, 1536, 65536, 32, 32, False, True, True): (1, 256, 1, 2), - (1536, 1536, 65536, 32, 32, True, False, True): (1, 256, 3, 4), - (1536, 1536, 65536, 64, 64, False, True, True): (1, 512, 3, 4), - (1536, 1536, 65536, 64, 64, True, False, True): (3, 256, 3, 4), - (1536, 1536, 65536, 128, 128, False, True, True): (3, 512, 1, 4), - (1536, 1536, 65536, 128, 128, True, False, True): (4, 512, 2, 4), - (1536, 1536, 131072, 16, 16, False, True, True): (2, 1024, 1, 2), - (1536, 1536, 131072, 16, 16, True, False, True): (9, 512, 4, 4), - (1536, 1536, 131072, 32, 32, False, True, True): (1, 512, 1, 2), - (1536, 1536, 131072, 32, 32, True, False, True): (5, 512, 3, 4), - (1536, 1536, 131072, 64, 64, False, True, True): (1, 1024, 3, 4), - (1536, 1536, 131072, 64, 64, True, False, True): (2, 512, 3, 4), - (1536, 1536, 131072, 128, 128, False, True, True): (3, 1024, 1, 4), - (1536, 1536, 131072, 128, 128, True, False, True): (1, 1024, 2, 4), - (2048, 2048, 256, 16, 16, False, True, True): (1, 4, 5, 2), - (2048, 2048, 256, 16, 16, True, False, True): (4, 4, 5, 2), - (2048, 2048, 256, 32, 32, False, True, True): (3, 4, 6, 2), - (2048, 2048, 256, 32, 32, True, False, True): (2, 4, 5, 2), - (2048, 2048, 256, 64, 64, False, True, True): (2, 4, 4, 4), - (2048, 2048, 256, 64, 64, True, False, True): (2, 4, 3, 4), - (2048, 2048, 256, 128, 128, False, True, True): (3, 2, 2, 8), - (2048, 2048, 256, 128, 128, True, False, True): (3, 2, 2, 8), - (2048, 2048, 512, 16, 16, False, True, True): (3, 4, 4, 4), - (2048, 2048, 512, 16, 16, True, False, True): (1, 4, 4, 4), - (2048, 2048, 512, 32, 32, False, True, True): (1, 4, 3, 4), - (2048, 2048, 512, 32, 32, True, False, True): (1, 4, 4, 2), - (2048, 2048, 512, 64, 64, False, True, True): (1, 8, 3, 4), - (2048, 2048, 512, 64, 64, True, False, True): (1, 8, 3, 4), - (2048, 2048, 512, 128, 128, False, True, True): (3, 4, 2, 8), - (2048, 2048, 512, 128, 128, True, False, True): (2, 4, 2, 8), - (2048, 2048, 1024, 16, 16, False, True, True): (3, 4, 3, 4), - (2048, 2048, 1024, 16, 16, True, False, True): (4, 8, 3, 2), - (2048, 2048, 1024, 32, 32, False, True, True): (3, 8, 3, 4), - (2048, 2048, 1024, 32, 32, True, False, True): (1, 8, 3, 2), - (2048, 2048, 1024, 64, 64, False, True, True): (1, 8, 3, 4), - (2048, 2048, 1024, 64, 64, True, False, True): (1, 8, 3, 4), - (2048, 2048, 1024, 128, 128, False, True, True): (4, 8, 1, 4), - (2048, 2048, 1024, 128, 128, True, False, True): (2, 8, 1, 4), - (2048, 2048, 2048, 16, 16, False, True, True): (4, 16, 3, 2), - (2048, 2048, 2048, 16, 16, True, False, True): (4, 16, 3, 2), - (2048, 2048, 2048, 32, 32, False, True, True): (1, 16, 3, 2), - (2048, 2048, 2048, 32, 32, True, False, True): (1, 16, 3, 2), - (2048, 2048, 2048, 64, 64, False, True, True): (4, 16, 3, 4), - (2048, 2048, 2048, 64, 64, True, False, True): (4, 16, 3, 4), - (2048, 2048, 2048, 128, 128, False, True, True): (6, 16, 2, 8), - (2048, 2048, 2048, 128, 128, True, False, True): (3, 16, 1, 4), - (2048, 2048, 4096, 16, 16, False, True, True): (4, 32, 4, 2), - (2048, 2048, 4096, 16, 16, True, False, True): (4, 32, 3, 2), - (2048, 2048, 4096, 32, 32, False, True, True): (4, 16, 3, 8), - (2048, 2048, 4096, 32, 32, True, False, True): (4, 16, 3, 8), - (2048, 2048, 4096, 64, 64, False, True, True): (1, 32, 3, 4), - (2048, 2048, 4096, 64, 64, True, False, True): (3, 32, 3, 4), - (2048, 2048, 4096, 128, 128, False, True, True): (2, 32, 1, 4), - (2048, 2048, 4096, 128, 128, True, False, True): (2, 32, 1, 4), - (2048, 2048, 8192, 16, 16, False, True, True): (4, 64, 4, 2), - (2048, 2048, 8192, 16, 16, True, False, True): (4, 64, 4, 2), - (2048, 2048, 8192, 32, 32, False, True, True): (4, 32, 4, 8), - (2048, 2048, 8192, 32, 32, True, False, True): (4, 32, 3, 8), - (2048, 2048, 8192, 64, 64, False, True, True): (4, 64, 3, 4), - (2048, 2048, 8192, 64, 64, True, False, True): (4, 64, 3, 4), - (2048, 2048, 8192, 128, 128, False, True, True): (2, 64, 1, 4), - (2048, 2048, 8192, 128, 128, True, False, True): (2, 64, 1, 4), - (2048, 2048, 16384, 16, 16, False, True, True): (4, 64, 3, 2), - (2048, 2048, 16384, 16, 16, True, False, True): (1, 64, 3, 2), - (2048, 2048, 16384, 32, 32, False, True, True): (4, 64, 3, 4), - (2048, 2048, 16384, 32, 32, True, False, True): (4, 64, 3, 4), - (2048, 2048, 16384, 64, 64, False, True, True): (4, 128, 3, 4), - (2048, 2048, 16384, 64, 64, True, False, True): (4, 128, 3, 4), - (2048, 2048, 16384, 128, 128, False, True, True): (2, 128, 1, 4), - (2048, 2048, 16384, 128, 128, True, False, True): (2, 128, 1, 4), - (2048, 2048, 32768, 16, 16, False, True, True): (8, 128, 3, 2), - (2048, 2048, 32768, 16, 16, True, False, True): (8, 128, 3, 4), - (2048, 2048, 32768, 32, 32, False, True, True): (8, 128, 3, 4), - (2048, 2048, 32768, 32, 32, True, False, True): (8, 128, 3, 4), - (2048, 2048, 32768, 64, 64, False, True, True): (1, 128, 2, 4), - (2048, 2048, 32768, 64, 64, True, False, True): (8, 256, 3, 4), - (2048, 2048, 32768, 128, 128, False, True, True): (2, 256, 1, 4), - (2048, 2048, 32768, 128, 128, True, False, True): (2, 256, 1, 4), - (2048, 2048, 65536, 16, 16, False, True, True): (9, 256, 4, 4), - (2048, 2048, 65536, 16, 16, True, False, True): (7, 256, 4, 4), - (2048, 2048, 65536, 32, 32, False, True, True): (7, 256, 3, 4), - (2048, 2048, 65536, 32, 32, True, False, True): (3, 256, 3, 4), - (2048, 2048, 65536, 64, 64, False, True, True): (2, 256, 2, 4), - (2048, 2048, 65536, 64, 64, True, False, True): (6, 512, 3, 4), - (2048, 2048, 65536, 128, 128, False, True, True): (2, 512, 1, 4), - (2048, 2048, 65536, 128, 128, True, False, True): (2, 512, 1, 4), - (2048, 2048, 131072, 16, 16, False, True, True): (9, 512, 4, 4), - (2048, 2048, 131072, 16, 16, True, False, True): (9, 512, 4, 4), - (2048, 2048, 131072, 32, 32, False, True, True): (7, 512, 4, 4), - (2048, 2048, 131072, 32, 32, True, False, True): (3, 512, 3, 4), - (2048, 2048, 131072, 64, 64, False, True, True): (2, 512, 2, 4), - (2048, 2048, 131072, 64, 64, True, False, True): (4, 1024, 3, 4), - (2048, 2048, 131072, 128, 128, False, True, True): (1, 1024, 1, 4), - (2048, 2048, 131072, 128, 128, True, False, True): (2, 1024, 1, 4), - (3072, 768, 256, 16, 16, False, True, True): (6, 4, 1, 4), - (3072, 768, 256, 16, 16, True, False, True): (3, 1, 4, 4), - (3072, 768, 256, 32, 32, False, True, True): (6, 8, 1, 2), - (3072, 768, 256, 32, 32, True, False, True): (1, 2, 4, 4), - (3072, 768, 256, 64, 64, False, True, True): (1, 4, 4, 4), - (3072, 768, 256, 64, 64, True, False, True): (4, 2, 4, 4), - (3072, 768, 256, 128, 128, False, True, True): (1, 2, 3, 8), - (3072, 768, 256, 128, 128, True, False, True): (1, 2, 3, 8), - (3072, 768, 512, 16, 16, False, True, True): (2, 4, 1, 4), - (3072, 768, 512, 16, 16, True, False, True): (1, 4, 4, 1), - (3072, 768, 512, 32, 32, False, True, True): (3, 8, 1, 4), - (3072, 768, 512, 32, 32, True, False, True): (1, 2, 3, 4), - (3072, 768, 512, 64, 64, False, True, True): (1, 8, 1, 4), - (3072, 768, 512, 64, 64, True, False, True): (4, 4, 3, 4), - (3072, 768, 512, 128, 128, False, True, True): (1, 4, 3, 8), - (3072, 768, 512, 128, 128, True, False, True): (1, 4, 3, 8), - (3072, 768, 1024, 16, 16, False, True, True): (1, 8, 1, 4), - (3072, 768, 1024, 16, 16, True, False, True): (3, 4, 3, 1), - (3072, 768, 1024, 32, 32, False, True, True): (1, 8, 1, 8), - (3072, 768, 1024, 32, 32, True, False, True): (1, 4, 4, 4), - (3072, 768, 1024, 64, 64, False, True, True): (1, 16, 3, 4), - (3072, 768, 1024, 64, 64, True, False, True): (1, 4, 3, 4), - (3072, 768, 1024, 128, 128, False, True, True): (1, 8, 3, 8), - (3072, 768, 1024, 128, 128, True, False, True): (2, 8, 3, 8), - (3072, 768, 2048, 16, 16, False, True, True): (3, 8, 1, 4), - (3072, 768, 2048, 16, 16, True, False, True): (2, 8, 3, 4), - (3072, 768, 2048, 32, 32, False, True, True): (3, 16, 1, 8), - (3072, 768, 2048, 32, 32, True, False, True): (3, 8, 3, 4), - (3072, 768, 2048, 64, 64, False, True, True): (1, 16, 1, 4), - (3072, 768, 2048, 64, 64, True, False, True): (1, 16, 3, 4), - (3072, 768, 2048, 128, 128, False, True, True): (1, 16, 3, 8), - (3072, 768, 2048, 128, 128, True, False, True): (2, 16, 2, 4), - (3072, 768, 4096, 16, 16, False, True, True): (1, 16, 1, 4), - (3072, 768, 4096, 16, 16, True, False, True): (4, 32, 4, 2), - (3072, 768, 4096, 32, 32, False, True, True): (2, 32, 1, 8), - (3072, 768, 4096, 32, 32, True, False, True): (7, 16, 3, 4), - (3072, 768, 4096, 64, 64, False, True, True): (2, 32, 1, 4), - (3072, 768, 4096, 64, 64, True, False, True): (2, 16, 2, 4), - (3072, 768, 4096, 128, 128, False, True, True): (1, 32, 3, 8), - (3072, 768, 4096, 128, 128, True, False, True): (3, 32, 2, 4), - (3072, 768, 8192, 16, 16, False, True, True): (2, 32, 1, 4), - (3072, 768, 8192, 16, 16, True, False, True): (4, 64, 4, 2), - (3072, 768, 8192, 32, 32, False, True, True): (4, 32, 1, 4), - (3072, 768, 8192, 32, 32, True, False, True): (4, 32, 3, 4), - (3072, 768, 8192, 64, 64, False, True, True): (2, 64, 1, 4), - (3072, 768, 8192, 64, 64, True, False, True): (4, 32, 2, 4), - (3072, 768, 8192, 128, 128, False, True, True): (3, 64, 1, 4), - (3072, 768, 8192, 128, 128, True, False, True): (6, 64, 2, 4), - (3072, 768, 16384, 16, 16, False, True, True): (1, 64, 1, 4), - (3072, 768, 16384, 16, 16, True, False, True): (1, 64, 1, 1), - (3072, 768, 16384, 32, 32, False, True, True): (1, 64, 1, 4), - (3072, 768, 16384, 32, 32, True, False, True): (4, 64, 3, 4), - (3072, 768, 16384, 64, 64, False, True, True): (4, 128, 1, 4), - (3072, 768, 16384, 64, 64, True, False, True): (4, 64, 2, 4), - (3072, 768, 16384, 128, 128, False, True, True): (3, 128, 1, 4), - (3072, 768, 16384, 128, 128, True, False, True): (4, 128, 2, 4), - (3072, 768, 32768, 16, 16, False, True, True): (1, 128, 1, 4), - (3072, 768, 32768, 16, 16, True, False, True): (8, 128, 4, 1), - (3072, 768, 32768, 32, 32, False, True, True): (1, 128, 1, 4), - (3072, 768, 32768, 32, 32, True, False, True): (8, 128, 3, 4), - (3072, 768, 32768, 64, 64, False, True, True): (1, 256, 1, 4), - (3072, 768, 32768, 64, 64, True, False, True): (1, 128, 2, 4), - (3072, 768, 32768, 128, 128, False, True, True): (3, 256, 1, 4), - (3072, 768, 32768, 128, 128, True, False, True): (8, 256, 2, 4), - (3072, 768, 50432, 16, 16, False, True, True): (1, 197, 1, 4), - (3072, 768, 50432, 16, 16, True, False, True): (7, 197, 4, 1), - (3072, 768, 50432, 32, 32, False, True, True): (1, 197, 1, 4), - (3072, 768, 50432, 32, 32, True, False, True): (4, 197, 3, 4), - (3072, 768, 50432, 64, 64, False, True, True): (1, 394, 1, 4), - (3072, 768, 50432, 64, 64, True, False, True): (3, 197, 2, 4), - (3072, 768, 50432, 128, 128, False, True, True): (3, 394, 1, 4), - (3072, 768, 50432, 128, 128, True, False, True): (8, 394, 2, 4), - (3072, 768, 65536, 16, 16, False, True, True): (1, 256, 1, 4), - (3072, 768, 65536, 16, 16, True, False, True): (15, 256, 4, 1), - (3072, 768, 65536, 32, 32, False, True, True): (1, 256, 1, 4), - (3072, 768, 65536, 32, 32, True, False, True): (15, 256, 3, 4), - (3072, 768, 65536, 64, 64, False, True, True): (1, 512, 1, 4), - (3072, 768, 65536, 64, 64, True, False, True): (2, 256, 2, 4), - (3072, 768, 65536, 128, 128, False, True, True): (3, 512, 1, 4), - (3072, 768, 65536, 128, 128, True, False, True): (3, 512, 2, 4), - (3072, 768, 131072, 16, 16, False, True, True): (1, 512, 1, 4), - (3072, 768, 131072, 16, 16, True, False, True): (15, 512, 4, 1), - (3072, 768, 131072, 32, 32, False, True, True): (1, 512, 1, 4), - (3072, 768, 131072, 32, 32, True, False, True): (9, 512, 3, 4), - (3072, 768, 131072, 64, 64, False, True, True): (1, 1024, 1, 4), - (3072, 768, 131072, 64, 64, True, False, True): (3, 512, 2, 4), - (3072, 768, 131072, 128, 128, False, True, True): (3, 1024, 1, 4), - (3072, 768, 131072, 128, 128, True, False, True): (1, 1024, 2, 4), - (3072, 3072, 256, 16, 16, False, True, True): (5, 4, 1, 4), - (3072, 3072, 256, 16, 16, True, False, True): (1, 2, 5, 2), - (3072, 3072, 256, 32, 32, False, True, True): (5, 4, 1, 8), - (3072, 3072, 256, 32, 32, True, False, True): (1, 4, 4, 2), - (3072, 3072, 256, 64, 64, False, True, True): (2, 4, 4, 4), - (3072, 3072, 256, 64, 64, True, False, True): (2, 4, 4, 4), - (3072, 3072, 256, 128, 128, False, True, True): (1, 2, 3, 8), - (3072, 3072, 256, 128, 128, True, False, True): (1, 2, 3, 8), - (3072, 3072, 512, 16, 16, False, True, True): (5, 4, 1, 2), - (3072, 3072, 512, 16, 16, True, False, True): (1, 2, 3, 4), - (3072, 3072, 512, 32, 32, False, True, True): (3, 8, 1, 4), - (3072, 3072, 512, 32, 32, True, False, True): (1, 4, 4, 2), - (3072, 3072, 512, 64, 64, False, True, True): (1, 8, 2, 2), - (3072, 3072, 512, 64, 64, True, False, True): (2, 4, 3, 4), - (3072, 3072, 512, 128, 128, False, True, True): (2, 4, 3, 8), - (3072, 3072, 512, 128, 128, True, False, True): (1, 4, 3, 8), - (3072, 3072, 1024, 16, 16, False, True, True): (1, 8, 1, 4), - (3072, 3072, 1024, 16, 16, True, False, True): (2, 8, 3, 1), - (3072, 3072, 1024, 32, 32, False, True, True): (1, 16, 1, 4), - (3072, 3072, 1024, 32, 32, True, False, True): (1, 4, 4, 4), - (3072, 3072, 1024, 64, 64, False, True, True): (1, 8, 3, 4), - (3072, 3072, 1024, 64, 64, True, False, True): (2, 4, 3, 4), - (3072, 3072, 1024, 128, 128, False, True, True): (1, 8, 1, 4), - (3072, 3072, 1024, 128, 128, True, False, True): (2, 8, 3, 8), - (3072, 3072, 2048, 16, 16, False, True, True): (1, 16, 1, 2), - (3072, 3072, 2048, 16, 16, True, False, True): (2, 16, 4, 2), - (3072, 3072, 2048, 32, 32, False, True, True): (1, 16, 1, 8), - (3072, 3072, 2048, 32, 32, True, False, True): (3, 8, 4, 4), - (3072, 3072, 2048, 64, 64, False, True, True): (3, 16, 3, 4), - (3072, 3072, 2048, 64, 64, True, False, True): (3, 8, 3, 4), - (3072, 3072, 2048, 128, 128, False, True, True): (1, 16, 3, 8), - (3072, 3072, 2048, 128, 128, True, False, True): (5, 16, 3, 8), - (3072, 3072, 4096, 16, 16, False, True, True): (1, 32, 1, 2), - (3072, 3072, 4096, 16, 16, True, False, True): (4, 32, 4, 2), - (3072, 3072, 4096, 32, 32, False, True, True): (1, 32, 1, 8), - (3072, 3072, 4096, 32, 32, True, False, True): (3, 16, 3, 4), - (3072, 3072, 4096, 64, 64, False, True, True): (1, 32, 3, 4), - (3072, 3072, 4096, 64, 64, True, False, True): (3, 16, 3, 4), - (3072, 3072, 4096, 128, 128, False, True, True): (3, 32, 3, 8), - (3072, 3072, 4096, 128, 128, True, False, True): (3, 32, 3, 8), - (3072, 3072, 8192, 16, 16, False, True, True): (1, 64, 1, 2), - (3072, 3072, 8192, 16, 16, True, False, True): (4, 64, 4, 2), - (3072, 3072, 8192, 32, 32, False, True, True): (1, 64, 1, 8), - (3072, 3072, 8192, 32, 32, True, False, True): (6, 32, 3, 4), - (3072, 3072, 8192, 64, 64, False, True, True): (1, 64, 3, 4), - (3072, 3072, 8192, 64, 64, True, False, True): (2, 32, 3, 4), - (3072, 3072, 8192, 128, 128, False, True, True): (2, 64, 3, 8), - (3072, 3072, 8192, 128, 128, True, False, True): (1, 64, 3, 8), - (3072, 3072, 16384, 16, 16, False, True, True): (1, 128, 1, 2), - (3072, 3072, 16384, 16, 16, True, False, True): (4, 128, 4, 2), - (3072, 3072, 16384, 32, 32, False, True, True): (1, 64, 1, 2), - (3072, 3072, 16384, 32, 32, True, False, True): (4, 64, 3, 4), - (3072, 3072, 16384, 64, 64, False, True, True): (1, 128, 3, 4), - (3072, 3072, 16384, 64, 64, True, False, True): (4, 64, 3, 4), - (3072, 3072, 16384, 128, 128, False, True, True): (1, 128, 1, 4), - (3072, 3072, 16384, 128, 128, True, False, True): (1, 128, 3, 8), - (3072, 3072, 32768, 16, 16, False, True, True): (1, 256, 1, 2), - (3072, 3072, 32768, 16, 16, True, False, True): (8, 128, 4, 4), - (3072, 3072, 32768, 32, 32, False, True, True): (1, 256, 1, 8), - (3072, 3072, 32768, 32, 32, True, False, True): (5, 128, 3, 4), - (3072, 3072, 32768, 64, 64, False, True, True): (1, 256, 3, 4), - (3072, 3072, 32768, 64, 64, True, False, True): (3, 128, 3, 4), - (3072, 3072, 32768, 128, 128, False, True, True): (1, 256, 1, 4), - (3072, 3072, 32768, 128, 128, True, False, True): (3, 256, 2, 4), - (3072, 3072, 65536, 16, 16, False, True, True): (1, 512, 1, 2), - (3072, 3072, 65536, 16, 16, True, False, True): (7, 256, 4, 4), - (3072, 3072, 65536, 32, 32, False, True, True): (1, 256, 1, 2), - (3072, 3072, 65536, 32, 32, True, False, True): (5, 256, 3, 4), - (3072, 3072, 65536, 64, 64, False, True, True): (1, 512, 3, 4), - (3072, 3072, 65536, 64, 64, True, False, True): (3, 256, 3, 4), - (3072, 3072, 65536, 128, 128, False, True, True): (1, 512, 1, 4), - (3072, 3072, 65536, 128, 128, True, False, True): (3, 512, 2, 4), - (3072, 3072, 131072, 16, 16, False, True, True): (1, 1024, 1, 2), - (3072, 3072, 131072, 16, 16, True, False, True): (5, 512, 4, 4), - (3072, 3072, 131072, 32, 32, False, True, True): (1, 512, 1, 2), - (3072, 3072, 131072, 32, 32, True, False, True): (5, 512, 3, 4), - (3072, 3072, 131072, 64, 64, False, True, True): (1, 1024, 3, 4), - (3072, 3072, 131072, 64, 64, True, False, True): (3, 512, 3, 4), - (3072, 3072, 131072, 128, 128, False, True, True): (1, 1024, 1, 4), - (3072, 3072, 131072, 128, 128, True, False, True): (6, 1024, 2, 4), - (4096, 4096, 256, 16, 16, False, True, True): (2, 2, 5, 4), - (4096, 4096, 256, 16, 16, True, False, True): (2, 2, 4, 2), - (4096, 4096, 256, 32, 32, False, True, True): (1, 2, 4, 4), - (4096, 4096, 256, 32, 32, True, False, True): (3, 2, 4, 2), - (4096, 4096, 256, 64, 64, False, True, True): (3, 4, 3, 4), - (4096, 4096, 256, 64, 64, True, False, True): (1, 4, 3, 2), - (4096, 4096, 256, 128, 128, False, True, True): (1, 2, 2, 8), - (4096, 4096, 256, 128, 128, True, False, True): (1, 2, 2, 8), - (4096, 4096, 512, 16, 16, False, True, True): (4, 2, 3, 4), - (4096, 4096, 512, 16, 16, True, False, True): (1, 2, 3, 4), - (4096, 4096, 512, 32, 32, False, True, True): (1, 4, 3, 4), - (4096, 4096, 512, 32, 32, True, False, True): (3, 4, 3, 2), - (4096, 4096, 512, 64, 64, False, True, True): (4, 4, 4, 4), - (4096, 4096, 512, 64, 64, True, False, True): (3, 4, 3, 4), - (4096, 4096, 512, 128, 128, False, True, True): (2, 4, 2, 8), - (4096, 4096, 512, 128, 128, True, False, True): (2, 4, 1, 4), - (4096, 4096, 1024, 16, 16, False, True, True): (2, 8, 3, 2), - (4096, 4096, 1024, 16, 16, True, False, True): (2, 8, 3, 2), - (4096, 4096, 1024, 32, 32, False, True, True): (1, 8, 3, 4), - (4096, 4096, 1024, 32, 32, True, False, True): (1, 8, 3, 2), - (4096, 4096, 1024, 64, 64, False, True, True): (1, 8, 3, 4), - (4096, 4096, 1024, 64, 64, True, False, True): (1, 8, 3, 4), - (4096, 4096, 1024, 128, 128, False, True, True): (4, 8, 1, 4), - (4096, 4096, 1024, 128, 128, True, False, True): (2, 8, 2, 8), - (4096, 4096, 2048, 16, 16, False, True, True): (2, 8, 4, 4), - (4096, 4096, 2048, 16, 16, True, False, True): (2, 8, 4, 4), - (4096, 4096, 2048, 32, 32, False, True, True): (4, 8, 3, 8), - (4096, 4096, 2048, 32, 32, True, False, True): (4, 8, 4, 8), - (4096, 4096, 2048, 64, 64, False, True, True): (4, 16, 3, 4), - (4096, 4096, 2048, 64, 64, True, False, True): (4, 16, 3, 4), - (4096, 4096, 2048, 128, 128, False, True, True): (1, 16, 1, 4), - (4096, 4096, 2048, 128, 128, True, False, True): (4, 16, 1, 4), - (4096, 4096, 4096, 16, 16, False, True, True): (4, 32, 4, 4), - (4096, 4096, 4096, 16, 16, True, False, True): (2, 32, 4, 4), - (4096, 4096, 4096, 32, 32, False, True, True): (4, 16, 4, 8), - (4096, 4096, 4096, 32, 32, True, False, True): (4, 16, 4, 8), - (4096, 4096, 4096, 64, 64, False, True, True): (4, 32, 3, 4), - (4096, 4096, 4096, 64, 64, True, False, True): (2, 32, 3, 4), - (4096, 4096, 4096, 128, 128, False, True, True): (2, 32, 1, 4), - (4096, 4096, 4096, 128, 128, True, False, True): (2, 32, 1, 4), - (4096, 4096, 8192, 16, 16, False, True, True): (4, 64, 4, 2), - (4096, 4096, 8192, 16, 16, True, False, True): (4, 64, 4, 2), - (4096, 4096, 8192, 32, 32, False, True, True): (4, 32, 4, 8), - (4096, 4096, 8192, 32, 32, True, False, True): (4, 32, 4, 8), - (4096, 4096, 8192, 64, 64, False, True, True): (4, 64, 3, 4), - (4096, 4096, 8192, 64, 64, True, False, True): (4, 64, 3, 4), - (4096, 4096, 8192, 128, 128, False, True, True): (1, 64, 1, 4), - (4096, 4096, 8192, 128, 128, True, False, True): (1, 64, 1, 4), - (4096, 4096, 16384, 16, 16, False, True, True): (4, 64, 4, 4), - (4096, 4096, 16384, 16, 16, True, False, True): (4, 64, 4, 4), - (4096, 4096, 16384, 32, 32, False, True, True): (4, 64, 4, 8), - (4096, 4096, 16384, 32, 32, True, False, True): (4, 64, 4, 8), - (4096, 4096, 16384, 64, 64, False, True, True): (4, 128, 3, 4), - (4096, 4096, 16384, 64, 64, True, False, True): (4, 128, 3, 4), - (4096, 4096, 16384, 128, 128, False, True, True): (1, 128, 1, 4), - (4096, 4096, 16384, 128, 128, True, False, True): (1, 128, 1, 4), - (4096, 4096, 32768, 16, 16, False, True, True): (8, 128, 4, 4), - (4096, 4096, 32768, 16, 16, True, False, True): (5, 128, 4, 4), - (4096, 4096, 32768, 32, 32, False, True, True): (5, 128, 4, 4), - (4096, 4096, 32768, 32, 32, True, False, True): (3, 128, 4, 8), - (4096, 4096, 32768, 64, 64, False, True, True): (3, 256, 3, 4), - (4096, 4096, 32768, 64, 64, True, False, True): (2, 256, 3, 4), - (4096, 4096, 32768, 128, 128, False, True, True): (1, 256, 1, 4), - (4096, 4096, 32768, 128, 128, True, False, True): (1, 256, 1, 4), - (4096, 4096, 65536, 16, 16, False, True, True): (5, 256, 4, 4), - (4096, 4096, 65536, 16, 16, True, False, True): (5, 256, 4, 4), - (4096, 4096, 65536, 32, 32, False, True, True): (4, 256, 4, 8), - (4096, 4096, 65536, 32, 32, True, False, True): (4, 256, 4, 8), - (4096, 4096, 65536, 64, 64, False, True, True): (1, 512, 3, 4), - (4096, 4096, 65536, 64, 64, True, False, True): (3, 512, 3, 4), - (4096, 4096, 65536, 128, 128, False, True, True): (1, 512, 1, 4), - (4096, 4096, 65536, 128, 128, True, False, True): (1, 512, 1, 4), - (4096, 4096, 131072, 16, 16, False, True, True): (5, 512, 4, 4), - (4096, 4096, 131072, 16, 16, True, False, True): (5, 512, 4, 4), - (4096, 4096, 131072, 32, 32, False, True, True): (4, 512, 4, 4), - (4096, 4096, 131072, 32, 32, True, False, True): (2, 512, 3, 4), - (4096, 4096, 131072, 64, 64, False, True, True): (1, 1024, 3, 4), - (4096, 4096, 131072, 64, 64, True, False, True): (3, 1024, 3, 4), - (4096, 4096, 131072, 128, 128, False, True, True): (1, 1024, 1, 4), - (4096, 4096, 131072, 128, 128, True, False, True): (1, 1024, 1, 4), - (5120, 1280, 65792, 16, 16, False, True, True): (1, 257, 1, 4), - (5120, 1280, 65792, 16, 16, True, False, True): (11, 257, 4, 1), - (5120, 1280, 65792, 32, 32, False, True, True): (1, 257, 1, 4), - (5120, 1280, 65792, 32, 32, True, False, True): (5, 257, 3, 4), - (5120, 1280, 65792, 64, 64, False, True, True): (1, 514, 1, 4), - (5120, 1280, 65792, 64, 64, True, False, True): (5, 257, 2, 4), - (5120, 1280, 65792, 128, 128, False, True, True): (3, 514, 1, 4), - (5120, 1280, 65792, 128, 128, True, False, True): (7, 514, 2, 4), - (6144, 6144, 256, 16, 16, False, True, True): (1, 2, 1, 4), - (6144, 6144, 256, 16, 16, True, False, True): (3, 1, 4, 4), - (6144, 6144, 256, 32, 32, False, True, True): (3, 2, 1, 8), - (6144, 6144, 256, 32, 32, True, False, True): (1, 1, 4, 4), - (6144, 6144, 256, 64, 64, False, True, True): (4, 2, 3, 4), - (6144, 6144, 256, 64, 64, True, False, True): (3, 2, 4, 4), - (6144, 6144, 256, 128, 128, False, True, True): (2, 2, 3, 8), - (6144, 6144, 256, 128, 128, True, False, True): (1, 2, 3, 8), - (6144, 6144, 512, 16, 16, False, True, True): (4, 4, 1, 4), - (6144, 6144, 512, 16, 16, True, False, True): (3, 2, 3, 1), - (6144, 6144, 512, 32, 32, False, True, True): (1, 8, 1, 4), - (6144, 6144, 512, 32, 32, True, False, True): (1, 2, 3, 2), - (6144, 6144, 512, 64, 64, False, True, True): (2, 4, 3, 4), - (6144, 6144, 512, 64, 64, True, False, True): (2, 2, 3, 4), - (6144, 6144, 512, 128, 128, False, True, True): (1, 4, 3, 8), - (6144, 6144, 512, 128, 128, True, False, True): (1, 4, 3, 8), - (6144, 6144, 1024, 16, 16, False, True, True): (1, 8, 1, 2), - (6144, 6144, 1024, 16, 16, True, False, True): (4, 8, 4, 4), - (6144, 6144, 1024, 32, 32, False, True, True): (1, 8, 4, 2), - (6144, 6144, 1024, 32, 32, True, False, True): (1, 8, 4, 2), - (6144, 6144, 1024, 64, 64, False, True, True): (4, 8, 3, 4), - (6144, 6144, 1024, 64, 64, True, False, True): (1, 4, 3, 4), - (6144, 6144, 1024, 128, 128, False, True, True): (2, 8, 3, 8), - (6144, 6144, 1024, 128, 128, True, False, True): (1, 8, 3, 8), - (6144, 6144, 2048, 16, 16, False, True, True): (4, 4, 1, 4), - (6144, 6144, 2048, 16, 16, True, False, True): (2, 8, 4, 4), - (6144, 6144, 2048, 32, 32, False, True, True): (1, 16, 4, 2), - (6144, 6144, 2048, 32, 32, True, False, True): (4, 8, 4, 8), - (6144, 6144, 2048, 64, 64, False, True, True): (4, 16, 3, 4), - (6144, 6144, 2048, 64, 64, True, False, True): (2, 8, 3, 4), - (6144, 6144, 2048, 128, 128, False, True, True): (1, 16, 3, 8), - (6144, 6144, 2048, 128, 128, True, False, True): (4, 16, 3, 8), - (6144, 6144, 4096, 16, 16, False, True, True): (4, 8, 1, 4), - (6144, 6144, 4096, 16, 16, True, False, True): (4, 32, 4, 2), - (6144, 6144, 4096, 32, 32, False, True, True): (4, 16, 1, 2), - (6144, 6144, 4096, 32, 32, True, False, True): (2, 8, 3, 8), - (6144, 6144, 4096, 64, 64, False, True, True): (4, 32, 3, 4), - (6144, 6144, 4096, 64, 64, True, False, True): (4, 16, 3, 4), - (6144, 6144, 4096, 128, 128, False, True, True): (4, 32, 3, 8), - (6144, 6144, 4096, 128, 128, True, False, True): (4, 32, 3, 8), - (6144, 6144, 8192, 16, 16, False, True, True): (2, 16, 1, 2), - (6144, 6144, 8192, 16, 16, True, False, True): (4, 64, 4, 2), - (6144, 6144, 8192, 32, 32, False, True, True): (4, 32, 1, 2), - (6144, 6144, 8192, 32, 32, True, False, True): (4, 32, 4, 8), - (6144, 6144, 8192, 64, 64, False, True, True): (4, 64, 3, 4), - (6144, 6144, 8192, 64, 64, True, False, True): (4, 32, 3, 4), - (6144, 6144, 8192, 128, 128, False, True, True): (4, 64, 3, 8), - (6144, 6144, 8192, 128, 128, True, False, True): (4, 64, 3, 8), - (6144, 6144, 16384, 16, 16, False, True, True): (2, 32, 1, 2), - (6144, 6144, 16384, 16, 16, True, False, True): (4, 64, 4, 4), - (6144, 6144, 16384, 32, 32, False, True, True): (4, 64, 1, 2), - (6144, 6144, 16384, 32, 32, True, False, True): (4, 64, 3, 2), - (6144, 6144, 16384, 64, 64, False, True, True): (4, 128, 3, 4), - (6144, 6144, 16384, 64, 64, True, False, True): (2, 32, 3, 8), - (6144, 6144, 16384, 128, 128, False, True, True): (4, 128, 3, 8), - (6144, 6144, 16384, 128, 128, True, False, True): (4, 128, 3, 8), - (6144, 6144, 32768, 16, 16, False, True, True): (2, 64, 1, 2), - (6144, 6144, 32768, 16, 16, True, False, True): (3, 128, 4, 4), - (6144, 6144, 32768, 32, 32, False, True, True): (4, 128, 1, 2), - (6144, 6144, 32768, 32, 32, True, False, True): (3, 128, 3, 4), - (6144, 6144, 32768, 64, 64, False, True, True): (4, 256, 3, 4), - (6144, 6144, 32768, 64, 64, True, False, True): (2, 64, 3, 8), - (6144, 6144, 32768, 128, 128, False, True, True): (4, 256, 3, 8), - (6144, 6144, 32768, 128, 128, True, False, True): (4, 256, 3, 8), - (6144, 6144, 65536, 16, 16, False, True, True): (2, 128, 1, 2), - (6144, 6144, 65536, 16, 16, True, False, True): (4, 256, 4, 4), - (6144, 6144, 65536, 32, 32, False, True, True): (4, 256, 1, 2), - (6144, 6144, 65536, 32, 32, True, False, True): (4, 256, 3, 4), - (6144, 6144, 65536, 64, 64, False, True, True): (4, 512, 3, 4), - (6144, 6144, 65536, 64, 64, True, False, True): (2, 128, 3, 8), - (6144, 6144, 65536, 128, 128, False, True, True): (4, 512, 3, 8), - (6144, 6144, 65536, 128, 128, True, False, True): (4, 512, 3, 8), - (6144, 6144, 131072, 16, 16, False, True, True): (2, 256, 1, 2), - (6144, 6144, 131072, 16, 16, True, False, True): (5, 512, 4, 1), - (6144, 6144, 131072, 32, 32, False, True, True): (4, 512, 1, 2), - (6144, 6144, 131072, 32, 32, True, False, True): (4, 512, 3, 2), - (6144, 6144, 131072, 64, 64, False, True, True): (4, 1024, 3, 4), - (6144, 6144, 131072, 64, 64, True, False, True): (2, 256, 3, 8), - (6144, 6144, 131072, 128, 128, False, True, True): (4, 1024, 3, 8), - (6144, 6144, 131072, 128, 128, True, False, True): (4, 1024, 3, 8), - (8192, 8192, 256, 16, 16, False, True, True): (1, 1, 3, 4), - (8192, 8192, 256, 16, 16, True, False, True): (4, 1, 3, 4), - (8192, 8192, 256, 32, 32, False, True, True): (1, 2, 3, 4), - (8192, 8192, 256, 32, 32, True, False, True): (1, 2, 3, 4), - (8192, 8192, 256, 64, 64, False, True, True): (6, 2, 3, 8), - (8192, 8192, 256, 64, 64, True, False, True): (4, 2, 3, 8), - (8192, 8192, 256, 128, 128, False, True, True): (1, 2, 1, 4), - (8192, 8192, 256, 128, 128, True, False, True): (1, 2, 1, 4), - (8192, 8192, 512, 16, 16, False, True, True): (4, 4, 3, 2), - (8192, 8192, 512, 16, 16, True, False, True): (4, 4, 3, 4), - (8192, 8192, 512, 32, 32, False, True, True): (1, 4, 3, 4), - (8192, 8192, 512, 32, 32, True, False, True): (3, 4, 3, 2), - (8192, 8192, 512, 64, 64, False, True, True): (1, 4, 3, 4), - (8192, 8192, 512, 64, 64, True, False, True): (1, 4, 3, 4), - (8192, 8192, 512, 128, 128, False, True, True): (4, 4, 2, 8), - (8192, 8192, 512, 128, 128, True, False, True): (4, 4, 2, 8), - (8192, 8192, 1024, 16, 16, False, True, True): (4, 8, 4, 4), - (8192, 8192, 1024, 16, 16, True, False, True): (2, 8, 4, 4), - (8192, 8192, 1024, 32, 32, False, True, True): (2, 4, 4, 8), - (8192, 8192, 1024, 32, 32, True, False, True): (1, 4, 3, 4), - (8192, 8192, 1024, 64, 64, False, True, True): (4, 8, 3, 4), - (8192, 8192, 1024, 64, 64, True, False, True): (2, 8, 3, 4), - (8192, 8192, 1024, 128, 128, False, True, True): (4, 8, 1, 4), - (8192, 8192, 1024, 128, 128, True, False, True): (4, 8, 1, 4), - (8192, 8192, 2048, 16, 16, False, True, True): (2, 8, 4, 4), - (8192, 8192, 2048, 16, 16, True, False, True): (2, 8, 4, 4), - (8192, 8192, 2048, 32, 32, False, True, True): (2, 8, 4, 8), - (8192, 8192, 2048, 32, 32, True, False, True): (2, 8, 4, 8), - (8192, 8192, 2048, 64, 64, False, True, True): (4, 8, 2, 4), - (8192, 8192, 2048, 64, 64, True, False, True): (4, 16, 3, 4), - (8192, 8192, 2048, 128, 128, False, True, True): (4, 16, 1, 4), - (8192, 8192, 2048, 128, 128, True, False, True): (4, 16, 1, 4), - (8192, 8192, 4096, 16, 16, False, True, True): (4, 16, 4, 4), - (8192, 8192, 4096, 16, 16, True, False, True): (4, 32, 4, 2), - (8192, 8192, 4096, 32, 32, False, True, True): (2, 16, 4, 8), - (8192, 8192, 4096, 32, 32, True, False, True): (2, 16, 4, 8), - (8192, 8192, 4096, 64, 64, False, True, True): (4, 32, 3, 4), - (8192, 8192, 4096, 64, 64, True, False, True): (4, 16, 2, 4), - (8192, 8192, 4096, 128, 128, False, True, True): (4, 32, 1, 4), - (8192, 8192, 4096, 128, 128, True, False, True): (4, 32, 1, 4), - (8192, 8192, 8192, 16, 16, False, True, True): (4, 64, 4, 2), - (8192, 8192, 8192, 16, 16, True, False, True): (4, 64, 4, 2), - (8192, 8192, 8192, 32, 32, False, True, True): (2, 32, 4, 8), - (8192, 8192, 8192, 32, 32, True, False, True): (2, 32, 4, 8), - (8192, 8192, 8192, 64, 64, False, True, True): (4, 32, 3, 8), - (8192, 8192, 8192, 64, 64, True, False, True): (4, 32, 2, 4), - (8192, 8192, 8192, 128, 128, False, True, True): (4, 64, 1, 4), - (8192, 8192, 8192, 128, 128, True, False, True): (4, 64, 1, 4), - (8192, 8192, 16384, 16, 16, False, True, True): (4, 64, 4, 4), - (8192, 8192, 16384, 16, 16, True, False, True): (4, 64, 4, 4), - (8192, 8192, 16384, 32, 32, False, True, True): (4, 64, 3, 4), - (8192, 8192, 16384, 32, 32, True, False, True): (4, 64, 4, 8), - (8192, 8192, 16384, 64, 64, False, True, True): (4, 64, 2, 4), - (8192, 8192, 16384, 64, 64, True, False, True): (4, 64, 2, 4), - (8192, 8192, 16384, 128, 128, False, True, True): (4, 128, 1, 4), - (8192, 8192, 16384, 128, 128, True, False, True): (4, 128, 1, 4), - (8192, 8192, 32768, 16, 16, False, True, True): (3, 128, 4, 4), - (8192, 8192, 32768, 16, 16, True, False, True): (3, 128, 4, 4), - (8192, 8192, 32768, 32, 32, False, True, True): (2, 128, 4, 8), - (8192, 8192, 32768, 32, 32, True, False, True): (2, 128, 4, 8), - (8192, 8192, 32768, 64, 64, False, True, True): (2, 128, 2, 4), - (8192, 8192, 32768, 64, 64, True, False, True): (2, 128, 2, 4), - (8192, 8192, 32768, 128, 128, False, True, True): (4, 256, 1, 4), - (8192, 8192, 32768, 128, 128, True, False, True): (4, 256, 1, 4), - (8192, 8192, 65536, 16, 16, False, True, True): (3, 256, 4, 4), - (8192, 8192, 65536, 16, 16, True, False, True): (3, 256, 4, 4), - (8192, 8192, 65536, 32, 32, False, True, True): (2, 256, 3, 4), - (8192, 8192, 65536, 32, 32, True, False, True): (2, 256, 3, 4), - (8192, 8192, 65536, 64, 64, False, True, True): (2, 256, 2, 4), - (8192, 8192, 65536, 64, 64, True, False, True): (2, 256, 3, 8), - (8192, 8192, 65536, 128, 128, False, True, True): (4, 512, 1, 4), - (8192, 8192, 65536, 128, 128, True, False, True): (4, 512, 1, 4), - (8192, 8192, 131072, 16, 16, False, True, True): (3, 512, 4, 4), - (8192, 8192, 131072, 16, 16, True, False, True): (3, 512, 4, 4), - (8192, 8192, 131072, 32, 32, False, True, True): (2, 512, 4, 4), - (8192, 8192, 131072, 32, 32, True, False, True): (2, 512, 3, 4), - (8192, 8192, 131072, 64, 64, False, True, True): (4, 512, 2, 4), - (8192, 8192, 131072, 64, 64, True, False, True): (2, 512, 2, 4), - (8192, 8192, 131072, 128, 128, False, True, True): (4, 1024, 1, 4), - (8192, 8192, 131072, 128, 128, True, False, True): (4, 1024, 1, 4), - (16384, 16384, 256, 16, 16, False, True, True): (2, 2, 6, 4), - (16384, 16384, 256, 16, 16, True, False, True): (2, 2, 6, 4), - (16384, 16384, 256, 32, 32, False, True, True): (4, 2, 3, 2), - (16384, 16384, 256, 32, 32, True, False, True): (4, 2, 3, 2), - (16384, 16384, 256, 64, 64, False, True, True): (2, 2, 4, 4), - (16384, 16384, 256, 64, 64, True, False, True): (4, 2, 3, 8), - (16384, 16384, 256, 128, 128, False, True, True): (4, 2, 2, 8), - (16384, 16384, 256, 128, 128, True, False, True): (4, 2, 2, 8), - (16384, 16384, 512, 16, 16, False, True, True): (1, 2, 4, 4), - (16384, 16384, 512, 16, 16, True, False, True): (1, 2, 4, 4), - (16384, 16384, 512, 32, 32, False, True, True): (2, 2, 4, 8), - (16384, 16384, 512, 32, 32, True, False, True): (2, 2, 4, 8), - (16384, 16384, 512, 64, 64, False, True, True): (4, 4, 3, 4), - (16384, 16384, 512, 64, 64, True, False, True): (4, 4, 3, 4), - (16384, 16384, 512, 128, 128, False, True, True): (4, 4, 2, 8), - (16384, 16384, 512, 128, 128, True, False, True): (4, 4, 2, 8), - (16384, 16384, 1024, 16, 16, False, True, True): (3, 4, 4, 4), - (16384, 16384, 1024, 16, 16, True, False, True): (2, 8, 4, 4), - (16384, 16384, 1024, 32, 32, False, True, True): (2, 4, 4, 8), - (16384, 16384, 1024, 32, 32, True, False, True): (1, 4, 4, 8), - (16384, 16384, 1024, 64, 64, False, True, True): (2, 8, 3, 4), - (16384, 16384, 1024, 64, 64, True, False, True): (2, 8, 3, 4), - (16384, 16384, 1024, 128, 128, False, True, True): (4, 8, 1, 4), - (16384, 16384, 1024, 128, 128, True, False, True): (4, 8, 1, 4), - (16384, 16384, 2048, 16, 16, False, True, True): (2, 8, 4, 4), - (16384, 16384, 2048, 16, 16, True, False, True): (2, 8, 4, 4), - (16384, 16384, 2048, 32, 32, False, True, True): (1, 8, 4, 8), - (16384, 16384, 2048, 32, 32, True, False, True): (2, 8, 4, 8), - (16384, 16384, 2048, 64, 64, False, True, True): (2, 8, 2, 4), - (16384, 16384, 2048, 64, 64, True, False, True): (2, 8, 2, 4), - (16384, 16384, 2048, 128, 128, False, True, True): (4, 16, 1, 4), - (16384, 16384, 2048, 128, 128, True, False, True): (4, 16, 1, 4), - (16384, 16384, 4096, 16, 16, False, True, True): (2, 16, 4, 4), - (16384, 16384, 4096, 16, 16, True, False, True): (2, 16, 4, 4), - (16384, 16384, 4096, 32, 32, False, True, True): (1, 8, 3, 8), - (16384, 16384, 4096, 32, 32, True, False, True): (2, 16, 3, 4), - (16384, 16384, 4096, 64, 64, False, True, True): (2, 16, 2, 4), - (16384, 16384, 4096, 64, 64, True, False, True): (2, 16, 2, 4), - (16384, 16384, 4096, 128, 128, False, True, True): (4, 32, 1, 4), - (16384, 16384, 4096, 128, 128, True, False, True): (4, 32, 1, 4), - (16384, 16384, 8192, 16, 16, False, True, True): (4, 64, 4, 2), - (16384, 16384, 8192, 16, 16, True, False, True): (4, 64, 4, 2), - (16384, 16384, 8192, 32, 32, False, True, True): (2, 32, 4, 8), - (16384, 16384, 8192, 32, 32, True, False, True): (2, 32, 3, 4), - (16384, 16384, 8192, 64, 64, False, True, True): (2, 32, 4, 8), - (16384, 16384, 8192, 64, 64, True, False, True): (2, 32, 3, 8), - (16384, 16384, 8192, 128, 128, False, True, True): (4, 64, 1, 4), - (16384, 16384, 8192, 128, 128, True, False, True): (4, 64, 1, 4), - (16384, 16384, 16384, 16, 16, False, True, True): (1, 64, 4, 4), - (16384, 16384, 16384, 16, 16, True, False, True): (1, 64, 4, 4), - (16384, 16384, 16384, 32, 32, False, True, True): (1, 64, 3, 8), - (16384, 16384, 16384, 32, 32, True, False, True): (1, 64, 3, 4), - (16384, 16384, 16384, 64, 64, False, True, True): (1, 64, 2, 4), - (16384, 16384, 16384, 64, 64, True, False, True): (1, 64, 4, 8), - (16384, 16384, 16384, 128, 128, False, True, True): (4, 128, 1, 4), - (16384, 16384, 16384, 128, 128, True, False, True): (4, 128, 1, 4), - (16384, 16384, 32768, 16, 16, False, True, True): (1, 128, 4, 4), - (16384, 16384, 32768, 16, 16, True, False, True): (1, 128, 4, 4), - (16384, 16384, 32768, 32, 32, False, True, True): (1, 128, 4, 2), - (16384, 16384, 32768, 32, 32, True, False, True): (1, 128, 3, 8), - (16384, 16384, 32768, 64, 64, False, True, True): (2, 128, 2, 4), - (16384, 16384, 32768, 64, 64, True, False, True): (1, 128, 3, 8), - (16384, 16384, 32768, 128, 128, False, True, True): (4, 256, 1, 4), - (16384, 16384, 32768, 128, 128, True, False, True): (4, 256, 1, 4), - (16384, 16384, 65536, 16, 16, False, True, True): (1, 256, 4, 4), - (16384, 16384, 65536, 16, 16, True, False, True): (1, 256, 4, 4), - (16384, 16384, 65536, 32, 32, False, True, True): (1, 256, 3, 4), - (16384, 16384, 65536, 32, 32, True, False, True): (1, 256, 3, 4), - (16384, 16384, 65536, 64, 64, False, True, True): (1, 256, 2, 4), - (16384, 16384, 65536, 64, 64, True, False, True): (2, 256, 2, 4), - (16384, 16384, 65536, 128, 128, False, True, True): (4, 512, 1, 4), - (16384, 16384, 65536, 128, 128, True, False, True): (4, 512, 1, 4), - (16384, 16384, 131072, 16, 16, False, True, True): (2, 512, 4, 4), - (16384, 16384, 131072, 16, 16, True, False, True): (1, 512, 4, 4), - (16384, 16384, 131072, 32, 32, False, True, True): (1, 512, 4, 8), - (16384, 16384, 131072, 32, 32, True, False, True): (1, 512, 3, 4), - (16384, 16384, 131072, 64, 64, False, True, True): (2, 512, 2, 4), - (16384, 16384, 131072, 64, 64, True, False, True): (1, 512, 2, 4), - (16384, 16384, 131072, 128, 128, False, True, True): (4, 1024, 1, 4), - (16384, 16384, 131072, 128, 128, True, False, True): (4, 1024, 1, 4), - }, - ("bsr_dense_addmm", "NVIDIA A100-SXM4-80GB", (0, torch.bfloat16, 0.56)): { - (192, 192, 256, 64, 64, False, True, True): (3, 4, 3, 4), - (192, 192, 256, 64, 64, True, False, True): (1, 4, 4, 4), - (192, 192, 512, 64, 64, False, True, True): (2, 8, 3, 4), - (192, 192, 512, 64, 64, True, False, True): (2, 8, 3, 4), - (192, 192, 1024, 64, 64, False, True, True): (1, 16, 3, 4), - (192, 192, 1024, 64, 64, True, False, True): (1, 16, 5, 4), - (192, 192, 2048, 64, 64, False, True, True): (3, 32, 3, 4), - (192, 192, 2048, 64, 64, True, False, True): (5, 32, 3, 4), - (192, 192, 4096, 64, 64, False, True, True): (1, 64, 4, 4), - (192, 192, 4096, 64, 64, True, False, True): (2, 32, 3, 4), - (192, 192, 8192, 64, 64, False, True, True): (1, 128, 2, 4), - (192, 192, 8192, 64, 64, True, False, True): (1, 64, 3, 4), - (192, 192, 16384, 64, 64, False, True, True): (1, 256, 1, 4), - (192, 192, 16384, 64, 64, True, False, True): (1, 64, 3, 4), - (192, 192, 32768, 64, 64, False, True, True): (2, 512, 1, 2), - (192, 192, 32768, 64, 64, True, False, True): (2, 256, 2, 4), - (192, 192, 65536, 64, 64, False, True, True): (3, 512, 1, 4), - (192, 192, 65536, 64, 64, True, False, True): (1, 512, 2, 4), - (192, 192, 131072, 64, 64, False, True, True): (5, 1024, 1, 4), - (192, 192, 131072, 64, 64, True, False, True): (4, 512, 2, 4), - (384, 384, 256, 128, 128, False, True, True): (3, 2, 3, 8), - (384, 384, 256, 128, 128, True, False, True): (1, 2, 3, 8), - (384, 384, 512, 128, 128, False, True, True): (4, 4, 3, 8), - (384, 384, 512, 128, 128, True, False, True): (3, 4, 3, 8), - (384, 384, 1024, 128, 128, False, True, True): (1, 8, 3, 8), - (384, 384, 1024, 128, 128, True, False, True): (2, 8, 3, 8), - (384, 384, 2048, 128, 128, False, True, True): (5, 16, 3, 8), - (384, 384, 2048, 128, 128, True, False, True): (5, 16, 3, 8), - (384, 384, 4096, 128, 128, False, True, True): (3, 32, 3, 8), - (384, 384, 4096, 128, 128, True, False, True): (6, 32, 3, 8), - (384, 384, 8192, 128, 128, False, True, True): (2, 64, 3, 8), - (384, 384, 8192, 128, 128, True, False, True): (4, 32, 2, 8), - (384, 384, 16384, 128, 128, False, True, True): (2, 128, 3, 8), - (384, 384, 16384, 128, 128, True, False, True): (5, 128, 2, 4), - (384, 384, 32768, 128, 128, False, True, True): (2, 256, 3, 8), - (384, 384, 32768, 128, 128, True, False, True): (3, 256, 2, 4), - (384, 384, 65536, 128, 128, False, True, True): (3, 512, 1, 4), - (384, 384, 65536, 128, 128, True, False, True): (1, 512, 2, 4), - (384, 384, 131072, 128, 128, False, True, True): (3, 1024, 1, 4), - (384, 384, 131072, 128, 128, True, False, True): (1, 1024, 2, 4), - }, - ("bsr_dense_addmm", "NVIDIA A100-SXM4-80GB", (0, torch.float16, 0.5)): { - (16, 16, 16, 16, 16, False, False, False): (1, 1, 1, 1), - (16, 16, 16, 16, 16, False, False, True): (1, 1, 2, 2), - (16, 16, 16, 16, 16, False, True, False): (1, 1, 1, 1), - (16, 16, 16, 16, 16, False, True, True): (1, 1, 1, 8), - (16, 16, 16, 16, 16, True, False, False): (3, 1, 3, 4), - (16, 16, 16, 16, 16, True, False, True): (1, 1, 2, 1), - (16, 16, 32, 16, 16, False, False, False): (1, 2, 1, 8), - (16, 16, 32, 16, 16, False, False, True): (1, 2, 1, 2), - (16, 16, 32, 16, 16, False, True, False): (2, 1, 1, 4), - (16, 16, 32, 16, 16, False, True, True): (1, 2, 1, 4), - (16, 16, 32, 16, 16, True, False, False): (1, 1, 1, 4), - (16, 16, 32, 16, 16, True, False, True): (1, 2, 1, 2), - (16, 16, 64, 16, 16, False, False, False): (1, 4, 1, 1), - (16, 16, 64, 16, 16, False, False, True): (1, 2, 2, 4), - (16, 16, 64, 16, 16, False, True, False): (1, 4, 1, 4), - (16, 16, 64, 16, 16, False, True, True): (1, 2, 1, 4), - (16, 16, 64, 16, 16, True, False, False): (1, 4, 1, 2), - (16, 16, 64, 16, 16, True, False, True): (1, 1, 1, 2), - (16, 32, 16, 16, 16, False, False, False): (1, 1, 2, 4), - (16, 32, 16, 16, 16, False, False, True): (1, 1, 1, 4), - (16, 32, 16, 16, 16, False, True, False): (1, 1, 1, 2), - (16, 32, 16, 16, 16, False, True, True): (1, 1, 1, 2), - (16, 32, 16, 16, 16, True, False, False): (1, 1, 2, 16), - (16, 32, 16, 16, 16, True, False, True): (1, 1, 1, 4), - (16, 32, 16, 16, 32, False, False, False): (2, 1, 1, 8), - (16, 32, 16, 16, 32, False, False, True): (2, 1, 1, 8), - (16, 32, 16, 16, 32, False, True, False): (1, 1, 2, 1), - (16, 32, 16, 16, 32, False, True, True): (1, 1, 1, 4), - (16, 32, 16, 16, 32, True, False, False): (2, 1, 1, 8), - (16, 32, 16, 16, 32, True, False, True): (1, 1, 2, 4), - (16, 32, 32, 16, 16, False, False, False): (1, 1, 1, 16), - (16, 32, 32, 16, 16, False, False, True): (1, 2, 1, 2), - (16, 32, 32, 16, 16, False, True, False): (1, 2, 1, 8), - (16, 32, 32, 16, 16, False, True, True): (3, 2, 1, 4), - (16, 32, 32, 16, 16, True, False, False): (1, 2, 1, 4), - (16, 32, 32, 16, 16, True, False, True): (1, 2, 1, 2), - (16, 32, 32, 16, 32, False, False, False): (1, 2, 1, 2), - (16, 32, 32, 16, 32, False, False, True): (1, 1, 1, 4), - (16, 32, 32, 16, 32, False, True, False): (1, 1, 2, 4), - (16, 32, 32, 16, 32, False, True, True): (1, 2, 1, 2), - (16, 32, 32, 16, 32, True, False, False): (1, 2, 1, 2), - (16, 32, 32, 16, 32, True, False, True): (1, 2, 1, 16), - (16, 32, 64, 16, 16, False, False, False): (1, 4, 1, 4), - (16, 32, 64, 16, 16, False, False, True): (2, 4, 1, 4), - (16, 32, 64, 16, 16, False, True, False): (1, 4, 1, 4), - (16, 32, 64, 16, 16, False, True, True): (1, 4, 1, 4), - (16, 32, 64, 16, 16, True, False, False): (3, 4, 1, 2), - (16, 32, 64, 16, 16, True, False, True): (1, 4, 1, 1), - (16, 32, 64, 16, 32, False, False, False): (1, 4, 1, 16), - (16, 32, 64, 16, 32, False, False, True): (1, 2, 1, 2), - (16, 32, 64, 16, 32, False, True, False): (1, 4, 2, 2), - (16, 32, 64, 16, 32, False, True, True): (1, 4, 1, 8), - (16, 32, 64, 16, 32, True, False, False): (1, 4, 1, 8), - (16, 32, 64, 16, 32, True, False, True): (1, 2, 1, 4), - (16, 64, 16, 16, 32, False, False, False): (1, 1, 1, 2), - (16, 64, 16, 16, 32, False, False, True): (1, 1, 1, 4), - (16, 64, 16, 16, 32, False, True, False): (2, 1, 2, 4), - (16, 64, 16, 16, 32, False, True, True): (1, 1, 1, 4), - (16, 64, 16, 16, 32, True, False, False): (1, 1, 1, 4), - (16, 64, 16, 16, 32, True, False, True): (1, 1, 1, 4), - (16, 64, 32, 16, 32, False, False, False): (1, 2, 1, 2), - (16, 64, 32, 16, 32, False, False, True): (1, 1, 1, 4), - (16, 64, 32, 16, 32, False, True, False): (1, 1, 1, 4), - (16, 64, 32, 16, 32, False, True, True): (1, 2, 3, 2), - (16, 64, 32, 16, 32, True, False, False): (1, 1, 1, 4), - (16, 64, 32, 16, 32, True, False, True): (1, 1, 2, 4), - (16, 64, 64, 16, 32, False, False, False): (1, 4, 1, 8), - (16, 64, 64, 16, 32, False, False, True): (1, 4, 1, 4), - (16, 64, 64, 16, 32, False, True, False): (1, 4, 1, 1), - (16, 64, 64, 16, 32, False, True, True): (2, 4, 1, 4), - (16, 64, 64, 16, 32, True, False, False): (1, 4, 1, 4), - (16, 64, 64, 16, 32, True, False, True): (1, 4, 1, 4), - (32, 16, 16, 16, 16, False, False, False): (2, 1, 2, 4), - (32, 16, 16, 16, 16, False, False, True): (2, 1, 1, 2), - (32, 16, 16, 16, 16, False, True, False): (1, 1, 2, 4), - (32, 16, 16, 16, 16, False, True, True): (1, 1, 1, 2), - (32, 16, 16, 16, 16, True, False, False): (1, 1, 1, 4), - (32, 16, 16, 16, 16, True, False, True): (2, 1, 1, 2), - (32, 16, 32, 16, 16, False, False, False): (1, 1, 1, 4), - (32, 16, 32, 16, 16, False, False, True): (1, 1, 1, 4), - (32, 16, 32, 16, 16, False, True, False): (1, 2, 1, 4), - (32, 16, 32, 16, 16, False, True, True): (2, 2, 1, 4), - (32, 16, 32, 16, 16, True, False, False): (2, 1, 1, 4), - (32, 16, 32, 16, 16, True, False, True): (2, 2, 1, 2), - (32, 16, 64, 16, 16, False, False, False): (1, 4, 1, 2), - (32, 16, 64, 16, 16, False, False, True): (1, 4, 1, 4), - (32, 16, 64, 16, 16, False, True, False): (1, 2, 1, 4), - (32, 16, 64, 16, 16, False, True, True): (1, 4, 1, 2), - (32, 16, 64, 16, 16, True, False, False): (1, 4, 2, 8), - (32, 16, 64, 16, 16, True, False, True): (1, 4, 1, 1), - (32, 32, 16, 16, 16, False, False, False): (1, 1, 1, 4), - (32, 32, 16, 16, 16, False, False, True): (2, 1, 1, 4), - (32, 32, 16, 16, 16, False, True, False): (1, 1, 2, 4), - (32, 32, 16, 16, 16, False, True, True): (1, 1, 2, 2), - (32, 32, 16, 16, 16, True, False, False): (1, 1, 1, 8), - (32, 32, 16, 16, 16, True, False, True): (1, 1, 1, 4), - (32, 32, 16, 16, 32, False, False, False): (1, 1, 3, 2), - (32, 32, 16, 16, 32, False, False, True): (2, 1, 1, 4), - (32, 32, 16, 16, 32, False, True, False): (3, 1, 1, 4), - (32, 32, 16, 16, 32, False, True, True): (1, 1, 1, 4), - (32, 32, 16, 16, 32, True, False, False): (2, 1, 1, 8), - (32, 32, 16, 16, 32, True, False, True): (1, 1, 3, 2), - (32, 32, 16, 32, 32, False, False, False): (1, 1, 1, 2), - (32, 32, 16, 32, 32, False, False, True): (2, 1, 1, 8), - (32, 32, 16, 32, 32, False, True, False): (1, 1, 1, 2), - (32, 32, 16, 32, 32, False, True, True): (1, 1, 1, 8), - (32, 32, 16, 32, 32, True, False, False): (1, 1, 2, 4), - (32, 32, 16, 32, 32, True, False, True): (1, 1, 1, 2), - (32, 32, 32, 16, 16, False, False, False): (1, 1, 1, 4), - (32, 32, 32, 16, 16, False, False, True): (1, 2, 1, 4), - (32, 32, 32, 16, 16, False, True, False): (1, 2, 1, 4), - (32, 32, 32, 16, 16, False, True, True): (1, 2, 1, 2), - (32, 32, 32, 16, 16, True, False, False): (1, 2, 1, 4), - (32, 32, 32, 16, 16, True, False, True): (1, 2, 1, 4), - (32, 32, 32, 16, 32, False, False, False): (1, 2, 1, 4), - (32, 32, 32, 16, 32, False, False, True): (1, 2, 1, 2), - (32, 32, 32, 16, 32, False, True, False): (1, 2, 1, 4), - (32, 32, 32, 16, 32, False, True, True): (1, 2, 1, 2), - (32, 32, 32, 16, 32, True, False, False): (1, 2, 1, 1), - (32, 32, 32, 16, 32, True, False, True): (1, 2, 1, 2), - (32, 32, 32, 32, 32, False, False, False): (1, 1, 1, 4), - (32, 32, 32, 32, 32, False, False, True): (2, 1, 1, 4), - (32, 32, 32, 32, 32, False, True, False): (1, 1, 1, 8), - (32, 32, 32, 32, 32, False, True, True): (1, 1, 1, 8), - (32, 32, 32, 32, 32, True, False, False): (1, 1, 3, 4), - (32, 32, 32, 32, 32, True, False, True): (1, 1, 1, 8), - (32, 32, 64, 16, 16, False, False, False): (1, 4, 1, 4), - (32, 32, 64, 16, 16, False, False, True): (1, 4, 1, 2), - (32, 32, 64, 16, 16, False, True, False): (1, 1, 1, 4), - (32, 32, 64, 16, 16, False, True, True): (1, 4, 1, 4), - (32, 32, 64, 16, 16, True, False, False): (1, 4, 1, 8), - (32, 32, 64, 16, 16, True, False, True): (1, 4, 1, 2), - (32, 32, 64, 16, 32, False, False, False): (1, 1, 1, 4), - (32, 32, 64, 16, 32, False, False, True): (1, 4, 1, 4), - (32, 32, 64, 16, 32, False, True, False): (1, 1, 1, 4), - (32, 32, 64, 16, 32, False, True, True): (1, 4, 1, 4), - (32, 32, 64, 16, 32, True, False, False): (2, 2, 1, 8), - (32, 32, 64, 16, 32, True, False, True): (1, 2, 1, 2), - (32, 32, 64, 32, 32, False, False, False): (1, 2, 1, 4), - (32, 32, 64, 32, 32, False, False, True): (1, 2, 1, 1), - (32, 32, 64, 32, 32, False, True, False): (1, 2, 2, 8), - (32, 32, 64, 32, 32, False, True, True): (1, 1, 1, 4), - (32, 32, 64, 32, 32, True, False, False): (1, 2, 1, 4), - (32, 32, 64, 32, 32, True, False, True): (2, 2, 1, 4), - (32, 64, 16, 16, 32, False, False, False): (1, 1, 1, 8), - (32, 64, 16, 16, 32, False, False, True): (1, 1, 1, 4), - (32, 64, 16, 16, 32, False, True, False): (2, 1, 1, 4), - (32, 64, 16, 16, 32, False, True, True): (1, 1, 1, 4), - (32, 64, 16, 16, 32, True, False, False): (1, 1, 2, 4), - (32, 64, 16, 16, 32, True, False, True): (1, 1, 2, 2), - (32, 64, 16, 32, 32, False, False, False): (1, 1, 1, 8), - (32, 64, 16, 32, 32, False, False, True): (2, 1, 1, 4), - (32, 64, 16, 32, 32, False, True, False): (1, 1, 1, 4), - (32, 64, 16, 32, 32, False, True, True): (1, 1, 2, 2), - (32, 64, 16, 32, 32, True, False, False): (1, 1, 1, 2), - (32, 64, 16, 32, 32, True, False, True): (2, 1, 2, 4), - (32, 64, 32, 16, 32, False, False, False): (1, 1, 1, 4), - (32, 64, 32, 16, 32, False, False, True): (1, 2, 1, 2), - (32, 64, 32, 16, 32, False, True, False): (1, 2, 3, 4), - (32, 64, 32, 16, 32, False, True, True): (2, 2, 1, 4), - (32, 64, 32, 16, 32, True, False, False): (1, 1, 1, 4), - (32, 64, 32, 16, 32, True, False, True): (1, 2, 2, 1), - (32, 64, 32, 32, 32, False, False, False): (1, 1, 1, 8), - (32, 64, 32, 32, 32, False, False, True): (1, 1, 1, 4), - (32, 64, 32, 32, 32, False, True, False): (1, 1, 2, 4), - (32, 64, 32, 32, 32, False, True, True): (1, 1, 1, 4), - (32, 64, 32, 32, 32, True, False, False): (2, 1, 1, 2), - (32, 64, 32, 32, 32, True, False, True): (1, 1, 1, 4), - (32, 64, 64, 16, 32, False, False, False): (1, 4, 2, 1), - (32, 64, 64, 16, 32, False, False, True): (3, 4, 1, 4), - (32, 64, 64, 16, 32, False, True, False): (1, 1, 1, 8), - (32, 64, 64, 16, 32, False, True, True): (1, 4, 1, 4), - (32, 64, 64, 16, 32, True, False, False): (1, 4, 1, 4), - (32, 64, 64, 16, 32, True, False, True): (2, 2, 3, 4), - (32, 64, 64, 32, 32, False, False, False): (1, 2, 1, 4), - (32, 64, 64, 32, 32, False, False, True): (1, 2, 1, 4), - (32, 64, 64, 32, 32, False, True, False): (1, 2, 2, 8), - (32, 64, 64, 32, 32, False, True, True): (1, 2, 1, 4), - (32, 64, 64, 32, 32, True, False, False): (1, 2, 2, 4), - (32, 64, 64, 32, 32, True, False, True): (1, 2, 1, 4), - (64, 32, 16, 32, 32, False, False, False): (1, 1, 1, 1), - (64, 32, 16, 32, 32, False, False, True): (1, 1, 2, 4), - (64, 32, 16, 32, 32, False, True, False): (2, 1, 1, 8), - (64, 32, 16, 32, 32, False, True, True): (1, 1, 1, 4), - (64, 32, 16, 32, 32, True, False, False): (2, 1, 1, 2), - (64, 32, 16, 32, 32, True, False, True): (1, 1, 1, 4), - (64, 32, 32, 32, 32, False, False, False): (3, 1, 1, 4), - (64, 32, 32, 32, 32, False, False, True): (1, 1, 1, 4), - (64, 32, 32, 32, 32, False, True, False): (1, 1, 1, 8), - (64, 32, 32, 32, 32, False, True, True): (1, 1, 1, 2), - (64, 32, 32, 32, 32, True, False, False): (1, 1, 1, 2), - (64, 32, 32, 32, 32, True, False, True): (1, 1, 1, 4), - (64, 32, 64, 32, 32, False, False, False): (1, 2, 1, 2), - (64, 32, 64, 32, 32, False, False, True): (3, 2, 1, 4), - (64, 32, 64, 32, 32, False, True, False): (1, 1, 1, 1), - (64, 32, 64, 32, 32, False, True, True): (1, 2, 1, 4), - (64, 32, 64, 32, 32, True, False, False): (1, 1, 3, 4), - (64, 32, 64, 32, 32, True, False, True): (1, 2, 2, 4), - (64, 64, 16, 32, 32, False, False, False): (1, 1, 2, 2), - (64, 64, 16, 32, 32, False, False, True): (1, 1, 3, 2), - (64, 64, 16, 32, 32, False, True, False): (1, 1, 1, 8), - (64, 64, 16, 32, 32, False, True, True): (1, 1, 2, 4), - (64, 64, 16, 32, 32, True, False, False): (1, 1, 2, 4), - (64, 64, 16, 32, 32, True, False, True): (2, 1, 2, 4), - (64, 64, 32, 32, 32, False, False, False): (1, 1, 2, 8), - (64, 64, 32, 32, 32, False, False, True): (1, 1, 2, 4), - (64, 64, 32, 32, 32, False, True, False): (1, 1, 1, 4), - (64, 64, 32, 32, 32, False, True, True): (1, 1, 1, 4), - (64, 64, 32, 32, 32, True, False, False): (1, 1, 1, 4), - (64, 64, 32, 32, 32, True, False, True): (2, 1, 2, 4), - (64, 64, 64, 32, 32, False, False, False): (1, 2, 1, 4), - (64, 64, 64, 32, 32, False, False, True): (1, 2, 1, 4), - (64, 64, 64, 32, 32, False, True, False): (1, 2, 1, 4), - (64, 64, 64, 32, 32, False, True, True): (3, 2, 1, 4), - (64, 64, 64, 32, 32, True, False, False): (1, 2, 1, 8), - (64, 64, 64, 32, 32, True, False, True): (1, 2, 3, 4), - (192, 192, 256, 16, 16, False, True, True): (1, 8, 4, 2), - (192, 192, 256, 16, 16, True, False, True): (1, 4, 4, 4), - (192, 192, 256, 32, 32, False, True, True): (2, 8, 5, 4), - (192, 192, 256, 32, 32, True, False, True): (2, 8, 5, 1), - (192, 192, 512, 16, 16, False, True, True): (3, 8, 4, 4), - (192, 192, 512, 16, 16, True, False, True): (5, 8, 5, 4), - (192, 192, 512, 32, 32, False, True, True): (1, 16, 5, 4), - (192, 192, 512, 32, 32, True, False, True): (1, 8, 6, 2), - (192, 192, 1024, 16, 16, False, True, True): (1, 16, 4, 4), - (192, 192, 1024, 16, 16, True, False, True): (3, 16, 5, 2), - (192, 192, 1024, 32, 32, False, True, True): (3, 16, 4, 4), - (192, 192, 1024, 32, 32, True, False, True): (1, 16, 5, 4), - (192, 192, 2048, 16, 16, False, True, True): (2, 16, 3, 4), - (192, 192, 2048, 16, 16, True, False, True): (1, 16, 4, 4), - (192, 192, 2048, 32, 32, False, True, True): (1, 32, 3, 4), - (192, 192, 2048, 32, 32, True, False, True): (3, 16, 4, 4), - (192, 192, 4096, 16, 16, False, True, True): (1, 64, 1, 4), - (192, 192, 4096, 16, 16, True, False, True): (1, 16, 3, 4), - (192, 192, 4096, 32, 32, False, True, True): (1, 128, 1, 4), - (192, 192, 4096, 32, 32, True, False, True): (2, 32, 4, 2), - (192, 192, 8192, 16, 16, False, True, True): (1, 64, 1, 4), - (192, 192, 8192, 16, 16, True, False, True): (2, 64, 3, 2), - (192, 192, 8192, 32, 32, False, True, True): (1, 128, 1, 4), - (192, 192, 8192, 32, 32, True, False, True): (4, 32, 3, 4), - (192, 192, 16384, 16, 16, False, True, True): (1, 128, 1, 4), - (192, 192, 16384, 16, 16, True, False, True): (1, 64, 3, 2), - (192, 192, 16384, 32, 32, False, True, True): (1, 128, 1, 4), - (192, 192, 16384, 32, 32, True, False, True): (1, 64, 3, 4), - (192, 192, 32768, 16, 16, False, True, True): (2, 256, 1, 2), - (192, 192, 32768, 16, 16, True, False, True): (1, 128, 3, 2), - (192, 192, 32768, 32, 32, False, True, True): (2, 256, 1, 4), - (192, 192, 32768, 32, 32, True, False, True): (1, 256, 3, 2), - (192, 192, 65536, 16, 16, False, True, True): (2, 512, 1, 2), - (192, 192, 65536, 16, 16, True, False, True): (1, 256, 3, 2), - (192, 192, 65536, 32, 32, False, True, True): (2, 512, 1, 4), - (192, 192, 65536, 32, 32, True, False, True): (2, 256, 3, 4), - (192, 192, 131072, 16, 16, False, True, True): (4, 1024, 1, 2), - (192, 192, 131072, 16, 16, True, False, True): (3, 512, 3, 2), - (192, 192, 131072, 32, 32, False, True, True): (1, 1024, 1, 4), - (192, 192, 131072, 32, 32, True, False, True): (3, 512, 3, 4), - (256, 256, 256, 16, 16, False, True, True): (4, 8, 6, 2), - (256, 256, 256, 16, 16, True, False, True): (5, 16, 5, 1), - (256, 256, 256, 32, 32, False, True, True): (1, 8, 7, 4), - (256, 256, 256, 32, 32, True, False, True): (1, 8, 5, 4), - (256, 256, 256, 64, 64, False, True, True): (1, 4, 5, 4), - (256, 256, 256, 64, 64, True, False, True): (2, 4, 3, 4), - (256, 256, 256, 128, 128, False, True, True): (1, 2, 2, 8), - (256, 256, 256, 128, 128, True, False, True): (1, 2, 2, 8), - (256, 256, 512, 16, 16, False, True, True): (4, 8, 4, 4), - (256, 256, 512, 16, 16, True, False, True): (4, 8, 6, 2), - (256, 256, 512, 32, 32, False, True, True): (3, 8, 5, 4), - (256, 256, 512, 32, 32, True, False, True): (2, 8, 5, 4), - (256, 256, 512, 64, 64, False, True, True): (2, 8, 4, 4), - (256, 256, 512, 64, 64, True, False, True): (1, 8, 7, 4), - (256, 256, 512, 128, 128, False, True, True): (2, 4, 2, 8), - (256, 256, 512, 128, 128, True, False, True): (5, 4, 2, 8), - (256, 256, 1024, 16, 16, False, True, True): (1, 8, 4, 4), - (256, 256, 1024, 16, 16, True, False, True): (1, 16, 4, 2), - (256, 256, 1024, 32, 32, False, True, True): (5, 32, 5, 1), - (256, 256, 1024, 32, 32, True, False, True): (1, 16, 4, 2), - (256, 256, 1024, 64, 64, False, True, True): (1, 16, 4, 4), - (256, 256, 1024, 64, 64, True, False, True): (2, 16, 3, 4), - (256, 256, 1024, 128, 128, False, True, True): (9, 8, 2, 8), - (256, 256, 1024, 128, 128, True, False, True): (1, 8, 2, 8), - (256, 256, 2048, 16, 16, False, True, True): (6, 32, 5, 2), - (256, 256, 2048, 16, 16, True, False, True): (2, 32, 4, 2), - (256, 256, 2048, 32, 32, False, True, True): (1, 32, 3, 2), - (256, 256, 2048, 32, 32, True, False, True): (1, 32, 3, 2), - (256, 256, 2048, 64, 64, False, True, True): (2, 32, 4, 4), - (256, 256, 2048, 64, 64, True, False, True): (2, 16, 4, 4), - (256, 256, 2048, 128, 128, False, True, True): (3, 16, 2, 8), - (256, 256, 2048, 128, 128, True, False, True): (4, 16, 2, 8), - (256, 256, 4096, 16, 16, False, True, True): (1, 32, 3, 4), - (256, 256, 4096, 16, 16, True, False, True): (3, 16, 3, 2), - (256, 256, 4096, 32, 32, False, True, True): (3, 32, 3, 2), - (256, 256, 4096, 32, 32, True, False, True): (1, 32, 3, 2), - (256, 256, 4096, 64, 64, False, True, True): (2, 32, 3, 4), - (256, 256, 4096, 64, 64, True, False, True): (2, 32, 3, 4), - (256, 256, 4096, 128, 128, False, True, True): (5, 32, 2, 8), - (256, 256, 4096, 128, 128, True, False, True): (1, 32, 2, 8), - (256, 256, 8192, 16, 16, False, True, True): (8, 32, 3, 4), - (256, 256, 8192, 16, 16, True, False, True): (1, 32, 3, 2), - (256, 256, 8192, 32, 32, False, True, True): (3, 64, 3, 4), - (256, 256, 8192, 32, 32, True, False, True): (2, 128, 1, 2), - (256, 256, 8192, 64, 64, False, True, True): (7, 128, 1, 4), - (256, 256, 8192, 64, 64, True, False, True): (4, 128, 1, 4), - (256, 256, 8192, 128, 128, False, True, True): (2, 64, 1, 4), - (256, 256, 8192, 128, 128, True, False, True): (4, 64, 1, 4), - (256, 256, 16384, 16, 16, False, True, True): (4, 128, 3, 2), - (256, 256, 16384, 16, 16, True, False, True): (5, 64, 3, 2), - (256, 256, 16384, 32, 32, False, True, True): (5, 128, 3, 2), - (256, 256, 16384, 32, 32, True, False, True): (5, 128, 3, 2), - (256, 256, 16384, 64, 64, False, True, True): (1, 256, 1, 4), - (256, 256, 16384, 64, 64, True, False, True): (5, 128, 3, 4), - (256, 256, 16384, 128, 128, False, True, True): (11, 128, 2, 8), - (256, 256, 16384, 128, 128, True, False, True): (3, 128, 1, 4), - (256, 256, 32768, 16, 16, False, True, True): (1, 128, 3, 4), - (256, 256, 32768, 16, 16, True, False, True): (2, 128, 3, 2), - (256, 256, 32768, 32, 32, False, True, True): (4, 256, 3, 2), - (256, 256, 32768, 32, 32, True, False, True): (1, 256, 3, 2), - (256, 256, 32768, 64, 64, False, True, True): (2, 256, 1, 4), - (256, 256, 32768, 64, 64, True, False, True): (2, 256, 1, 4), - (256, 256, 32768, 128, 128, False, True, True): (3, 256, 1, 4), - (256, 256, 32768, 128, 128, True, False, True): (2, 256, 1, 4), - (256, 256, 50432, 16, 16, False, True, True): (4, 197, 1, 4), - (256, 256, 50432, 16, 16, True, False, True): (4, 197, 3, 2), - (256, 256, 50432, 32, 32, False, True, True): (1, 394, 1, 2), - (256, 256, 50432, 32, 32, True, False, True): (4, 197, 3, 4), - (256, 256, 50432, 64, 64, False, True, True): (6, 394, 1, 4), - (256, 256, 50432, 64, 64, True, False, True): (4, 394, 2, 4), - (256, 256, 50432, 128, 128, False, True, True): (3, 394, 1, 4), - (256, 256, 50432, 128, 128, True, False, True): (1, 394, 2, 4), - (256, 256, 65536, 16, 16, False, True, True): (1, 256, 3, 2), - (256, 256, 65536, 16, 16, True, False, True): (1, 256, 3, 2), - (256, 256, 65536, 32, 32, False, True, True): (1, 512, 3, 2), - (256, 256, 65536, 32, 32, True, False, True): (4, 512, 3, 2), - (256, 256, 65536, 64, 64, False, True, True): (2, 512, 1, 4), - (256, 256, 65536, 64, 64, True, False, True): (5, 512, 1, 4), - (256, 256, 65536, 128, 128, False, True, True): (3, 512, 1, 4), - (256, 256, 65536, 128, 128, True, False, True): (1, 512, 1, 4), - (256, 256, 65792, 16, 16, False, True, True): (2, 257, 1, 4), - (256, 256, 65792, 16, 16, True, False, True): (1, 257, 3, 2), - (256, 256, 65792, 32, 32, False, True, True): (2, 257, 1, 4), - (256, 256, 65792, 32, 32, True, False, True): (1, 257, 3, 4), - (256, 256, 65792, 64, 64, False, True, True): (2, 514, 1, 4), - (256, 256, 65792, 64, 64, True, False, True): (2, 514, 2, 4), - (256, 256, 65792, 128, 128, False, True, True): (3, 514, 1, 4), - (256, 256, 65792, 128, 128, True, False, True): (1, 514, 2, 4), - (256, 256, 131072, 16, 16, False, True, True): (1, 512, 3, 1), - (256, 256, 131072, 16, 16, True, False, True): (1, 512, 3, 2), - (256, 256, 131072, 32, 32, False, True, True): (2, 1024, 3, 2), - (256, 256, 131072, 32, 32, True, False, True): (1, 1024, 3, 2), - (256, 256, 131072, 64, 64, False, True, True): (1, 1024, 1, 4), - (256, 256, 131072, 64, 64, True, False, True): (1, 1024, 1, 4), - (256, 256, 131072, 128, 128, False, True, True): (7, 1024, 1, 4), - (256, 256, 131072, 128, 128, True, False, True): (1, 1024, 1, 4), - (384, 384, 256, 16, 16, False, True, True): (3, 16, 4, 1), - (384, 384, 256, 16, 16, True, False, True): (2, 4, 6, 2), - (384, 384, 256, 32, 32, False, True, True): (1, 8, 4, 4), - (384, 384, 256, 32, 32, True, False, True): (1, 4, 5, 2), - (384, 384, 256, 64, 64, False, True, True): (3, 4, 3, 4), - (384, 384, 256, 64, 64, True, False, True): (4, 4, 5, 4), - (384, 384, 512, 16, 16, False, True, True): (1, 16, 4, 1), - (384, 384, 512, 16, 16, True, False, True): (1, 8, 5, 2), - (384, 384, 512, 32, 32, False, True, True): (4, 16, 4, 2), - (384, 384, 512, 32, 32, True, False, True): (1, 8, 5, 2), - (384, 384, 512, 64, 64, False, True, True): (2, 8, 3, 4), - (384, 384, 512, 64, 64, True, False, True): (1, 8, 4, 4), - (384, 384, 1024, 16, 16, False, True, True): (1, 16, 4, 2), - (384, 384, 1024, 16, 16, True, False, True): (7, 8, 5, 2), - (384, 384, 1024, 32, 32, False, True, True): (2, 16, 3, 4), - (384, 384, 1024, 32, 32, True, False, True): (1, 16, 4, 2), - (384, 384, 1024, 64, 64, False, True, True): (6, 16, 3, 4), - (384, 384, 1024, 64, 64, True, False, True): (4, 16, 4, 4), - (384, 384, 2048, 16, 16, False, True, True): (1, 32, 1, 4), - (384, 384, 2048, 16, 16, True, False, True): (1, 16, 3, 2), - (384, 384, 2048, 32, 32, False, True, True): (1, 32, 1, 8), - (384, 384, 2048, 32, 32, True, False, True): (1, 8, 4, 4), - (384, 384, 2048, 64, 64, False, True, True): (2, 32, 1, 8), - (384, 384, 2048, 64, 64, True, False, True): (3, 16, 3, 4), - (384, 384, 4096, 16, 16, False, True, True): (5, 32, 1, 4), - (384, 384, 4096, 16, 16, True, False, True): (1, 32, 3, 2), - (384, 384, 4096, 32, 32, False, True, True): (1, 32, 1, 8), - (384, 384, 4096, 32, 32, True, False, True): (2, 16, 4, 4), - (384, 384, 4096, 64, 64, False, True, True): (1, 64, 1, 4), - (384, 384, 4096, 64, 64, True, False, True): (2, 32, 3, 4), - (384, 384, 8192, 16, 16, False, True, True): (2, 64, 1, 4), - (384, 384, 8192, 16, 16, True, False, True): (3, 32, 3, 2), - (384, 384, 8192, 32, 32, False, True, True): (4, 128, 1, 4), - (384, 384, 8192, 32, 32, True, False, True): (1, 32, 3, 2), - (384, 384, 8192, 64, 64, False, True, True): (1, 128, 1, 4), - (384, 384, 8192, 64, 64, True, False, True): (1, 64, 3, 4), - (384, 384, 16384, 16, 16, False, True, True): (1, 128, 1, 2), - (384, 384, 16384, 16, 16, True, False, True): (1, 64, 3, 2), - (384, 384, 16384, 32, 32, False, True, True): (1, 128, 1, 4), - (384, 384, 16384, 32, 32, True, False, True): (1, 64, 3, 4), - (384, 384, 16384, 64, 64, False, True, True): (5, 128, 3, 4), - (384, 384, 16384, 64, 64, True, False, True): (1, 128, 3, 4), - (384, 384, 32768, 16, 16, False, True, True): (2, 256, 1, 2), - (384, 384, 32768, 16, 16, True, False, True): (1, 128, 3, 4), - (384, 384, 32768, 32, 32, False, True, True): (1, 256, 1, 2), - (384, 384, 32768, 32, 32, True, False, True): (2, 128, 3, 4), - (384, 384, 32768, 64, 64, False, True, True): (3, 256, 1, 4), - (384, 384, 32768, 64, 64, True, False, True): (2, 256, 3, 4), - (384, 384, 65536, 16, 16, False, True, True): (2, 128, 1, 4), - (384, 384, 65536, 16, 16, True, False, True): (1, 256, 3, 4), - (384, 384, 65536, 32, 32, False, True, True): (1, 512, 1, 2), - (384, 384, 65536, 32, 32, True, False, True): (1, 256, 3, 4), - (384, 384, 65536, 64, 64, False, True, True): (3, 512, 1, 4), - (384, 384, 65536, 64, 64, True, False, True): (3, 256, 3, 4), - (384, 384, 131072, 16, 16, False, True, True): (2, 256, 1, 2), - (384, 384, 131072, 16, 16, True, False, True): (1, 512, 3, 4), - (384, 384, 131072, 32, 32, False, True, True): (1, 512, 1, 2), - (384, 384, 131072, 32, 32, True, False, True): (1, 512, 3, 4), - (384, 384, 131072, 64, 64, False, True, True): (3, 1024, 1, 4), - (384, 384, 131072, 64, 64, True, False, True): (3, 512, 3, 4), - (512, 512, 256, 16, 16, False, True, True): (1, 8, 5, 1), - (512, 512, 256, 16, 16, True, False, True): (2, 16, 5, 1), - (512, 512, 256, 32, 32, False, True, True): (2, 8, 5, 2), - (512, 512, 256, 32, 32, True, False, True): (4, 4, 5, 2), - (512, 512, 256, 64, 64, False, True, True): (1, 4, 5, 4), - (512, 512, 256, 64, 64, True, False, True): (3, 4, 5, 4), - (512, 512, 256, 128, 128, False, True, True): (1, 2, 2, 8), - (512, 512, 256, 128, 128, True, False, True): (1, 2, 2, 8), - (512, 512, 512, 16, 16, False, True, True): (1, 8, 4, 4), - (512, 512, 512, 16, 16, True, False, True): (4, 16, 5, 1), - (512, 512, 512, 32, 32, False, True, True): (4, 8, 5, 2), - (512, 512, 512, 32, 32, True, False, True): (7, 16, 4, 1), - (512, 512, 512, 64, 64, False, True, True): (3, 8, 5, 4), - (512, 512, 512, 64, 64, True, False, True): (1, 8, 4, 4), - (512, 512, 512, 128, 128, False, True, True): (4, 4, 2, 8), - (512, 512, 512, 128, 128, True, False, True): (4, 4, 2, 8), - (512, 512, 1024, 16, 16, False, True, True): (2, 8, 4, 4), - (512, 512, 1024, 16, 16, True, False, True): (2, 16, 4, 2), - (512, 512, 1024, 32, 32, False, True, True): (3, 16, 4, 2), - (512, 512, 1024, 32, 32, True, False, True): (3, 16, 3, 2), - (512, 512, 1024, 64, 64, False, True, True): (5, 8, 5, 4), - (512, 512, 1024, 64, 64, True, False, True): (4, 16, 3, 4), - (512, 512, 1024, 128, 128, False, True, True): (6, 8, 2, 8), - (512, 512, 1024, 128, 128, True, False, True): (4, 8, 2, 8), - (512, 512, 2048, 16, 16, False, True, True): (2, 16, 3, 4), - (512, 512, 2048, 16, 16, True, False, True): (1, 16, 4, 2), - (512, 512, 2048, 32, 32, False, True, True): (2, 32, 3, 2), - (512, 512, 2048, 32, 32, True, False, True): (2, 32, 3, 2), - (512, 512, 2048, 64, 64, False, True, True): (1, 32, 3, 4), - (512, 512, 2048, 64, 64, True, False, True): (1, 32, 3, 2), - (512, 512, 2048, 128, 128, False, True, True): (3, 16, 2, 8), - (512, 512, 2048, 128, 128, True, False, True): (1, 16, 2, 8), - (512, 512, 4096, 16, 16, False, True, True): (4, 32, 3, 2), - (512, 512, 4096, 16, 16, True, False, True): (1, 32, 3, 2), - (512, 512, 4096, 32, 32, False, True, True): (3, 32, 3, 2), - (512, 512, 4096, 32, 32, True, False, True): (3, 32, 3, 2), - (512, 512, 4096, 64, 64, False, True, True): (1, 32, 3, 4), - (512, 512, 4096, 64, 64, True, False, True): (1, 64, 1, 4), - (512, 512, 4096, 128, 128, False, True, True): (7, 32, 2, 8), - (512, 512, 4096, 128, 128, True, False, True): (1, 32, 2, 8), - (512, 512, 8192, 16, 16, False, True, True): (4, 64, 3, 2), - (512, 512, 8192, 16, 16, True, False, True): (1, 64, 3, 2), - (512, 512, 8192, 32, 32, False, True, True): (3, 64, 3, 2), - (512, 512, 8192, 32, 32, True, False, True): (1, 64, 3, 2), - (512, 512, 8192, 64, 64, False, True, True): (1, 64, 3, 4), - (512, 512, 8192, 64, 64, True, False, True): (1, 64, 3, 4), - (512, 512, 8192, 128, 128, False, True, True): (7, 64, 2, 8), - (512, 512, 8192, 128, 128, True, False, True): (1, 64, 1, 4), - (512, 512, 16384, 16, 16, False, True, True): (1, 128, 3, 2), - (512, 512, 16384, 16, 16, True, False, True): (1, 64, 3, 2), - (512, 512, 16384, 32, 32, False, True, True): (1, 128, 3, 2), - (512, 512, 16384, 32, 32, True, False, True): (1, 128, 3, 2), - (512, 512, 16384, 64, 64, False, True, True): (1, 128, 3, 4), - (512, 512, 16384, 64, 64, True, False, True): (4, 128, 3, 4), - (512, 512, 16384, 128, 128, False, True, True): (5, 128, 2, 8), - (512, 512, 16384, 128, 128, True, False, True): (2, 128, 1, 4), - (512, 512, 32768, 16, 16, False, True, True): (1, 128, 3, 4), - (512, 512, 32768, 16, 16, True, False, True): (1, 128, 3, 2), - (512, 512, 32768, 32, 32, False, True, True): (1, 256, 3, 2), - (512, 512, 32768, 32, 32, True, False, True): (1, 256, 3, 2), - (512, 512, 32768, 64, 64, False, True, True): (1, 256, 3, 4), - (512, 512, 32768, 64, 64, True, False, True): (1, 256, 3, 4), - (512, 512, 32768, 128, 128, False, True, True): (5, 256, 1, 4), - (512, 512, 32768, 128, 128, True, False, True): (1, 256, 1, 4), - (512, 512, 50432, 16, 16, False, True, True): (4, 197, 1, 4), - (512, 512, 50432, 16, 16, True, False, True): (4, 197, 3, 2), - (512, 512, 50432, 32, 32, False, True, True): (2, 197, 1, 4), - (512, 512, 50432, 32, 32, True, False, True): (4, 197, 3, 4), - (512, 512, 50432, 64, 64, False, True, True): (2, 394, 1, 4), - (512, 512, 50432, 64, 64, True, False, True): (4, 197, 2, 4), - (512, 512, 50432, 128, 128, False, True, True): (5, 394, 1, 4), - (512, 512, 50432, 128, 128, True, False, True): (6, 394, 2, 4), - (512, 512, 65536, 16, 16, False, True, True): (1, 256, 3, 2), - (512, 512, 65536, 16, 16, True, False, True): (1, 256, 3, 1), - (512, 512, 65536, 32, 32, False, True, True): (1, 512, 3, 2), - (512, 512, 65536, 32, 32, True, False, True): (1, 512, 3, 2), - (512, 512, 65536, 64, 64, False, True, True): (2, 256, 2, 4), - (512, 512, 65536, 64, 64, True, False, True): (1, 512, 3, 4), - (512, 512, 65536, 128, 128, False, True, True): (7, 512, 1, 4), - (512, 512, 65536, 128, 128, True, False, True): (5, 512, 1, 4), - (512, 512, 65792, 16, 16, False, True, True): (2, 257, 1, 4), - (512, 512, 65792, 16, 16, True, False, True): (1, 257, 3, 4), - (512, 512, 65792, 32, 32, False, True, True): (2, 257, 1, 4), - (512, 512, 65792, 32, 32, True, False, True): (1, 257, 3, 4), - (512, 512, 65792, 64, 64, False, True, True): (4, 514, 1, 4), - (512, 512, 65792, 64, 64, True, False, True): (4, 257, 2, 4), - (512, 512, 65792, 128, 128, False, True, True): (5, 514, 1, 4), - (512, 512, 65792, 128, 128, True, False, True): (4, 514, 2, 4), - (512, 512, 131072, 16, 16, False, True, True): (1, 512, 3, 1), - (512, 512, 131072, 16, 16, True, False, True): (1, 512, 3, 1), - (512, 512, 131072, 32, 32, False, True, True): (1, 1024, 3, 2), - (512, 512, 131072, 32, 32, True, False, True): (1, 1024, 3, 2), - (512, 512, 131072, 64, 64, False, True, True): (4, 512, 2, 4), - (512, 512, 131072, 64, 64, True, False, True): (2, 512, 2, 4), - (512, 512, 131072, 128, 128, False, True, True): (5, 1024, 1, 4), - (512, 512, 131072, 128, 128, True, False, True): (4, 1024, 1, 4), - (768, 768, 256, 16, 16, False, True, True): (1, 8, 4, 1), - (768, 768, 256, 16, 16, True, False, True): (3, 2, 5, 2), - (768, 768, 256, 32, 32, False, True, True): (1, 8, 4, 2), - (768, 768, 256, 32, 32, True, False, True): (2, 4, 6, 2), - (768, 768, 256, 64, 64, False, True, True): (3, 4, 3, 4), - (768, 768, 256, 64, 64, True, False, True): (2, 4, 4, 4), - (768, 768, 256, 128, 128, False, True, True): (1, 2, 3, 8), - (768, 768, 256, 128, 128, True, False, True): (2, 2, 3, 8), - (768, 768, 512, 16, 16, False, True, True): (1, 8, 4, 2), - (768, 768, 512, 16, 16, True, False, True): (2, 8, 5, 2), - (768, 768, 512, 32, 32, False, True, True): (1, 16, 1, 4), - (768, 768, 512, 32, 32, True, False, True): (3, 8, 5, 2), - (768, 768, 512, 64, 64, False, True, True): (4, 8, 3, 4), - (768, 768, 512, 64, 64, True, False, True): (2, 8, 4, 4), - (768, 768, 512, 128, 128, False, True, True): (1, 4, 3, 8), - (768, 768, 512, 128, 128, True, False, True): (3, 4, 3, 8), - (768, 768, 1024, 16, 16, False, True, True): (1, 16, 1, 4), - (768, 768, 1024, 16, 16, True, False, True): (1, 8, 5, 2), - (768, 768, 1024, 32, 32, False, True, True): (1, 16, 1, 8), - (768, 768, 1024, 32, 32, True, False, True): (1, 4, 4, 4), - (768, 768, 1024, 64, 64, False, True, True): (2, 16, 1, 8), - (768, 768, 1024, 64, 64, True, False, True): (1, 8, 3, 8), - (768, 768, 1024, 128, 128, False, True, True): (1, 8, 3, 8), - (768, 768, 1024, 128, 128, True, False, True): (3, 8, 3, 8), - (768, 768, 2048, 16, 16, False, True, True): (6, 16, 1, 2), - (768, 768, 2048, 16, 16, True, False, True): (2, 16, 4, 2), - (768, 768, 2048, 32, 32, False, True, True): (3, 32, 1, 4), - (768, 768, 2048, 32, 32, True, False, True): (6, 8, 3, 4), - (768, 768, 2048, 64, 64, False, True, True): (2, 32, 2, 2), - (768, 768, 2048, 64, 64, True, False, True): (1, 16, 4, 4), - (768, 768, 2048, 128, 128, False, True, True): (2, 16, 3, 8), - (768, 768, 2048, 128, 128, True, False, True): (4, 16, 3, 8), - (768, 768, 4096, 16, 16, False, True, True): (1, 32, 1, 4), - (768, 768, 4096, 16, 16, True, False, True): (2, 16, 3, 2), - (768, 768, 4096, 32, 32, False, True, True): (3, 32, 1, 8), - (768, 768, 4096, 32, 32, True, False, True): (1, 16, 4, 4), - (768, 768, 4096, 64, 64, False, True, True): (1, 64, 2, 4), - (768, 768, 4096, 64, 64, True, False, True): (1, 8, 3, 8), - (768, 768, 4096, 128, 128, False, True, True): (1, 32, 3, 8), - (768, 768, 4096, 128, 128, True, False, True): (2, 32, 3, 8), - (768, 768, 8192, 16, 16, False, True, True): (1, 64, 1, 2), - (768, 768, 8192, 16, 16, True, False, True): (2, 64, 3, 2), - (768, 768, 8192, 32, 32, False, True, True): (2, 64, 1, 8), - (768, 768, 8192, 32, 32, True, False, True): (2, 32, 3, 4), - (768, 768, 8192, 64, 64, False, True, True): (4, 64, 3, 4), - (768, 768, 8192, 64, 64, True, False, True): (1, 64, 3, 4), - (768, 768, 8192, 128, 128, False, True, True): (4, 64, 3, 8), - (768, 768, 8192, 128, 128, True, False, True): (2, 64, 3, 8), - (768, 768, 16384, 16, 16, False, True, True): (4, 128, 1, 2), - (768, 768, 16384, 16, 16, True, False, True): (1, 64, 3, 4), - (768, 768, 16384, 32, 32, False, True, True): (1, 128, 1, 8), - (768, 768, 16384, 32, 32, True, False, True): (1, 64, 3, 4), - (768, 768, 16384, 64, 64, False, True, True): (1, 128, 3, 4), - (768, 768, 16384, 64, 64, True, False, True): (1, 128, 3, 4), - (768, 768, 16384, 128, 128, False, True, True): (3, 128, 1, 4), - (768, 768, 16384, 128, 128, True, False, True): (1, 128, 2, 4), - (768, 768, 32768, 16, 16, False, True, True): (2, 256, 1, 2), - (768, 768, 32768, 16, 16, True, False, True): (1, 128, 4, 4), - (768, 768, 32768, 32, 32, False, True, True): (1, 128, 1, 2), - (768, 768, 32768, 32, 32, True, False, True): (1, 128, 3, 4), - (768, 768, 32768, 64, 64, False, True, True): (1, 256, 1, 4), - (768, 768, 32768, 64, 64, True, False, True): (1, 128, 3, 4), - (768, 768, 32768, 128, 128, False, True, True): (3, 256, 1, 4), - (768, 768, 32768, 128, 128, True, False, True): (3, 256, 2, 4), - (768, 768, 65536, 16, 16, False, True, True): (4, 512, 1, 2), - (768, 768, 65536, 16, 16, True, False, True): (1, 256, 4, 4), - (768, 768, 65536, 32, 32, False, True, True): (1, 256, 1, 2), - (768, 768, 65536, 32, 32, True, False, True): (1, 256, 3, 4), - (768, 768, 65536, 64, 64, False, True, True): (1, 512, 1, 4), - (768, 768, 65536, 64, 64, True, False, True): (1, 256, 3, 4), - (768, 768, 65536, 128, 128, False, True, True): (3, 512, 1, 4), - (768, 768, 65536, 128, 128, True, False, True): (2, 512, 2, 4), - (768, 768, 131072, 16, 16, False, True, True): (1, 512, 1, 1), - (768, 768, 131072, 16, 16, True, False, True): (1, 512, 4, 4), - (768, 768, 131072, 32, 32, False, True, True): (1, 512, 1, 2), - (768, 768, 131072, 32, 32, True, False, True): (1, 512, 3, 4), - (768, 768, 131072, 64, 64, False, True, True): (1, 1024, 1, 4), - (768, 768, 131072, 64, 64, True, False, True): (3, 512, 3, 4), - (768, 768, 131072, 128, 128, False, True, True): (3, 1024, 1, 4), - (768, 768, 131072, 128, 128, True, False, True): (1, 1024, 2, 4), - (768, 3072, 256, 16, 16, False, True, True): (1, 8, 5, 2), - (768, 3072, 256, 16, 16, True, False, True): (3, 4, 7, 2), - (768, 3072, 256, 32, 32, False, True, True): (1, 8, 4, 2), - (768, 3072, 256, 32, 32, True, False, True): (1, 4, 5, 4), - (768, 3072, 256, 64, 64, False, True, True): (1, 4, 3, 4), - (768, 3072, 256, 64, 64, True, False, True): (1, 4, 5, 4), - (768, 3072, 256, 128, 128, False, True, True): (2, 2, 3, 8), - (768, 3072, 256, 128, 128, True, False, True): (2, 2, 3, 8), - (768, 3072, 512, 16, 16, False, True, True): (1, 8, 5, 2), - (768, 3072, 512, 16, 16, True, False, True): (1, 8, 5, 2), - (768, 3072, 512, 32, 32, False, True, True): (3, 8, 3, 4), - (768, 3072, 512, 32, 32, True, False, True): (1, 8, 7, 4), - (768, 3072, 512, 64, 64, False, True, True): (3, 8, 3, 4), - (768, 3072, 512, 64, 64, True, False, True): (3, 8, 5, 4), - (768, 3072, 512, 128, 128, False, True, True): (1, 4, 3, 8), - (768, 3072, 512, 128, 128, True, False, True): (1, 4, 3, 8), - (768, 3072, 1024, 16, 16, False, True, True): (4, 16, 1, 4), - (768, 3072, 1024, 16, 16, True, False, True): (2, 8, 5, 2), - (768, 3072, 1024, 32, 32, False, True, True): (1, 16, 6, 2), - (768, 3072, 1024, 32, 32, True, False, True): (1, 8, 4, 4), - (768, 3072, 1024, 64, 64, False, True, True): (2, 16, 4, 4), - (768, 3072, 1024, 64, 64, True, False, True): (2, 16, 4, 4), - (768, 3072, 1024, 128, 128, False, True, True): (1, 8, 3, 8), - (768, 3072, 1024, 128, 128, True, False, True): (3, 8, 3, 8), - (768, 3072, 2048, 16, 16, False, True, True): (1, 16, 1, 2), - (768, 3072, 2048, 16, 16, True, False, True): (1, 16, 5, 2), - (768, 3072, 2048, 32, 32, False, True, True): (4, 16, 1, 8), - (768, 3072, 2048, 32, 32, True, False, True): (2, 8, 3, 4), - (768, 3072, 2048, 64, 64, False, True, True): (2, 16, 3, 4), - (768, 3072, 2048, 64, 64, True, False, True): (2, 16, 3, 4), - (768, 3072, 2048, 128, 128, False, True, True): (3, 16, 3, 8), - (768, 3072, 2048, 128, 128, True, False, True): (1, 16, 3, 8), - (768, 3072, 4096, 16, 16, False, True, True): (1, 32, 1, 4), - (768, 3072, 4096, 16, 16, True, False, True): (1, 16, 3, 1), - (768, 3072, 4096, 32, 32, False, True, True): (3, 32, 1, 8), - (768, 3072, 4096, 32, 32, True, False, True): (2, 16, 3, 8), - (768, 3072, 4096, 64, 64, False, True, True): (2, 32, 3, 4), - (768, 3072, 4096, 64, 64, True, False, True): (2, 16, 3, 4), - (768, 3072, 4096, 128, 128, False, True, True): (5, 32, 1, 4), - (768, 3072, 4096, 128, 128, True, False, True): (4, 32, 3, 8), - (768, 3072, 8192, 16, 16, False, True, True): (1, 32, 1, 4), - (768, 3072, 8192, 16, 16, True, False, True): (4, 64, 4, 2), - (768, 3072, 8192, 32, 32, False, True, True): (1, 64, 1, 8), - (768, 3072, 8192, 32, 32, True, False, True): (2, 32, 3, 8), - (768, 3072, 8192, 64, 64, False, True, True): (2, 64, 3, 4), - (768, 3072, 8192, 64, 64, True, False, True): (2, 32, 3, 4), - (768, 3072, 8192, 128, 128, False, True, True): (1, 64, 3, 8), - (768, 3072, 8192, 128, 128, True, False, True): (2, 64, 3, 8), - (768, 3072, 16384, 16, 16, False, True, True): (1, 64, 1, 4), - (768, 3072, 16384, 16, 16, True, False, True): (1, 64, 4, 1), - (768, 3072, 16384, 32, 32, False, True, True): (1, 128, 1, 8), - (768, 3072, 16384, 32, 32, True, False, True): (1, 64, 3, 4), - (768, 3072, 16384, 64, 64, False, True, True): (1, 128, 3, 4), - (768, 3072, 16384, 64, 64, True, False, True): (1, 64, 3, 4), - (768, 3072, 16384, 128, 128, False, True, True): (2, 128, 3, 8), - (768, 3072, 16384, 128, 128, True, False, True): (1, 128, 3, 8), - (768, 3072, 32768, 16, 16, False, True, True): (1, 128, 1, 4), - (768, 3072, 32768, 16, 16, True, False, True): (1, 128, 4, 1), - (768, 3072, 32768, 32, 32, False, True, True): (1, 256, 1, 8), - (768, 3072, 32768, 32, 32, True, False, True): (1, 128, 3, 4), - (768, 3072, 32768, 64, 64, False, True, True): (1, 256, 3, 4), - (768, 3072, 32768, 64, 64, True, False, True): (1, 128, 3, 4), - (768, 3072, 32768, 128, 128, False, True, True): (3, 256, 1, 4), - (768, 3072, 32768, 128, 128, True, False, True): (5, 256, 3, 8), - (768, 3072, 50432, 16, 16, False, True, True): (1, 197, 1, 4), - (768, 3072, 50432, 16, 16, True, False, True): (4, 197, 4, 1), - (768, 3072, 50432, 32, 32, False, True, True): (2, 197, 1, 4), - (768, 3072, 50432, 32, 32, True, False, True): (4, 197, 3, 4), - (768, 3072, 50432, 64, 64, False, True, True): (1, 394, 3, 4), - (768, 3072, 50432, 64, 64, True, False, True): (1, 197, 3, 4), - (768, 3072, 50432, 128, 128, False, True, True): (3, 394, 1, 4), - (768, 3072, 50432, 128, 128, True, False, True): (3, 394, 2, 4), - (768, 3072, 65536, 16, 16, False, True, True): (1, 256, 1, 4), - (768, 3072, 65536, 16, 16, True, False, True): (5, 256, 4, 1), - (768, 3072, 65536, 32, 32, False, True, True): (2, 256, 1, 4), - (768, 3072, 65536, 32, 32, True, False, True): (3, 256, 3, 4), - (768, 3072, 65536, 64, 64, False, True, True): (1, 512, 3, 4), - (768, 3072, 65536, 64, 64, True, False, True): (1, 256, 3, 4), - (768, 3072, 65536, 128, 128, False, True, True): (3, 512, 1, 4), - (768, 3072, 65536, 128, 128, True, False, True): (2, 512, 3, 8), - (768, 3072, 131072, 16, 16, False, True, True): (1, 512, 1, 4), - (768, 3072, 131072, 16, 16, True, False, True): (5, 512, 4, 1), - (768, 3072, 131072, 32, 32, False, True, True): (2, 512, 1, 4), - (768, 3072, 131072, 32, 32, True, False, True): (2, 512, 3, 4), - (768, 3072, 131072, 64, 64, False, True, True): (1, 1024, 3, 4), - (768, 3072, 131072, 64, 64, True, False, True): (2, 512, 3, 4), - (768, 3072, 131072, 128, 128, False, True, True): (3, 1024, 1, 4), - (768, 3072, 131072, 128, 128, True, False, True): (2, 1024, 3, 8), - (1024, 1024, 256, 16, 16, False, True, True): (3, 4, 5, 4), - (1024, 1024, 256, 16, 16, True, False, True): (3, 4, 5, 4), - (1024, 1024, 256, 32, 32, False, True, True): (2, 4, 6, 2), - (1024, 1024, 256, 32, 32, True, False, True): (2, 4, 6, 2), - (1024, 1024, 256, 64, 64, False, True, True): (1, 4, 4, 4), - (1024, 1024, 256, 64, 64, True, False, True): (2, 4, 6, 4), - (1024, 1024, 256, 128, 128, False, True, True): (1, 2, 2, 8), - (1024, 1024, 256, 128, 128, True, False, True): (1, 2, 2, 8), - (1024, 1024, 512, 16, 16, False, True, True): (3, 4, 5, 4), - (1024, 1024, 512, 16, 16, True, False, True): (3, 8, 4, 2), - (1024, 1024, 512, 32, 32, False, True, True): (1, 8, 4, 2), - (1024, 1024, 512, 32, 32, True, False, True): (1, 8, 4, 2), - (1024, 1024, 512, 64, 64, False, True, True): (2, 8, 3, 4), - (1024, 1024, 512, 64, 64, True, False, True): (1, 4, 4, 4), - (1024, 1024, 512, 128, 128, False, True, True): (7, 4, 2, 8), - (1024, 1024, 512, 128, 128, True, False, True): (1, 4, 2, 8), - (1024, 1024, 1024, 16, 16, False, True, True): (4, 8, 4, 2), - (1024, 1024, 1024, 16, 16, True, False, True): (3, 8, 5, 2), - (1024, 1024, 1024, 32, 32, False, True, True): (1, 8, 4, 4), - (1024, 1024, 1024, 32, 32, True, False, True): (1, 8, 4, 2), - (1024, 1024, 1024, 64, 64, False, True, True): (1, 16, 3, 4), - (1024, 1024, 1024, 64, 64, True, False, True): (3, 16, 3, 4), - (1024, 1024, 1024, 128, 128, False, True, True): (6, 8, 2, 8), - (1024, 1024, 1024, 128, 128, True, False, True): (4, 8, 2, 8), - (1024, 1024, 2048, 16, 16, False, True, True): (3, 8, 3, 4), - (1024, 1024, 2048, 16, 16, True, False, True): (3, 8, 3, 4), - (1024, 1024, 2048, 32, 32, False, True, True): (1, 16, 3, 4), - (1024, 1024, 2048, 32, 32, True, False, True): (1, 16, 3, 2), - (1024, 1024, 2048, 64, 64, False, True, True): (5, 16, 3, 4), - (1024, 1024, 2048, 64, 64, True, False, True): (5, 16, 3, 4), - (1024, 1024, 2048, 128, 128, False, True, True): (3, 16, 2, 8), - (1024, 1024, 2048, 128, 128, True, False, True): (4, 16, 2, 16), - (1024, 1024, 4096, 16, 16, False, True, True): (4, 32, 3, 2), - (1024, 1024, 4096, 16, 16, True, False, True): (8, 32, 3, 2), - (1024, 1024, 4096, 32, 32, False, True, True): (9, 32, 3, 2), - (1024, 1024, 4096, 32, 32, True, False, True): (1, 32, 3, 2), - (1024, 1024, 4096, 64, 64, False, True, True): (6, 32, 3, 4), - (1024, 1024, 4096, 64, 64, True, False, True): (1, 32, 3, 4), - (1024, 1024, 4096, 128, 128, False, True, True): (4, 32, 2, 8), - (1024, 1024, 4096, 128, 128, True, False, True): (4, 32, 1, 4), - (1024, 1024, 8192, 16, 16, False, True, True): (4, 64, 3, 2), - (1024, 1024, 8192, 16, 16, True, False, True): (4, 64, 3, 2), - (1024, 1024, 8192, 32, 32, False, True, True): (8, 64, 3, 2), - (1024, 1024, 8192, 32, 32, True, False, True): (6, 64, 3, 2), - (1024, 1024, 8192, 64, 64, False, True, True): (2, 64, 3, 4), - (1024, 1024, 8192, 64, 64, True, False, True): (2, 64, 3, 4), - (1024, 1024, 8192, 128, 128, False, True, True): (3, 64, 1, 4), - (1024, 1024, 8192, 128, 128, True, False, True): (2, 64, 1, 4), - (1024, 1024, 16384, 16, 16, False, True, True): (1, 64, 3, 4), - (1024, 1024, 16384, 16, 16, True, False, True): (1, 64, 3, 2), - (1024, 1024, 16384, 32, 32, False, True, True): (1, 128, 3, 4), - (1024, 1024, 16384, 32, 32, True, False, True): (1, 64, 3, 4), - (1024, 1024, 16384, 64, 64, False, True, True): (1, 128, 3, 4), - (1024, 1024, 16384, 64, 64, True, False, True): (1, 128, 3, 4), - (1024, 1024, 16384, 128, 128, False, True, True): (11, 128, 1, 4), - (1024, 1024, 16384, 128, 128, True, False, True): (4, 128, 1, 4), - (1024, 1024, 32768, 16, 16, False, True, True): (1, 128, 3, 4), - (1024, 1024, 32768, 16, 16, True, False, True): (1, 128, 3, 1), - (1024, 1024, 32768, 32, 32, False, True, True): (1, 256, 3, 2), - (1024, 1024, 32768, 32, 32, True, False, True): (1, 128, 3, 4), - (1024, 1024, 32768, 64, 64, False, True, True): (2, 128, 2, 4), - (1024, 1024, 32768, 64, 64, True, False, True): (1, 256, 3, 4), - (1024, 1024, 32768, 128, 128, False, True, True): (7, 256, 1, 4), - (1024, 1024, 32768, 128, 128, True, False, True): (4, 256, 1, 4), - (1024, 1024, 50432, 16, 16, False, True, True): (1, 197, 1, 4), - (1024, 1024, 50432, 16, 16, True, False, True): (4, 197, 3, 4), - (1024, 1024, 50432, 32, 32, False, True, True): (2, 197, 1, 4), - (1024, 1024, 50432, 32, 32, True, False, True): (1, 197, 3, 4), - (1024, 1024, 50432, 64, 64, False, True, True): (2, 394, 1, 4), - (1024, 1024, 50432, 64, 64, True, False, True): (1, 197, 2, 4), - (1024, 1024, 50432, 128, 128, False, True, True): (3, 394, 1, 4), - (1024, 1024, 50432, 128, 128, True, False, True): (2, 394, 2, 4), - (1024, 1024, 65536, 16, 16, False, True, True): (1, 256, 3, 4), - (1024, 1024, 65536, 16, 16, True, False, True): (1, 256, 3, 1), - (1024, 1024, 65536, 32, 32, False, True, True): (1, 512, 3, 2), - (1024, 1024, 65536, 32, 32, True, False, True): (1, 256, 3, 4), - (1024, 1024, 65536, 64, 64, False, True, True): (2, 256, 2, 4), - (1024, 1024, 65536, 64, 64, True, False, True): (1, 512, 3, 4), - (1024, 1024, 65536, 128, 128, False, True, True): (10, 512, 1, 4), - (1024, 1024, 65536, 128, 128, True, False, True): (4, 512, 1, 4), - (1024, 1024, 65792, 16, 16, False, True, True): (1, 257, 1, 4), - (1024, 1024, 65792, 16, 16, True, False, True): (10, 257, 4, 1), - (1024, 1024, 65792, 32, 32, False, True, True): (2, 257, 1, 4), - (1024, 1024, 65792, 32, 32, True, False, True): (1, 257, 3, 4), - (1024, 1024, 65792, 64, 64, False, True, True): (2, 514, 1, 4), - (1024, 1024, 65792, 64, 64, True, False, True): (2, 257, 2, 4), - (1024, 1024, 65792, 128, 128, False, True, True): (6, 514, 1, 4), - (1024, 1024, 65792, 128, 128, True, False, True): (2, 514, 2, 4), - (1024, 1024, 131072, 16, 16, False, True, True): (11, 512, 3, 2), - (1024, 1024, 131072, 16, 16, True, False, True): (11, 512, 3, 2), - (1024, 1024, 131072, 32, 32, False, True, True): (7, 1024, 3, 2), - (1024, 1024, 131072, 32, 32, True, False, True): (6, 512, 3, 4), - (1024, 1024, 131072, 64, 64, False, True, True): (1, 512, 2, 4), - (1024, 1024, 131072, 64, 64, True, False, True): (4, 1024, 3, 4), - (1024, 1024, 131072, 128, 128, False, True, True): (12, 1024, 1, 4), - (1024, 1024, 131072, 128, 128, True, False, True): (4, 1024, 1, 4), - (1280, 5120, 65792, 16, 16, False, True, True): (1, 257, 1, 4), - (1280, 5120, 65792, 16, 16, True, False, True): (5, 257, 4, 1), - (1280, 5120, 65792, 32, 32, False, True, True): (2, 257, 1, 4), - (1280, 5120, 65792, 32, 32, True, False, True): (2, 257, 3, 4), - (1280, 5120, 65792, 64, 64, False, True, True): (1, 514, 3, 4), - (1280, 5120, 65792, 64, 64, True, False, True): (2, 257, 3, 4), - (1280, 5120, 65792, 128, 128, False, True, True): (1, 514, 3, 8), - (1280, 5120, 65792, 128, 128, True, False, True): (1, 514, 3, 8), - (1536, 1536, 256, 16, 16, False, True, True): (5, 4, 4, 2), - (1536, 1536, 256, 16, 16, True, False, True): (3, 4, 5, 2), - (1536, 1536, 256, 32, 32, False, True, True): (2, 4, 4, 4), - (1536, 1536, 256, 32, 32, True, False, True): (1, 4, 6, 2), - (1536, 1536, 256, 64, 64, False, True, True): (5, 4, 4, 4), - (1536, 1536, 256, 64, 64, True, False, True): (2, 4, 4, 4), - (1536, 1536, 256, 128, 128, False, True, True): (1, 2, 3, 8), - (1536, 1536, 256, 128, 128, True, False, True): (2, 2, 3, 8), - (1536, 1536, 512, 16, 16, False, True, True): (1, 8, 1, 4), - (1536, 1536, 512, 16, 16, True, False, True): (3, 4, 4, 2), - (1536, 1536, 512, 32, 32, False, True, True): (1, 8, 1, 8), - (1536, 1536, 512, 32, 32, True, False, True): (1, 4, 4, 4), - (1536, 1536, 512, 64, 64, False, True, True): (3, 8, 3, 4), - (1536, 1536, 512, 64, 64, True, False, True): (5, 8, 3, 4), - (1536, 1536, 512, 128, 128, False, True, True): (3, 4, 3, 8), - (1536, 1536, 512, 128, 128, True, False, True): (1, 4, 3, 8), - (1536, 1536, 1024, 16, 16, False, True, True): (6, 8, 1, 2), - (1536, 1536, 1024, 16, 16, True, False, True): (2, 8, 5, 2), - (1536, 1536, 1024, 32, 32, False, True, True): (6, 8, 1, 8), - (1536, 1536, 1024, 32, 32, True, False, True): (2, 4, 3, 4), - (1536, 1536, 1024, 64, 64, False, True, True): (1, 16, 3, 4), - (1536, 1536, 1024, 64, 64, True, False, True): (3, 8, 3, 4), - (1536, 1536, 1024, 128, 128, False, True, True): (3, 8, 3, 8), - (1536, 1536, 1024, 128, 128, True, False, True): (3, 8, 3, 8), - (1536, 1536, 2048, 16, 16, False, True, True): (1, 16, 1, 4), - (1536, 1536, 2048, 16, 16, True, False, True): (1, 8, 3, 1), - (1536, 1536, 2048, 32, 32, False, True, True): (1, 16, 1, 8), - (1536, 1536, 2048, 32, 32, True, False, True): (4, 8, 3, 2), - (1536, 1536, 2048, 64, 64, False, True, True): (1, 16, 3, 4), - (1536, 1536, 2048, 64, 64, True, False, True): (3, 8, 3, 4), - (1536, 1536, 2048, 128, 128, False, True, True): (6, 16, 1, 4), - (1536, 1536, 2048, 128, 128, True, False, True): (4, 16, 3, 8), - (1536, 1536, 4096, 16, 16, False, True, True): (1, 32, 1, 2), - (1536, 1536, 4096, 16, 16, True, False, True): (4, 32, 4, 2), - (1536, 1536, 4096, 32, 32, False, True, True): (1, 32, 1, 8), - (1536, 1536, 4096, 32, 32, True, False, True): (3, 16, 3, 4), - (1536, 1536, 4096, 64, 64, False, True, True): (1, 32, 3, 4), - (1536, 1536, 4096, 64, 64, True, False, True): (1, 16, 3, 4), - (1536, 1536, 4096, 128, 128, False, True, True): (4, 32, 3, 8), - (1536, 1536, 4096, 128, 128, True, False, True): (2, 32, 3, 8), - (1536, 1536, 8192, 16, 16, False, True, True): (2, 64, 1, 2), - (1536, 1536, 8192, 16, 16, True, False, True): (4, 64, 4, 2), - (1536, 1536, 8192, 32, 32, False, True, True): (1, 64, 1, 8), - (1536, 1536, 8192, 32, 32, True, False, True): (12, 32, 3, 4), - (1536, 1536, 8192, 64, 64, False, True, True): (2, 64, 3, 4), - (1536, 1536, 8192, 64, 64, True, False, True): (2, 32, 3, 4), - (1536, 1536, 8192, 128, 128, False, True, True): (3, 64, 1, 4), - (1536, 1536, 8192, 128, 128, True, False, True): (4, 64, 3, 8), - (1536, 1536, 16384, 16, 16, False, True, True): (1, 128, 1, 2), - (1536, 1536, 16384, 16, 16, True, False, True): (1, 64, 4, 4), - (1536, 1536, 16384, 32, 32, False, True, True): (1, 64, 1, 2), - (1536, 1536, 16384, 32, 32, True, False, True): (1, 64, 3, 4), - (1536, 1536, 16384, 64, 64, False, True, True): (1, 128, 3, 4), - (1536, 1536, 16384, 64, 64, True, False, True): (1, 64, 3, 4), - (1536, 1536, 16384, 128, 128, False, True, True): (3, 128, 1, 4), - (1536, 1536, 16384, 128, 128, True, False, True): (1, 128, 2, 4), - (1536, 1536, 32768, 16, 16, False, True, True): (1, 256, 1, 2), - (1536, 1536, 32768, 16, 16, True, False, True): (1, 128, 3, 2), - (1536, 1536, 32768, 32, 32, False, True, True): (1, 128, 1, 2), - (1536, 1536, 32768, 32, 32, True, False, True): (1, 128, 3, 4), - (1536, 1536, 32768, 64, 64, False, True, True): (3, 256, 3, 4), - (1536, 1536, 32768, 64, 64, True, False, True): (1, 128, 3, 4), - (1536, 1536, 32768, 128, 128, False, True, True): (3, 256, 1, 4), - (1536, 1536, 32768, 128, 128, True, False, True): (1, 256, 2, 4), - (1536, 1536, 65536, 16, 16, False, True, True): (4, 512, 1, 2), - (1536, 1536, 65536, 16, 16, True, False, True): (1, 256, 4, 4), - (1536, 1536, 65536, 32, 32, False, True, True): (1, 256, 1, 2), - (1536, 1536, 65536, 32, 32, True, False, True): (1, 256, 3, 4), - (1536, 1536, 65536, 64, 64, False, True, True): (2, 512, 3, 4), - (1536, 1536, 65536, 64, 64, True, False, True): (1, 256, 3, 4), - (1536, 1536, 65536, 128, 128, False, True, True): (3, 512, 1, 4), - (1536, 1536, 65536, 128, 128, True, False, True): (2, 512, 2, 4), - (1536, 1536, 131072, 16, 16, False, True, True): (2, 1024, 1, 2), - (1536, 1536, 131072, 16, 16, True, False, True): (9, 512, 4, 4), - (1536, 1536, 131072, 32, 32, False, True, True): (1, 512, 1, 2), - (1536, 1536, 131072, 32, 32, True, False, True): (9, 512, 3, 4), - (1536, 1536, 131072, 64, 64, False, True, True): (1, 1024, 3, 4), - (1536, 1536, 131072, 64, 64, True, False, True): (1, 512, 3, 4), - (1536, 1536, 131072, 128, 128, False, True, True): (3, 1024, 1, 4), - (1536, 1536, 131072, 128, 128, True, False, True): (1, 1024, 2, 4), - (2048, 2048, 256, 16, 16, False, True, True): (4, 4, 6, 2), - (2048, 2048, 256, 16, 16, True, False, True): (2, 8, 4, 1), - (2048, 2048, 256, 32, 32, False, True, True): (3, 4, 4, 2), - (2048, 2048, 256, 32, 32, True, False, True): (1, 4, 5, 2), - (2048, 2048, 256, 64, 64, False, True, True): (2, 4, 4, 4), - (2048, 2048, 256, 64, 64, True, False, True): (2, 4, 4, 4), - (2048, 2048, 256, 128, 128, False, True, True): (3, 2, 2, 8), - (2048, 2048, 256, 128, 128, True, False, True): (5, 2, 2, 8), - (2048, 2048, 512, 16, 16, False, True, True): (5, 4, 4, 4), - (2048, 2048, 512, 16, 16, True, False, True): (2, 4, 4, 2), - (2048, 2048, 512, 32, 32, False, True, True): (1, 4, 3, 4), - (2048, 2048, 512, 32, 32, True, False, True): (3, 4, 4, 2), - (2048, 2048, 512, 64, 64, False, True, True): (1, 8, 3, 4), - (2048, 2048, 512, 64, 64, True, False, True): (1, 8, 3, 2), - (2048, 2048, 512, 128, 128, False, True, True): (3, 4, 2, 8), - (2048, 2048, 512, 128, 128, True, False, True): (2, 4, 2, 8), - (2048, 2048, 1024, 16, 16, False, True, True): (3, 4, 3, 4), - (2048, 2048, 1024, 16, 16, True, False, True): (2, 8, 3, 2), - (2048, 2048, 1024, 32, 32, False, True, True): (3, 8, 3, 4), - (2048, 2048, 1024, 32, 32, True, False, True): (1, 8, 3, 2), - (2048, 2048, 1024, 64, 64, False, True, True): (1, 8, 3, 4), - (2048, 2048, 1024, 64, 64, True, False, True): (1, 8, 3, 4), - (2048, 2048, 1024, 128, 128, False, True, True): (4, 8, 2, 8), - (2048, 2048, 1024, 128, 128, True, False, True): (4, 8, 1, 4), - (2048, 2048, 2048, 16, 16, False, True, True): (4, 16, 3, 2), - (2048, 2048, 2048, 16, 16, True, False, True): (2, 16, 3, 2), - (2048, 2048, 2048, 32, 32, False, True, True): (1, 16, 3, 4), - (2048, 2048, 2048, 32, 32, True, False, True): (1, 16, 3, 2), - (2048, 2048, 2048, 64, 64, False, True, True): (1, 16, 3, 4), - (2048, 2048, 2048, 64, 64, True, False, True): (1, 16, 3, 4), - (2048, 2048, 2048, 128, 128, False, True, True): (6, 16, 2, 8), - (2048, 2048, 2048, 128, 128, True, False, True): (5, 16, 1, 4), - (2048, 2048, 4096, 16, 16, False, True, True): (4, 32, 4, 2), - (2048, 2048, 4096, 16, 16, True, False, True): (4, 32, 3, 2), - (2048, 2048, 4096, 32, 32, False, True, True): (4, 16, 3, 8), - (2048, 2048, 4096, 32, 32, True, False, True): (4, 16, 3, 4), - (2048, 2048, 4096, 64, 64, False, True, True): (4, 32, 3, 4), - (2048, 2048, 4096, 64, 64, True, False, True): (4, 32, 3, 4), - (2048, 2048, 4096, 128, 128, False, True, True): (4, 32, 2, 8), - (2048, 2048, 4096, 128, 128, True, False, True): (2, 32, 1, 4), - (2048, 2048, 8192, 16, 16, False, True, True): (4, 64, 4, 2), - (2048, 2048, 8192, 16, 16, True, False, True): (4, 64, 4, 2), - (2048, 2048, 8192, 32, 32, False, True, True): (4, 32, 3, 8), - (2048, 2048, 8192, 32, 32, True, False, True): (4, 32, 4, 8), - (2048, 2048, 8192, 64, 64, False, True, True): (2, 64, 3, 4), - (2048, 2048, 8192, 64, 64, True, False, True): (4, 64, 3, 4), - (2048, 2048, 8192, 128, 128, False, True, True): (3, 64, 1, 4), - (2048, 2048, 8192, 128, 128, True, False, True): (2, 64, 1, 4), - (2048, 2048, 16384, 16, 16, False, True, True): (4, 64, 3, 4), - (2048, 2048, 16384, 16, 16, True, False, True): (1, 64, 3, 4), - (2048, 2048, 16384, 32, 32, False, True, True): (4, 64, 3, 4), - (2048, 2048, 16384, 32, 32, True, False, True): (4, 64, 3, 4), - (2048, 2048, 16384, 64, 64, False, True, True): (4, 128, 3, 4), - (2048, 2048, 16384, 64, 64, True, False, True): (4, 128, 3, 4), - (2048, 2048, 16384, 128, 128, False, True, True): (3, 128, 1, 4), - (2048, 2048, 16384, 128, 128, True, False, True): (2, 128, 1, 4), - (2048, 2048, 32768, 16, 16, False, True, True): (8, 128, 3, 2), - (2048, 2048, 32768, 16, 16, True, False, True): (8, 128, 3, 4), - (2048, 2048, 32768, 32, 32, False, True, True): (8, 128, 3, 4), - (2048, 2048, 32768, 32, 32, True, False, True): (8, 128, 3, 4), - (2048, 2048, 32768, 64, 64, False, True, True): (8, 256, 3, 4), - (2048, 2048, 32768, 64, 64, True, False, True): (8, 256, 3, 4), - (2048, 2048, 32768, 128, 128, False, True, True): (3, 256, 1, 4), - (2048, 2048, 32768, 128, 128, True, False, True): (1, 256, 1, 4), - (2048, 2048, 50432, 16, 16, False, True, True): (1, 197, 1, 4), - (2048, 2048, 50432, 16, 16, True, False, True): (4, 197, 4, 1), - (2048, 2048, 50432, 32, 32, False, True, True): (2, 197, 1, 4), - (2048, 2048, 50432, 32, 32, True, False, True): (4, 197, 3, 4), - (2048, 2048, 50432, 64, 64, False, True, True): (2, 394, 3, 4), - (2048, 2048, 50432, 64, 64, True, False, True): (4, 197, 2, 4), - (2048, 2048, 50432, 128, 128, False, True, True): (3, 394, 1, 4), - (2048, 2048, 50432, 128, 128, True, False, True): (4, 394, 2, 4), - (2048, 2048, 65536, 16, 16, False, True, True): (9, 256, 3, 2), - (2048, 2048, 65536, 16, 16, True, False, True): (9, 256, 4, 4), - (2048, 2048, 65536, 32, 32, False, True, True): (7, 256, 3, 4), - (2048, 2048, 65536, 32, 32, True, False, True): (7, 256, 3, 4), - (2048, 2048, 65536, 64, 64, False, True, True): (2, 256, 2, 4), - (2048, 2048, 65536, 64, 64, True, False, True): (9, 512, 3, 4), - (2048, 2048, 65536, 128, 128, False, True, True): (5, 512, 1, 4), - (2048, 2048, 65536, 128, 128, True, False, True): (1, 512, 1, 4), - (2048, 2048, 65792, 16, 16, False, True, True): (1, 257, 1, 4), - (2048, 2048, 65792, 16, 16, True, False, True): (7, 257, 4, 1), - (2048, 2048, 65792, 32, 32, False, True, True): (2, 257, 1, 4), - (2048, 2048, 65792, 32, 32, True, False, True): (7, 257, 3, 4), - (2048, 2048, 65792, 64, 64, False, True, True): (1, 514, 3, 4), - (2048, 2048, 65792, 64, 64, True, False, True): (1, 257, 2, 4), - (2048, 2048, 65792, 128, 128, False, True, True): (3, 514, 1, 4), - (2048, 2048, 65792, 128, 128, True, False, True): (1, 514, 2, 4), - (2048, 2048, 131072, 16, 16, False, True, True): (9, 512, 3, 2), - (2048, 2048, 131072, 16, 16, True, False, True): (9, 512, 4, 4), - (2048, 2048, 131072, 32, 32, False, True, True): (7, 512, 3, 4), - (2048, 2048, 131072, 32, 32, True, False, True): (3, 512, 3, 4), - (2048, 2048, 131072, 64, 64, False, True, True): (1, 512, 2, 4), - (2048, 2048, 131072, 64, 64, True, False, True): (2, 1024, 3, 4), - (2048, 2048, 131072, 128, 128, False, True, True): (3, 1024, 1, 4), - (2048, 2048, 131072, 128, 128, True, False, True): (1, 1024, 1, 4), - (3072, 768, 256, 16, 16, False, True, True): (6, 4, 1, 4), - (3072, 768, 256, 16, 16, True, False, True): (2, 1, 5, 2), - (3072, 768, 256, 32, 32, False, True, True): (1, 4, 1, 8), - (3072, 768, 256, 32, 32, True, False, True): (4, 2, 4, 4), - (3072, 768, 256, 64, 64, False, True, True): (1, 2, 3, 4), - (3072, 768, 256, 64, 64, True, False, True): (3, 4, 3, 4), - (3072, 768, 256, 128, 128, False, True, True): (1, 2, 3, 8), - (3072, 768, 256, 128, 128, True, False, True): (3, 2, 3, 8), - (3072, 768, 512, 16, 16, False, True, True): (1, 4, 1, 4), - (3072, 768, 512, 16, 16, True, False, True): (3, 4, 4, 1), - (3072, 768, 512, 32, 32, False, True, True): (5, 8, 1, 4), - (3072, 768, 512, 32, 32, True, False, True): (3, 4, 4, 2), - (3072, 768, 512, 64, 64, False, True, True): (1, 8, 1, 4), - (3072, 768, 512, 64, 64, True, False, True): (1, 4, 3, 4), - (3072, 768, 512, 128, 128, False, True, True): (3, 4, 3, 8), - (3072, 768, 512, 128, 128, True, False, True): (1, 4, 3, 8), - (3072, 768, 1024, 16, 16, False, True, True): (1, 8, 1, 4), - (3072, 768, 1024, 16, 16, True, False, True): (3, 4, 3, 1), - (3072, 768, 1024, 32, 32, False, True, True): (1, 16, 1, 4), - (3072, 768, 1024, 32, 32, True, False, True): (1, 4, 3, 8), - (3072, 768, 1024, 64, 64, False, True, True): (8, 16, 3, 2), - (3072, 768, 1024, 64, 64, True, False, True): (1, 4, 3, 4), - (3072, 768, 1024, 128, 128, False, True, True): (2, 8, 3, 8), - (3072, 768, 1024, 128, 128, True, False, True): (3, 8, 2, 4), - (3072, 768, 2048, 16, 16, False, True, True): (1, 8, 1, 4), - (3072, 768, 2048, 16, 16, True, False, True): (6, 8, 4, 4), - (3072, 768, 2048, 32, 32, False, True, True): (1, 16, 1, 8), - (3072, 768, 2048, 32, 32, True, False, True): (6, 8, 3, 4), - (3072, 768, 2048, 64, 64, False, True, True): (8, 16, 3, 4), - (3072, 768, 2048, 64, 64, True, False, True): (3, 16, 3, 4), - (3072, 768, 2048, 128, 128, False, True, True): (1, 16, 3, 8), - (3072, 768, 2048, 128, 128, True, False, True): (2, 16, 2, 4), - (3072, 768, 4096, 16, 16, False, True, True): (1, 16, 1, 4), - (3072, 768, 4096, 16, 16, True, False, True): (4, 32, 4, 2), - (3072, 768, 4096, 32, 32, False, True, True): (1, 32, 1, 8), - (3072, 768, 4096, 32, 32, True, False, True): (4, 16, 3, 4), - (3072, 768, 4096, 64, 64, False, True, True): (2, 32, 1, 4), - (3072, 768, 4096, 64, 64, True, False, True): (2, 16, 2, 4), - (3072, 768, 4096, 128, 128, False, True, True): (2, 32, 1, 16), - (3072, 768, 4096, 128, 128, True, False, True): (3, 32, 2, 4), - (3072, 768, 8192, 16, 16, False, True, True): (2, 32, 1, 4), - (3072, 768, 8192, 16, 16, True, False, True): (4, 64, 4, 2), - (3072, 768, 8192, 32, 32, False, True, True): (2, 32, 1, 4), - (3072, 768, 8192, 32, 32, True, False, True): (6, 32, 3, 4), - (3072, 768, 8192, 64, 64, False, True, True): (2, 64, 1, 4), - (3072, 768, 8192, 64, 64, True, False, True): (2, 32, 2, 4), - (3072, 768, 8192, 128, 128, False, True, True): (3, 64, 1, 4), - (3072, 768, 8192, 128, 128, True, False, True): (2, 64, 2, 4), - (3072, 768, 16384, 16, 16, False, True, True): (1, 64, 1, 4), - (3072, 768, 16384, 16, 16, True, False, True): (1, 64, 1, 1), - (3072, 768, 16384, 32, 32, False, True, True): (2, 64, 1, 4), - (3072, 768, 16384, 32, 32, True, False, True): (4, 64, 3, 4), - (3072, 768, 16384, 64, 64, False, True, True): (2, 128, 1, 4), - (3072, 768, 16384, 64, 64, True, False, True): (4, 64, 2, 4), - (3072, 768, 16384, 128, 128, False, True, True): (3, 128, 1, 4), - (3072, 768, 16384, 128, 128, True, False, True): (1, 128, 2, 4), - (3072, 768, 32768, 16, 16, False, True, True): (1, 128, 1, 4), - (3072, 768, 32768, 16, 16, True, False, True): (8, 256, 3, 2), - (3072, 768, 32768, 32, 32, False, True, True): (2, 128, 1, 4), - (3072, 768, 32768, 32, 32, True, False, True): (8, 128, 3, 4), - (3072, 768, 32768, 64, 64, False, True, True): (1, 256, 1, 4), - (3072, 768, 32768, 64, 64, True, False, True): (8, 128, 2, 4), - (3072, 768, 32768, 128, 128, False, True, True): (3, 256, 1, 4), - (3072, 768, 32768, 128, 128, True, False, True): (3, 256, 2, 4), - (3072, 768, 50432, 16, 16, False, True, True): (1, 197, 1, 4), - (3072, 768, 50432, 16, 16, True, False, True): (7, 197, 4, 1), - (3072, 768, 50432, 32, 32, False, True, True): (2, 197, 1, 4), - (3072, 768, 50432, 32, 32, True, False, True): (10, 197, 3, 4), - (3072, 768, 50432, 64, 64, False, True, True): (1, 394, 1, 4), - (3072, 768, 50432, 64, 64, True, False, True): (3, 197, 2, 4), - (3072, 768, 50432, 128, 128, False, True, True): (3, 394, 1, 4), - (3072, 768, 50432, 128, 128, True, False, True): (2, 394, 2, 4), - (3072, 768, 65536, 16, 16, False, True, True): (1, 256, 1, 4), - (3072, 768, 65536, 16, 16, True, False, True): (15, 256, 4, 1), - (3072, 768, 65536, 32, 32, False, True, True): (2, 256, 1, 4), - (3072, 768, 65536, 32, 32, True, False, True): (10, 256, 3, 4), - (3072, 768, 65536, 64, 64, False, True, True): (1, 512, 1, 4), - (3072, 768, 65536, 64, 64, True, False, True): (3, 256, 2, 4), - (3072, 768, 65536, 128, 128, False, True, True): (3, 512, 1, 4), - (3072, 768, 65536, 128, 128, True, False, True): (3, 512, 2, 4), - (3072, 768, 131072, 16, 16, False, True, True): (1, 512, 1, 4), - (3072, 768, 131072, 16, 16, True, False, True): (15, 512, 4, 1), - (3072, 768, 131072, 32, 32, False, True, True): (2, 512, 1, 4), - (3072, 768, 131072, 32, 32, True, False, True): (9, 512, 3, 4), - (3072, 768, 131072, 64, 64, False, True, True): (1, 1024, 1, 4), - (3072, 768, 131072, 64, 64, True, False, True): (3, 512, 2, 4), - (3072, 768, 131072, 128, 128, False, True, True): (3, 1024, 1, 4), - (3072, 768, 131072, 128, 128, True, False, True): (3, 1024, 2, 4), - (3072, 3072, 256, 16, 16, False, True, True): (5, 4, 1, 4), - (3072, 3072, 256, 16, 16, True, False, True): (1, 2, 5, 2), - (3072, 3072, 256, 32, 32, False, True, True): (1, 4, 1, 8), - (3072, 3072, 256, 32, 32, True, False, True): (3, 4, 4, 2), - (3072, 3072, 256, 64, 64, False, True, True): (2, 4, 3, 4), - (3072, 3072, 256, 64, 64, True, False, True): (3, 4, 4, 4), - (3072, 3072, 256, 128, 128, False, True, True): (1, 2, 3, 8), - (3072, 3072, 256, 128, 128, True, False, True): (1, 2, 3, 8), - (3072, 3072, 512, 16, 16, False, True, True): (5, 4, 1, 2), - (3072, 3072, 512, 16, 16, True, False, True): (1, 2, 4, 4), - (3072, 3072, 512, 32, 32, False, True, True): (3, 8, 1, 4), - (3072, 3072, 512, 32, 32, True, False, True): (4, 2, 3, 4), - (3072, 3072, 512, 64, 64, False, True, True): (1, 8, 2, 2), - (3072, 3072, 512, 64, 64, True, False, True): (2, 4, 3, 4), - (3072, 3072, 512, 128, 128, False, True, True): (1, 4, 3, 8), - (3072, 3072, 512, 128, 128, True, False, True): (4, 4, 3, 8), - (3072, 3072, 1024, 16, 16, False, True, True): (1, 8, 1, 4), - (3072, 3072, 1024, 16, 16, True, False, True): (4, 8, 5, 2), - (3072, 3072, 1024, 32, 32, False, True, True): (1, 8, 1, 8), - (3072, 3072, 1024, 32, 32, True, False, True): (1, 4, 4, 4), - (3072, 3072, 1024, 64, 64, False, True, True): (3, 8, 3, 4), - (3072, 3072, 1024, 64, 64, True, False, True): (2, 4, 3, 4), - (3072, 3072, 1024, 128, 128, False, True, True): (3, 8, 1, 4), - (3072, 3072, 1024, 128, 128, True, False, True): (1, 8, 3, 8), - (3072, 3072, 2048, 16, 16, False, True, True): (1, 16, 1, 2), - (3072, 3072, 2048, 16, 16, True, False, True): (4, 16, 4, 2), - (3072, 3072, 2048, 32, 32, False, True, True): (1, 16, 1, 8), - (3072, 3072, 2048, 32, 32, True, False, True): (3, 8, 4, 4), - (3072, 3072, 2048, 64, 64, False, True, True): (3, 16, 3, 4), - (3072, 3072, 2048, 64, 64, True, False, True): (3, 8, 3, 4), - (3072, 3072, 2048, 128, 128, False, True, True): (4, 16, 3, 8), - (3072, 3072, 2048, 128, 128, True, False, True): (3, 16, 3, 8), - (3072, 3072, 4096, 16, 16, False, True, True): (1, 32, 1, 2), - (3072, 3072, 4096, 16, 16, True, False, True): (4, 32, 4, 2), - (3072, 3072, 4096, 32, 32, False, True, True): (1, 32, 1, 8), - (3072, 3072, 4096, 32, 32, True, False, True): (3, 16, 3, 4), - (3072, 3072, 4096, 64, 64, False, True, True): (1, 32, 3, 4), - (3072, 3072, 4096, 64, 64, True, False, True): (3, 16, 3, 4), - (3072, 3072, 4096, 128, 128, False, True, True): (1, 32, 3, 8), - (3072, 3072, 4096, 128, 128, True, False, True): (3, 32, 3, 8), - (3072, 3072, 8192, 16, 16, False, True, True): (1, 64, 1, 2), - (3072, 3072, 8192, 16, 16, True, False, True): (4, 64, 4, 2), - (3072, 3072, 8192, 32, 32, False, True, True): (1, 64, 1, 8), - (3072, 3072, 8192, 32, 32, True, False, True): (8, 32, 3, 4), - (3072, 3072, 8192, 64, 64, False, True, True): (3, 64, 3, 4), - (3072, 3072, 8192, 64, 64, True, False, True): (2, 32, 3, 4), - (3072, 3072, 8192, 128, 128, False, True, True): (2, 64, 3, 8), - (3072, 3072, 8192, 128, 128, True, False, True): (1, 64, 3, 8), - (3072, 3072, 16384, 16, 16, False, True, True): (1, 128, 1, 2), - (3072, 3072, 16384, 16, 16, True, False, True): (4, 128, 4, 2), - (3072, 3072, 16384, 32, 32, False, True, True): (1, 64, 1, 2), - (3072, 3072, 16384, 32, 32, True, False, True): (4, 64, 3, 4), - (3072, 3072, 16384, 64, 64, False, True, True): (1, 128, 3, 4), - (3072, 3072, 16384, 64, 64, True, False, True): (4, 64, 3, 4), - (3072, 3072, 16384, 128, 128, False, True, True): (3, 128, 1, 4), - (3072, 3072, 16384, 128, 128, True, False, True): (1, 128, 3, 8), - (3072, 3072, 32768, 16, 16, False, True, True): (1, 256, 1, 2), - (3072, 3072, 32768, 16, 16, True, False, True): (8, 128, 4, 4), - (3072, 3072, 32768, 32, 32, False, True, True): (1, 256, 1, 8), - (3072, 3072, 32768, 32, 32, True, False, True): (5, 128, 3, 4), - (3072, 3072, 32768, 64, 64, False, True, True): (1, 256, 3, 4), - (3072, 3072, 32768, 64, 64, True, False, True): (1, 128, 3, 4), - (3072, 3072, 32768, 128, 128, False, True, True): (3, 256, 1, 4), - (3072, 3072, 32768, 128, 128, True, False, True): (3, 256, 2, 4), - (3072, 3072, 65536, 16, 16, False, True, True): (1, 512, 1, 2), - (3072, 3072, 65536, 16, 16, True, False, True): (7, 256, 4, 4), - (3072, 3072, 65536, 32, 32, False, True, True): (1, 256, 1, 2), - (3072, 3072, 65536, 32, 32, True, False, True): (5, 256, 3, 4), - (3072, 3072, 65536, 64, 64, False, True, True): (1, 512, 3, 4), - (3072, 3072, 65536, 64, 64, True, False, True): (3, 256, 3, 4), - (3072, 3072, 65536, 128, 128, False, True, True): (3, 512, 1, 4), - (3072, 3072, 65536, 128, 128, True, False, True): (3, 512, 2, 4), - (3072, 3072, 131072, 16, 16, False, True, True): (1, 1024, 1, 2), - (3072, 3072, 131072, 16, 16, True, False, True): (5, 512, 4, 4), - (3072, 3072, 131072, 32, 32, False, True, True): (1, 512, 1, 2), - (3072, 3072, 131072, 32, 32, True, False, True): (3, 512, 3, 4), - (3072, 3072, 131072, 64, 64, False, True, True): (1, 1024, 3, 4), - (3072, 3072, 131072, 64, 64, True, False, True): (3, 512, 3, 4), - (3072, 3072, 131072, 128, 128, False, True, True): (3, 1024, 1, 4), - (3072, 3072, 131072, 128, 128, True, False, True): (1, 1024, 2, 4), - (4096, 4096, 256, 16, 16, False, True, True): (2, 2, 6, 4), - (4096, 4096, 256, 16, 16, True, False, True): (2, 2, 5, 4), - (4096, 4096, 256, 32, 32, False, True, True): (7, 2, 4, 4), - (4096, 4096, 256, 32, 32, True, False, True): (1, 2, 4, 4), - (4096, 4096, 256, 64, 64, False, True, True): (3, 4, 3, 4), - (4096, 4096, 256, 64, 64, True, False, True): (3, 4, 3, 4), - (4096, 4096, 256, 128, 128, False, True, True): (1, 2, 2, 8), - (4096, 4096, 256, 128, 128, True, False, True): (1, 2, 2, 8), - (4096, 4096, 512, 16, 16, False, True, True): (4, 2, 3, 4), - (4096, 4096, 512, 16, 16, True, False, True): (2, 4, 3, 2), - (4096, 4096, 512, 32, 32, False, True, True): (3, 4, 3, 4), - (4096, 4096, 512, 32, 32, True, False, True): (3, 4, 3, 2), - (4096, 4096, 512, 64, 64, False, True, True): (3, 4, 3, 4), - (4096, 4096, 512, 64, 64, True, False, True): (3, 4, 3, 4), - (4096, 4096, 512, 128, 128, False, True, True): (2, 4, 2, 8), - (4096, 4096, 512, 128, 128, True, False, True): (2, 4, 1, 4), - (4096, 4096, 1024, 16, 16, False, True, True): (2, 8, 3, 2), - (4096, 4096, 1024, 16, 16, True, False, True): (2, 8, 3, 2), - (4096, 4096, 1024, 32, 32, False, True, True): (3, 8, 3, 4), - (4096, 4096, 1024, 32, 32, True, False, True): (1, 8, 3, 2), - (4096, 4096, 1024, 64, 64, False, True, True): (1, 8, 3, 4), - (4096, 4096, 1024, 64, 64, True, False, True): (1, 8, 3, 4), - (4096, 4096, 1024, 128, 128, False, True, True): (2, 8, 2, 8), - (4096, 4096, 1024, 128, 128, True, False, True): (2, 8, 2, 8), - (4096, 4096, 2048, 16, 16, False, True, True): (2, 8, 4, 4), - (4096, 4096, 2048, 16, 16, True, False, True): (2, 8, 4, 4), - (4096, 4096, 2048, 32, 32, False, True, True): (4, 8, 4, 8), - (4096, 4096, 2048, 32, 32, True, False, True): (4, 8, 4, 8), - (4096, 4096, 2048, 64, 64, False, True, True): (1, 16, 3, 4), - (4096, 4096, 2048, 64, 64, True, False, True): (4, 16, 3, 4), - (4096, 4096, 2048, 128, 128, False, True, True): (2, 16, 2, 8), - (4096, 4096, 2048, 128, 128, True, False, True): (4, 16, 1, 4), - (4096, 4096, 4096, 16, 16, False, True, True): (4, 32, 4, 4), - (4096, 4096, 4096, 16, 16, True, False, True): (4, 32, 4, 2), - (4096, 4096, 4096, 32, 32, False, True, True): (4, 16, 4, 8), - (4096, 4096, 4096, 32, 32, True, False, True): (4, 16, 3, 8), - (4096, 4096, 4096, 64, 64, False, True, True): (1, 32, 3, 4), - (4096, 4096, 4096, 64, 64, True, False, True): (1, 32, 3, 4), - (4096, 4096, 4096, 128, 128, False, True, True): (3, 32, 1, 4), - (4096, 4096, 4096, 128, 128, True, False, True): (2, 32, 1, 4), - (4096, 4096, 8192, 16, 16, False, True, True): (4, 64, 4, 2), - (4096, 4096, 8192, 16, 16, True, False, True): (4, 64, 4, 2), - (4096, 4096, 8192, 32, 32, False, True, True): (4, 32, 4, 8), - (4096, 4096, 8192, 32, 32, True, False, True): (4, 32, 4, 8), - (4096, 4096, 8192, 64, 64, False, True, True): (2, 64, 3, 4), - (4096, 4096, 8192, 64, 64, True, False, True): (2, 64, 3, 4), - (4096, 4096, 8192, 128, 128, False, True, True): (3, 64, 1, 4), - (4096, 4096, 8192, 128, 128, True, False, True): (1, 64, 1, 4), - (4096, 4096, 16384, 16, 16, False, True, True): (4, 64, 3, 4), - (4096, 4096, 16384, 16, 16, True, False, True): (4, 64, 4, 4), - (4096, 4096, 16384, 32, 32, False, True, True): (4, 64, 4, 8), - (4096, 4096, 16384, 32, 32, True, False, True): (4, 64, 4, 8), - (4096, 4096, 16384, 64, 64, False, True, True): (1, 64, 2, 4), - (4096, 4096, 16384, 64, 64, True, False, True): (1, 64, 3, 8), - (4096, 4096, 16384, 128, 128, False, True, True): (3, 128, 1, 4), - (4096, 4096, 16384, 128, 128, True, False, True): (1, 128, 1, 4), - (4096, 4096, 32768, 16, 16, False, True, True): (8, 128, 3, 2), - (4096, 4096, 32768, 16, 16, True, False, True): (5, 128, 4, 4), - (4096, 4096, 32768, 32, 32, False, True, True): (3, 128, 4, 4), - (4096, 4096, 32768, 32, 32, True, False, True): (3, 128, 4, 8), - (4096, 4096, 32768, 64, 64, False, True, True): (1, 128, 2, 4), - (4096, 4096, 32768, 64, 64, True, False, True): (3, 256, 3, 4), - (4096, 4096, 32768, 128, 128, False, True, True): (3, 256, 1, 4), - (4096, 4096, 32768, 128, 128, True, False, True): (1, 256, 1, 4), - (4096, 4096, 50432, 16, 16, False, True, True): (1, 197, 1, 4), - (4096, 4096, 50432, 16, 16, True, False, True): (4, 197, 4, 1), - (4096, 4096, 50432, 32, 32, False, True, True): (1, 197, 1, 4), - (4096, 4096, 50432, 32, 32, True, False, True): (2, 197, 3, 4), - (4096, 4096, 50432, 64, 64, False, True, True): (1, 394, 3, 4), - (4096, 4096, 50432, 64, 64, True, False, True): (1, 197, 2, 4), - (4096, 4096, 50432, 128, 128, False, True, True): (3, 394, 1, 4), - (4096, 4096, 50432, 128, 128, True, False, True): (1, 394, 2, 4), - (4096, 4096, 65536, 16, 16, False, True, True): (5, 256, 4, 4), - (4096, 4096, 65536, 16, 16, True, False, True): (5, 256, 4, 4), - (4096, 4096, 65536, 32, 32, False, True, True): (4, 256, 4, 8), - (4096, 4096, 65536, 32, 32, True, False, True): (4, 256, 3, 8), - (4096, 4096, 65536, 64, 64, False, True, True): (1, 256, 2, 4), - (4096, 4096, 65536, 64, 64, True, False, True): (1, 512, 3, 4), - (4096, 4096, 65536, 128, 128, False, True, True): (3, 512, 1, 4), - (4096, 4096, 65536, 128, 128, True, False, True): (1, 512, 1, 4), - (4096, 4096, 65792, 16, 16, False, True, True): (1, 257, 1, 4), - (4096, 4096, 65792, 16, 16, True, False, True): (5, 257, 4, 1), - (4096, 4096, 65792, 32, 32, False, True, True): (1, 257, 1, 4), - (4096, 4096, 65792, 32, 32, True, False, True): (1, 257, 3, 4), - (4096, 4096, 65792, 64, 64, False, True, True): (1, 514, 3, 4), - (4096, 4096, 65792, 64, 64, True, False, True): (1, 257, 2, 4), - (4096, 4096, 65792, 128, 128, False, True, True): (3, 514, 1, 4), - (4096, 4096, 65792, 128, 128, True, False, True): (1, 514, 2, 4), - (4096, 4096, 131072, 16, 16, False, True, True): (4, 512, 3, 4), - (4096, 4096, 131072, 16, 16, True, False, True): (5, 512, 4, 4), - (4096, 4096, 131072, 32, 32, False, True, True): (1, 512, 4, 8), - (4096, 4096, 131072, 32, 32, True, False, True): (4, 512, 4, 8), - (4096, 4096, 131072, 64, 64, False, True, True): (1, 512, 2, 4), - (4096, 4096, 131072, 64, 64, True, False, True): (1, 512, 2, 4), - (4096, 4096, 131072, 128, 128, False, True, True): (3, 1024, 1, 4), - (4096, 4096, 131072, 128, 128, True, False, True): (1, 1024, 1, 4), - (5120, 1280, 65792, 16, 16, False, True, True): (1, 257, 1, 4), - (5120, 1280, 65792, 16, 16, True, False, True): (7, 257, 4, 1), - (5120, 1280, 65792, 32, 32, False, True, True): (2, 257, 1, 4), - (5120, 1280, 65792, 32, 32, True, False, True): (5, 257, 3, 4), - (5120, 1280, 65792, 64, 64, False, True, True): (1, 514, 1, 4), - (5120, 1280, 65792, 64, 64, True, False, True): (5, 257, 2, 4), - (5120, 1280, 65792, 128, 128, False, True, True): (3, 514, 1, 4), - (5120, 1280, 65792, 128, 128, True, False, True): (4, 514, 2, 4), - (6144, 6144, 256, 16, 16, False, True, True): (1, 2, 1, 4), - (6144, 6144, 256, 16, 16, True, False, True): (1, 1, 4, 4), - (6144, 6144, 256, 32, 32, False, True, True): (3, 2, 1, 8), - (6144, 6144, 256, 32, 32, True, False, True): (2, 1, 3, 4), - (6144, 6144, 256, 64, 64, False, True, True): (2, 2, 3, 4), - (6144, 6144, 256, 64, 64, True, False, True): (6, 2, 4, 4), - (6144, 6144, 256, 128, 128, False, True, True): (2, 2, 3, 8), - (6144, 6144, 256, 128, 128, True, False, True): (1, 2, 3, 8), - (6144, 6144, 512, 16, 16, False, True, True): (4, 4, 1, 4), - (6144, 6144, 512, 16, 16, True, False, True): (3, 2, 3, 1), - (6144, 6144, 512, 32, 32, False, True, True): (1, 8, 1, 4), - (6144, 6144, 512, 32, 32, True, False, True): (2, 2, 3, 8), - (6144, 6144, 512, 64, 64, False, True, True): (4, 4, 3, 4), - (6144, 6144, 512, 64, 64, True, False, True): (6, 2, 3, 4), - (6144, 6144, 512, 128, 128, False, True, True): (3, 4, 1, 4), - (6144, 6144, 512, 128, 128, True, False, True): (4, 4, 3, 8), - (6144, 6144, 1024, 16, 16, False, True, True): (1, 8, 1, 2), - (6144, 6144, 1024, 16, 16, True, False, True): (4, 8, 4, 2), - (6144, 6144, 1024, 32, 32, False, True, True): (1, 8, 4, 2), - (6144, 6144, 1024, 32, 32, True, False, True): (1, 8, 4, 2), - (6144, 6144, 1024, 64, 64, False, True, True): (4, 8, 3, 4), - (6144, 6144, 1024, 64, 64, True, False, True): (1, 4, 3, 4), - (6144, 6144, 1024, 128, 128, False, True, True): (3, 8, 1, 4), - (6144, 6144, 1024, 128, 128, True, False, True): (1, 8, 3, 8), - (6144, 6144, 2048, 16, 16, False, True, True): (4, 4, 1, 4), - (6144, 6144, 2048, 16, 16, True, False, True): (2, 8, 4, 4), - (6144, 6144, 2048, 32, 32, False, True, True): (4, 8, 3, 4), - (6144, 6144, 2048, 32, 32, True, False, True): (2, 8, 3, 4), - (6144, 6144, 2048, 64, 64, False, True, True): (4, 16, 3, 4), - (6144, 6144, 2048, 64, 64, True, False, True): (2, 8, 3, 4), - (6144, 6144, 2048, 128, 128, False, True, True): (3, 16, 1, 4), - (6144, 6144, 2048, 128, 128, True, False, True): (4, 16, 3, 8), - (6144, 6144, 4096, 16, 16, False, True, True): (4, 8, 1, 4), - (6144, 6144, 4096, 16, 16, True, False, True): (4, 32, 4, 2), - (6144, 6144, 4096, 32, 32, False, True, True): (4, 16, 1, 2), - (6144, 6144, 4096, 32, 32, True, False, True): (2, 8, 3, 8), - (6144, 6144, 4096, 64, 64, False, True, True): (4, 32, 3, 4), - (6144, 6144, 4096, 64, 64, True, False, True): (4, 16, 3, 4), - (6144, 6144, 4096, 128, 128, False, True, True): (6, 32, 1, 4), - (6144, 6144, 4096, 128, 128, True, False, True): (4, 32, 3, 8), - (6144, 6144, 8192, 16, 16, False, True, True): (2, 16, 1, 2), - (6144, 6144, 8192, 16, 16, True, False, True): (4, 64, 4, 2), - (6144, 6144, 8192, 32, 32, False, True, True): (4, 32, 1, 2), - (6144, 6144, 8192, 32, 32, True, False, True): (4, 32, 3, 4), - (6144, 6144, 8192, 64, 64, False, True, True): (4, 64, 3, 4), - (6144, 6144, 8192, 64, 64, True, False, True): (4, 32, 3, 4), - (6144, 6144, 8192, 128, 128, False, True, True): (6, 64, 1, 4), - (6144, 6144, 8192, 128, 128, True, False, True): (4, 64, 3, 8), - (6144, 6144, 16384, 16, 16, False, True, True): (2, 32, 1, 2), - (6144, 6144, 16384, 16, 16, True, False, True): (4, 64, 4, 4), - (6144, 6144, 16384, 32, 32, False, True, True): (4, 64, 1, 2), - (6144, 6144, 16384, 32, 32, True, False, True): (4, 64, 3, 4), - (6144, 6144, 16384, 64, 64, False, True, True): (4, 128, 3, 4), - (6144, 6144, 16384, 64, 64, True, False, True): (1, 32, 3, 8), - (6144, 6144, 16384, 128, 128, False, True, True): (4, 128, 1, 4), - (6144, 6144, 16384, 128, 128, True, False, True): (4, 128, 3, 8), - (6144, 6144, 32768, 16, 16, False, True, True): (2, 64, 1, 2), - (6144, 6144, 32768, 16, 16, True, False, True): (5, 128, 4, 1), - (6144, 6144, 32768, 32, 32, False, True, True): (4, 128, 1, 2), - (6144, 6144, 32768, 32, 32, True, False, True): (3, 128, 3, 4), - (6144, 6144, 32768, 64, 64, False, True, True): (4, 256, 3, 4), - (6144, 6144, 32768, 64, 64, True, False, True): (2, 64, 3, 8), - (6144, 6144, 32768, 128, 128, False, True, True): (8, 256, 1, 4), - (6144, 6144, 32768, 128, 128, True, False, True): (4, 256, 3, 8), - (6144, 6144, 65536, 16, 16, False, True, True): (2, 128, 1, 2), - (6144, 6144, 65536, 16, 16, True, False, True): (5, 256, 4, 1), - (6144, 6144, 65536, 32, 32, False, True, True): (4, 256, 1, 2), - (6144, 6144, 65536, 32, 32, True, False, True): (2, 256, 3, 4), - (6144, 6144, 65536, 64, 64, False, True, True): (4, 512, 3, 4), - (6144, 6144, 65536, 64, 64, True, False, True): (1, 128, 3, 8), - (6144, 6144, 65536, 128, 128, False, True, True): (4, 512, 1, 4), - (6144, 6144, 65536, 128, 128, True, False, True): (4, 512, 3, 8), - (6144, 6144, 131072, 16, 16, False, True, True): (2, 256, 1, 2), - (6144, 6144, 131072, 16, 16, True, False, True): (3, 512, 4, 4), - (6144, 6144, 131072, 32, 32, False, True, True): (4, 512, 1, 2), - (6144, 6144, 131072, 32, 32, True, False, True): (4, 512, 3, 4), - (6144, 6144, 131072, 64, 64, False, True, True): (4, 1024, 3, 4), - (6144, 6144, 131072, 64, 64, True, False, True): (2, 256, 3, 8), - (6144, 6144, 131072, 128, 128, False, True, True): (4, 1024, 1, 4), - (6144, 6144, 131072, 128, 128, True, False, True): (4, 1024, 3, 8), - (8192, 8192, 256, 16, 16, False, True, True): (2, 2, 6, 4), - (8192, 8192, 256, 16, 16, True, False, True): (2, 4, 2, 2), - (8192, 8192, 256, 32, 32, False, True, True): (4, 2, 3, 4), - (8192, 8192, 256, 32, 32, True, False, True): (4, 2, 3, 4), - (8192, 8192, 256, 64, 64, False, True, True): (2, 2, 3, 8), - (8192, 8192, 256, 64, 64, True, False, True): (6, 2, 3, 8), - (8192, 8192, 256, 128, 128, False, True, True): (3, 2, 1, 4), - (8192, 8192, 256, 128, 128, True, False, True): (1, 2, 1, 4), - (8192, 8192, 512, 16, 16, False, True, True): (4, 4, 3, 2), - (8192, 8192, 512, 16, 16, True, False, True): (4, 4, 3, 4), - (8192, 8192, 512, 32, 32, False, True, True): (1, 4, 3, 4), - (8192, 8192, 512, 32, 32, True, False, True): (5, 4, 3, 2), - (8192, 8192, 512, 64, 64, False, True, True): (1, 4, 3, 4), - (8192, 8192, 512, 64, 64, True, False, True): (2, 2, 3, 8), - (8192, 8192, 512, 128, 128, False, True, True): (4, 4, 2, 8), - (8192, 8192, 512, 128, 128, True, False, True): (4, 4, 2, 8), - (8192, 8192, 1024, 16, 16, False, True, True): (4, 8, 4, 4), - (8192, 8192, 1024, 16, 16, True, False, True): (4, 8, 4, 4), - (8192, 8192, 1024, 32, 32, False, True, True): (2, 4, 4, 8), - (8192, 8192, 1024, 32, 32, True, False, True): (1, 4, 3, 4), - (8192, 8192, 1024, 64, 64, False, True, True): (4, 8, 3, 4), - (8192, 8192, 1024, 64, 64, True, False, True): (2, 8, 3, 4), - (8192, 8192, 1024, 128, 128, False, True, True): (4, 8, 2, 8), - (8192, 8192, 1024, 128, 128, True, False, True): (4, 8, 1, 4), - (8192, 8192, 2048, 16, 16, False, True, True): (2, 8, 4, 4), - (8192, 8192, 2048, 16, 16, True, False, True): (2, 8, 4, 4), - (8192, 8192, 2048, 32, 32, False, True, True): (2, 8, 4, 8), - (8192, 8192, 2048, 32, 32, True, False, True): (2, 8, 4, 8), - (8192, 8192, 2048, 64, 64, False, True, True): (4, 8, 2, 4), - (8192, 8192, 2048, 64, 64, True, False, True): (4, 16, 3, 4), - (8192, 8192, 2048, 128, 128, False, True, True): (6, 16, 1, 4), - (8192, 8192, 2048, 128, 128, True, False, True): (4, 16, 1, 4), - (8192, 8192, 4096, 16, 16, False, True, True): (4, 32, 4, 2), - (8192, 8192, 4096, 16, 16, True, False, True): (4, 32, 4, 2), - (8192, 8192, 4096, 32, 32, False, True, True): (2, 16, 4, 8), - (8192, 8192, 4096, 32, 32, True, False, True): (4, 16, 4, 8), - (8192, 8192, 4096, 64, 64, False, True, True): (4, 16, 2, 4), - (8192, 8192, 4096, 64, 64, True, False, True): (4, 16, 2, 4), - (8192, 8192, 4096, 128, 128, False, True, True): (6, 32, 1, 4), - (8192, 8192, 4096, 128, 128, True, False, True): (4, 32, 1, 4), - (8192, 8192, 8192, 16, 16, False, True, True): (4, 64, 4, 2), - (8192, 8192, 8192, 16, 16, True, False, True): (4, 64, 4, 2), - (8192, 8192, 8192, 32, 32, False, True, True): (2, 32, 4, 8), - (8192, 8192, 8192, 32, 32, True, False, True): (2, 32, 4, 8), - (8192, 8192, 8192, 64, 64, False, True, True): (2, 32, 2, 4), - (8192, 8192, 8192, 64, 64, True, False, True): (4, 32, 2, 4), - (8192, 8192, 8192, 128, 128, False, True, True): (6, 64, 1, 4), - (8192, 8192, 8192, 128, 128, True, False, True): (4, 64, 1, 4), - (8192, 8192, 16384, 16, 16, False, True, True): (4, 64, 3, 4), - (8192, 8192, 16384, 16, 16, True, False, True): (4, 64, 4, 4), - (8192, 8192, 16384, 32, 32, False, True, True): (4, 64, 4, 8), - (8192, 8192, 16384, 32, 32, True, False, True): (4, 64, 4, 8), - (8192, 8192, 16384, 64, 64, False, True, True): (4, 64, 2, 4), - (8192, 8192, 16384, 64, 64, True, False, True): (4, 64, 3, 8), - (8192, 8192, 16384, 128, 128, False, True, True): (6, 128, 1, 4), - (8192, 8192, 16384, 128, 128, True, False, True): (4, 128, 1, 4), - (8192, 8192, 32768, 16, 16, False, True, True): (3, 128, 4, 4), - (8192, 8192, 32768, 16, 16, True, False, True): (3, 128, 4, 4), - (8192, 8192, 32768, 32, 32, False, True, True): (2, 128, 4, 8), - (8192, 8192, 32768, 32, 32, True, False, True): (2, 128, 4, 8), - (8192, 8192, 32768, 64, 64, False, True, True): (2, 128, 2, 4), - (8192, 8192, 32768, 64, 64, True, False, True): (2, 128, 3, 8), - (8192, 8192, 32768, 128, 128, False, True, True): (6, 256, 1, 4), - (8192, 8192, 32768, 128, 128, True, False, True): (4, 256, 1, 4), - (8192, 8192, 50432, 16, 16, False, True, True): (1, 197, 1, 1), - (8192, 8192, 50432, 16, 16, True, False, True): (3, 197, 4, 1), - (8192, 8192, 50432, 32, 32, False, True, True): (2, 197, 1, 4), - (8192, 8192, 50432, 32, 32, True, False, True): (2, 197, 3, 4), - (8192, 8192, 50432, 64, 64, False, True, True): (2, 394, 3, 4), - (8192, 8192, 65536, 16, 16, False, True, True): (3, 256, 4, 4), - (8192, 8192, 65536, 16, 16, True, False, True): (4, 256, 4, 4), - (8192, 8192, 65536, 32, 32, False, True, True): (2, 256, 4, 8), - (8192, 8192, 65536, 32, 32, True, False, True): (2, 256, 3, 8), - (8192, 8192, 65536, 64, 64, False, True, True): (2, 256, 2, 4), - (8192, 8192, 65536, 64, 64, True, False, True): (4, 256, 3, 8), - (8192, 8192, 65536, 128, 128, False, True, True): (6, 512, 1, 4), - (8192, 8192, 65536, 128, 128, True, False, True): (4, 512, 1, 4), - (8192, 8192, 65792, 16, 16, False, True, True): (1, 257, 1, 1), - (8192, 8192, 65792, 16, 16, True, False, True): (3, 257, 4, 1), - (8192, 8192, 65792, 32, 32, False, True, True): (2, 257, 1, 4), - (8192, 8192, 65792, 32, 32, True, False, True): (1, 257, 3, 4), - (8192, 8192, 65792, 64, 64, False, True, True): (2, 514, 3, 4), - (8192, 8192, 65792, 64, 64, True, False, True): (1, 257, 3, 4), - (8192, 8192, 65792, 128, 128, False, True, True): (2, 514, 1, 4), - (8192, 8192, 65792, 128, 128, True, False, True): (2, 514, 3, 8), - (8192, 8192, 131072, 16, 16, False, True, True): (4, 512, 4, 4), - (8192, 8192, 131072, 16, 16, True, False, True): (3, 512, 4, 4), - (8192, 8192, 131072, 32, 32, False, True, True): (2, 512, 4, 8), - (8192, 8192, 131072, 32, 32, True, False, True): (2, 512, 4, 8), - (8192, 8192, 131072, 64, 64, False, True, True): (2, 512, 2, 4), - (8192, 8192, 131072, 64, 64, True, False, True): (2, 512, 2, 4), - (8192, 8192, 131072, 128, 128, False, True, True): (4, 1024, 1, 4), - (8192, 8192, 131072, 128, 128, True, False, True): (4, 1024, 1, 4), - (12288, 12288, 256, 16, 16, False, True, True): (4, 2, 1, 4), - (12288, 12288, 256, 16, 16, True, False, True): (1, 1, 3, 1), - (12288, 12288, 256, 32, 32, False, True, True): (4, 4, 1, 4), - (12288, 12288, 256, 32, 32, True, False, True): (2, 1, 3, 2), - (12288, 12288, 256, 64, 64, False, True, True): (4, 2, 3, 4), - (12288, 12288, 256, 64, 64, True, False, True): (3, 1, 3, 4), - (12288, 12288, 256, 128, 128, False, True, True): (6, 2, 1, 4), - (12288, 12288, 256, 128, 128, True, False, True): (4, 2, 3, 8), - (12288, 12288, 512, 16, 16, False, True, True): (4, 4, 1, 2), - (12288, 12288, 512, 16, 16, True, False, True): (4, 4, 4, 2), - (12288, 12288, 512, 32, 32, False, True, True): (4, 4, 4, 2), - (12288, 12288, 512, 32, 32, True, False, True): (2, 2, 3, 8), - (12288, 12288, 512, 64, 64, False, True, True): (4, 4, 3, 4), - (12288, 12288, 512, 64, 64, True, False, True): (8, 2, 3, 4), - (12288, 12288, 512, 128, 128, False, True, True): (4, 4, 3, 8), - (12288, 12288, 512, 128, 128, True, False, True): (4, 4, 3, 8), - (12288, 12288, 1024, 16, 16, False, True, True): (4, 8, 1, 2), - (12288, 12288, 1024, 16, 16, True, False, True): (2, 4, 4, 4), - (12288, 12288, 1024, 32, 32, False, True, True): (4, 4, 3, 4), - (12288, 12288, 1024, 32, 32, True, False, True): (1, 4, 3, 4), - (12288, 12288, 1024, 64, 64, False, True, True): (4, 8, 3, 4), - (12288, 12288, 1024, 64, 64, True, False, True): (2, 4, 3, 4), - (12288, 12288, 1024, 128, 128, False, True, True): (4, 8, 3, 8), - (12288, 12288, 1024, 128, 128, True, False, True): (4, 8, 3, 8), - (12288, 12288, 2048, 16, 16, False, True, True): (2, 4, 1, 4), - (12288, 12288, 2048, 16, 16, True, False, True): (2, 8, 4, 4), - (12288, 12288, 2048, 32, 32, False, True, True): (4, 8, 1, 2), - (12288, 12288, 2048, 32, 32, True, False, True): (2, 8, 4, 8), - (12288, 12288, 2048, 64, 64, False, True, True): (4, 16, 3, 4), - (12288, 12288, 2048, 64, 64, True, False, True): (2, 8, 3, 4), - (12288, 12288, 2048, 128, 128, False, True, True): (4, 16, 3, 8), - (12288, 12288, 2048, 128, 128, True, False, True): (4, 16, 3, 8), - (12288, 12288, 4096, 16, 16, False, True, True): (2, 8, 1, 4), - (12288, 12288, 4096, 16, 16, True, False, True): (2, 16, 4, 4), - (12288, 12288, 4096, 32, 32, False, True, True): (2, 16, 1, 2), - (12288, 12288, 4096, 32, 32, True, False, True): (2, 16, 3, 4), - (12288, 12288, 4096, 64, 64, False, True, True): (4, 32, 3, 4), - (12288, 12288, 4096, 64, 64, True, False, True): (2, 16, 3, 4), - (12288, 12288, 4096, 128, 128, False, True, True): (4, 32, 1, 4), - (12288, 12288, 4096, 128, 128, True, False, True): (4, 32, 3, 8), - (12288, 12288, 8192, 16, 16, False, True, True): (2, 32, 1, 1), - (12288, 12288, 8192, 16, 16, True, False, True): (4, 64, 4, 2), - (12288, 12288, 8192, 32, 32, False, True, True): (2, 32, 1, 2), - (12288, 12288, 8192, 32, 32, True, False, True): (2, 32, 3, 2), - (12288, 12288, 8192, 64, 64, False, True, True): (4, 64, 3, 4), - (12288, 12288, 8192, 64, 64, True, False, True): (2, 32, 3, 4), - (12288, 12288, 8192, 128, 128, False, True, True): (4, 64, 3, 8), - (12288, 12288, 8192, 128, 128, True, False, True): (2, 64, 3, 8), - (12288, 12288, 16384, 16, 16, False, True, True): (4, 128, 1, 2), - (12288, 12288, 16384, 16, 16, True, False, True): (4, 128, 4, 2), - (12288, 12288, 16384, 32, 32, False, True, True): (2, 64, 1, 2), - (12288, 12288, 16384, 32, 32, True, False, True): (2, 64, 3, 4), - (12288, 12288, 16384, 64, 64, False, True, True): (4, 128, 3, 4), - (12288, 12288, 16384, 64, 64, True, False, True): (2, 64, 3, 4), - (12288, 12288, 16384, 128, 128, False, True, True): (4, 128, 1, 4), - (12288, 12288, 16384, 128, 128, True, False, True): (4, 128, 3, 8), - (12288, 12288, 32768, 16, 16, False, True, True): (2, 128, 1, 1), - (12288, 12288, 32768, 16, 16, True, False, True): (3, 128, 4, 1), - (12288, 12288, 32768, 32, 32, False, True, True): (2, 128, 1, 2), - (12288, 12288, 32768, 32, 32, True, False, True): (2, 128, 3, 2), - (12288, 12288, 32768, 64, 64, False, True, True): (4, 256, 3, 4), - (12288, 12288, 32768, 64, 64, True, False, True): (1, 64, 3, 8), - (12288, 12288, 32768, 128, 128, False, True, True): (4, 256, 3, 8), - (12288, 12288, 32768, 128, 128, True, False, True): (4, 256, 3, 8), - (12288, 12288, 65536, 16, 16, False, True, True): (4, 512, 1, 2), - (12288, 12288, 65536, 16, 16, True, False, True): (3, 256, 4, 1), - (12288, 12288, 65536, 32, 32, False, True, True): (2, 256, 1, 2), - (12288, 12288, 65536, 32, 32, True, False, True): (2, 256, 3, 2), - (12288, 12288, 65536, 64, 64, False, True, True): (4, 512, 3, 4), - (12288, 12288, 65536, 64, 64, True, False, True): (2, 256, 3, 4), - (12288, 12288, 65536, 128, 128, False, True, True): (4, 512, 1, 4), - (12288, 12288, 65536, 128, 128, True, False, True): (4, 512, 3, 8), - (12288, 12288, 131072, 16, 16, False, True, True): (2, 512, 1, 1), - (12288, 12288, 131072, 16, 16, True, False, True): (2, 512, 4, 4), - (12288, 12288, 131072, 32, 32, False, True, True): (2, 512, 1, 2), - (12288, 12288, 131072, 32, 32, True, False, True): (2, 512, 3, 4), - (12288, 12288, 131072, 64, 64, False, True, True): (4, 1024, 3, 4), - (12288, 12288, 131072, 64, 64, True, False, True): (2, 512, 3, 4), - (12288, 12288, 131072, 128, 128, False, True, True): (4, 1024, 3, 8), - (12288, 12288, 131072, 128, 128, True, False, True): (4, 1024, 3, 8), - (16384, 16384, 256, 16, 16, False, True, True): (2, 2, 3, 2), - (16384, 16384, 256, 16, 16, True, False, True): (2, 2, 6, 4), - (16384, 16384, 256, 32, 32, False, True, True): (4, 2, 3, 4), - (16384, 16384, 256, 32, 32, True, False, True): (4, 2, 3, 2), - (16384, 16384, 256, 64, 64, False, True, True): (2, 2, 5, 4), - (16384, 16384, 256, 64, 64, True, False, True): (2, 2, 3, 8), - (16384, 16384, 256, 128, 128, False, True, True): (4, 2, 2, 8), - (16384, 16384, 256, 128, 128, True, False, True): (2, 2, 1, 4), - (16384, 16384, 512, 16, 16, False, True, True): (1, 2, 4, 4), - (16384, 16384, 512, 16, 16, True, False, True): (1, 2, 4, 4), - (16384, 16384, 512, 32, 32, False, True, True): (2, 2, 3, 8), - (16384, 16384, 512, 32, 32, True, False, True): (2, 2, 4, 8), - (16384, 16384, 512, 64, 64, False, True, True): (4, 4, 3, 4), - (16384, 16384, 512, 64, 64, True, False, True): (2, 4, 3, 4), - (16384, 16384, 512, 128, 128, False, True, True): (4, 4, 2, 8), - (16384, 16384, 512, 128, 128, True, False, True): (4, 4, 2, 8), - (16384, 16384, 1024, 16, 16, False, True, True): (4, 8, 4, 4), - (16384, 16384, 1024, 16, 16, True, False, True): (2, 4, 4, 4), - (16384, 16384, 1024, 32, 32, False, True, True): (2, 4, 4, 8), - (16384, 16384, 1024, 32, 32, True, False, True): (2, 4, 4, 8), - (16384, 16384, 1024, 64, 64, False, True, True): (4, 4, 2, 4), - (16384, 16384, 1024, 64, 64, True, False, True): (2, 4, 2, 4), - (16384, 16384, 1024, 128, 128, False, True, True): (6, 8, 1, 4), - (16384, 16384, 1024, 128, 128, True, False, True): (4, 8, 1, 4), - (16384, 16384, 2048, 16, 16, False, True, True): (2, 8, 4, 4), - (16384, 16384, 2048, 16, 16, True, False, True): (2, 8, 4, 4), - (16384, 16384, 2048, 32, 32, False, True, True): (2, 8, 4, 8), - (16384, 16384, 2048, 32, 32, True, False, True): (2, 8, 4, 8), - (16384, 16384, 2048, 64, 64, False, True, True): (2, 8, 2, 4), - (16384, 16384, 2048, 64, 64, True, False, True): (2, 8, 2, 4), - (16384, 16384, 2048, 128, 128, False, True, True): (4, 16, 2, 8), - (16384, 16384, 2048, 128, 128, True, False, True): (4, 16, 1, 4), - (16384, 16384, 4096, 16, 16, False, True, True): (2, 16, 4, 4), - (16384, 16384, 4096, 16, 16, True, False, True): (2, 16, 4, 4), - (16384, 16384, 4096, 32, 32, False, True, True): (1, 16, 4, 8), - (16384, 16384, 4096, 32, 32, True, False, True): (2, 16, 3, 4), - (16384, 16384, 4096, 64, 64, False, True, True): (1, 16, 2, 4), - (16384, 16384, 4096, 64, 64, True, False, True): (2, 16, 2, 4), - (16384, 16384, 4096, 128, 128, False, True, True): (4, 32, 2, 8), - (16384, 16384, 4096, 128, 128, True, False, True): (4, 32, 1, 4), - (16384, 16384, 8192, 16, 16, False, True, True): (2, 64, 4, 2), - (16384, 16384, 8192, 16, 16, True, False, True): (2, 64, 4, 2), - (16384, 16384, 8192, 32, 32, False, True, True): (2, 32, 4, 8), - (16384, 16384, 8192, 32, 32, True, False, True): (2, 32, 4, 8), - (16384, 16384, 8192, 64, 64, False, True, True): (2, 32, 2, 4), - (16384, 16384, 8192, 64, 64, True, False, True): (2, 32, 4, 8), - (16384, 16384, 8192, 128, 128, False, True, True): (4, 64, 2, 8), - (16384, 16384, 8192, 128, 128, True, False, True): (4, 64, 1, 4), - (16384, 16384, 16384, 16, 16, False, True, True): (1, 64, 4, 4), - (16384, 16384, 16384, 16, 16, True, False, True): (1, 64, 4, 4), - (16384, 16384, 16384, 32, 32, False, True, True): (1, 64, 4, 8), - (16384, 16384, 16384, 32, 32, True, False, True): (1, 64, 4, 8), - (16384, 16384, 16384, 64, 64, False, True, True): (1, 64, 2, 4), - (16384, 16384, 16384, 64, 64, True, False, True): (1, 64, 3, 8), - (16384, 16384, 16384, 128, 128, False, True, True): (4, 128, 1, 4), - (16384, 16384, 16384, 128, 128, True, False, True): (4, 128, 1, 4), - (16384, 16384, 32768, 16, 16, False, True, True): (1, 128, 4, 4), - (16384, 16384, 32768, 16, 16, True, False, True): (1, 128, 4, 4), - (16384, 16384, 32768, 32, 32, False, True, True): (1, 128, 3, 4), - (16384, 16384, 32768, 32, 32, True, False, True): (1, 128, 3, 8), - (16384, 16384, 32768, 64, 64, False, True, True): (2, 128, 2, 4), - (16384, 16384, 32768, 64, 64, True, False, True): (1, 128, 4, 8), - (16384, 16384, 32768, 128, 128, False, True, True): (4, 256, 2, 8), - (16384, 16384, 32768, 128, 128, True, False, True): (4, 256, 1, 4), - (16384, 16384, 65536, 16, 16, False, True, True): (1, 256, 3, 4), - (16384, 16384, 65536, 16, 16, True, False, True): (1, 256, 4, 4), - (16384, 16384, 65536, 32, 32, False, True, True): (1, 256, 4, 8), - (16384, 16384, 65536, 32, 32, True, False, True): (1, 256, 3, 4), - (16384, 16384, 65536, 64, 64, False, True, True): (2, 256, 2, 4), - (16384, 16384, 65536, 64, 64, True, False, True): (1, 256, 3, 8), - (16384, 16384, 65536, 128, 128, False, True, True): (4, 512, 2, 8), - (16384, 16384, 65536, 128, 128, True, False, True): (4, 512, 1, 4), - (16384, 16384, 65792, 16, 16, False, True, True): (1, 257, 1, 1), - (16384, 16384, 65792, 16, 16, True, False, True): (1, 257, 4, 1), - (16384, 16384, 65792, 32, 32, False, True, True): (1, 257, 1, 4), - (16384, 16384, 65792, 32, 32, True, False, True): (1, 257, 3, 4), - (16384, 16384, 65792, 64, 64, False, True, True): (2, 514, 3, 4), - (16384, 16384, 65792, 64, 64, True, False, True): (1, 257, 3, 4), - (16384, 16384, 65792, 128, 128, False, True, True): (2, 514, 3, 8), - (16384, 16384, 65792, 128, 128, True, False, True): (2, 514, 3, 8), - (16384, 16384, 131072, 16, 16, False, True, True): (1, 512, 4, 4), - (16384, 16384, 131072, 16, 16, True, False, True): (1, 512, 3, 2), - (16384, 16384, 131072, 32, 32, False, True, True): (1, 512, 4, 8), - (16384, 16384, 131072, 32, 32, True, False, True): (1, 512, 3, 2), - (16384, 16384, 131072, 64, 64, False, True, True): (1, 512, 2, 4), - (16384, 16384, 131072, 64, 64, True, False, True): (1, 512, 2, 4), - (16384, 16384, 131072, 128, 128, False, True, True): (4, 1024, 1, 4), - (16384, 16384, 131072, 128, 128, True, False, True): (4, 1024, 1, 4), - (24576, 24576, 256, 16, 16, False, True, True): (6, 2, 1, 2), - (24576, 24576, 256, 16, 16, True, False, True): (2, 2, 5, 4), - (24576, 24576, 256, 32, 32, False, True, True): (4, 4, 1, 4), - (24576, 24576, 256, 32, 32, True, False, True): (2, 2, 4, 2), - (24576, 24576, 256, 64, 64, False, True, True): (2, 2, 3, 4), - (24576, 24576, 256, 64, 64, True, False, True): (1, 1, 3, 4), - (24576, 24576, 256, 128, 128, False, True, True): (6, 2, 1, 4), - (24576, 24576, 256, 128, 128, True, False, True): (2, 2, 3, 8), - (24576, 24576, 512, 16, 16, False, True, True): (4, 4, 1, 2), - (24576, 24576, 512, 16, 16, True, False, True): (2, 2, 4, 4), - (24576, 24576, 512, 32, 32, False, True, True): (1, 2, 3, 4), - (24576, 24576, 512, 32, 32, True, False, True): (1, 2, 3, 4), - (24576, 24576, 512, 64, 64, False, True, True): (4, 4, 3, 4), - (24576, 24576, 512, 64, 64, True, False, True): (1, 2, 3, 4), - (24576, 24576, 512, 128, 128, False, True, True): (4, 4, 3, 8), - (24576, 24576, 512, 128, 128, True, False, True): (4, 4, 3, 8), - (24576, 24576, 1024, 16, 16, False, True, True): (2, 8, 1, 2), - (24576, 24576, 1024, 16, 16, True, False, True): (2, 4, 4, 4), - (24576, 24576, 1024, 32, 32, False, True, True): (2, 4, 1, 2), - (24576, 24576, 1024, 32, 32, True, False, True): (1, 4, 3, 4), - (24576, 24576, 1024, 64, 64, False, True, True): (4, 8, 3, 4), - (24576, 24576, 1024, 64, 64, True, False, True): (1, 4, 3, 4), - (24576, 24576, 1024, 128, 128, False, True, True): (4, 8, 3, 8), - (24576, 24576, 1024, 128, 128, True, False, True): (4, 8, 3, 8), - (24576, 24576, 2048, 16, 16, False, True, True): (1, 4, 1, 4), - (24576, 24576, 2048, 16, 16, True, False, True): (1, 8, 4, 4), - (24576, 24576, 2048, 32, 32, False, True, True): (2, 8, 1, 2), - (24576, 24576, 2048, 32, 32, True, False, True): (1, 8, 3, 4), - (24576, 24576, 2048, 64, 64, False, True, True): (4, 16, 3, 4), - (24576, 24576, 2048, 64, 64, True, False, True): (1, 4, 3, 8), - (24576, 24576, 2048, 128, 128, False, True, True): (4, 16, 3, 8), - (24576, 24576, 2048, 128, 128, True, False, True): (2, 16, 3, 8), - (24576, 24576, 4096, 16, 16, False, True, True): (2, 32, 1, 2), - (24576, 24576, 4096, 16, 16, True, False, True): (1, 16, 4, 4), - (24576, 24576, 4096, 32, 32, False, True, True): (1, 16, 1, 2), - (24576, 24576, 4096, 32, 32, True, False, True): (1, 16, 3, 4), - (24576, 24576, 4096, 64, 64, False, True, True): (4, 32, 3, 4), - (24576, 24576, 4096, 64, 64, True, False, True): (1, 8, 3, 8), - (24576, 24576, 4096, 128, 128, False, True, True): (4, 32, 3, 8), - (24576, 24576, 4096, 128, 128, True, False, True): (2, 32, 3, 8), - (24576, 24576, 8192, 16, 16, False, True, True): (1, 32, 1, 1), - (24576, 24576, 8192, 16, 16, True, False, True): (2, 64, 4, 2), - (24576, 24576, 8192, 32, 32, False, True, True): (1, 32, 1, 2), - (24576, 24576, 8192, 32, 32, True, False, True): (1, 32, 3, 4), - (24576, 24576, 8192, 64, 64, False, True, True): (4, 64, 3, 4), - (24576, 24576, 8192, 64, 64, True, False, True): (1, 32, 3, 4), - (24576, 24576, 8192, 128, 128, False, True, True): (4, 64, 3, 8), - (24576, 24576, 8192, 128, 128, True, False, True): (4, 64, 3, 8), - (24576, 24576, 16384, 16, 16, False, True, True): (2, 128, 1, 2), - (24576, 24576, 16384, 16, 16, True, False, True): (1, 64, 4, 4), - (24576, 24576, 16384, 32, 32, False, True, True): (1, 64, 1, 2), - (24576, 24576, 16384, 32, 32, True, False, True): (1, 64, 3, 2), - (24576, 24576, 16384, 64, 64, False, True, True): (2, 128, 3, 4), - (24576, 24576, 16384, 64, 64, True, False, True): (1, 32, 3, 8), - (24576, 24576, 16384, 128, 128, False, True, True): (4, 128, 3, 8), - (24576, 24576, 16384, 128, 128, True, False, True): (4, 128, 3, 8), - (24576, 24576, 32768, 16, 16, False, True, True): (1, 128, 1, 1), - (24576, 24576, 32768, 16, 16, True, False, True): (1, 128, 4, 4), - (24576, 24576, 32768, 32, 32, False, True, True): (1, 128, 1, 2), - (24576, 24576, 32768, 32, 32, True, False, True): (1, 128, 3, 4), - (24576, 24576, 32768, 64, 64, False, True, True): (2, 256, 3, 4), - (24576, 24576, 32768, 64, 64, True, False, True): (1, 128, 3, 4), - (24576, 24576, 32768, 128, 128, False, True, True): (4, 256, 3, 8), - (24576, 24576, 32768, 128, 128, True, False, True): (2, 256, 3, 8), - (24576, 24576, 65536, 16, 16, False, True, True): (2, 512, 1, 2), - (24576, 24576, 65536, 16, 16, True, False, True): (1, 256, 4, 4), - (32768, 32768, 256, 16, 16, False, True, True): (4, 2, 1, 2), - (32768, 32768, 256, 16, 16, True, False, True): (2, 2, 5, 4), - (32768, 32768, 256, 32, 32, False, True, True): (4, 2, 4, 2), - (32768, 32768, 256, 32, 32, True, False, True): (1, 1, 4, 8), - (32768, 32768, 256, 64, 64, False, True, True): (2, 2, 3, 4), - (32768, 32768, 256, 64, 64, True, False, True): (1, 1, 3, 8), - (32768, 32768, 256, 128, 128, False, True, True): (2, 2, 3, 8), - (32768, 32768, 256, 128, 128, True, False, True): (2, 2, 3, 8), - (32768, 32768, 512, 16, 16, False, True, True): (2, 2, 1, 4), - (32768, 32768, 512, 16, 16, True, False, True): (2, 2, 4, 2), - (32768, 32768, 512, 32, 32, False, True, True): (1, 2, 3, 4), - (32768, 32768, 512, 32, 32, True, False, True): (1, 2, 4, 8), - (32768, 32768, 512, 64, 64, False, True, True): (4, 4, 3, 4), - (32768, 32768, 512, 64, 64, True, False, True): (1, 2, 3, 4), - (32768, 32768, 512, 128, 128, False, True, True): (4, 4, 3, 8), - (32768, 32768, 512, 128, 128, True, False, True): (4, 4, 3, 8), - (32768, 32768, 1024, 16, 16, False, True, True): (2, 4, 1, 1), - (32768, 32768, 1024, 16, 16, True, False, True): (1, 4, 4, 2), - (32768, 32768, 1024, 32, 32, False, True, True): (2, 4, 1, 4), - (32768, 32768, 1024, 32, 32, True, False, True): (1, 4, 3, 4), - (32768, 32768, 1024, 64, 64, False, True, True): (4, 8, 3, 4), - (32768, 32768, 1024, 64, 64, True, False, True): (1, 4, 3, 4), - (32768, 32768, 1024, 128, 128, False, True, True): (4, 8, 3, 8), - (32768, 32768, 1024, 128, 128, True, False, True): (4, 8, 3, 8), - (32768, 32768, 2048, 16, 16, False, True, True): (1, 8, 1, 4), - (32768, 32768, 2048, 16, 16, True, False, True): (1, 8, 4, 4), - (32768, 32768, 2048, 32, 32, False, True, True): (2, 8, 1, 4), - (32768, 32768, 2048, 32, 32, True, False, True): (1, 8, 3, 4), - (32768, 32768, 2048, 64, 64, False, True, True): (4, 16, 3, 4), - (32768, 32768, 2048, 64, 64, True, False, True): (1, 8, 3, 4), - (32768, 32768, 2048, 128, 128, False, True, True): (4, 16, 3, 8), - (32768, 32768, 2048, 128, 128, True, False, True): (2, 16, 3, 8), - (32768, 32768, 4096, 16, 16, False, True, True): (1, 16, 1, 4), - (32768, 32768, 4096, 16, 16, True, False, True): (1, 16, 4, 4), - (32768, 32768, 4096, 32, 32, False, True, True): (2, 16, 1, 4), - (32768, 32768, 4096, 32, 32, True, False, True): (1, 16, 3, 4), - (32768, 32768, 4096, 64, 64, False, True, True): (2, 32, 3, 4), - (32768, 32768, 4096, 64, 64, True, False, True): (1, 16, 3, 4), - (32768, 32768, 4096, 128, 128, False, True, True): (4, 32, 3, 8), - (32768, 32768, 4096, 128, 128, True, False, True): (4, 32, 3, 8), - (32768, 32768, 8192, 16, 16, False, True, True): (1, 32, 1, 4), - (32768, 32768, 8192, 16, 16, True, False, True): (2, 64, 4, 1), - (32768, 32768, 8192, 32, 32, False, True, True): (2, 32, 1, 4), - (32768, 32768, 8192, 32, 32, True, False, True): (1, 32, 3, 4), - (32768, 32768, 8192, 64, 64, False, True, True): (2, 64, 3, 4), - (32768, 32768, 8192, 64, 64, True, False, True): (1, 32, 3, 4), - (32768, 32768, 8192, 128, 128, False, True, True): (4, 64, 3, 8), - (32768, 32768, 8192, 128, 128, True, False, True): (2, 64, 3, 8), - (32768, 32768, 16384, 16, 16, False, True, True): (1, 64, 1, 4), - (32768, 32768, 16384, 16, 16, True, False, True): (1, 64, 4, 1), - (32768, 32768, 16384, 32, 32, False, True, True): (2, 64, 1, 4), - (32768, 32768, 16384, 32, 32, True, False, True): (1, 64, 3, 4), - (32768, 32768, 16384, 64, 64, False, True, True): (2, 128, 3, 4), - (32768, 32768, 16384, 64, 64, True, False, True): (1, 64, 3, 4), - (32768, 32768, 16384, 128, 128, False, True, True): (4, 128, 3, 8), - (32768, 32768, 16384, 128, 128, True, False, True): (2, 128, 3, 8), - (32768, 32768, 32768, 16, 16, False, True, True): (1, 128, 1, 4), - (32768, 32768, 32768, 16, 16, True, False, True): (1, 128, 4, 1), - (32768, 32768, 32768, 32, 32, False, True, True): (2, 128, 1, 4), - (32768, 32768, 32768, 32, 32, True, False, True): (1, 128, 3, 4), - (32768, 32768, 32768, 64, 64, False, True, True): (2, 256, 3, 4), - (32768, 32768, 32768, 64, 64, True, False, True): (1, 128, 3, 4), - (32768, 32768, 32768, 128, 128, False, True, True): (2, 256, 3, 8), - (32768, 32768, 32768, 128, 128, True, False, True): (4, 256, 3, 8), - (32768, 32768, 65536, 16, 16, False, True, True): (1, 256, 1, 4), - (32768, 32768, 65536, 16, 16, True, False, True): (1, 256, 4, 1), - (32768, 32768, 65536, 32, 32, False, True, True): (1, 256, 3, 4), - (32768, 32768, 65536, 32, 32, True, False, True): (1, 256, 3, 4), - (32768, 32768, 65536, 64, 64, False, True, True): (1, 512, 3, 4), - (32768, 32768, 65536, 64, 64, True, False, True): (1, 256, 3, 4), - (32768, 32768, 65536, 128, 128, False, True, True): (4, 512, 1, 4), - (32768, 32768, 65536, 128, 128, True, False, True): (2, 512, 3, 8), - }, - ("bsr_dense_addmm", "NVIDIA A100-SXM4-80GB", (0, torch.float16, 0.56)): { - (192, 192, 256, 64, 64, False, True, True): (1, 4, 3, 4), - (192, 192, 256, 64, 64, True, False, True): (1, 4, 3, 4), - (192, 192, 512, 64, 64, False, True, True): (1, 8, 5, 4), - (192, 192, 512, 64, 64, True, False, True): (1, 8, 3, 4), - (192, 192, 1024, 64, 64, False, True, True): (1, 16, 3, 2), - (192, 192, 1024, 64, 64, True, False, True): (1, 16, 3, 4), - (192, 192, 2048, 64, 64, False, True, True): (1, 32, 5, 4), - (192, 192, 2048, 64, 64, True, False, True): (4, 32, 5, 4), - (192, 192, 4096, 64, 64, False, True, True): (1, 64, 1, 8), - (192, 192, 4096, 64, 64, True, False, True): (1, 32, 3, 4), - (192, 192, 8192, 64, 64, False, True, True): (4, 128, 1, 4), - (192, 192, 8192, 64, 64, True, False, True): (3, 64, 3, 4), - (192, 192, 16384, 64, 64, False, True, True): (1, 256, 1, 4), - (192, 192, 16384, 64, 64, True, False, True): (3, 64, 2, 4), - (192, 192, 32768, 64, 64, False, True, True): (1, 512, 1, 2), - (192, 192, 32768, 64, 64, True, False, True): (2, 256, 2, 4), - (192, 192, 65536, 64, 64, False, True, True): (1, 512, 1, 4), - (192, 192, 65536, 64, 64, True, False, True): (2, 512, 2, 4), - (192, 192, 131072, 64, 64, False, True, True): (1, 1024, 1, 4), - (192, 192, 131072, 64, 64, True, False, True): (1, 512, 3, 4), - (384, 384, 256, 128, 128, False, True, True): (3, 2, 3, 8), - (384, 384, 256, 128, 128, True, False, True): (5, 2, 3, 8), - (384, 384, 512, 128, 128, False, True, True): (4, 4, 3, 8), - (384, 384, 512, 128, 128, True, False, True): (1, 4, 3, 8), - (384, 384, 1024, 128, 128, False, True, True): (1, 8, 3, 8), - (384, 384, 1024, 128, 128, True, False, True): (1, 8, 2, 8), - (384, 384, 2048, 128, 128, False, True, True): (3, 16, 3, 8), - (384, 384, 2048, 128, 128, True, False, True): (1, 16, 3, 8), - (384, 384, 4096, 128, 128, False, True, True): (3, 32, 3, 8), - (384, 384, 4096, 128, 128, True, False, True): (3, 32, 3, 8), - (384, 384, 8192, 128, 128, False, True, True): (2, 64, 3, 8), - (384, 384, 8192, 128, 128, True, False, True): (2, 64, 2, 4), - (384, 384, 16384, 128, 128, False, True, True): (1, 128, 2, 8), - (384, 384, 16384, 128, 128, True, False, True): (3, 128, 2, 4), - (384, 384, 32768, 128, 128, False, True, True): (2, 256, 3, 8), - (384, 384, 32768, 128, 128, True, False, True): (1, 256, 2, 4), - (384, 384, 65536, 128, 128, False, True, True): (7, 512, 1, 4), - (384, 384, 65536, 128, 128, True, False, True): (3, 512, 2, 4), - (384, 384, 131072, 128, 128, False, True, True): (5, 1024, 1, 4), - (384, 384, 131072, 128, 128, True, False, True): (1, 1024, 2, 4), - }, - ("bsr_dense_addmm", "NVIDIA A100-SXM4-80GB", (0, torch.float32, 0.5)): { - (16, 16, 16, 16, 16, False, False, False): (2, 1, 1, 16), - (16, 16, 16, 16, 16, False, False, True): (1, 1, 2, 4), - (16, 16, 16, 16, 16, False, True, False): (1, 1, 2, 16), - (16, 16, 16, 16, 16, False, True, True): (2, 1, 2, 8), - (16, 16, 16, 16, 16, True, False, False): (1, 1, 1, 2), - (16, 16, 16, 16, 16, True, False, True): (2, 1, 1, 4), - (16, 16, 32, 16, 16, False, False, False): (1, 1, 1, 2), - (16, 16, 32, 16, 16, False, False, True): (1, 1, 2, 8), - (16, 16, 32, 16, 16, False, True, False): (1, 2, 1, 4), - (16, 16, 32, 16, 16, False, True, True): (1, 2, 2, 4), - (16, 16, 32, 16, 16, True, False, False): (1, 1, 2, 4), - (16, 16, 32, 16, 16, True, False, True): (1, 2, 2, 4), - (16, 16, 64, 16, 16, False, False, False): (1, 4, 1, 4), - (16, 16, 64, 16, 16, False, False, True): (2, 2, 1, 4), - (16, 16, 64, 16, 16, False, True, False): (1, 4, 1, 4), - (16, 16, 64, 16, 16, False, True, True): (1, 4, 1, 8), - (16, 16, 64, 16, 16, True, False, False): (1, 2, 1, 4), - (16, 16, 64, 16, 16, True, False, True): (1, 4, 2, 8), - (16, 32, 16, 16, 16, False, False, False): (1, 1, 2, 8), - (16, 32, 16, 16, 16, False, False, True): (2, 1, 1, 4), - (16, 32, 16, 16, 16, False, True, False): (1, 1, 1, 4), - (16, 32, 16, 16, 16, False, True, True): (1, 1, 1, 4), - (16, 32, 16, 16, 16, True, False, False): (1, 1, 1, 4), - (16, 32, 16, 16, 16, True, False, True): (1, 1, 2, 8), - (16, 32, 16, 16, 32, False, False, False): (1, 1, 2, 4), - (16, 32, 16, 16, 32, False, False, True): (2, 1, 2, 2), - (16, 32, 16, 16, 32, False, True, False): (1, 1, 1, 8), - (16, 32, 16, 16, 32, False, True, True): (1, 1, 1, 2), - (16, 32, 16, 16, 32, True, False, False): (3, 1, 1, 4), - (16, 32, 16, 16, 32, True, False, True): (1, 1, 1, 4), - (16, 32, 32, 16, 16, False, False, False): (1, 2, 1, 4), - (16, 32, 32, 16, 16, False, False, True): (2, 2, 1, 4), - (16, 32, 32, 16, 16, False, True, False): (1, 2, 1, 2), - (16, 32, 32, 16, 16, False, True, True): (1, 2, 1, 4), - (16, 32, 32, 16, 16, True, False, False): (1, 2, 1, 4), - (16, 32, 32, 16, 16, True, False, True): (1, 2, 1, 4), - (16, 32, 32, 16, 32, False, False, False): (1, 1, 2, 4), - (16, 32, 32, 16, 32, False, False, True): (1, 2, 1, 4), - (16, 32, 32, 16, 32, False, True, False): (1, 2, 2, 8), - (16, 32, 32, 16, 32, False, True, True): (1, 2, 1, 1), - (16, 32, 32, 16, 32, True, False, False): (1, 2, 1, 2), - (16, 32, 32, 16, 32, True, False, True): (1, 2, 1, 4), - (16, 32, 64, 16, 16, False, False, False): (1, 2, 1, 4), - (16, 32, 64, 16, 16, False, False, True): (2, 4, 1, 4), - (16, 32, 64, 16, 16, False, True, False): (1, 4, 2, 4), - (16, 32, 64, 16, 16, False, True, True): (1, 4, 1, 4), - (16, 32, 64, 16, 16, True, False, False): (1, 2, 2, 8), - (16, 32, 64, 16, 16, True, False, True): (1, 4, 1, 2), - (16, 32, 64, 16, 32, False, False, False): (1, 4, 1, 4), - (16, 32, 64, 16, 32, False, False, True): (1, 4, 3, 4), - (16, 32, 64, 16, 32, False, True, False): (1, 2, 1, 4), - (16, 32, 64, 16, 32, False, True, True): (1, 4, 1, 4), - (16, 32, 64, 16, 32, True, False, False): (1, 2, 1, 8), - (16, 32, 64, 16, 32, True, False, True): (1, 2, 1, 4), - (16, 64, 16, 16, 32, False, False, False): (1, 1, 1, 2), - (16, 64, 16, 16, 32, False, False, True): (1, 1, 1, 8), - (16, 64, 16, 16, 32, False, True, False): (1, 1, 1, 8), - (16, 64, 16, 16, 32, False, True, True): (1, 1, 1, 4), - (16, 64, 16, 16, 32, True, False, False): (1, 1, 1, 8), - (16, 64, 16, 16, 32, True, False, True): (1, 1, 1, 4), - (16, 64, 32, 16, 32, False, False, False): (1, 2, 1, 4), - (16, 64, 32, 16, 32, False, False, True): (1, 1, 1, 4), - (16, 64, 32, 16, 32, False, True, False): (1, 2, 1, 1), - (16, 64, 32, 16, 32, False, True, True): (1, 2, 1, 8), - (16, 64, 32, 16, 32, True, False, False): (2, 2, 1, 4), - (16, 64, 32, 16, 32, True, False, True): (2, 2, 1, 4), - (16, 64, 64, 16, 32, False, False, False): (1, 2, 1, 4), - (16, 64, 64, 16, 32, False, False, True): (1, 4, 1, 4), - (16, 64, 64, 16, 32, False, True, False): (1, 4, 1, 4), - (16, 64, 64, 16, 32, False, True, True): (1, 4, 1, 4), - (16, 64, 64, 16, 32, True, False, False): (1, 4, 1, 2), - (16, 64, 64, 16, 32, True, False, True): (3, 4, 1, 4), - (32, 16, 16, 16, 16, False, False, False): (1, 1, 2, 4), - (32, 16, 16, 16, 16, False, False, True): (1, 1, 1, 2), - (32, 16, 16, 16, 16, False, True, False): (1, 1, 2, 4), - (32, 16, 16, 16, 16, False, True, True): (1, 1, 2, 4), - (32, 16, 16, 16, 16, True, False, False): (1, 1, 3, 8), - (32, 16, 16, 16, 16, True, False, True): (1, 1, 2, 4), - (32, 16, 32, 16, 16, False, False, False): (1, 2, 1, 4), - (32, 16, 32, 16, 16, False, False, True): (1, 2, 3, 4), - (32, 16, 32, 16, 16, False, True, False): (1, 1, 1, 8), - (32, 16, 32, 16, 16, False, True, True): (1, 2, 1, 4), - (32, 16, 32, 16, 16, True, False, False): (1, 1, 1, 2), - (32, 16, 32, 16, 16, True, False, True): (1, 1, 1, 4), - (32, 16, 64, 16, 16, False, False, False): (1, 4, 1, 4), - (32, 16, 64, 16, 16, False, False, True): (3, 4, 1, 4), - (32, 16, 64, 16, 16, False, True, False): (1, 4, 1, 1), - (32, 16, 64, 16, 16, False, True, True): (1, 4, 1, 4), - (32, 16, 64, 16, 16, True, False, False): (1, 4, 1, 4), - (32, 16, 64, 16, 16, True, False, True): (1, 4, 1, 4), - (32, 32, 16, 16, 16, False, False, False): (1, 1, 1, 2), - (32, 32, 16, 16, 16, False, False, True): (2, 1, 1, 4), - (32, 32, 16, 16, 16, False, True, False): (1, 1, 1, 2), - (32, 32, 16, 16, 16, False, True, True): (2, 1, 1, 4), - (32, 32, 16, 16, 16, True, False, False): (3, 1, 2, 4), - (32, 32, 16, 16, 16, True, False, True): (1, 1, 2, 4), - (32, 32, 16, 16, 32, False, False, False): (2, 1, 1, 2), - (32, 32, 16, 16, 32, False, False, True): (1, 1, 1, 4), - (32, 32, 16, 16, 32, False, True, False): (1, 1, 1, 4), - (32, 32, 16, 16, 32, False, True, True): (1, 1, 1, 8), - (32, 32, 16, 16, 32, True, False, False): (1, 1, 1, 8), - (32, 32, 16, 16, 32, True, False, True): (1, 1, 1, 4), - (32, 32, 16, 32, 32, False, False, False): (2, 1, 1, 4), - (32, 32, 16, 32, 32, False, False, True): (1, 1, 2, 4), - (32, 32, 16, 32, 32, False, True, False): (2, 1, 1, 1), - (32, 32, 16, 32, 32, False, True, True): (2, 1, 2, 4), - (32, 32, 16, 32, 32, True, False, False): (1, 1, 1, 8), - (32, 32, 16, 32, 32, True, False, True): (1, 1, 1, 4), - (32, 32, 32, 16, 16, False, False, False): (1, 1, 1, 4), - (32, 32, 32, 16, 16, False, False, True): (1, 2, 1, 2), - (32, 32, 32, 16, 16, False, True, False): (2, 2, 1, 4), - (32, 32, 32, 16, 16, False, True, True): (1, 2, 2, 4), - (32, 32, 32, 16, 16, True, False, False): (1, 2, 1, 4), - (32, 32, 32, 16, 16, True, False, True): (2, 2, 1, 4), - (32, 32, 32, 16, 32, False, False, False): (1, 2, 1, 4), - (32, 32, 32, 16, 32, False, False, True): (1, 2, 1, 4), - (32, 32, 32, 16, 32, False, True, False): (1, 2, 1, 4), - (32, 32, 32, 16, 32, False, True, True): (1, 2, 1, 4), - (32, 32, 32, 16, 32, True, False, False): (2, 1, 1, 2), - (32, 32, 32, 16, 32, True, False, True): (2, 2, 2, 4), - (32, 32, 32, 32, 32, False, False, False): (1, 1, 1, 4), - (32, 32, 32, 32, 32, False, False, True): (1, 1, 1, 2), - (32, 32, 32, 32, 32, False, True, False): (1, 1, 1, 4), - (32, 32, 32, 32, 32, False, True, True): (1, 1, 2, 2), - (32, 32, 32, 32, 32, True, False, False): (1, 1, 1, 2), - (32, 32, 32, 32, 32, True, False, True): (1, 1, 2, 1), - (32, 32, 64, 16, 16, False, False, False): (2, 4, 1, 4), - (32, 32, 64, 16, 16, False, False, True): (1, 4, 2, 4), - (32, 32, 64, 16, 16, False, True, False): (1, 4, 1, 4), - (32, 32, 64, 16, 16, False, True, True): (1, 4, 1, 4), - (32, 32, 64, 16, 16, True, False, False): (1, 2, 1, 4), - (32, 32, 64, 16, 16, True, False, True): (2, 4, 1, 4), - (32, 32, 64, 16, 32, False, False, False): (1, 4, 1, 8), - (32, 32, 64, 16, 32, False, False, True): (1, 4, 1, 4), - (32, 32, 64, 16, 32, False, True, False): (1, 4, 1, 4), - (32, 32, 64, 16, 32, False, True, True): (2, 4, 1, 4), - (32, 32, 64, 16, 32, True, False, False): (1, 2, 2, 4), - (32, 32, 64, 16, 32, True, False, True): (2, 4, 1, 4), - (32, 32, 64, 32, 32, False, False, False): (2, 2, 1, 4), - (32, 32, 64, 32, 32, False, False, True): (1, 1, 1, 4), - (32, 32, 64, 32, 32, False, True, False): (1, 1, 1, 8), - (32, 32, 64, 32, 32, False, True, True): (2, 1, 1, 4), - (32, 32, 64, 32, 32, True, False, False): (1, 1, 1, 4), - (32, 32, 64, 32, 32, True, False, True): (1, 2, 1, 1), - (32, 64, 16, 16, 32, False, False, False): (1, 1, 2, 2), - (32, 64, 16, 16, 32, False, False, True): (2, 1, 1, 4), - (32, 64, 16, 16, 32, False, True, False): (1, 1, 1, 8), - (32, 64, 16, 16, 32, False, True, True): (1, 1, 3, 4), - (32, 64, 16, 16, 32, True, False, False): (1, 1, 1, 2), - (32, 64, 16, 16, 32, True, False, True): (1, 1, 2, 4), - (32, 64, 16, 32, 32, False, False, False): (1, 1, 1, 2), - (32, 64, 16, 32, 32, False, False, True): (1, 1, 3, 4), - (32, 64, 16, 32, 32, False, True, False): (1, 1, 2, 4), - (32, 64, 16, 32, 32, False, True, True): (1, 1, 1, 8), - (32, 64, 16, 32, 32, True, False, False): (1, 1, 2, 4), - (32, 64, 16, 32, 32, True, False, True): (1, 1, 1, 8), - (32, 64, 32, 16, 32, False, False, False): (1, 2, 1, 4), - (32, 64, 32, 16, 32, False, False, True): (1, 2, 3, 4), - (32, 64, 32, 16, 32, False, True, False): (1, 2, 1, 8), - (32, 64, 32, 16, 32, False, True, True): (3, 2, 1, 4), - (32, 64, 32, 16, 32, True, False, False): (1, 1, 1, 8), - (32, 64, 32, 16, 32, True, False, True): (1, 2, 1, 4), - (32, 64, 32, 32, 32, False, False, False): (1, 1, 1, 1), - (32, 64, 32, 32, 32, False, False, True): (1, 1, 1, 4), - (32, 64, 32, 32, 32, False, True, False): (1, 1, 1, 4), - (32, 64, 32, 32, 32, False, True, True): (1, 1, 1, 4), - (32, 64, 32, 32, 32, True, False, False): (1, 1, 1, 4), - (32, 64, 32, 32, 32, True, False, True): (1, 1, 2, 8), - (32, 64, 64, 16, 32, False, False, False): (2, 4, 1, 4), - (32, 64, 64, 16, 32, False, False, True): (1, 4, 1, 4), - (32, 64, 64, 16, 32, False, True, False): (1, 4, 1, 4), - (32, 64, 64, 16, 32, False, True, True): (2, 4, 1, 4), - (32, 64, 64, 16, 32, True, False, False): (1, 4, 1, 4), - (32, 64, 64, 16, 32, True, False, True): (1, 4, 1, 4), - (32, 64, 64, 32, 32, False, False, False): (2, 2, 1, 4), - (32, 64, 64, 32, 32, False, False, True): (1, 2, 1, 8), - (32, 64, 64, 32, 32, False, True, False): (1, 2, 1, 4), - (32, 64, 64, 32, 32, False, True, True): (1, 2, 1, 4), - (32, 64, 64, 32, 32, True, False, False): (2, 2, 1, 4), - (32, 64, 64, 32, 32, True, False, True): (1, 2, 3, 8), - (64, 32, 16, 32, 32, False, False, False): (1, 1, 1, 4), - (64, 32, 16, 32, 32, False, False, True): (3, 1, 2, 4), - (64, 32, 16, 32, 32, False, True, False): (2, 1, 1, 2), - (64, 32, 16, 32, 32, False, True, True): (1, 1, 1, 8), - (64, 32, 16, 32, 32, True, False, False): (1, 1, 1, 2), - (64, 32, 16, 32, 32, True, False, True): (1, 1, 1, 4), - (64, 32, 32, 32, 32, False, False, False): (1, 1, 1, 4), - (64, 32, 32, 32, 32, False, False, True): (1, 1, 2, 8), - (64, 32, 32, 32, 32, False, True, False): (1, 1, 1, 8), - (64, 32, 32, 32, 32, False, True, True): (1, 1, 1, 4), - (64, 32, 32, 32, 32, True, False, False): (1, 1, 2, 4), - (64, 32, 32, 32, 32, True, False, True): (1, 1, 3, 8), - (64, 32, 64, 32, 32, False, False, False): (1, 2, 1, 4), - (64, 32, 64, 32, 32, False, False, True): (2, 2, 1, 4), - (64, 32, 64, 32, 32, False, True, False): (1, 1, 1, 4), - (64, 32, 64, 32, 32, False, True, True): (1, 2, 1, 8), - (64, 32, 64, 32, 32, True, False, False): (2, 2, 1, 4), - (64, 32, 64, 32, 32, True, False, True): (1, 2, 1, 8), - (64, 64, 16, 32, 32, False, False, False): (1, 1, 2, 8), - (64, 64, 16, 32, 32, False, False, True): (2, 1, 2, 4), - (64, 64, 16, 32, 32, False, True, False): (1, 1, 1, 2), - (64, 64, 16, 32, 32, False, True, True): (1, 1, 2, 4), - (64, 64, 16, 32, 32, True, False, False): (1, 1, 1, 2), - (64, 64, 16, 32, 32, True, False, True): (1, 1, 2, 4), - (64, 64, 32, 32, 32, False, False, False): (1, 1, 1, 4), - (64, 64, 32, 32, 32, False, False, True): (2, 1, 1, 4), - (64, 64, 32, 32, 32, False, True, False): (1, 1, 1, 8), - (64, 64, 32, 32, 32, False, True, True): (2, 1, 1, 4), - (64, 64, 32, 32, 32, True, False, False): (1, 1, 1, 4), - (64, 64, 32, 32, 32, True, False, True): (1, 1, 1, 8), - (64, 64, 64, 32, 32, False, False, False): (2, 2, 1, 4), - (64, 64, 64, 32, 32, False, False, True): (1, 2, 1, 4), - (64, 64, 64, 32, 32, False, True, False): (1, 2, 1, 4), - (64, 64, 64, 32, 32, False, True, True): (2, 2, 1, 4), - (64, 64, 64, 32, 32, True, False, False): (1, 1, 1, 8), - (64, 64, 64, 32, 32, True, False, True): (1, 2, 2, 4), - (192, 192, 256, 16, 16, False, True, True): (1, 16, 3, 2), - (192, 192, 256, 16, 16, True, False, True): (1, 8, 5, 4), - (192, 192, 256, 32, 32, False, True, True): (2, 8, 4, 4), - (192, 192, 256, 32, 32, True, False, True): (1, 8, 5, 4), - (192, 192, 512, 16, 16, False, True, True): (2, 16, 3, 4), - (192, 192, 512, 16, 16, True, False, True): (1, 16, 5, 4), - (192, 192, 512, 32, 32, False, True, True): (1, 16, 3, 4), - (192, 192, 512, 32, 32, True, False, True): (2, 16, 3, 4), - (192, 192, 1024, 16, 16, False, True, True): (3, 16, 3, 4), - (192, 192, 1024, 16, 16, True, False, True): (2, 8, 3, 4), - (192, 192, 1024, 32, 32, False, True, True): (3, 32, 1, 4), - (192, 192, 1024, 32, 32, True, False, True): (3, 16, 3, 4), - (192, 192, 2048, 16, 16, False, True, True): (1, 32, 3, 4), - (192, 192, 2048, 16, 16, True, False, True): (2, 16, 3, 4), - (192, 192, 2048, 32, 32, False, True, True): (1, 64, 1, 4), - (192, 192, 2048, 32, 32, True, False, True): (1, 64, 2, 4), - (192, 192, 4096, 16, 16, False, True, True): (1, 64, 2, 4), - (192, 192, 4096, 16, 16, True, False, True): (1, 32, 3, 4), - (192, 192, 4096, 32, 32, False, True, True): (3, 128, 2, 4), - (192, 192, 4096, 32, 32, True, False, True): (1, 128, 2, 4), - (192, 192, 8192, 16, 16, False, True, True): (2, 64, 3, 4), - (192, 192, 8192, 16, 16, True, False, True): (1, 64, 3, 4), - (192, 192, 8192, 32, 32, False, True, True): (3, 128, 3, 4), - (192, 192, 8192, 32, 32, True, False, True): (1, 128, 2, 4), - (192, 192, 16384, 16, 16, False, True, True): (1, 256, 3, 2), - (192, 192, 16384, 16, 16, True, False, True): (1, 256, 3, 2), - (192, 192, 16384, 32, 32, False, True, True): (2, 256, 3, 4), - (192, 192, 16384, 32, 32, True, False, True): (2, 256, 3, 4), - (192, 192, 32768, 16, 16, False, True, True): (2, 512, 3, 2), - (192, 192, 32768, 16, 16, True, False, True): (1, 128, 3, 4), - (192, 192, 32768, 32, 32, False, True, True): (2, 512, 3, 4), - (192, 192, 32768, 32, 32, True, False, True): (2, 512, 3, 4), - (192, 192, 65536, 16, 16, False, True, True): (2, 1024, 3, 2), - (192, 192, 65536, 16, 16, True, False, True): (1, 256, 3, 4), - (192, 192, 65536, 32, 32, False, True, True): (2, 1024, 3, 4), - (192, 192, 65536, 32, 32, True, False, True): (2, 1024, 3, 4), - (192, 192, 131072, 16, 16, False, True, True): (2, 512, 3, 4), - (192, 192, 131072, 16, 16, True, False, True): (1, 512, 3, 4), - (192, 192, 131072, 32, 32, False, True, True): (2, 1024, 3, 4), - (192, 192, 131072, 32, 32, True, False, True): (2, 1024, 3, 4), - (256, 256, 256, 16, 16, False, True, True): (1, 16, 3, 4), - (256, 256, 256, 16, 16, True, False, True): (2, 16, 1, 4), - (256, 256, 256, 32, 32, False, True, True): (1, 8, 4, 8), - (256, 256, 256, 32, 32, True, False, True): (4, 8, 4, 4), - (256, 256, 256, 64, 64, False, True, True): (1, 4, 4, 8), - (256, 256, 256, 64, 64, True, False, True): (1, 4, 3, 8), - (256, 256, 256, 128, 128, False, True, True): (7, 2, 1, 32), - (256, 256, 256, 128, 128, True, False, True): (3, 2, 1, 32), - (256, 256, 512, 16, 16, False, True, True): (1, 16, 5, 4), - (256, 256, 512, 16, 16, True, False, True): (1, 16, 3, 2), - (256, 256, 512, 32, 32, False, True, True): (4, 16, 4, 4), - (256, 256, 512, 32, 32, True, False, True): (4, 16, 3, 4), - (256, 256, 512, 64, 64, False, True, True): (1, 8, 3, 8), - (256, 256, 512, 64, 64, True, False, True): (1, 8, 3, 8), - (256, 256, 512, 128, 128, False, True, True): (1, 4, 1, 32), - (256, 256, 512, 128, 128, True, False, True): (3, 4, 1, 32), - (256, 256, 1024, 16, 16, False, True, True): (3, 32, 5, 2), - (256, 256, 1024, 16, 16, True, False, True): (2, 32, 5, 2), - (256, 256, 1024, 32, 32, False, True, True): (1, 32, 4, 4), - (256, 256, 1024, 32, 32, True, False, True): (1, 32, 5, 4), - (256, 256, 1024, 64, 64, False, True, True): (4, 16, 3, 8), - (256, 256, 1024, 64, 64, True, False, True): (1, 16, 3, 8), - (256, 256, 1024, 128, 128, False, True, True): (1, 8, 1, 32), - (256, 256, 1024, 128, 128, True, False, True): (3, 8, 1, 32), - (256, 256, 2048, 16, 16, False, True, True): (3, 32, 3, 4), - (256, 256, 2048, 16, 16, True, False, True): (1, 64, 3, 2), - (256, 256, 2048, 32, 32, False, True, True): (1, 64, 3, 4), - (256, 256, 2048, 32, 32, True, False, True): (1, 64, 3, 4), - (256, 256, 2048, 64, 64, False, True, True): (2, 32, 1, 8), - (256, 256, 2048, 64, 64, True, False, True): (2, 32, 1, 8), - (256, 256, 2048, 128, 128, False, True, True): (4, 16, 1, 32), - (256, 256, 2048, 128, 128, True, False, True): (4, 16, 1, 32), - (256, 256, 4096, 16, 16, False, True, True): (1, 32, 2, 4), - (256, 256, 4096, 16, 16, True, False, True): (1, 32, 3, 4), - (256, 256, 4096, 32, 32, False, True, True): (1, 128, 2, 4), - (256, 256, 4096, 32, 32, True, False, True): (1, 128, 2, 4), - (256, 256, 4096, 64, 64, False, True, True): (2, 64, 4, 8), - (256, 256, 4096, 64, 64, True, False, True): (3, 64, 2, 8), - (256, 256, 4096, 128, 128, False, True, True): (3, 32, 1, 32), - (256, 256, 4096, 128, 128, True, False, True): (2, 32, 1, 32), - (256, 256, 8192, 16, 16, False, True, True): (1, 64, 3, 4), - (256, 256, 8192, 16, 16, True, False, True): (2, 128, 3, 2), - (256, 256, 8192, 32, 32, False, True, True): (3, 128, 3, 4), - (256, 256, 8192, 32, 32, True, False, True): (1, 128, 3, 4), - (256, 256, 8192, 64, 64, False, True, True): (3, 128, 1, 4), - (256, 256, 8192, 64, 64, True, False, True): (4, 128, 2, 8), - (256, 256, 8192, 128, 128, False, True, True): (6, 64, 1, 32), - (256, 256, 8192, 128, 128, True, False, True): (2, 64, 1, 32), - (256, 256, 16384, 16, 16, False, True, True): (4, 128, 3, 4), - (256, 256, 16384, 16, 16, True, False, True): (3, 128, 3, 4), - (256, 256, 16384, 32, 32, False, True, True): (4, 256, 3, 4), - (256, 256, 16384, 32, 32, True, False, True): (2, 256, 3, 4), - (256, 256, 16384, 64, 64, False, True, True): (3, 256, 1, 4), - (256, 256, 16384, 64, 64, True, False, True): (2, 256, 2, 4), - (256, 256, 16384, 128, 128, False, True, True): (1, 128, 1, 32), - (256, 256, 16384, 128, 128, True, False, True): (3, 128, 1, 32), - (256, 256, 32768, 16, 16, False, True, True): (1, 256, 3, 4), - (256, 256, 32768, 16, 16, True, False, True): (2, 128, 3, 4), - (256, 256, 32768, 32, 32, False, True, True): (2, 512, 3, 4), - (256, 256, 32768, 32, 32, True, False, True): (4, 512, 3, 4), - (256, 256, 32768, 64, 64, False, True, True): (1, 512, 1, 8), - (256, 256, 32768, 64, 64, True, False, True): (1, 512, 2, 4), - (256, 256, 32768, 128, 128, False, True, True): (1, 256, 1, 32), - (256, 256, 32768, 128, 128, True, False, True): (1, 256, 1, 32), - (256, 256, 65536, 16, 16, False, True, True): (2, 512, 3, 4), - (256, 256, 65536, 16, 16, True, False, True): (1, 256, 3, 4), - (256, 256, 65536, 32, 32, False, True, True): (1, 1024, 3, 4), - (256, 256, 65536, 32, 32, True, False, True): (2, 1024, 3, 4), - (256, 256, 65536, 64, 64, False, True, True): (1, 1024, 2, 4), - (256, 256, 65536, 64, 64, True, False, True): (1, 1024, 2, 4), - (256, 256, 65536, 128, 128, False, True, True): (1, 512, 1, 32), - (256, 256, 65536, 128, 128, True, False, True): (2, 512, 1, 32), - (256, 256, 131072, 16, 16, False, True, True): (1, 1024, 3, 4), - (256, 256, 131072, 16, 16, True, False, True): (1, 512, 3, 4), - (256, 256, 131072, 32, 32, False, True, True): (1, 2048, 3, 4), - (256, 256, 131072, 32, 32, True, False, True): (1, 2048, 3, 4), - (256, 256, 131072, 64, 64, False, True, True): (1, 2048, 1, 8), - (256, 256, 131072, 64, 64, True, False, True): (1, 2048, 2, 4), - (256, 256, 131072, 128, 128, False, True, True): (1, 1024, 1, 32), - (256, 256, 131072, 128, 128, True, False, True): (4, 1024, 1, 32), - (384, 384, 256, 16, 16, False, True, True): (1, 8, 3, 4), - (384, 384, 256, 16, 16, True, False, True): (1, 8, 3, 4), - (384, 384, 256, 32, 32, False, True, True): (2, 8, 3, 8), - (384, 384, 256, 32, 32, True, False, True): (1, 8, 3, 4), - (384, 384, 256, 64, 64, False, True, True): (1, 4, 4, 8), - (384, 384, 256, 64, 64, True, False, True): (2, 4, 3, 8), - (384, 384, 512, 16, 16, False, True, True): (3, 16, 3, 2), - (384, 384, 512, 16, 16, True, False, True): (3, 16, 3, 2), - (384, 384, 512, 32, 32, False, True, True): (2, 8, 3, 4), - (384, 384, 512, 32, 32, True, False, True): (1, 8, 3, 4), - (384, 384, 512, 64, 64, False, True, True): (2, 8, 3, 8), - (384, 384, 512, 64, 64, True, False, True): (2, 8, 4, 8), - (384, 384, 1024, 16, 16, False, True, True): (3, 16, 3, 2), - (384, 384, 1024, 16, 16, True, False, True): (4, 32, 3, 2), - (384, 384, 1024, 32, 32, False, True, True): (1, 32, 3, 4), - (384, 384, 1024, 32, 32, True, False, True): (2, 16, 3, 4), - (384, 384, 1024, 64, 64, False, True, True): (2, 16, 3, 8), - (384, 384, 1024, 64, 64, True, False, True): (4, 16, 4, 8), - (384, 384, 2048, 16, 16, False, True, True): (3, 16, 3, 4), - (384, 384, 2048, 16, 16, True, False, True): (1, 32, 3, 4), - (384, 384, 2048, 32, 32, False, True, True): (3, 64, 2, 4), - (384, 384, 2048, 32, 32, True, False, True): (1, 64, 3, 4), - (384, 384, 2048, 64, 64, False, True, True): (4, 32, 4, 8), - (384, 384, 2048, 64, 64, True, False, True): (5, 32, 4, 8), - (384, 384, 4096, 16, 16, False, True, True): (1, 32, 3, 4), - (384, 384, 4096, 16, 16, True, False, True): (3, 32, 3, 4), - (384, 384, 4096, 32, 32, False, True, True): (2, 64, 3, 4), - (384, 384, 4096, 32, 32, True, False, True): (2, 64, 3, 4), - (384, 384, 4096, 64, 64, False, True, True): (2, 64, 3, 8), - (384, 384, 4096, 64, 64, True, False, True): (2, 64, 3, 8), - (384, 384, 8192, 16, 16, False, True, True): (1, 128, 3, 2), - (384, 384, 8192, 16, 16, True, False, True): (1, 128, 3, 2), - (384, 384, 8192, 32, 32, False, True, True): (1, 128, 3, 4), - (384, 384, 8192, 32, 32, True, False, True): (1, 128, 3, 4), - (384, 384, 8192, 64, 64, False, True, True): (3, 128, 3, 4), - (384, 384, 8192, 64, 64, True, False, True): (2, 128, 3, 4), - (384, 384, 16384, 16, 16, False, True, True): (1, 256, 3, 2), - (384, 384, 16384, 16, 16, True, False, True): (1, 64, 3, 4), - (384, 384, 16384, 32, 32, False, True, True): (2, 256, 3, 4), - (384, 384, 16384, 32, 32, True, False, True): (4, 256, 3, 4), - (384, 384, 16384, 64, 64, False, True, True): (2, 256, 3, 4), - (384, 384, 16384, 64, 64, True, False, True): (1, 256, 3, 4), - (384, 384, 32768, 16, 16, False, True, True): (1, 128, 3, 4), - (384, 384, 32768, 16, 16, True, False, True): (1, 128, 3, 4), - (384, 384, 32768, 32, 32, False, True, True): (1, 512, 3, 4), - (384, 384, 32768, 32, 32, True, False, True): (1, 512, 2, 4), - (384, 384, 32768, 64, 64, False, True, True): (1, 512, 3, 4), - (384, 384, 32768, 64, 64, True, False, True): (1, 512, 3, 4), - (384, 384, 65536, 16, 16, False, True, True): (1, 256, 3, 4), - (384, 384, 65536, 16, 16, True, False, True): (1, 256, 3, 4), - (384, 384, 65536, 32, 32, False, True, True): (1, 1024, 3, 4), - (384, 384, 65536, 32, 32, True, False, True): (1, 512, 3, 4), - (384, 384, 65536, 64, 64, False, True, True): (1, 1024, 3, 4), - (384, 384, 65536, 64, 64, True, False, True): (1, 1024, 3, 4), - (384, 384, 131072, 16, 16, False, True, True): (1, 512, 3, 4), - (384, 384, 131072, 16, 16, True, False, True): (1, 512, 3, 4), - (384, 384, 131072, 32, 32, False, True, True): (1, 1024, 3, 4), - (384, 384, 131072, 32, 32, True, False, True): (1, 1024, 3, 4), - (384, 384, 131072, 64, 64, False, True, True): (1, 2048, 3, 4), - (384, 384, 131072, 64, 64, True, False, True): (1, 2048, 3, 4), - (512, 512, 256, 16, 16, False, True, True): (1, 8, 4, 4), - (512, 512, 256, 16, 16, True, False, True): (1, 8, 3, 2), - (512, 512, 256, 32, 32, False, True, True): (4, 8, 3, 4), - (512, 512, 256, 32, 32, True, False, True): (4, 8, 3, 4), - (512, 512, 256, 64, 64, False, True, True): (3, 4, 3, 8), - (512, 512, 256, 64, 64, True, False, True): (5, 4, 3, 8), - (512, 512, 256, 128, 128, False, True, True): (1, 2, 1, 32), - (512, 512, 256, 128, 128, True, False, True): (3, 2, 1, 32), - (512, 512, 512, 16, 16, False, True, True): (2, 16, 3, 2), - (512, 512, 512, 16, 16, True, False, True): (1, 8, 4, 4), - (512, 512, 512, 32, 32, False, True, True): (3, 16, 3, 4), - (512, 512, 512, 32, 32, True, False, True): (5, 16, 2, 4), - (512, 512, 512, 64, 64, False, True, True): (1, 8, 3, 8), - (512, 512, 512, 64, 64, True, False, True): (3, 8, 3, 8), - (512, 512, 512, 128, 128, False, True, True): (1, 4, 1, 32), - (512, 512, 512, 128, 128, True, False, True): (3, 4, 1, 16), - (512, 512, 1024, 16, 16, False, True, True): (1, 16, 3, 4), - (512, 512, 1024, 16, 16, True, False, True): (3, 16, 3, 4), - (512, 512, 1024, 32, 32, False, True, True): (3, 32, 3, 4), - (512, 512, 1024, 32, 32, True, False, True): (3, 32, 2, 4), - (512, 512, 1024, 64, 64, False, True, True): (1, 16, 3, 8), - (512, 512, 1024, 64, 64, True, False, True): (4, 16, 3, 8), - (512, 512, 1024, 128, 128, False, True, True): (4, 8, 1, 32), - (512, 512, 1024, 128, 128, True, False, True): (4, 8, 1, 32), - (512, 512, 2048, 16, 16, False, True, True): (5, 16, 3, 4), - (512, 512, 2048, 16, 16, True, False, True): (5, 16, 3, 4), - (512, 512, 2048, 32, 32, False, True, True): (1, 32, 3, 4), - (512, 512, 2048, 32, 32, True, False, True): (1, 32, 4, 4), - (512, 512, 2048, 64, 64, False, True, True): (4, 32, 3, 8), - (512, 512, 2048, 64, 64, True, False, True): (4, 32, 3, 8), - (512, 512, 2048, 128, 128, False, True, True): (3, 16, 1, 32), - (512, 512, 2048, 128, 128, True, False, True): (3, 16, 1, 32), - (512, 512, 4096, 16, 16, False, True, True): (4, 32, 3, 4), - (512, 512, 4096, 16, 16, True, False, True): (4, 64, 3, 2), - (512, 512, 4096, 32, 32, False, True, True): (3, 64, 3, 4), - (512, 512, 4096, 32, 32, True, False, True): (3, 64, 3, 4), - (512, 512, 4096, 64, 64, False, True, True): (4, 64, 2, 4), - (512, 512, 4096, 64, 64, True, False, True): (1, 64, 2, 4), - (512, 512, 4096, 128, 128, False, True, True): (1, 32, 1, 32), - (512, 512, 4096, 128, 128, True, False, True): (1, 32, 1, 32), - (512, 512, 8192, 16, 16, False, True, True): (1, 64, 3, 4), - (512, 512, 8192, 16, 16, True, False, True): (4, 64, 3, 4), - (512, 512, 8192, 32, 32, False, True, True): (2, 128, 3, 4), - (512, 512, 8192, 32, 32, True, False, True): (3, 128, 3, 4), - (512, 512, 8192, 64, 64, False, True, True): (1, 128, 2, 4), - (512, 512, 8192, 64, 64, True, False, True): (1, 128, 2, 4), - (512, 512, 8192, 128, 128, False, True, True): (6, 64, 1, 32), - (512, 512, 8192, 128, 128, True, False, True): (4, 64, 1, 32), - (512, 512, 16384, 16, 16, False, True, True): (1, 128, 3, 4), - (512, 512, 16384, 16, 16, True, False, True): (1, 64, 3, 4), - (512, 512, 16384, 32, 32, False, True, True): (1, 256, 3, 4), - (512, 512, 16384, 32, 32, True, False, True): (4, 256, 3, 4), - (512, 512, 16384, 64, 64, False, True, True): (1, 256, 2, 4), - (512, 512, 16384, 64, 64, True, False, True): (1, 256, 2, 4), - (512, 512, 16384, 128, 128, False, True, True): (1, 128, 1, 32), - (512, 512, 16384, 128, 128, True, False, True): (2, 128, 1, 32), - (512, 512, 32768, 16, 16, False, True, True): (1, 256, 3, 4), - (512, 512, 32768, 16, 16, True, False, True): (1, 128, 3, 4), - (512, 512, 32768, 32, 32, False, True, True): (1, 512, 3, 4), - (512, 512, 32768, 32, 32, True, False, True): (1, 512, 3, 4), - (512, 512, 32768, 64, 64, False, True, True): (1, 512, 2, 4), - (512, 512, 32768, 64, 64, True, False, True): (2, 512, 2, 4), - (512, 512, 32768, 128, 128, False, True, True): (1, 256, 1, 32), - (512, 512, 32768, 128, 128, True, False, True): (2, 256, 1, 32), - (512, 512, 65536, 16, 16, False, True, True): (1, 512, 3, 4), - (512, 512, 65536, 16, 16, True, False, True): (1, 256, 3, 4), - (512, 512, 65536, 32, 32, False, True, True): (1, 1024, 3, 4), - (512, 512, 65536, 32, 32, True, False, True): (1, 1024, 3, 4), - (512, 512, 65536, 64, 64, False, True, True): (1, 1024, 2, 4), - (512, 512, 65536, 64, 64, True, False, True): (1, 1024, 2, 4), - (512, 512, 65536, 128, 128, False, True, True): (1, 512, 1, 32), - (512, 512, 65536, 128, 128, True, False, True): (4, 512, 1, 32), - (512, 512, 131072, 16, 16, False, True, True): (1, 512, 3, 4), - (512, 512, 131072, 16, 16, True, False, True): (1, 512, 3, 4), - (512, 512, 131072, 32, 32, False, True, True): (1, 2048, 3, 4), - (512, 512, 131072, 32, 32, True, False, True): (1, 2048, 3, 4), - (512, 512, 131072, 64, 64, False, True, True): (1, 2048, 2, 4), - (512, 512, 131072, 64, 64, True, False, True): (1, 2048, 2, 4), - (512, 512, 131072, 128, 128, False, True, True): (1, 1024, 1, 32), - (512, 512, 131072, 128, 128, True, False, True): (2, 1024, 1, 32), - (768, 768, 256, 16, 16, False, True, True): (1, 4, 5, 4), - (768, 768, 256, 16, 16, True, False, True): (3, 8, 3, 2), - (768, 768, 256, 32, 32, False, True, True): (2, 4, 3, 4), - (768, 768, 256, 32, 32, True, False, True): (3, 8, 4, 4), - (768, 768, 256, 64, 64, False, True, True): (1, 4, 4, 8), - (768, 768, 256, 64, 64, True, False, True): (3, 4, 3, 8), - (768, 768, 256, 128, 128, False, True, True): (3, 2, 1, 32), - (768, 768, 256, 128, 128, True, False, True): (2, 2, 2, 32), - (768, 768, 512, 16, 16, False, True, True): (2, 4, 5, 4), - (768, 768, 512, 16, 16, True, False, True): (2, 4, 4, 4), - (768, 768, 512, 32, 32, False, True, True): (1, 8, 3, 4), - (768, 768, 512, 32, 32, True, False, True): (3, 8, 4, 4), - (768, 768, 512, 64, 64, False, True, True): (2, 8, 3, 8), - (768, 768, 512, 64, 64, True, False, True): (5, 8, 3, 8), - (768, 768, 512, 128, 128, False, True, True): (2, 4, 1, 32), - (768, 768, 512, 128, 128, True, False, True): (2, 4, 2, 32), - (768, 768, 1024, 16, 16, False, True, True): (2, 16, 4, 2), - (768, 768, 1024, 16, 16, True, False, True): (4, 32, 3, 1), - (768, 768, 1024, 32, 32, False, True, True): (1, 32, 2, 4), - (768, 768, 1024, 32, 32, True, False, True): (1, 16, 5, 4), - (768, 768, 1024, 64, 64, False, True, True): (2, 16, 3, 8), - (768, 768, 1024, 64, 64, True, False, True): (2, 16, 3, 8), - (768, 768, 1024, 128, 128, False, True, True): (1, 8, 2, 32), - (768, 768, 1024, 128, 128, True, False, True): (1, 8, 1, 32), - (768, 768, 2048, 16, 16, False, True, True): (1, 16, 3, 4), - (768, 768, 2048, 16, 16, True, False, True): (1, 16, 3, 4), - (768, 768, 2048, 32, 32, False, True, True): (1, 32, 3, 4), - (768, 768, 2048, 32, 32, True, False, True): (5, 32, 3, 4), - (768, 768, 2048, 64, 64, False, True, True): (1, 32, 3, 8), - (768, 768, 2048, 64, 64, True, False, True): (1, 32, 3, 4), - (768, 768, 2048, 128, 128, False, True, True): (3, 16, 1, 32), - (768, 768, 2048, 128, 128, True, False, True): (4, 16, 1, 32), - (768, 768, 4096, 16, 16, False, True, True): (1, 64, 3, 2), - (768, 768, 4096, 16, 16, True, False, True): (3, 64, 3, 2), - (768, 768, 4096, 32, 32, False, True, True): (1, 64, 3, 4), - (768, 768, 4096, 32, 32, True, False, True): (1, 64, 3, 4), - (768, 768, 4096, 64, 64, False, True, True): (4, 64, 3, 4), - (768, 768, 4096, 64, 64, True, False, True): (4, 64, 3, 4), - (768, 768, 4096, 128, 128, False, True, True): (1, 32, 1, 32), - (768, 768, 4096, 128, 128, True, False, True): (1, 32, 2, 32), - (768, 768, 8192, 16, 16, False, True, True): (1, 128, 3, 2), - (768, 768, 8192, 16, 16, True, False, True): (2, 32, 3, 4), - (768, 768, 8192, 32, 32, False, True, True): (2, 128, 3, 4), - (768, 768, 8192, 32, 32, True, False, True): (1, 128, 2, 4), - (768, 768, 8192, 64, 64, False, True, True): (1, 128, 3, 4), - (768, 768, 8192, 64, 64, True, False, True): (2, 128, 3, 4), - (768, 768, 8192, 128, 128, False, True, True): (1, 64, 1, 32), - (768, 768, 8192, 128, 128, True, False, True): (2, 64, 1, 32), - (768, 768, 16384, 16, 16, False, True, True): (3, 64, 3, 4), - (768, 768, 16384, 16, 16, True, False, True): (1, 64, 3, 4), - (768, 768, 16384, 32, 32, False, True, True): (2, 256, 3, 4), - (768, 768, 16384, 32, 32, True, False, True): (4, 256, 2, 4), - (768, 768, 16384, 64, 64, False, True, True): (1, 256, 3, 4), - (768, 768, 16384, 64, 64, True, False, True): (1, 256, 3, 4), - (768, 768, 16384, 128, 128, False, True, True): (1, 128, 1, 32), - (768, 768, 16384, 128, 128, True, False, True): (2, 128, 1, 32), - (768, 768, 32768, 16, 16, False, True, True): (1, 128, 3, 4), - (768, 768, 32768, 16, 16, True, False, True): (2, 128, 3, 4), - (768, 768, 32768, 32, 32, False, True, True): (2, 256, 3, 4), - (768, 768, 32768, 32, 32, True, False, True): (1, 256, 3, 4), - (768, 768, 32768, 64, 64, False, True, True): (1, 512, 3, 4), - (768, 768, 32768, 64, 64, True, False, True): (1, 512, 3, 4), - (768, 768, 32768, 128, 128, False, True, True): (1, 256, 1, 32), - (768, 768, 32768, 128, 128, True, False, True): (1, 256, 1, 32), - (768, 768, 50432, 16, 16, False, True, True): (1, 197, 3, 4), - (768, 768, 50432, 32, 32, False, True, True): (1, 394, 3, 4), - (768, 768, 50432, 64, 64, False, True, True): (1, 788, 3, 4), - (768, 768, 50432, 128, 128, False, True, True): (3, 394, 1, 32), - (768, 768, 65536, 16, 16, False, True, True): (1, 256, 3, 4), - (768, 768, 65536, 16, 16, True, False, True): (1, 256, 3, 4), - (768, 768, 65536, 32, 32, False, True, True): (1, 512, 3, 4), - (768, 768, 65536, 32, 32, True, False, True): (1, 512, 3, 4), - (768, 768, 65536, 64, 64, False, True, True): (1, 1024, 3, 4), - (768, 768, 65536, 64, 64, True, False, True): (1, 1024, 3, 4), - (768, 768, 65536, 128, 128, False, True, True): (1, 512, 1, 32), - (768, 768, 65536, 128, 128, True, False, True): (1, 512, 1, 32), - (768, 768, 131072, 16, 16, False, True, True): (1, 512, 3, 4), - (768, 768, 131072, 16, 16, True, False, True): (1, 512, 3, 4), - (768, 768, 131072, 32, 32, False, True, True): (1, 1024, 3, 4), - (768, 768, 131072, 32, 32, True, False, True): (1, 1024, 3, 4), - (768, 768, 131072, 64, 64, False, True, True): (1, 2048, 3, 4), - (768, 768, 131072, 64, 64, True, False, True): (1, 2048, 3, 4), - (768, 768, 131072, 128, 128, False, True, True): (1, 1024, 1, 32), - (768, 768, 131072, 128, 128, True, False, True): (1, 1024, 1, 32), - (768, 3072, 256, 16, 16, False, True, True): (1, 2, 4, 4), - (768, 3072, 256, 16, 16, True, False, True): (1, 4, 3, 4), - (768, 3072, 256, 32, 32, False, True, True): (1, 4, 3, 4), - (768, 3072, 256, 32, 32, True, False, True): (3, 4, 3, 4), - (768, 3072, 256, 64, 64, False, True, True): (1, 4, 3, 8), - (768, 3072, 256, 64, 64, True, False, True): (1, 4, 3, 8), - (768, 3072, 256, 128, 128, False, True, True): (2, 2, 2, 32), - (768, 3072, 256, 128, 128, True, False, True): (2, 2, 1, 32), - (768, 3072, 512, 16, 16, False, True, True): (2, 4, 3, 4), - (768, 3072, 512, 16, 16, True, False, True): (1, 8, 3, 2), - (768, 3072, 512, 32, 32, False, True, True): (3, 8, 4, 4), - (768, 3072, 512, 32, 32, True, False, True): (3, 8, 3, 4), - (768, 3072, 512, 64, 64, False, True, True): (1, 8, 4, 8), - (768, 3072, 512, 64, 64, True, False, True): (1, 8, 3, 8), - (768, 3072, 512, 128, 128, False, True, True): (1, 4, 2, 32), - (768, 3072, 512, 128, 128, True, False, True): (1, 4, 1, 32), - (768, 3072, 1024, 16, 16, False, True, True): (4, 16, 3, 2), - (768, 3072, 1024, 16, 16, True, False, True): (4, 16, 3, 2), - (768, 3072, 1024, 32, 32, False, True, True): (4, 16, 5, 4), - (768, 3072, 1024, 32, 32, True, False, True): (4, 16, 5, 4), - (768, 3072, 1024, 64, 64, False, True, True): (2, 16, 3, 8), - (768, 3072, 1024, 64, 64, True, False, True): (2, 16, 3, 8), - (768, 3072, 1024, 128, 128, False, True, True): (1, 8, 1, 32), - (768, 3072, 1024, 128, 128, True, False, True): (1, 8, 1, 32), - (768, 3072, 2048, 16, 16, False, True, True): (2, 16, 3, 4), - (768, 3072, 2048, 16, 16, True, False, True): (2, 16, 3, 4), - (768, 3072, 2048, 32, 32, False, True, True): (4, 32, 5, 4), - (768, 3072, 2048, 32, 32, True, False, True): (2, 32, 3, 4), - (768, 3072, 2048, 64, 64, False, True, True): (2, 32, 3, 8), - (768, 3072, 2048, 64, 64, True, False, True): (2, 32, 3, 8), - (768, 3072, 2048, 128, 128, False, True, True): (1, 16, 1, 32), - (768, 3072, 2048, 128, 128, True, False, True): (2, 16, 1, 32), - (768, 3072, 4096, 16, 16, False, True, True): (1, 32, 5, 4), - (768, 3072, 4096, 16, 16, True, False, True): (3, 64, 3, 2), - (768, 3072, 4096, 32, 32, False, True, True): (5, 64, 3, 4), - (768, 3072, 4096, 32, 32, True, False, True): (5, 64, 3, 4), - (768, 3072, 4096, 64, 64, False, True, True): (1, 64, 3, 8), - (768, 3072, 4096, 64, 64, True, False, True): (5, 64, 3, 4), - (768, 3072, 4096, 128, 128, False, True, True): (1, 32, 1, 32), - (768, 3072, 4096, 128, 128, True, False, True): (1, 32, 1, 32), - (768, 3072, 8192, 16, 16, False, True, True): (1, 128, 3, 2), - (768, 3072, 8192, 16, 16, True, False, True): (1, 128, 3, 2), - (768, 3072, 8192, 32, 32, False, True, True): (1, 128, 3, 4), - (768, 3072, 8192, 32, 32, True, False, True): (1, 64, 3, 4), - (768, 3072, 8192, 64, 64, False, True, True): (3, 128, 3, 4), - (768, 3072, 8192, 64, 64, True, False, True): (3, 128, 3, 4), - (768, 3072, 8192, 128, 128, False, True, True): (4, 64, 2, 32), - (768, 3072, 8192, 128, 128, True, False, True): (2, 64, 1, 32), - (768, 3072, 16384, 16, 16, False, True, True): (1, 256, 2, 2), - (768, 3072, 16384, 16, 16, True, False, True): (1, 64, 3, 4), - (768, 3072, 16384, 32, 32, False, True, True): (8, 128, 3, 4), - (768, 3072, 16384, 32, 32, True, False, True): (1, 128, 3, 4), - (768, 3072, 16384, 64, 64, False, True, True): (1, 256, 3, 4), - (768, 3072, 16384, 64, 64, True, False, True): (3, 256, 3, 4), - (768, 3072, 16384, 128, 128, False, True, True): (3, 128, 1, 32), - (768, 3072, 16384, 128, 128, True, False, True): (2, 128, 2, 32), - (768, 3072, 32768, 16, 16, False, True, True): (1, 512, 3, 1), - (768, 3072, 32768, 16, 16, True, False, True): (1, 128, 3, 4), - (768, 3072, 32768, 32, 32, False, True, True): (1, 256, 3, 4), - (768, 3072, 32768, 32, 32, True, False, True): (1, 256, 3, 4), - (768, 3072, 32768, 64, 64, False, True, True): (2, 512, 3, 4), - (768, 3072, 32768, 64, 64, True, False, True): (1, 512, 3, 4), - (768, 3072, 32768, 128, 128, False, True, True): (1, 256, 1, 32), - (768, 3072, 32768, 128, 128, True, False, True): (2, 256, 2, 32), - (768, 3072, 50432, 16, 16, False, True, True): (1, 197, 3, 4), - (768, 3072, 50432, 16, 16, True, False, True): (1, 197, 3, 4), - (768, 3072, 50432, 32, 32, False, True, True): (1, 788, 2, 4), - (768, 3072, 50432, 32, 32, True, False, True): (1, 394, 3, 4), - (768, 3072, 50432, 64, 64, False, True, True): (1, 788, 3, 4), - (768, 3072, 50432, 64, 64, True, False, True): (2, 788, 3, 4), - (768, 3072, 50432, 128, 128, False, True, True): (1, 394, 1, 32), - (768, 3072, 50432, 128, 128, True, False, True): (2, 394, 2, 32), - (768, 3072, 65536, 16, 16, False, True, True): (1, 1024, 3, 1), - (768, 3072, 65536, 16, 16, True, False, True): (1, 256, 3, 4), - (768, 3072, 65536, 32, 32, False, True, True): (1, 512, 3, 4), - (768, 3072, 65536, 32, 32, True, False, True): (1, 512, 3, 4), - (768, 3072, 65536, 64, 64, False, True, True): (2, 1024, 3, 4), - (768, 3072, 65536, 64, 64, True, False, True): (5, 1024, 3, 4), - (768, 3072, 65536, 128, 128, False, True, True): (1, 512, 1, 32), - (768, 3072, 65536, 128, 128, True, False, True): (2, 512, 2, 32), - (768, 3072, 131072, 16, 16, False, True, True): (1, 2048, 3, 1), - (768, 3072, 131072, 16, 16, True, False, True): (1, 512, 3, 4), - (768, 3072, 131072, 32, 32, False, True, True): (1, 1024, 3, 4), - (768, 3072, 131072, 32, 32, True, False, True): (1, 1024, 3, 4), - (768, 3072, 131072, 64, 64, False, True, True): (1, 2048, 3, 4), - (768, 3072, 131072, 64, 64, True, False, True): (2, 2048, 3, 4), - (768, 3072, 131072, 128, 128, False, True, True): (1, 1024, 1, 32), - (768, 3072, 131072, 128, 128, True, False, True): (1, 1024, 2, 32), - (1024, 1024, 256, 16, 16, False, True, True): (4, 8, 3, 2), - (1024, 1024, 256, 16, 16, True, False, True): (2, 8, 3, 2), - (1024, 1024, 256, 32, 32, False, True, True): (1, 8, 3, 4), - (1024, 1024, 256, 32, 32, True, False, True): (1, 8, 3, 4), - (1024, 1024, 256, 64, 64, False, True, True): (1, 4, 3, 8), - (1024, 1024, 256, 64, 64, True, False, True): (2, 4, 3, 8), - (1024, 1024, 256, 128, 128, False, True, True): (3, 2, 1, 32), - (1024, 1024, 256, 128, 128, True, False, True): (5, 2, 1, 32), - (1024, 1024, 512, 16, 16, False, True, True): (3, 8, 3, 4), - (1024, 1024, 512, 16, 16, True, False, True): (3, 8, 3, 4), - (1024, 1024, 512, 32, 32, False, True, True): (1, 16, 3, 4), - (1024, 1024, 512, 32, 32, True, False, True): (3, 16, 3, 4), - (1024, 1024, 512, 64, 64, False, True, True): (6, 8, 3, 8), - (1024, 1024, 512, 64, 64, True, False, True): (8, 8, 3, 8), - (1024, 1024, 512, 128, 128, False, True, True): (1, 4, 1, 32), - (1024, 1024, 512, 128, 128, True, False, True): (1, 4, 1, 32), - (1024, 1024, 1024, 16, 16, False, True, True): (4, 8, 3, 4), - (1024, 1024, 1024, 16, 16, True, False, True): (1, 8, 3, 4), - (1024, 1024, 1024, 32, 32, False, True, True): (4, 16, 4, 4), - (1024, 1024, 1024, 32, 32, True, False, True): (5, 16, 3, 4), - (1024, 1024, 1024, 64, 64, False, True, True): (6, 16, 3, 8), - (1024, 1024, 1024, 64, 64, True, False, True): (3, 16, 2, 4), - (1024, 1024, 1024, 128, 128, False, True, True): (1, 8, 1, 32), - (1024, 1024, 1024, 128, 128, True, False, True): (2, 8, 1, 32), - (1024, 1024, 2048, 16, 16, False, True, True): (4, 16, 3, 4), - (1024, 1024, 2048, 16, 16, True, False, True): (1, 16, 3, 4), - (1024, 1024, 2048, 32, 32, False, True, True): (1, 32, 3, 4), - (1024, 1024, 2048, 32, 32, True, False, True): (2, 32, 3, 4), - (1024, 1024, 2048, 64, 64, False, True, True): (4, 32, 2, 4), - (1024, 1024, 2048, 64, 64, True, False, True): (8, 32, 2, 4), - (1024, 1024, 2048, 128, 128, False, True, True): (1, 16, 1, 32), - (1024, 1024, 2048, 128, 128, True, False, True): (1, 16, 1, 32), - (1024, 1024, 4096, 16, 16, False, True, True): (4, 32, 3, 4), - (1024, 1024, 4096, 16, 16, True, False, True): (1, 64, 3, 2), - (1024, 1024, 4096, 32, 32, False, True, True): (1, 64, 3, 4), - (1024, 1024, 4096, 32, 32, True, False, True): (1, 64, 3, 4), - (1024, 1024, 4096, 64, 64, False, True, True): (2, 64, 2, 4), - (1024, 1024, 4096, 64, 64, True, False, True): (2, 64, 2, 4), - (1024, 1024, 4096, 128, 128, False, True, True): (1, 32, 1, 32), - (1024, 1024, 4096, 128, 128, True, False, True): (4, 32, 1, 32), - (1024, 1024, 8192, 16, 16, False, True, True): (1, 128, 3, 1), - (1024, 1024, 8192, 16, 16, True, False, True): (1, 128, 3, 1), - (1024, 1024, 8192, 32, 32, False, True, True): (1, 128, 3, 4), - (1024, 1024, 8192, 32, 32, True, False, True): (1, 128, 3, 4), - (1024, 1024, 8192, 64, 64, False, True, True): (2, 128, 2, 4), - (1024, 1024, 8192, 64, 64, True, False, True): (2, 128, 2, 4), - (1024, 1024, 8192, 128, 128, False, True, True): (1, 64, 1, 32), - (1024, 1024, 8192, 128, 128, True, False, True): (4, 64, 1, 32), - (1024, 1024, 16384, 16, 16, False, True, True): (1, 128, 2, 4), - (1024, 1024, 16384, 16, 16, True, False, True): (4, 256, 3, 1), - (1024, 1024, 16384, 32, 32, False, True, True): (1, 256, 3, 4), - (1024, 1024, 16384, 32, 32, True, False, True): (1, 256, 3, 4), - (1024, 1024, 16384, 64, 64, False, True, True): (1, 256, 2, 4), - (1024, 1024, 16384, 64, 64, True, False, True): (1, 256, 2, 4), - (1024, 1024, 16384, 128, 128, False, True, True): (1, 128, 1, 32), - (1024, 1024, 16384, 128, 128, True, False, True): (4, 128, 1, 32), - (1024, 1024, 32768, 16, 16, False, True, True): (1, 256, 2, 4), - (1024, 1024, 32768, 16, 16, True, False, True): (4, 512, 3, 1), - (1024, 1024, 32768, 32, 32, False, True, True): (1, 512, 3, 4), - (1024, 1024, 32768, 32, 32, True, False, True): (1, 512, 3, 4), - (1024, 1024, 32768, 64, 64, False, True, True): (1, 512, 2, 4), - (1024, 1024, 32768, 64, 64, True, False, True): (1, 512, 2, 4), - (1024, 1024, 32768, 128, 128, False, True, True): (1, 256, 1, 32), - (1024, 1024, 32768, 128, 128, True, False, True): (1, 256, 1, 32), - (1024, 1024, 65536, 16, 16, False, True, True): (1, 512, 2, 4), - (1024, 1024, 65536, 16, 16, True, False, True): (1, 1024, 3, 1), - (1024, 1024, 65536, 32, 32, False, True, True): (1, 1024, 3, 4), - (1024, 1024, 65536, 32, 32, True, False, True): (1, 512, 3, 4), - (1024, 1024, 65536, 64, 64, False, True, True): (1, 1024, 2, 4), - (1024, 1024, 65536, 64, 64, True, False, True): (1, 1024, 2, 4), - (1024, 1024, 65536, 128, 128, False, True, True): (1, 512, 1, 32), - (1024, 1024, 65536, 128, 128, True, False, True): (1, 512, 1, 32), - (1024, 1024, 131072, 16, 16, False, True, True): (4, 2048, 3, 1), - (1024, 1024, 131072, 16, 16, True, False, True): (4, 2048, 3, 1), - (1024, 1024, 131072, 32, 32, False, True, True): (1, 2048, 3, 4), - (1024, 1024, 131072, 32, 32, True, False, True): (1, 1024, 3, 4), - (1024, 1024, 131072, 64, 64, False, True, True): (1, 2048, 2, 4), - (1024, 1024, 131072, 64, 64, True, False, True): (1, 2048, 2, 4), - (1024, 1024, 131072, 128, 128, False, True, True): (1, 1024, 1, 32), - (1024, 1024, 131072, 128, 128, True, False, True): (1, 1024, 1, 32), - (1280, 5120, 65792, 16, 16, False, True, True): (1, 1028, 3, 1), - (1280, 5120, 65792, 16, 16, True, False, True): (1, 257, 3, 4), - (1280, 5120, 65792, 32, 32, False, True, True): (1, 514, 3, 4), - (1280, 5120, 65792, 32, 32, True, False, True): (1, 514, 3, 4), - (1280, 5120, 65792, 64, 64, False, True, True): (2, 1028, 3, 4), - (1280, 5120, 65792, 64, 64, True, False, True): (1, 1028, 3, 4), - (1280, 5120, 65792, 128, 128, False, True, True): (2, 514, 2, 32), - (1280, 5120, 65792, 128, 128, True, False, True): (1, 514, 2, 32), - (1536, 1536, 256, 16, 16, False, True, True): (5, 4, 3, 2), - (1536, 1536, 256, 16, 16, True, False, True): (2, 2, 3, 4), - (1536, 1536, 256, 32, 32, False, True, True): (1, 8, 2, 4), - (1536, 1536, 256, 32, 32, True, False, True): (2, 4, 3, 4), - (1536, 1536, 256, 64, 64, False, True, True): (1, 4, 3, 8), - (1536, 1536, 256, 64, 64, True, False, True): (2, 4, 3, 8), - (1536, 1536, 256, 128, 128, False, True, True): (1, 2, 1, 32), - (1536, 1536, 256, 128, 128, True, False, True): (2, 2, 2, 32), - (1536, 1536, 512, 16, 16, False, True, True): (1, 8, 3, 2), - (1536, 1536, 512, 16, 16, True, False, True): (1, 8, 3, 2), - (1536, 1536, 512, 32, 32, False, True, True): (1, 16, 3, 4), - (1536, 1536, 512, 32, 32, True, False, True): (1, 16, 3, 4), - (1536, 1536, 512, 64, 64, False, True, True): (3, 8, 3, 8), - (1536, 1536, 512, 64, 64, True, False, True): (3, 8, 3, 8), - (1536, 1536, 512, 128, 128, False, True, True): (1, 4, 1, 32), - (1536, 1536, 512, 128, 128, True, False, True): (2, 4, 2, 32), - (1536, 1536, 1024, 16, 16, False, True, True): (2, 8, 3, 4), - (1536, 1536, 1024, 16, 16, True, False, True): (2, 8, 3, 4), - (1536, 1536, 1024, 32, 32, False, True, True): (1, 16, 3, 4), - (1536, 1536, 1024, 32, 32, True, False, True): (1, 16, 3, 4), - (1536, 1536, 1024, 64, 64, False, True, True): (2, 16, 3, 8), - (1536, 1536, 1024, 64, 64, True, False, True): (2, 16, 3, 8), - (1536, 1536, 1024, 128, 128, False, True, True): (3, 8, 1, 32), - (1536, 1536, 1024, 128, 128, True, False, True): (1, 8, 2, 32), - (1536, 1536, 2048, 16, 16, False, True, True): (1, 32, 3, 2), - (1536, 1536, 2048, 16, 16, True, False, True): (1, 32, 3, 2), - (1536, 1536, 2048, 32, 32, False, True, True): (3, 32, 2, 4), - (1536, 1536, 2048, 32, 32, True, False, True): (4, 32, 3, 4), - (1536, 1536, 2048, 64, 64, False, True, True): (1, 32, 3, 4), - (1536, 1536, 2048, 64, 64, True, False, True): (1, 32, 3, 4), - (1536, 1536, 2048, 128, 128, False, True, True): (1, 16, 1, 32), - (1536, 1536, 2048, 128, 128, True, False, True): (2, 16, 1, 32), - (1536, 1536, 4096, 16, 16, False, True, True): (1, 64, 3, 2), - (1536, 1536, 4096, 16, 16, True, False, True): (1, 16, 3, 4), - (1536, 1536, 4096, 32, 32, False, True, True): (1, 64, 2, 4), - (1536, 1536, 4096, 32, 32, True, False, True): (1, 64, 2, 4), - (1536, 1536, 4096, 64, 64, False, True, True): (1, 64, 3, 4), - (1536, 1536, 4096, 64, 64, True, False, True): (1, 64, 3, 4), - (1536, 1536, 4096, 128, 128, False, True, True): (1, 32, 1, 32), - (1536, 1536, 4096, 128, 128, True, False, True): (4, 32, 2, 32), - (1536, 1536, 8192, 16, 16, False, True, True): (1, 32, 3, 4), - (1536, 1536, 8192, 16, 16, True, False, True): (5, 32, 3, 4), - (1536, 1536, 8192, 32, 32, False, True, True): (1, 128, 2, 4), - (1536, 1536, 8192, 32, 32, True, False, True): (1, 128, 2, 4), - (1536, 1536, 8192, 64, 64, False, True, True): (1, 128, 3, 4), - (1536, 1536, 8192, 64, 64, True, False, True): (1, 128, 3, 4), - (1536, 1536, 8192, 128, 128, False, True, True): (1, 64, 1, 32), - (1536, 1536, 8192, 128, 128, True, False, True): (4, 64, 2, 32), - (1536, 1536, 16384, 16, 16, False, True, True): (1, 64, 3, 4), - (1536, 1536, 16384, 16, 16, True, False, True): (1, 64, 3, 4), - (1536, 1536, 16384, 32, 32, False, True, True): (1, 256, 2, 4), - (1536, 1536, 16384, 32, 32, True, False, True): (1, 128, 3, 4), - (1536, 1536, 16384, 64, 64, False, True, True): (1, 256, 3, 4), - (1536, 1536, 16384, 64, 64, True, False, True): (3, 256, 3, 4), - (1536, 1536, 16384, 128, 128, False, True, True): (1, 128, 1, 32), - (1536, 1536, 16384, 128, 128, True, False, True): (4, 128, 2, 32), - (1536, 1536, 32768, 16, 16, False, True, True): (1, 128, 3, 4), - (1536, 1536, 32768, 16, 16, True, False, True): (1, 128, 3, 4), - (1536, 1536, 32768, 32, 32, False, True, True): (1, 256, 3, 4), - (1536, 1536, 32768, 32, 32, True, False, True): (1, 256, 3, 4), - (1536, 1536, 32768, 64, 64, False, True, True): (1, 512, 3, 4), - (1536, 1536, 32768, 64, 64, True, False, True): (1, 512, 3, 4), - (1536, 1536, 32768, 128, 128, False, True, True): (1, 256, 1, 32), - (1536, 1536, 32768, 128, 128, True, False, True): (4, 256, 2, 32), - (1536, 1536, 65536, 16, 16, False, True, True): (5, 256, 3, 4), - (1536, 1536, 65536, 16, 16, True, False, True): (1, 256, 3, 4), - (1536, 1536, 65536, 32, 32, False, True, True): (1, 512, 3, 4), - (1536, 1536, 65536, 32, 32, True, False, True): (1, 512, 3, 4), - (1536, 1536, 65536, 64, 64, False, True, True): (1, 1024, 3, 4), - (1536, 1536, 65536, 64, 64, True, False, True): (1, 1024, 3, 4), - (1536, 1536, 65536, 128, 128, False, True, True): (1, 512, 1, 32), - (1536, 1536, 65536, 128, 128, True, False, True): (4, 512, 2, 32), - (1536, 1536, 131072, 16, 16, False, True, True): (3, 512, 3, 4), - (1536, 1536, 131072, 16, 16, True, False, True): (1, 512, 3, 4), - (1536, 1536, 131072, 32, 32, False, True, True): (1, 1024, 3, 4), - (1536, 1536, 131072, 32, 32, True, False, True): (1, 1024, 3, 4), - (1536, 1536, 131072, 64, 64, False, True, True): (1, 2048, 3, 4), - (1536, 1536, 131072, 64, 64, True, False, True): (1, 2048, 3, 4), - (1536, 1536, 131072, 128, 128, False, True, True): (1, 1024, 1, 32), - (1536, 1536, 131072, 128, 128, True, False, True): (4, 1024, 2, 32), - (2048, 2048, 256, 16, 16, False, True, True): (1, 4, 3, 4), - (2048, 2048, 256, 16, 16, True, False, True): (1, 4, 3, 4), - (2048, 2048, 256, 32, 32, False, True, True): (3, 8, 3, 4), - (2048, 2048, 256, 32, 32, True, False, True): (3, 8, 3, 4), - (2048, 2048, 256, 64, 64, False, True, True): (4, 4, 4, 8), - (2048, 2048, 256, 64, 64, True, False, True): (8, 4, 4, 8), - (2048, 2048, 256, 128, 128, False, True, True): (3, 2, 1, 32), - (2048, 2048, 256, 128, 128, True, False, True): (3, 2, 1, 32), - (2048, 2048, 512, 16, 16, False, True, True): (4, 8, 3, 2), - (2048, 2048, 512, 16, 16, True, False, True): (4, 8, 3, 2), - (2048, 2048, 512, 32, 32, False, True, True): (3, 8, 3, 4), - (2048, 2048, 512, 32, 32, True, False, True): (1, 16, 2, 4), - (2048, 2048, 512, 64, 64, False, True, True): (4, 8, 2, 4), - (2048, 2048, 512, 64, 64, True, False, True): (4, 8, 2, 4), - (2048, 2048, 512, 128, 128, False, True, True): (1, 4, 1, 32), - (2048, 2048, 512, 128, 128, True, False, True): (4, 4, 1, 32), - (2048, 2048, 1024, 16, 16, False, True, True): (4, 8, 3, 4), - (2048, 2048, 1024, 16, 16, True, False, True): (4, 8, 3, 4), - (2048, 2048, 1024, 32, 32, False, True, True): (4, 16, 3, 4), - (2048, 2048, 1024, 32, 32, True, False, True): (1, 16, 3, 4), - (2048, 2048, 1024, 64, 64, False, True, True): (2, 16, 2, 4), - (2048, 2048, 1024, 64, 64, True, False, True): (2, 16, 2, 4), - (2048, 2048, 1024, 128, 128, False, True, True): (8, 8, 1, 32), - (2048, 2048, 1024, 128, 128, True, False, True): (4, 8, 1, 32), - (2048, 2048, 2048, 16, 16, False, True, True): (4, 32, 3, 1), - (2048, 2048, 2048, 16, 16, True, False, True): (3, 32, 3, 2), - (2048, 2048, 2048, 32, 32, False, True, True): (1, 32, 3, 4), - (2048, 2048, 2048, 32, 32, True, False, True): (1, 32, 3, 4), - (2048, 2048, 2048, 64, 64, False, True, True): (2, 32, 2, 4), - (2048, 2048, 2048, 64, 64, True, False, True): (2, 32, 2, 4), - (2048, 2048, 2048, 128, 128, False, True, True): (6, 16, 1, 32), - (2048, 2048, 2048, 128, 128, True, False, True): (4, 16, 1, 32), - (2048, 2048, 4096, 16, 16, False, True, True): (4, 64, 3, 1), - (2048, 2048, 4096, 16, 16, True, False, True): (1, 64, 3, 1), - (2048, 2048, 4096, 32, 32, False, True, True): (1, 64, 3, 4), - (2048, 2048, 4096, 32, 32, True, False, True): (4, 64, 3, 4), - (2048, 2048, 4096, 64, 64, False, True, True): (2, 64, 2, 4), - (2048, 2048, 4096, 64, 64, True, False, True): (2, 64, 2, 4), - (2048, 2048, 4096, 128, 128, False, True, True): (4, 32, 1, 32), - (2048, 2048, 4096, 128, 128, True, False, True): (4, 32, 1, 32), - (2048, 2048, 8192, 16, 16, False, True, True): (4, 128, 3, 1), - (2048, 2048, 8192, 16, 16, True, False, True): (1, 128, 3, 1), - (2048, 2048, 8192, 32, 32, False, True, True): (4, 128, 3, 4), - (2048, 2048, 8192, 32, 32, True, False, True): (4, 64, 3, 4), - (2048, 2048, 8192, 64, 64, False, True, True): (1, 128, 2, 4), - (2048, 2048, 8192, 64, 64, True, False, True): (2, 128, 2, 4), - (2048, 2048, 8192, 128, 128, False, True, True): (1, 64, 1, 32), - (2048, 2048, 8192, 128, 128, True, False, True): (4, 64, 1, 32), - (2048, 2048, 16384, 16, 16, False, True, True): (4, 256, 3, 1), - (2048, 2048, 16384, 16, 16, True, False, True): (1, 256, 3, 1), - (2048, 2048, 16384, 32, 32, False, True, True): (1, 256, 3, 4), - (2048, 2048, 16384, 32, 32, True, False, True): (1, 128, 3, 4), - (2048, 2048, 16384, 64, 64, False, True, True): (1, 256, 2, 4), - (2048, 2048, 16384, 64, 64, True, False, True): (1, 256, 2, 4), - (2048, 2048, 16384, 128, 128, False, True, True): (1, 128, 1, 32), - (2048, 2048, 16384, 128, 128, True, False, True): (4, 128, 1, 32), - (2048, 2048, 32768, 16, 16, False, True, True): (8, 512, 3, 1), - (2048, 2048, 32768, 16, 16, True, False, True): (1, 512, 3, 1), - (2048, 2048, 32768, 32, 32, False, True, True): (1, 512, 3, 4), - (2048, 2048, 32768, 32, 32, True, False, True): (1, 256, 3, 4), - (2048, 2048, 32768, 64, 64, False, True, True): (1, 512, 2, 4), - (2048, 2048, 32768, 64, 64, True, False, True): (1, 512, 2, 4), - (2048, 2048, 32768, 128, 128, False, True, True): (1, 256, 1, 32), - (2048, 2048, 32768, 128, 128, True, False, True): (4, 256, 1, 32), - (2048, 2048, 65536, 16, 16, False, True, True): (4, 1024, 3, 1), - (2048, 2048, 65536, 16, 16, True, False, True): (1, 1024, 3, 1), - (2048, 2048, 65536, 32, 32, False, True, True): (1, 1024, 3, 4), - (2048, 2048, 65536, 32, 32, True, False, True): (1, 512, 3, 4), - (2048, 2048, 65536, 64, 64, False, True, True): (1, 1024, 2, 4), - (2048, 2048, 65536, 64, 64, True, False, True): (1, 1024, 2, 4), - (2048, 2048, 65536, 128, 128, False, True, True): (1, 512, 1, 32), - (2048, 2048, 65536, 128, 128, True, False, True): (4, 512, 1, 32), - (2048, 2048, 131072, 16, 16, False, True, True): (4, 2048, 3, 1), - (2048, 2048, 131072, 16, 16, True, False, True): (1, 2048, 3, 1), - (2048, 2048, 131072, 32, 32, False, True, True): (1, 2048, 3, 4), - (2048, 2048, 131072, 32, 32, True, False, True): (1, 1024, 3, 4), - (2048, 2048, 131072, 64, 64, False, True, True): (1, 2048, 2, 4), - (2048, 2048, 131072, 64, 64, True, False, True): (1, 2048, 2, 4), - (2048, 2048, 131072, 128, 128, False, True, True): (1, 1024, 1, 32), - (2048, 2048, 131072, 128, 128, True, False, True): (4, 1024, 1, 32), - (3072, 768, 256, 16, 16, False, True, True): (4, 4, 3, 2), - (3072, 768, 256, 16, 16, True, False, True): (1, 2, 6, 4), - (3072, 768, 256, 32, 32, False, True, True): (1, 4, 6, 4), - (3072, 768, 256, 32, 32, True, False, True): (5, 4, 3, 4), - (3072, 768, 256, 64, 64, False, True, True): (4, 4, 3, 8), - (3072, 768, 256, 64, 64, True, False, True): (4, 4, 3, 8), - (3072, 768, 256, 128, 128, False, True, True): (1, 2, 1, 32), - (3072, 768, 256, 128, 128, True, False, True): (5, 2, 1, 32), - (3072, 768, 512, 16, 16, False, True, True): (4, 4, 3, 4), - (3072, 768, 512, 16, 16, True, False, True): (1, 4, 3, 4), - (3072, 768, 512, 32, 32, False, True, True): (3, 8, 3, 4), - (3072, 768, 512, 32, 32, True, False, True): (3, 8, 3, 4), - (3072, 768, 512, 64, 64, False, True, True): (2, 8, 3, 8), - (3072, 768, 512, 64, 64, True, False, True): (2, 8, 3, 8), - (3072, 768, 512, 128, 128, False, True, True): (1, 4, 2, 32), - (3072, 768, 512, 128, 128, True, False, True): (1, 4, 1, 32), - (3072, 768, 1024, 16, 16, False, True, True): (1, 16, 3, 2), - (3072, 768, 1024, 16, 16, True, False, True): (3, 16, 3, 2), - (3072, 768, 1024, 32, 32, False, True, True): (1, 16, 3, 4), - (3072, 768, 1024, 32, 32, True, False, True): (3, 16, 3, 4), - (3072, 768, 1024, 64, 64, False, True, True): (4, 16, 3, 8), - (3072, 768, 1024, 64, 64, True, False, True): (4, 16, 3, 4), - (3072, 768, 1024, 128, 128, False, True, True): (5, 8, 1, 32), - (3072, 768, 1024, 128, 128, True, False, True): (5, 8, 1, 32), - (3072, 768, 2048, 16, 16, False, True, True): (4, 32, 3, 2), - (3072, 768, 2048, 16, 16, True, False, True): (1, 32, 3, 2), - (3072, 768, 2048, 32, 32, False, True, True): (1, 32, 3, 4), - (3072, 768, 2048, 32, 32, True, False, True): (1, 32, 2, 4), - (3072, 768, 2048, 64, 64, False, True, True): (2, 32, 3, 4), - (3072, 768, 2048, 64, 64, True, False, True): (4, 32, 3, 4), - (3072, 768, 2048, 128, 128, False, True, True): (1, 16, 1, 32), - (3072, 768, 2048, 128, 128, True, False, True): (1, 16, 1, 32), - (3072, 768, 4096, 16, 16, False, True, True): (3, 64, 3, 2), - (3072, 768, 4096, 16, 16, True, False, True): (1, 64, 3, 2), - (3072, 768, 4096, 32, 32, False, True, True): (1, 64, 3, 4), - (3072, 768, 4096, 32, 32, True, False, True): (1, 32, 3, 4), - (3072, 768, 4096, 64, 64, False, True, True): (2, 64, 3, 4), - (3072, 768, 4096, 64, 64, True, False, True): (2, 64, 3, 4), - (3072, 768, 4096, 128, 128, False, True, True): (1, 32, 1, 32), - (3072, 768, 4096, 128, 128, True, False, True): (1, 32, 1, 32), - (3072, 768, 8192, 16, 16, False, True, True): (4, 128, 3, 1), - (3072, 768, 8192, 16, 16, True, False, True): (1, 32, 3, 4), - (3072, 768, 8192, 32, 32, False, True, True): (1, 64, 3, 4), - (3072, 768, 8192, 32, 32, True, False, True): (1, 64, 3, 4), - (3072, 768, 8192, 64, 64, False, True, True): (2, 128, 3, 4), - (3072, 768, 8192, 64, 64, True, False, True): (2, 128, 3, 4), - (3072, 768, 8192, 128, 128, False, True, True): (1, 64, 1, 32), - (3072, 768, 8192, 128, 128, True, False, True): (1, 64, 1, 32), - (3072, 768, 16384, 16, 16, False, True, True): (4, 256, 3, 1), - (3072, 768, 16384, 16, 16, True, False, True): (1, 64, 3, 4), - (3072, 768, 16384, 32, 32, False, True, True): (1, 128, 3, 4), - (3072, 768, 16384, 32, 32, True, False, True): (1, 128, 3, 4), - (3072, 768, 16384, 64, 64, False, True, True): (2, 256, 3, 4), - (3072, 768, 16384, 64, 64, True, False, True): (2, 256, 3, 4), - (3072, 768, 16384, 128, 128, False, True, True): (1, 128, 1, 32), - (3072, 768, 16384, 128, 128, True, False, True): (1, 128, 1, 32), - (3072, 768, 32768, 16, 16, False, True, True): (4, 512, 3, 1), - (3072, 768, 32768, 16, 16, True, False, True): (1, 128, 3, 4), - (3072, 768, 32768, 32, 32, False, True, True): (1, 256, 3, 4), - (3072, 768, 32768, 32, 32, True, False, True): (1, 256, 3, 4), - (3072, 768, 32768, 64, 64, False, True, True): (2, 512, 3, 4), - (3072, 768, 32768, 64, 64, True, False, True): (2, 512, 3, 4), - (3072, 768, 32768, 128, 128, False, True, True): (1, 256, 1, 32), - (3072, 768, 32768, 128, 128, True, False, True): (1, 256, 1, 32), - (3072, 768, 50432, 16, 16, False, True, True): (4, 788, 3, 1), - (3072, 768, 50432, 16, 16, True, False, True): (1, 197, 3, 4), - (3072, 768, 50432, 32, 32, False, True, True): (1, 394, 3, 4), - (3072, 768, 50432, 32, 32, True, False, True): (1, 394, 3, 4), - (3072, 768, 50432, 64, 64, False, True, True): (1, 788, 3, 4), - (3072, 768, 50432, 64, 64, True, False, True): (2, 788, 3, 4), - (3072, 768, 50432, 128, 128, False, True, True): (1, 394, 1, 32), - (3072, 768, 50432, 128, 128, True, False, True): (1, 394, 1, 32), - (3072, 768, 65536, 16, 16, False, True, True): (4, 1024, 3, 1), - (3072, 768, 65536, 16, 16, True, False, True): (1, 256, 3, 4), - (3072, 768, 65536, 32, 32, False, True, True): (1, 512, 3, 4), - (3072, 768, 65536, 32, 32, True, False, True): (1, 512, 3, 4), - (3072, 768, 65536, 64, 64, False, True, True): (2, 1024, 3, 4), - (3072, 768, 65536, 64, 64, True, False, True): (2, 1024, 3, 4), - (3072, 768, 65536, 128, 128, False, True, True): (1, 512, 1, 32), - (3072, 768, 65536, 128, 128, True, False, True): (1, 512, 1, 32), - (3072, 768, 131072, 16, 16, False, True, True): (4, 2048, 3, 1), - (3072, 768, 131072, 16, 16, True, False, True): (1, 512, 3, 4), - (3072, 768, 131072, 32, 32, False, True, True): (1, 1024, 3, 4), - (3072, 768, 131072, 32, 32, True, False, True): (1, 1024, 3, 4), - (3072, 768, 131072, 64, 64, False, True, True): (2, 2048, 3, 4), - (3072, 768, 131072, 64, 64, True, False, True): (2, 2048, 3, 4), - (3072, 768, 131072, 128, 128, False, True, True): (1, 1024, 1, 32), - (3072, 768, 131072, 128, 128, True, False, True): (1, 1024, 1, 32), - (3072, 3072, 256, 16, 16, False, True, True): (1, 4, 5, 2), - (3072, 3072, 256, 16, 16, True, False, True): (1, 4, 3, 2), - (3072, 3072, 256, 32, 32, False, True, True): (1, 4, 4, 4), - (3072, 3072, 256, 32, 32, True, False, True): (1, 4, 3, 4), - (3072, 3072, 256, 64, 64, False, True, True): (2, 4, 3, 8), - (3072, 3072, 256, 64, 64, True, False, True): (2, 4, 3, 8), - (3072, 3072, 256, 128, 128, False, True, True): (6, 2, 1, 32), - (3072, 3072, 256, 128, 128, True, False, True): (8, 2, 2, 32), - (3072, 3072, 512, 16, 16, False, True, True): (2, 4, 3, 4), - (3072, 3072, 512, 16, 16, True, False, True): (2, 4, 3, 4), - (3072, 3072, 512, 32, 32, False, True, True): (2, 8, 3, 4), - (3072, 3072, 512, 32, 32, True, False, True): (2, 8, 3, 4), - (3072, 3072, 512, 64, 64, False, True, True): (2, 8, 3, 8), - (3072, 3072, 512, 64, 64, True, False, True): (2, 8, 3, 8), - (3072, 3072, 512, 128, 128, False, True, True): (5, 4, 1, 32), - (3072, 3072, 512, 128, 128, True, False, True): (5, 4, 2, 32), - (3072, 3072, 1024, 16, 16, False, True, True): (1, 16, 3, 2), - (3072, 3072, 1024, 16, 16, True, False, True): (1, 16, 3, 2), - (3072, 3072, 1024, 32, 32, False, True, True): (2, 16, 3, 4), - (3072, 3072, 1024, 32, 32, True, False, True): (1, 16, 3, 4), - (3072, 3072, 1024, 64, 64, False, True, True): (1, 16, 3, 4), - (3072, 3072, 1024, 64, 64, True, False, True): (1, 16, 3, 4), - (3072, 3072, 1024, 128, 128, False, True, True): (1, 8, 1, 32), - (3072, 3072, 1024, 128, 128, True, False, True): (3, 8, 2, 32), - (3072, 3072, 2048, 16, 16, False, True, True): (1, 32, 3, 2), - (3072, 3072, 2048, 16, 16, True, False, True): (1, 16, 2, 4), - (3072, 3072, 2048, 32, 32, False, True, True): (1, 32, 2, 4), - (3072, 3072, 2048, 32, 32, True, False, True): (1, 32, 3, 4), - (3072, 3072, 2048, 64, 64, False, True, True): (1, 32, 3, 4), - (3072, 3072, 2048, 64, 64, True, False, True): (1, 32, 3, 4), - (3072, 3072, 2048, 128, 128, False, True, True): (1, 16, 1, 32), - (3072, 3072, 2048, 128, 128, True, False, True): (4, 16, 2, 32), - (3072, 3072, 4096, 16, 16, False, True, True): (2, 16, 3, 4), - (3072, 3072, 4096, 16, 16, True, False, True): (2, 16, 3, 4), - (3072, 3072, 4096, 32, 32, False, True, True): (1, 64, 2, 4), - (3072, 3072, 4096, 32, 32, True, False, True): (1, 32, 3, 4), - (3072, 3072, 4096, 64, 64, False, True, True): (1, 64, 3, 4), - (3072, 3072, 4096, 64, 64, True, False, True): (1, 64, 3, 4), - (3072, 3072, 4096, 128, 128, False, True, True): (1, 32, 1, 32), - (3072, 3072, 4096, 128, 128, True, False, True): (2, 32, 2, 32), - (3072, 3072, 8192, 16, 16, False, True, True): (2, 32, 3, 4), - (3072, 3072, 8192, 16, 16, True, False, True): (2, 32, 3, 4), - (3072, 3072, 8192, 32, 32, False, True, True): (1, 64, 3, 4), - (3072, 3072, 8192, 32, 32, True, False, True): (1, 64, 3, 4), - (3072, 3072, 8192, 64, 64, False, True, True): (1, 128, 3, 4), - (3072, 3072, 8192, 64, 64, True, False, True): (1, 128, 3, 4), - (3072, 3072, 8192, 128, 128, False, True, True): (1, 64, 1, 32), - (3072, 3072, 8192, 128, 128, True, False, True): (4, 64, 2, 32), - (3072, 3072, 16384, 16, 16, False, True, True): (2, 64, 3, 4), - (3072, 3072, 16384, 16, 16, True, False, True): (1, 64, 3, 4), - (3072, 3072, 16384, 32, 32, False, True, True): (1, 128, 3, 4), - (3072, 3072, 16384, 32, 32, True, False, True): (1, 128, 3, 4), - (3072, 3072, 16384, 64, 64, False, True, True): (1, 256, 3, 4), - (3072, 3072, 16384, 64, 64, True, False, True): (1, 256, 3, 4), - (3072, 3072, 16384, 128, 128, False, True, True): (1, 128, 1, 32), - (3072, 3072, 16384, 128, 128, True, False, True): (4, 128, 2, 32), - (3072, 3072, 32768, 16, 16, False, True, True): (3, 128, 3, 4), - (3072, 3072, 32768, 16, 16, True, False, True): (1, 128, 3, 4), - (3072, 3072, 32768, 32, 32, False, True, True): (1, 256, 3, 4), - (3072, 3072, 32768, 32, 32, True, False, True): (1, 256, 3, 4), - (3072, 3072, 32768, 64, 64, False, True, True): (1, 512, 3, 4), - (3072, 3072, 32768, 64, 64, True, False, True): (1, 512, 3, 4), - (3072, 3072, 32768, 128, 128, False, True, True): (1, 256, 1, 32), - (3072, 3072, 32768, 128, 128, True, False, True): (4, 256, 2, 32), - (3072, 3072, 65536, 16, 16, False, True, True): (5, 256, 3, 4), - (3072, 3072, 65536, 16, 16, True, False, True): (2, 256, 3, 4), - (3072, 3072, 65536, 32, 32, False, True, True): (1, 512, 3, 4), - (3072, 3072, 65536, 32, 32, True, False, True): (1, 512, 3, 4), - (3072, 3072, 65536, 64, 64, False, True, True): (1, 1024, 3, 4), - (3072, 3072, 65536, 64, 64, True, False, True): (1, 1024, 3, 4), - (3072, 3072, 65536, 128, 128, False, True, True): (1, 512, 1, 32), - (3072, 3072, 65536, 128, 128, True, False, True): (4, 512, 2, 32), - (3072, 3072, 131072, 16, 16, False, True, True): (5, 512, 3, 4), - (3072, 3072, 131072, 16, 16, True, False, True): (1, 512, 3, 4), - (3072, 3072, 131072, 32, 32, False, True, True): (1, 1024, 3, 4), - (3072, 3072, 131072, 32, 32, True, False, True): (1, 1024, 3, 4), - (3072, 3072, 131072, 64, 64, False, True, True): (1, 2048, 3, 4), - (3072, 3072, 131072, 64, 64, True, False, True): (1, 2048, 3, 4), - (3072, 3072, 131072, 128, 128, False, True, True): (1, 1024, 1, 32), - (3072, 3072, 131072, 128, 128, True, False, True): (4, 1024, 2, 32), - (4096, 4096, 256, 16, 16, False, True, True): (1, 4, 3, 2), - (4096, 4096, 256, 16, 16, True, False, True): (1, 2, 3, 4), - (4096, 4096, 256, 32, 32, False, True, True): (4, 4, 4, 4), - (4096, 4096, 256, 32, 32, True, False, True): (4, 4, 4, 4), - (4096, 4096, 256, 64, 64, False, True, True): (1, 4, 3, 8), - (4096, 4096, 256, 64, 64, True, False, True): (4, 4, 2, 4), - (4096, 4096, 256, 128, 128, False, True, True): (1, 2, 1, 32), - (4096, 4096, 256, 128, 128, True, False, True): (3, 2, 1, 32), - (4096, 4096, 512, 16, 16, False, True, True): (1, 4, 3, 4), - (4096, 4096, 512, 16, 16, True, False, True): (5, 8, 3, 2), - (4096, 4096, 512, 32, 32, False, True, True): (4, 8, 3, 4), - (4096, 4096, 512, 32, 32, True, False, True): (4, 8, 3, 4), - (4096, 4096, 512, 64, 64, False, True, True): (1, 8, 2, 4), - (4096, 4096, 512, 64, 64, True, False, True): (1, 8, 2, 4), - (4096, 4096, 512, 128, 128, False, True, True): (4, 4, 1, 32), - (4096, 4096, 512, 128, 128, True, False, True): (4, 4, 1, 32), - (4096, 4096, 1024, 16, 16, False, True, True): (1, 8, 3, 4), - (4096, 4096, 1024, 16, 16, True, False, True): (1, 8, 3, 4), - (4096, 4096, 1024, 32, 32, False, True, True): (1, 16, 3, 4), - (4096, 4096, 1024, 32, 32, True, False, True): (1, 16, 3, 4), - (4096, 4096, 1024, 64, 64, False, True, True): (4, 16, 2, 4), - (4096, 4096, 1024, 64, 64, True, False, True): (4, 16, 2, 4), - (4096, 4096, 1024, 128, 128, False, True, True): (4, 8, 1, 32), - (4096, 4096, 1024, 128, 128, True, False, True): (4, 8, 1, 32), - (4096, 4096, 2048, 16, 16, False, True, True): (1, 32, 3, 1), - (4096, 4096, 2048, 16, 16, True, False, True): (6, 8, 3, 4), - (4096, 4096, 2048, 32, 32, False, True, True): (1, 32, 3, 4), - (4096, 4096, 2048, 32, 32, True, False, True): (1, 32, 3, 4), - (4096, 4096, 2048, 64, 64, False, True, True): (4, 32, 2, 4), - (4096, 4096, 2048, 64, 64, True, False, True): (4, 32, 2, 4), - (4096, 4096, 2048, 128, 128, False, True, True): (4, 16, 1, 32), - (4096, 4096, 2048, 128, 128, True, False, True): (4, 16, 1, 32), - (4096, 4096, 4096, 16, 16, False, True, True): (1, 16, 3, 4), - (4096, 4096, 4096, 16, 16, True, False, True): (1, 64, 3, 1), - (4096, 4096, 4096, 32, 32, False, True, True): (1, 64, 3, 4), - (4096, 4096, 4096, 32, 32, True, False, True): (1, 32, 3, 4), - (4096, 4096, 4096, 64, 64, False, True, True): (4, 64, 2, 4), - (4096, 4096, 4096, 64, 64, True, False, True): (4, 64, 2, 4), - (4096, 4096, 4096, 128, 128, False, True, True): (4, 32, 1, 32), - (4096, 4096, 4096, 128, 128, True, False, True): (4, 32, 1, 32), - (4096, 4096, 8192, 16, 16, False, True, True): (4, 128, 3, 1), - (4096, 4096, 8192, 16, 16, True, False, True): (1, 128, 3, 1), - (4096, 4096, 8192, 32, 32, False, True, True): (1, 128, 3, 4), - (4096, 4096, 8192, 32, 32, True, False, True): (1, 64, 3, 4), - (4096, 4096, 8192, 64, 64, False, True, True): (4, 128, 2, 4), - (4096, 4096, 8192, 64, 64, True, False, True): (4, 128, 2, 4), - (4096, 4096, 8192, 128, 128, False, True, True): (4, 64, 1, 32), - (4096, 4096, 8192, 128, 128, True, False, True): (4, 64, 1, 32), - (4096, 4096, 16384, 16, 16, False, True, True): (1, 64, 3, 4), - (4096, 4096, 16384, 16, 16, True, False, True): (1, 256, 3, 1), - (4096, 4096, 16384, 32, 32, False, True, True): (1, 256, 3, 4), - (4096, 4096, 16384, 32, 32, True, False, True): (1, 128, 3, 4), - (4096, 4096, 16384, 64, 64, False, True, True): (4, 256, 2, 4), - (4096, 4096, 16384, 64, 64, True, False, True): (4, 256, 2, 4), - (4096, 4096, 16384, 128, 128, False, True, True): (4, 128, 1, 32), - (4096, 4096, 16384, 128, 128, True, False, True): (4, 128, 1, 32), - (4096, 4096, 32768, 16, 16, False, True, True): (1, 128, 3, 4), - (4096, 4096, 32768, 16, 16, True, False, True): (1, 512, 3, 1), - (4096, 4096, 32768, 32, 32, False, True, True): (1, 512, 3, 4), - (4096, 4096, 32768, 32, 32, True, False, True): (1, 256, 3, 4), - (4096, 4096, 32768, 64, 64, False, True, True): (4, 512, 2, 4), - (4096, 4096, 32768, 64, 64, True, False, True): (4, 512, 2, 4), - (4096, 4096, 32768, 128, 128, False, True, True): (4, 256, 1, 32), - (4096, 4096, 32768, 128, 128, True, False, True): (4, 256, 1, 32), - (4096, 4096, 65536, 16, 16, False, True, True): (1, 256, 3, 4), - (4096, 4096, 65536, 16, 16, True, False, True): (1, 1024, 3, 1), - (4096, 4096, 65536, 32, 32, False, True, True): (1, 1024, 3, 4), - (4096, 4096, 65536, 32, 32, True, False, True): (1, 512, 3, 4), - (4096, 4096, 65536, 64, 64, False, True, True): (4, 1024, 2, 4), - (4096, 4096, 65536, 64, 64, True, False, True): (2, 1024, 2, 4), - (4096, 4096, 65536, 128, 128, False, True, True): (4, 512, 1, 32), - (4096, 4096, 65536, 128, 128, True, False, True): (4, 512, 1, 32), - (4096, 4096, 131072, 16, 16, False, True, True): (2, 2048, 3, 1), - (4096, 4096, 131072, 16, 16, True, False, True): (1, 2048, 3, 1), - (4096, 4096, 131072, 32, 32, False, True, True): (2, 2048, 3, 4), - (4096, 4096, 131072, 32, 32, True, False, True): (1, 1024, 3, 4), - (4096, 4096, 131072, 64, 64, False, True, True): (2, 2048, 2, 4), - (4096, 4096, 131072, 64, 64, True, False, True): (2, 2048, 2, 4), - (4096, 4096, 131072, 128, 128, False, True, True): (4, 1024, 1, 32), - (4096, 4096, 131072, 128, 128, True, False, True): (4, 1024, 1, 32), - (5120, 1280, 65792, 16, 16, False, True, True): (2, 1028, 3, 1), - (5120, 1280, 65792, 16, 16, True, False, True): (1, 257, 3, 4), - (5120, 1280, 65792, 32, 32, False, True, True): (1, 514, 3, 4), - (5120, 1280, 65792, 32, 32, True, False, True): (1, 514, 3, 4), - (5120, 1280, 65792, 64, 64, False, True, True): (1, 1028, 3, 4), - (5120, 1280, 65792, 64, 64, True, False, True): (5, 1028, 3, 4), - (5120, 1280, 65792, 128, 128, False, True, True): (1, 514, 1, 32), - (5120, 1280, 65792, 128, 128, True, False, True): (4, 514, 2, 32), - (6144, 6144, 256, 16, 16, False, True, True): (2, 2, 3, 4), - (6144, 6144, 256, 16, 16, True, False, True): (2, 2, 3, 4), - (6144, 6144, 256, 32, 32, False, True, True): (2, 4, 3, 4), - (6144, 6144, 256, 32, 32, True, False, True): (2, 4, 3, 4), - (6144, 6144, 256, 64, 64, False, True, True): (1, 4, 3, 4), - (6144, 6144, 256, 64, 64, True, False, True): (1, 4, 3, 4), - (6144, 6144, 256, 128, 128, False, True, True): (1, 2, 1, 32), - (6144, 6144, 256, 128, 128, True, False, True): (5, 2, 2, 32), - (6144, 6144, 512, 16, 16, False, True, True): (4, 8, 3, 2), - (6144, 6144, 512, 16, 16, True, False, True): (4, 8, 3, 2), - (6144, 6144, 512, 32, 32, False, True, True): (2, 8, 3, 4), - (6144, 6144, 512, 32, 32, True, False, True): (2, 8, 3, 4), - (6144, 6144, 512, 64, 64, False, True, True): (1, 8, 3, 4), - (6144, 6144, 512, 64, 64, True, False, True): (1, 8, 3, 4), - (6144, 6144, 512, 128, 128, False, True, True): (1, 4, 1, 32), - (6144, 6144, 512, 128, 128, True, False, True): (4, 4, 2, 32), - (6144, 6144, 1024, 16, 16, False, True, True): (4, 16, 3, 2), - (6144, 6144, 1024, 16, 16, True, False, True): (4, 4, 3, 4), - (6144, 6144, 1024, 32, 32, False, True, True): (1, 16, 3, 4), - (6144, 6144, 1024, 32, 32, True, False, True): (1, 16, 3, 4), - (6144, 6144, 1024, 64, 64, False, True, True): (1, 16, 3, 4), - (6144, 6144, 1024, 64, 64, True, False, True): (1, 16, 3, 4), - (6144, 6144, 1024, 128, 128, False, True, True): (1, 8, 1, 32), - (6144, 6144, 1024, 128, 128, True, False, True): (4, 8, 2, 32), - (6144, 6144, 2048, 16, 16, False, True, True): (1, 8, 3, 4), - (6144, 6144, 2048, 16, 16, True, False, True): (4, 8, 3, 4), - (6144, 6144, 2048, 32, 32, False, True, True): (1, 16, 3, 4), - (6144, 6144, 2048, 32, 32, True, False, True): (1, 16, 3, 4), - (6144, 6144, 2048, 64, 64, False, True, True): (1, 32, 3, 4), - (6144, 6144, 2048, 64, 64, True, False, True): (3, 32, 3, 4), - (6144, 6144, 2048, 128, 128, False, True, True): (1, 16, 1, 32), - (6144, 6144, 2048, 128, 128, True, False, True): (1, 16, 2, 32), - (6144, 6144, 4096, 16, 16, False, True, True): (3, 16, 3, 4), - (6144, 6144, 4096, 16, 16, True, False, True): (4, 16, 3, 4), - (6144, 6144, 4096, 32, 32, False, True, True): (1, 32, 3, 4), - (6144, 6144, 4096, 32, 32, True, False, True): (1, 32, 3, 4), - (6144, 6144, 4096, 64, 64, False, True, True): (1, 64, 3, 4), - (6144, 6144, 4096, 64, 64, True, False, True): (1, 64, 3, 4), - (6144, 6144, 4096, 128, 128, False, True, True): (1, 32, 1, 32), - (6144, 6144, 4096, 128, 128, True, False, True): (4, 32, 2, 32), - (6144, 6144, 8192, 16, 16, False, True, True): (1, 32, 3, 4), - (6144, 6144, 8192, 16, 16, True, False, True): (4, 32, 3, 4), - (6144, 6144, 8192, 32, 32, False, True, True): (1, 64, 3, 4), - (6144, 6144, 8192, 32, 32, True, False, True): (1, 64, 3, 4), - (6144, 6144, 8192, 64, 64, False, True, True): (1, 128, 3, 4), - (6144, 6144, 8192, 64, 64, True, False, True): (1, 128, 3, 4), - (6144, 6144, 8192, 128, 128, False, True, True): (1, 64, 1, 32), - (6144, 6144, 8192, 128, 128, True, False, True): (4, 64, 2, 32), - (6144, 6144, 16384, 16, 16, False, True, True): (1, 64, 3, 4), - (6144, 6144, 16384, 16, 16, True, False, True): (4, 64, 3, 4), - (6144, 6144, 16384, 32, 32, False, True, True): (1, 128, 3, 4), - (6144, 6144, 16384, 32, 32, True, False, True): (1, 128, 3, 4), - (6144, 6144, 16384, 64, 64, False, True, True): (1, 256, 3, 4), - (6144, 6144, 16384, 64, 64, True, False, True): (1, 256, 3, 4), - (6144, 6144, 16384, 128, 128, False, True, True): (1, 128, 1, 32), - (6144, 6144, 16384, 128, 128, True, False, True): (4, 128, 2, 32), - (6144, 6144, 32768, 16, 16, False, True, True): (1, 128, 3, 4), - (6144, 6144, 32768, 16, 16, True, False, True): (4, 128, 3, 4), - (6144, 6144, 32768, 32, 32, False, True, True): (1, 256, 3, 4), - (6144, 6144, 32768, 32, 32, True, False, True): (1, 256, 3, 4), - (6144, 6144, 32768, 64, 64, False, True, True): (1, 512, 3, 4), - (6144, 6144, 32768, 64, 64, True, False, True): (1, 512, 3, 4), - (6144, 6144, 32768, 128, 128, False, True, True): (1, 256, 1, 32), - (6144, 6144, 32768, 128, 128, True, False, True): (4, 256, 2, 32), - (6144, 6144, 65536, 16, 16, False, True, True): (1, 256, 3, 4), - (6144, 6144, 65536, 16, 16, True, False, True): (2, 256, 3, 4), - (6144, 6144, 65536, 32, 32, False, True, True): (1, 512, 3, 4), - (6144, 6144, 65536, 32, 32, True, False, True): (1, 512, 3, 4), - (6144, 6144, 65536, 64, 64, False, True, True): (1, 1024, 3, 4), - (6144, 6144, 65536, 64, 64, True, False, True): (1, 1024, 3, 4), - (6144, 6144, 65536, 128, 128, False, True, True): (1, 512, 1, 32), - (6144, 6144, 65536, 128, 128, True, False, True): (4, 512, 2, 32), - (6144, 6144, 131072, 16, 16, False, True, True): (1, 512, 3, 4), - (6144, 6144, 131072, 16, 16, True, False, True): (2, 512, 3, 4), - (6144, 6144, 131072, 32, 32, False, True, True): (1, 1024, 3, 4), - (6144, 6144, 131072, 32, 32, True, False, True): (1, 1024, 3, 4), - (6144, 6144, 131072, 64, 64, False, True, True): (1, 2048, 3, 4), - (6144, 6144, 131072, 64, 64, True, False, True): (1, 2048, 3, 4), - (6144, 6144, 131072, 128, 128, False, True, True): (1, 1024, 1, 32), - (6144, 6144, 131072, 128, 128, True, False, True): (4, 1024, 2, 32), - (8192, 8192, 256, 16, 16, False, True, True): (2, 2, 4, 4), - (8192, 8192, 256, 16, 16, True, False, True): (1, 1, 3, 4), - (8192, 8192, 256, 32, 32, False, True, True): (2, 4, 3, 4), - (8192, 8192, 256, 32, 32, True, False, True): (2, 4, 3, 4), - (8192, 8192, 256, 64, 64, False, True, True): (4, 4, 2, 4), - (8192, 8192, 256, 64, 64, True, False, True): (4, 4, 2, 4), - (8192, 8192, 256, 128, 128, False, True, True): (1, 2, 1, 32), - (8192, 8192, 256, 128, 128, True, False, True): (4, 2, 1, 32), - (8192, 8192, 512, 16, 16, False, True, True): (1, 4, 3, 4), - (8192, 8192, 512, 16, 16, True, False, True): (3, 4, 3, 4), - (8192, 8192, 512, 32, 32, False, True, True): (1, 8, 3, 4), - (8192, 8192, 512, 32, 32, True, False, True): (6, 8, 3, 4), - (8192, 8192, 512, 64, 64, False, True, True): (4, 8, 2, 4), - (8192, 8192, 512, 64, 64, True, False, True): (4, 8, 2, 4), - (8192, 8192, 512, 128, 128, False, True, True): (4, 4, 1, 32), - (8192, 8192, 512, 128, 128, True, False, True): (4, 4, 1, 32), - (8192, 8192, 1024, 16, 16, False, True, True): (1, 4, 3, 4), - (8192, 8192, 1024, 16, 16, True, False, True): (1, 32, 3, 1), - (8192, 8192, 1024, 32, 32, False, True, True): (1, 16, 3, 4), - (8192, 8192, 1024, 32, 32, True, False, True): (1, 16, 3, 4), - (8192, 8192, 1024, 64, 64, False, True, True): (4, 16, 2, 4), - (8192, 8192, 1024, 64, 64, True, False, True): (4, 16, 2, 4), - (8192, 8192, 1024, 128, 128, False, True, True): (4, 8, 1, 32), - (8192, 8192, 1024, 128, 128, True, False, True): (4, 8, 1, 32), - (8192, 8192, 2048, 16, 16, False, True, True): (4, 8, 3, 4), - (8192, 8192, 2048, 16, 16, True, False, True): (1, 32, 3, 1), - (8192, 8192, 2048, 32, 32, False, True, True): (1, 32, 3, 4), - (8192, 8192, 2048, 32, 32, True, False, True): (1, 16, 4, 4), - (8192, 8192, 2048, 64, 64, False, True, True): (4, 32, 2, 4), - (8192, 8192, 2048, 64, 64, True, False, True): (4, 32, 2, 4), - (8192, 8192, 2048, 128, 128, False, True, True): (4, 16, 1, 32), - (8192, 8192, 2048, 128, 128, True, False, True): (4, 16, 1, 32), - (8192, 8192, 4096, 16, 16, False, True, True): (3, 16, 3, 4), - (8192, 8192, 4096, 16, 16, True, False, True): (2, 64, 3, 1), - (8192, 8192, 4096, 32, 32, False, True, True): (1, 64, 3, 4), - (8192, 8192, 4096, 32, 32, True, False, True): (1, 32, 3, 4), - (8192, 8192, 4096, 64, 64, False, True, True): (4, 64, 2, 4), - (8192, 8192, 4096, 64, 64, True, False, True): (2, 64, 2, 4), - (8192, 8192, 4096, 128, 128, False, True, True): (4, 32, 1, 32), - (8192, 8192, 4096, 128, 128, True, False, True): (4, 32, 1, 32), - (8192, 8192, 8192, 16, 16, False, True, True): (2, 128, 3, 1), - (8192, 8192, 8192, 16, 16, True, False, True): (2, 128, 3, 1), - (8192, 8192, 8192, 32, 32, False, True, True): (1, 128, 3, 4), - (8192, 8192, 8192, 32, 32, True, False, True): (1, 64, 3, 4), - (8192, 8192, 8192, 64, 64, False, True, True): (4, 128, 2, 4), - (8192, 8192, 8192, 64, 64, True, False, True): (2, 128, 2, 4), - (8192, 8192, 8192, 128, 128, False, True, True): (4, 64, 1, 32), - (8192, 8192, 8192, 128, 128, True, False, True): (4, 64, 1, 32), - (8192, 8192, 16384, 16, 16, False, True, True): (1, 64, 3, 4), - (8192, 8192, 16384, 16, 16, True, False, True): (1, 256, 3, 1), - (8192, 8192, 16384, 32, 32, False, True, True): (1, 256, 3, 4), - (8192, 8192, 16384, 32, 32, True, False, True): (1, 128, 3, 4), - (8192, 8192, 16384, 64, 64, False, True, True): (2, 256, 2, 4), - (8192, 8192, 16384, 64, 64, True, False, True): (2, 256, 2, 4), - (8192, 8192, 16384, 128, 128, False, True, True): (4, 128, 1, 32), - (8192, 8192, 16384, 128, 128, True, False, True): (4, 128, 1, 32), - (8192, 8192, 32768, 16, 16, False, True, True): (1, 512, 3, 1), - (8192, 8192, 32768, 16, 16, True, False, True): (1, 512, 3, 1), - (8192, 8192, 32768, 32, 32, False, True, True): (1, 512, 3, 4), - (8192, 8192, 32768, 32, 32, True, False, True): (1, 256, 3, 4), - (8192, 8192, 32768, 64, 64, False, True, True): (2, 512, 2, 4), - (8192, 8192, 32768, 64, 64, True, False, True): (2, 512, 2, 4), - (8192, 8192, 32768, 128, 128, False, True, True): (4, 256, 1, 32), - (8192, 8192, 32768, 128, 128, True, False, True): (4, 256, 1, 32), - (8192, 8192, 65536, 16, 16, False, True, True): (1, 256, 3, 4), - (8192, 8192, 65536, 16, 16, True, False, True): (1, 1024, 3, 1), - (8192, 8192, 65536, 32, 32, False, True, True): (1, 1024, 3, 4), - (8192, 8192, 65536, 32, 32, True, False, True): (1, 512, 3, 4), - (8192, 8192, 65536, 64, 64, False, True, True): (4, 1024, 2, 4), - (8192, 8192, 65536, 64, 64, True, False, True): (2, 1024, 2, 4), - (8192, 8192, 65536, 128, 128, False, True, True): (4, 512, 1, 32), - (8192, 8192, 65536, 128, 128, True, False, True): (4, 512, 1, 32), - (8192, 8192, 131072, 16, 16, False, True, True): (1, 2048, 3, 1), - (8192, 8192, 131072, 16, 16, True, False, True): (2, 2048, 3, 1), - (8192, 8192, 131072, 32, 32, False, True, True): (4, 2048, 3, 4), - (8192, 8192, 131072, 32, 32, True, False, True): (1, 1024, 3, 4), - (8192, 8192, 131072, 64, 64, False, True, True): (2, 2048, 2, 4), - (8192, 8192, 131072, 64, 64, True, False, True): (2, 2048, 2, 4), - (8192, 8192, 131072, 128, 128, False, True, True): (4, 1024, 1, 32), - (8192, 8192, 131072, 128, 128, True, False, True): (4, 1024, 1, 32), - (16384, 16384, 256, 16, 16, False, True, True): (1, 2, 3, 4), - (16384, 16384, 256, 16, 16, True, False, True): (1, 2, 3, 4), - (16384, 16384, 256, 32, 32, False, True, True): (1, 4, 3, 4), - (16384, 16384, 256, 32, 32, True, False, True): (1, 4, 3, 4), - (16384, 16384, 256, 64, 64, False, True, True): (2, 4, 2, 4), - (16384, 16384, 256, 64, 64, True, False, True): (2, 4, 2, 4), - (16384, 16384, 256, 128, 128, False, True, True): (2, 2, 1, 32), - (16384, 16384, 256, 128, 128, True, False, True): (2, 2, 1, 32), - (16384, 16384, 512, 16, 16, False, True, True): (1, 2, 3, 4), - (16384, 16384, 512, 16, 16, True, False, True): (5, 2, 3, 4), - (16384, 16384, 512, 32, 32, False, True, True): (1, 8, 3, 4), - (16384, 16384, 512, 32, 32, True, False, True): (1, 4, 3, 4), - (16384, 16384, 512, 64, 64, False, True, True): (4, 8, 2, 4), - (16384, 16384, 512, 64, 64, True, False, True): (4, 8, 2, 4), - (16384, 16384, 512, 128, 128, False, True, True): (4, 4, 1, 32), - (16384, 16384, 512, 128, 128, True, False, True): (4, 4, 1, 32), - (16384, 16384, 1024, 16, 16, False, True, True): (1, 4, 3, 4), - (16384, 16384, 1024, 16, 16, True, False, True): (2, 16, 3, 1), - (16384, 16384, 1024, 32, 32, False, True, True): (1, 16, 3, 4), - (16384, 16384, 1024, 32, 32, True, False, True): (1, 8, 3, 4), - (16384, 16384, 1024, 64, 64, False, True, True): (4, 16, 2, 4), - (16384, 16384, 1024, 64, 64, True, False, True): (4, 16, 2, 4), - (16384, 16384, 1024, 128, 128, False, True, True): (4, 8, 1, 32), - (16384, 16384, 1024, 128, 128, True, False, True): (4, 8, 1, 32), - (16384, 16384, 2048, 16, 16, False, True, True): (1, 8, 3, 4), - (16384, 16384, 2048, 16, 16, True, False, True): (2, 32, 3, 1), - (16384, 16384, 2048, 32, 32, False, True, True): (1, 32, 3, 4), - (16384, 16384, 2048, 32, 32, True, False, True): (1, 16, 3, 4), - (16384, 16384, 2048, 64, 64, False, True, True): (4, 32, 2, 4), - (16384, 16384, 2048, 64, 64, True, False, True): (2, 32, 2, 4), - (16384, 16384, 2048, 128, 128, False, True, True): (4, 16, 1, 32), - (16384, 16384, 2048, 128, 128, True, False, True): (4, 16, 1, 32), - (16384, 16384, 4096, 16, 16, False, True, True): (1, 16, 3, 4), - (16384, 16384, 4096, 16, 16, True, False, True): (2, 64, 3, 1), - (16384, 16384, 4096, 32, 32, False, True, True): (1, 64, 3, 4), - (16384, 16384, 4096, 32, 32, True, False, True): (1, 32, 3, 4), - (16384, 16384, 4096, 64, 64, False, True, True): (4, 64, 2, 4), - (16384, 16384, 4096, 64, 64, True, False, True): (2, 64, 2, 4), - (16384, 16384, 4096, 128, 128, False, True, True): (4, 32, 1, 32), - (16384, 16384, 4096, 128, 128, True, False, True): (4, 32, 1, 32), - (16384, 16384, 8192, 16, 16, False, True, True): (1, 128, 3, 1), - (16384, 16384, 8192, 16, 16, True, False, True): (2, 128, 3, 1), - (16384, 16384, 8192, 32, 32, False, True, True): (1, 128, 3, 4), - (16384, 16384, 8192, 32, 32, True, False, True): (1, 64, 3, 4), - (16384, 16384, 8192, 64, 64, False, True, True): (2, 128, 2, 4), - (16384, 16384, 8192, 64, 64, True, False, True): (2, 128, 2, 4), - (16384, 16384, 8192, 128, 128, False, True, True): (4, 64, 1, 32), - (16384, 16384, 8192, 128, 128, True, False, True): (4, 64, 1, 32), - (16384, 16384, 16384, 16, 16, False, True, True): (1, 64, 3, 4), - (16384, 16384, 16384, 16, 16, True, False, True): (2, 256, 3, 1), - (16384, 16384, 16384, 32, 32, False, True, True): (1, 256, 3, 4), - (16384, 16384, 16384, 32, 32, True, False, True): (1, 128, 3, 4), - (16384, 16384, 16384, 64, 64, False, True, True): (2, 256, 2, 4), - (16384, 16384, 16384, 64, 64, True, False, True): (2, 256, 2, 4), - (16384, 16384, 16384, 128, 128, False, True, True): (4, 128, 1, 32), - (16384, 16384, 16384, 128, 128, True, False, True): (4, 128, 1, 32), - (16384, 16384, 32768, 16, 16, False, True, True): (1, 512, 3, 1), - (16384, 16384, 32768, 16, 16, True, False, True): (1, 128, 3, 4), - (16384, 16384, 32768, 32, 32, False, True, True): (2, 512, 3, 4), - (16384, 16384, 32768, 32, 32, True, False, True): (1, 256, 4, 4), - (16384, 16384, 32768, 64, 64, False, True, True): (2, 512, 2, 4), - (16384, 16384, 32768, 64, 64, True, False, True): (2, 512, 2, 4), - (16384, 16384, 32768, 128, 128, False, True, True): (4, 256, 1, 32), - (16384, 16384, 32768, 128, 128, True, False, True): (4, 256, 1, 32), - (16384, 16384, 65536, 16, 16, False, True, True): (1, 256, 3, 4), - (16384, 16384, 65536, 16, 16, True, False, True): (1, 1024, 3, 1), - (16384, 16384, 65536, 32, 32, False, True, True): (1, 1024, 3, 4), - (16384, 16384, 65536, 32, 32, True, False, True): (1, 512, 4, 4), - (16384, 16384, 65536, 64, 64, False, True, True): (2, 1024, 2, 4), - (16384, 16384, 65536, 64, 64, True, False, True): (2, 1024, 2, 4), - (16384, 16384, 65536, 128, 128, False, True, True): (4, 512, 1, 32), - (16384, 16384, 65536, 128, 128, True, False, True): (4, 512, 1, 32), - (16384, 16384, 131072, 16, 16, False, True, True): (1, 1024, 4, 4), - (16384, 16384, 131072, 16, 16, True, False, True): (2, 2048, 3, 1), - (16384, 16384, 131072, 32, 32, False, True, True): (1, 1024, 2, 4), - (16384, 16384, 131072, 32, 32, True, False, True): (1, 1024, 2, 4), - (16384, 16384, 131072, 64, 64, False, True, True): (4, 2048, 2, 4), - (16384, 16384, 131072, 64, 64, True, False, True): (2, 2048, 2, 4), - (16384, 16384, 131072, 128, 128, False, True, True): (4, 1024, 1, 32), - (16384, 16384, 131072, 128, 128, True, False, True): (4, 1024, 1, 32), - }, - ("bsr_dense_addmm", "NVIDIA A100-SXM4-80GB", (0, torch.float32, 0.56)): { - (192, 192, 256, 64, 64, False, True, True): (1, 4, 3, 8), - (192, 192, 256, 64, 64, True, False, True): (1, 4, 3, 8), - (192, 192, 512, 64, 64, False, True, True): (2, 8, 3, 8), - (192, 192, 512, 64, 64, True, False, True): (5, 8, 3, 8), - (192, 192, 1024, 64, 64, False, True, True): (2, 16, 4, 8), - (192, 192, 1024, 64, 64, True, False, True): (1, 16, 3, 8), - (192, 192, 2048, 64, 64, False, True, True): (3, 32, 3, 8), - (192, 192, 2048, 64, 64, True, False, True): (5, 32, 5, 8), - (192, 192, 4096, 64, 64, False, True, True): (3, 64, 2, 8), - (192, 192, 4096, 64, 64, True, False, True): (1, 64, 3, 8), - (192, 192, 8192, 64, 64, False, True, True): (3, 128, 3, 8), - (192, 192, 8192, 64, 64, True, False, True): (6, 128, 3, 4), - (192, 192, 16384, 64, 64, False, True, True): (1, 256, 1, 8), - (192, 192, 16384, 64, 64, True, False, True): (1, 256, 3, 4), - (192, 192, 32768, 64, 64, False, True, True): (1, 512, 1, 8), - (192, 192, 32768, 64, 64, True, False, True): (1, 512, 3, 4), - (192, 192, 65536, 64, 64, False, True, True): (1, 1024, 1, 8), - (192, 192, 65536, 64, 64, True, False, True): (1, 1024, 3, 4), - (192, 192, 131072, 64, 64, False, True, True): (1, 2048, 1, 8), - (192, 192, 131072, 64, 64, True, False, True): (3, 2048, 1, 4), - (384, 384, 256, 128, 128, False, True, True): (1, 2, 1, 32), - (384, 384, 256, 128, 128, True, False, True): (1, 2, 1, 32), - (384, 384, 512, 128, 128, False, True, True): (1, 4, 1, 32), - (384, 384, 512, 128, 128, True, False, True): (2, 4, 1, 32), - (384, 384, 1024, 128, 128, False, True, True): (1, 8, 1, 32), - (384, 384, 1024, 128, 128, True, False, True): (4, 8, 1, 32), - (384, 384, 2048, 128, 128, False, True, True): (1, 16, 1, 32), - (384, 384, 2048, 128, 128, True, False, True): (1, 16, 1, 32), - (384, 384, 4096, 128, 128, False, True, True): (1, 32, 1, 32), - (384, 384, 4096, 128, 128, True, False, True): (2, 32, 2, 32), - (384, 384, 8192, 128, 128, False, True, True): (1, 64, 1, 32), - (384, 384, 8192, 128, 128, True, False, True): (1, 64, 2, 32), - (384, 384, 16384, 128, 128, False, True, True): (1, 128, 1, 32), - (384, 384, 16384, 128, 128, True, False, True): (4, 128, 1, 32), - (384, 384, 32768, 128, 128, False, True, True): (3, 256, 1, 32), - (384, 384, 32768, 128, 128, True, False, True): (3, 256, 1, 32), - (384, 384, 65536, 128, 128, False, True, True): (3, 512, 1, 32), - (384, 384, 65536, 128, 128, True, False, True): (3, 512, 1, 32), - (384, 384, 131072, 128, 128, False, True, True): (1, 1024, 1, 32), - (384, 384, 131072, 128, 128, True, False, True): (3, 1024, 1, 32), - }, - ("bsr_dense_addmm", "NVIDIA A100-SXM4-80GB", (0, torch.int8, 0.5)): { - (1280, 5120, 65792, 32, 32, False, True, True): (1, 1028, 1, 8), - (1280, 5120, 65792, 32, 32, True, False, True): (1, 514, 3, 2), - (1280, 5120, 65792, 64, 64, False, True, True): (2, 514, 1, 4), - (1280, 5120, 65792, 64, 64, True, False, True): (1, 514, 3, 2), - (1280, 5120, 65792, 128, 128, False, True, True): (2, 514, 1, 8), - (1280, 5120, 65792, 128, 128, True, False, True): (1, 514, 2, 4), - (1280, 5120, 65792, 256, 256, False, True, True): (1, 257, 1, 32), - (1280, 5120, 65792, 256, 256, True, False, True): (1, 257, 1, 32), - (5120, 1280, 65792, 32, 32, False, True, True): (3, 1028, 1, 8), - (5120, 1280, 65792, 32, 32, True, False, True): (1, 514, 1, 2), - (5120, 1280, 65792, 64, 64, False, True, True): (1, 514, 1, 4), - (5120, 1280, 65792, 64, 64, True, False, True): (2, 514, 2, 2), - (5120, 1280, 65792, 128, 128, False, True, True): (2, 514, 1, 8), - (5120, 1280, 65792, 128, 128, True, False, True): (2, 514, 2, 4), - (5120, 1280, 65792, 256, 256, False, True, True): (1, 257, 1, 32), - (5120, 1280, 65792, 256, 256, True, False, True): (1, 257, 1, 32), - }, - ("scatter_mm", "NVIDIA A100-SXM4-80GB", (0, torch.bfloat16, 0.5)): { - (256, 256, 256, 16, 16): (1, 1, 16, 16, 1, 2), - (256, 256, 256, 32, 32): (1, 1, 16, 16, 1, 4), - (256, 256, 256, 64, 64): (1, 1, 16, 16, 1, 1), - (256, 256, 256, 128, 128): (2, 4, 16, 64, 1, 4), - (256, 256, 512, 16, 16): (1, 1, 16, 16, 1, 4), - (256, 256, 512, 32, 32): (1, 1, 16, 32, 1, 4), - (256, 256, 512, 64, 64): (1, 1, 16, 32, 1, 1), - (256, 256, 512, 128, 128): (1, 1, 32, 32, 1, 4), - (256, 256, 1024, 16, 16): (1, 1, 16, 16, 1, 4), - (256, 256, 1024, 32, 32): (1, 2, 16, 32, 1, 1), - (256, 256, 1024, 64, 64): (1, 1, 32, 32, 1, 2), - (256, 256, 1024, 128, 128): (1, 1, 32, 64, 1, 4), - (256, 256, 2048, 16, 16): (1, 1, 16, 64, 1, 8), - (256, 256, 2048, 32, 32): (2, 1, 32, 64, 1, 2), - (256, 256, 2048, 64, 64): (1, 1, 32, 32, 1, 1), - (256, 256, 2048, 128, 128): (1, 1, 64, 64, 1, 4), - (256, 256, 4096, 16, 16): (1, 1, 16, 64, 1, 1), - (256, 256, 4096, 32, 32): (2, 2, 32, 64, 1, 2), - (256, 256, 4096, 64, 64): (1, 1, 32, 128, 1, 4), - (256, 256, 4096, 128, 128): (1, 1, 64, 64, 1, 4), - (256, 256, 8192, 16, 16): (1, 2, 16, 64, 1, 2), - (256, 256, 8192, 32, 32): (1, 1, 32, 64, 1, 2), - (256, 256, 8192, 64, 64): (1, 1, 32, 64, 1, 2), - (256, 256, 8192, 128, 128): (1, 1, 64, 64, 1, 4), - (256, 256, 16384, 16, 16): (1, 1, 16, 64, 1, 2), - (256, 256, 16384, 32, 32): (1, 1, 32, 64, 1, 2), - (256, 256, 16384, 64, 64): (1, 1, 64, 64, 1, 2), - (256, 256, 16384, 128, 128): (2, 16, 64, 64, 1, 4), - (256, 256, 32768, 16, 16): (1, 1, 16, 128, 1, 2), - (256, 256, 32768, 32, 32): (1, 1, 32, 64, 1, 2), - (256, 256, 32768, 64, 64): (1, 1, 64, 64, 1, 2), - (256, 256, 32768, 128, 128): (2, 32, 64, 64, 1, 4), - (256, 256, 65536, 16, 16): (1, 1, 16, 64, 1, 1), - (256, 256, 65536, 32, 32): (1, 1, 32, 64, 1, 2), - (256, 256, 65536, 64, 64): (1, 1, 64, 32, 1, 1), - (256, 256, 65536, 128, 128): (2, 32, 64, 64, 1, 4), - (256, 256, 131072, 16, 16): (1, 1, 16, 64, 1, 1), - (256, 256, 131072, 32, 32): (1, 1, 32, 64, 1, 2), - (256, 256, 131072, 64, 64): (4, 1, 64, 32, 1, 1), - (256, 256, 131072, 128, 128): (2, 64, 64, 64, 1, 4), - (512, 512, 256, 16, 16): (1, 1, 16, 16, 1, 2), - (512, 512, 256, 32, 32): (1, 1, 16, 32, 1, 1), - (512, 512, 256, 64, 64): (1, 2, 16, 32, 1, 1), - (512, 512, 256, 128, 128): (2, 16, 64, 16, 2, 4), - (512, 512, 512, 16, 16): (1, 1, 16, 16, 1, 4), - (512, 512, 512, 32, 32): (1, 1, 16, 32, 1, 1), - (512, 512, 512, 64, 64): (1, 1, 32, 32, 1, 2), - (512, 512, 512, 128, 128): (2, 8, 32, 64, 1, 4), - (512, 512, 1024, 16, 16): (1, 1, 16, 64, 1, 8), - (512, 512, 1024, 32, 32): (1, 1, 32, 32, 3, 1), - (512, 512, 1024, 64, 64): (1, 4, 32, 64, 1, 2), - (512, 512, 1024, 128, 128): (1, 4, 64, 64, 1, 4), - (512, 512, 2048, 16, 16): (1, 1, 16, 64, 1, 2), - (512, 512, 2048, 32, 32): (1, 1, 32, 64, 1, 2), - (512, 512, 2048, 64, 64): (1, 1, 64, 64, 3, 4), - (512, 512, 2048, 128, 128): (1, 1, 64, 64, 1, 4), - (512, 512, 4096, 16, 16): (1, 1, 16, 64, 1, 2), - (512, 512, 4096, 32, 32): (2, 64, 32, 64, 1, 2), - (512, 512, 4096, 64, 64): (1, 1, 64, 64, 3, 4), - (512, 512, 4096, 128, 128): (1, 1, 64, 64, 1, 4), - (512, 512, 8192, 16, 16): (1, 2, 16, 128, 1, 2), - (512, 512, 8192, 32, 32): (1, 1, 32, 64, 1, 2), - (512, 512, 8192, 64, 64): (1, 1, 64, 64, 1, 2), - (512, 512, 8192, 128, 128): (1, 1, 64, 64, 1, 4), - (512, 512, 16384, 16, 16): (1, 2, 16, 128, 1, 2), - (512, 512, 16384, 32, 32): (1, 1, 32, 64, 1, 2), - (512, 512, 16384, 64, 64): (1, 1, 64, 64, 3, 2), - (512, 512, 16384, 128, 128): (2, 1, 64, 64, 1, 4), - (512, 512, 32768, 16, 16): (1, 2, 16, 128, 1, 2), - (512, 512, 32768, 32, 32): (1, 1, 32, 64, 1, 2), - (512, 512, 32768, 64, 64): (1, 1, 64, 64, 3, 4), - (512, 512, 32768, 128, 128): (2, 1, 64, 64, 1, 4), - (512, 512, 65536, 16, 16): (1, 2, 16, 128, 1, 2), - (512, 512, 65536, 32, 32): (1, 1, 32, 64, 1, 2), - (512, 512, 65536, 64, 64): (1, 1, 64, 64, 3, 4), - (512, 512, 65536, 128, 128): (2, 1, 64, 64, 1, 4), - (512, 512, 131072, 16, 16): (1, 1, 16, 64, 1, 1), - (512, 512, 131072, 32, 32): (1, 1, 32, 64, 1, 2), - (512, 512, 131072, 64, 64): (1, 1, 64, 64, 3, 4), - (512, 512, 131072, 128, 128): (2, 4, 64, 64, 1, 4), - (1024, 1024, 256, 16, 16): (1, 1, 16, 16, 1, 4), - (1024, 1024, 256, 32, 32): (2, 16, 32, 16, 3, 4), - (1024, 1024, 256, 64, 64): (1, 4, 32, 32, 1, 2), - (1024, 1024, 256, 128, 128): (1, 4, 128, 16, 3, 16), - (1024, 1024, 512, 16, 16): (1, 1, 16, 64, 1, 2), - (1024, 1024, 512, 32, 32): (2, 2, 32, 64, 1, 2), - (1024, 1024, 512, 64, 64): (2, 8, 64, 64, 3, 4), - (1024, 1024, 512, 128, 128): (1, 4, 64, 64, 1, 8), - (1024, 1024, 1024, 16, 16): (1, 1, 16, 64, 1, 2), - (1024, 1024, 1024, 32, 32): (1, 1, 32, 64, 1, 2), - (1024, 1024, 1024, 64, 64): (1, 8, 64, 64, 3, 4), - (1024, 1024, 1024, 128, 128): (1, 8, 64, 64, 1, 4), - (1024, 1024, 2048, 16, 16): (1, 2, 16, 64, 1, 2), - (1024, 1024, 2048, 32, 32): (1, 1, 32, 64, 1, 2), - (1024, 1024, 2048, 64, 64): (2, 16, 64, 64, 2, 2), - (1024, 1024, 2048, 128, 128): (2, 32, 64, 64, 1, 4), - (1024, 1024, 4096, 16, 16): (2, 16, 16, 128, 1, 2), - (1024, 1024, 4096, 32, 32): (1, 16, 32, 64, 3, 2), - (1024, 1024, 4096, 64, 64): (1, 1, 64, 64, 3, 4), - (1024, 1024, 4096, 128, 128): (2, 64, 128, 64, 1, 4), - (1024, 1024, 8192, 16, 16): (2, 16, 16, 128, 1, 2), - (1024, 1024, 8192, 32, 32): (1, 16, 32, 64, 3, 2), - (1024, 1024, 8192, 64, 64): (1, 1, 64, 64, 3, 4), - (1024, 1024, 8192, 128, 128): (2, 1, 64, 64, 1, 4), - (1024, 1024, 16384, 16, 16): (1, 2, 16, 128, 1, 2), - (1024, 1024, 16384, 32, 32): (1, 16, 32, 64, 3, 2), - (1024, 1024, 16384, 64, 64): (1, 1, 64, 64, 3, 4), - (1024, 1024, 16384, 128, 128): (2, 16, 128, 64, 1, 4), - (1024, 1024, 32768, 16, 16): (1, 1, 16, 128, 1, 2), - (1024, 1024, 32768, 32, 32): (1, 1, 32, 128, 1, 2), - (1024, 1024, 32768, 64, 64): (1, 32, 64, 32, 2, 1), - (1024, 1024, 32768, 128, 128): (2, 8, 128, 64, 1, 4), - (1024, 1024, 65536, 16, 16): (3, 2, 16, 128, 1, 2), - (1024, 1024, 65536, 32, 32): (1, 1, 32, 128, 1, 2), - (1024, 1024, 65536, 64, 64): (2, 4, 64, 32, 2, 1), - (1024, 1024, 65536, 128, 128): (2, 8, 128, 64, 1, 4), - (1024, 1024, 131072, 16, 16): (2, 1, 16, 128, 1, 2), - (1024, 1024, 131072, 32, 32): (1, 1, 32, 128, 1, 2), - (1024, 1024, 131072, 64, 64): (1, 4, 64, 32, 2, 1), - (1024, 1024, 131072, 128, 128): (4, 1, 128, 64, 1, 4), - (2048, 2048, 256, 16, 16): (1, 1, 16, 64, 1, 8), - (2048, 2048, 256, 32, 32): (1, 1, 32, 32, 3, 1), - (2048, 2048, 256, 64, 64): (1, 1, 32, 32, 2, 1), - (2048, 2048, 256, 128, 128): (1, 4, 64, 64, 1, 8), - (2048, 2048, 512, 16, 16): (1, 2, 16, 64, 1, 2), - (2048, 2048, 512, 32, 32): (1, 2, 32, 64, 1, 4), - (2048, 2048, 512, 64, 64): (1, 4, 64, 64, 1, 8), - (2048, 2048, 512, 128, 128): (1, 4, 64, 64, 1, 4), - (2048, 2048, 1024, 16, 16): (1, 2, 16, 128, 1, 2), - (2048, 2048, 1024, 32, 32): (1, 1, 32, 64, 1, 2), - (2048, 2048, 1024, 64, 64): (1, 8, 64, 64, 1, 4), - (2048, 2048, 1024, 128, 128): (1, 8, 128, 64, 1, 4), - (2048, 2048, 2048, 16, 16): (3, 4, 16, 128, 1, 2), - (2048, 2048, 2048, 32, 32): (1, 16, 32, 64, 5, 2), - (2048, 2048, 2048, 64, 64): (1, 1, 64, 64, 3, 4), - (2048, 2048, 2048, 128, 128): (1, 8, 128, 64, 1, 4), - (2048, 2048, 4096, 16, 16): (1, 2, 16, 128, 1, 2), - (2048, 2048, 4096, 32, 32): (1, 8, 32, 64, 3, 2), - (2048, 2048, 4096, 64, 64): (1, 1, 64, 64, 3, 4), - (2048, 2048, 4096, 128, 128): (1, 8, 128, 64, 1, 4), - (2048, 2048, 8192, 16, 16): (2, 4, 16, 128, 1, 2), - (2048, 2048, 8192, 32, 32): (1, 4, 32, 128, 3, 2), - (2048, 2048, 8192, 64, 64): (1, 8, 64, 64, 3, 2), - (2048, 2048, 8192, 128, 128): (1, 8, 128, 64, 1, 4), - (2048, 2048, 16384, 16, 16): (1, 2, 16, 128, 1, 2), - (2048, 2048, 16384, 32, 32): (1, 4, 32, 128, 3, 2), - (2048, 2048, 16384, 64, 64): (1, 8, 64, 64, 3, 2), - (2048, 2048, 16384, 128, 128): (1, 4, 128, 64, 1, 4), - (2048, 2048, 32768, 16, 16): (3, 2, 16, 128, 1, 2), - (2048, 2048, 32768, 32, 32): (1, 1, 32, 128, 3, 2), - (2048, 2048, 32768, 64, 64): (1, 1, 64, 64, 3, 2), - (2048, 2048, 32768, 128, 128): (1, 4, 128, 64, 1, 4), - (2048, 2048, 65536, 16, 16): (1, 2, 16, 128, 1, 2), - (2048, 2048, 65536, 32, 32): (1, 4, 32, 128, 1, 2), - (2048, 2048, 65536, 64, 64): (1, 1, 64, 64, 3, 2), - (2048, 2048, 65536, 128, 128): (1, 2, 128, 64, 1, 4), - (2048, 2048, 131072, 16, 16): (4, 2, 16, 128, 1, 2), - (2048, 2048, 131072, 32, 32): (1, 1, 32, 128, 3, 2), - (2048, 2048, 131072, 64, 64): (1, 1, 64, 64, 3, 2), - (2048, 2048, 131072, 128, 128): (1, 2, 128, 64, 1, 4), - (4096, 4096, 256, 16, 16): (1, 1, 16, 64, 1, 2), - (4096, 4096, 256, 32, 32): (1, 1, 32, 64, 3, 4), - (4096, 4096, 256, 64, 64): (1, 1, 64, 64, 3, 4), - (4096, 4096, 256, 128, 128): (3, 4, 128, 32, 1, 4), - (4096, 4096, 512, 16, 16): (1, 2, 16, 128, 1, 2), - (4096, 4096, 512, 32, 32): (1, 2, 32, 64, 3, 2), - (4096, 4096, 512, 64, 64): (1, 4, 64, 64, 1, 4), - (4096, 4096, 512, 128, 128): (1, 4, 128, 64, 1, 4), - (4096, 4096, 1024, 16, 16): (1, 2, 16, 128, 1, 2), - (4096, 4096, 1024, 32, 32): (1, 8, 32, 64, 3, 2), - (4096, 4096, 1024, 64, 64): (1, 4, 64, 64, 1, 4), - (4096, 4096, 1024, 128, 128): (2, 4, 128, 64, 1, 4), - (4096, 4096, 2048, 16, 16): (1, 1, 16, 128, 1, 2), - (4096, 4096, 2048, 32, 32): (1, 4, 32, 128, 1, 4), - (4096, 4096, 2048, 64, 64): (1, 1, 64, 64, 3, 4), - (4096, 4096, 2048, 128, 128): (1, 16, 128, 64, 1, 4), - (4096, 4096, 4096, 16, 16): (1, 1, 16, 64, 3, 1), - (4096, 4096, 4096, 32, 32): (1, 4, 32, 64, 3, 2), - (4096, 4096, 4096, 64, 64): (1, 1, 64, 64, 3, 4), - (4096, 4096, 4096, 128, 128): (5, 1, 128, 64, 1, 4), - (4096, 4096, 8192, 16, 16): (1, 1, 16, 128, 1, 2), - (4096, 4096, 8192, 32, 32): (1, 1, 32, 128, 3, 2), - (4096, 4096, 8192, 64, 64): (1, 1, 64, 64, 3, 4), - (4096, 4096, 8192, 128, 128): (2, 1, 128, 64, 1, 4), - (4096, 4096, 16384, 16, 16): (1, 1, 16, 128, 1, 2), - (4096, 4096, 16384, 32, 32): (1, 1, 32, 128, 3, 2), - (4096, 4096, 16384, 64, 64): (1, 1, 64, 64, 4, 4), - (4096, 4096, 16384, 128, 128): (2, 1, 128, 64, 1, 4), - (4096, 4096, 32768, 16, 16): (3, 1, 16, 128, 1, 2), - (4096, 4096, 32768, 32, 32): (1, 1, 32, 128, 3, 2), - (4096, 4096, 32768, 64, 64): (1, 1, 64, 64, 3, 4), - (4096, 4096, 32768, 128, 128): (2, 1, 128, 64, 1, 4), - (4096, 4096, 65536, 16, 16): (2, 2, 16, 128, 1, 2), - (4096, 4096, 65536, 32, 32): (1, 1, 32, 128, 4, 2), - (4096, 4096, 65536, 64, 64): (1, 1, 64, 64, 4, 4), - (4096, 4096, 65536, 128, 128): (2, 1, 128, 64, 1, 4), - (4096, 4096, 131072, 16, 16): (2, 1, 16, 128, 1, 2), - (4096, 4096, 131072, 32, 32): (1, 1, 32, 128, 3, 2), - (4096, 4096, 131072, 64, 64): (1, 1, 64, 64, 3, 4), - (4096, 4096, 131072, 128, 128): (2, 1, 128, 64, 1, 4), - (8192, 8192, 256, 16, 16): (1, 2, 16, 64, 1, 2), - (8192, 8192, 256, 32, 32): (1, 1, 32, 64, 1, 2), - (8192, 8192, 256, 64, 64): (1, 2, 64, 64, 1, 4), - (8192, 8192, 256, 128, 128): (3, 16, 128, 16, 1, 2), - (8192, 8192, 512, 16, 16): (1, 2, 16, 128, 1, 2), - (8192, 8192, 512, 32, 32): (1, 4, 32, 64, 3, 2), - (8192, 8192, 512, 64, 64): (2, 8, 64, 64, 4, 4), - (8192, 8192, 512, 128, 128): (1, 8, 128, 64, 1, 4), - (8192, 8192, 1024, 16, 16): (4, 2, 16, 128, 1, 2), - (8192, 8192, 1024, 32, 32): (1, 8, 32, 128, 1, 2), - (8192, 8192, 1024, 64, 64): (1, 16, 64, 64, 3, 2), - (8192, 8192, 1024, 128, 128): (2, 16, 128, 64, 2, 4), - (8192, 8192, 2048, 16, 16): (2, 1, 16, 64, 4, 1), - (8192, 8192, 2048, 32, 32): (1, 16, 32, 64, 5, 2), - (8192, 8192, 2048, 64, 64): (1, 16, 64, 64, 3, 2), - (8192, 8192, 2048, 128, 128): (2, 16, 128, 64, 2, 4), - (8192, 8192, 4096, 16, 16): (1, 1, 16, 64, 4, 1), - (8192, 8192, 4096, 32, 32): (1, 16, 32, 64, 5, 2), - (8192, 8192, 4096, 64, 64): (1, 16, 64, 64, 3, 2), - (8192, 8192, 4096, 128, 128): (2, 64, 128, 64, 2, 4), - (8192, 8192, 8192, 16, 16): (1, 1, 16, 64, 4, 1), - (8192, 8192, 8192, 32, 32): (1, 8, 32, 128, 5, 4), - (8192, 8192, 8192, 64, 64): (1, 8, 64, 64, 3, 2), - (8192, 8192, 8192, 128, 128): (2, 8, 128, 64, 1, 4), - (8192, 8192, 16384, 16, 16): (1, 1, 16, 64, 4, 1), - (8192, 8192, 16384, 32, 32): (1, 8, 32, 64, 5, 2), - (8192, 8192, 16384, 64, 64): (1, 8, 64, 64, 3, 2), - (8192, 8192, 16384, 128, 128): (1, 8, 128, 64, 1, 4), - (8192, 8192, 32768, 16, 16): (1, 1, 16, 64, 4, 1), - (8192, 8192, 32768, 32, 32): (1, 8, 32, 64, 5, 2), - (8192, 8192, 32768, 64, 64): (3, 8, 64, 64, 3, 2), - (8192, 8192, 32768, 128, 128): (2, 8, 128, 64, 1, 4), - (8192, 8192, 65536, 16, 16): (1, 1, 16, 64, 4, 1), - (8192, 8192, 65536, 32, 32): (5, 4, 32, 64, 3, 2), - (8192, 8192, 65536, 64, 64): (1, 8, 64, 64, 3, 2), - (8192, 8192, 65536, 128, 128): (2, 8, 128, 64, 1, 4), - (8192, 8192, 131072, 16, 16): (2, 1, 16, 64, 4, 1), - (8192, 8192, 131072, 32, 32): (1, 4, 32, 64, 5, 2), - (8192, 8192, 131072, 64, 64): (1, 4, 64, 128, 3, 4), - (8192, 8192, 131072, 128, 128): (2, 8, 128, 64, 1, 4), - (16384, 16384, 256, 16, 16): (1, 2, 16, 128, 1, 2), - (16384, 16384, 256, 32, 32): (1, 4, 32, 64, 3, 2), - (16384, 16384, 256, 64, 64): (2, 4, 64, 64, 4, 4), - (16384, 16384, 256, 128, 128): (1, 4, 128, 64, 1, 16), - (16384, 16384, 512, 16, 16): (1, 2, 16, 128, 3, 2), - (16384, 16384, 512, 32, 32): (1, 4, 32, 128, 5, 4), - (16384, 16384, 512, 64, 64): (1, 8, 64, 64, 3, 2), - (16384, 16384, 512, 128, 128): (2, 8, 128, 64, 1, 4), - (16384, 16384, 1024, 16, 16): (1, 2, 16, 128, 1, 2), - (16384, 16384, 1024, 32, 32): (1, 8, 32, 64, 5, 2), - (16384, 16384, 1024, 64, 64): (1, 16, 64, 64, 3, 2), - (16384, 16384, 1024, 128, 128): (5, 16, 128, 64, 2, 4), - (16384, 16384, 2048, 16, 16): (1, 2, 16, 128, 1, 2), - (16384, 16384, 2048, 32, 32): (1, 8, 32, 64, 5, 2), - (16384, 16384, 2048, 64, 64): (1, 16, 64, 64, 3, 2), - (16384, 16384, 2048, 128, 128): (4, 32, 128, 64, 2, 4), - (16384, 16384, 4096, 16, 16): (3, 2, 16, 128, 1, 2), - (16384, 16384, 4096, 32, 32): (1, 4, 32, 64, 5, 2), - (16384, 16384, 4096, 64, 64): (2, 16, 64, 64, 3, 2), - (16384, 16384, 4096, 128, 128): (3, 32, 128, 64, 2, 4), - (16384, 16384, 8192, 16, 16): (1, 2, 16, 128, 1, 2), - (16384, 16384, 8192, 32, 32): (1, 4, 32, 64, 5, 2), - (16384, 16384, 8192, 64, 64): (4, 8, 64, 64, 3, 2), - (16384, 16384, 8192, 128, 128): (5, 8, 128, 64, 1, 4), - (16384, 16384, 16384, 16, 16): (1, 2, 16, 128, 1, 2), - (16384, 16384, 16384, 32, 32): (1, 4, 32, 64, 5, 2), - (16384, 16384, 16384, 64, 64): (2, 4, 64, 128, 3, 4), - (16384, 16384, 16384, 128, 128): (4, 8, 128, 64, 1, 4), - (16384, 16384, 32768, 16, 16): (4, 2, 16, 128, 1, 2), - (16384, 16384, 32768, 32, 32): (1, 4, 32, 64, 5, 2), - (16384, 16384, 32768, 64, 64): (1, 8, 64, 64, 3, 2), - (16384, 16384, 32768, 128, 128): (2, 512, 128, 64, 2, 4), - (16384, 16384, 65536, 16, 16): (3, 2, 16, 128, 1, 2), - (16384, 16384, 65536, 32, 32): (1, 4, 32, 64, 5, 2), - (16384, 16384, 65536, 64, 64): (1, 4, 64, 128, 3, 4), - (16384, 16384, 65536, 128, 128): (2, 1024, 128, 64, 2, 4), - (16384, 16384, 131072, 16, 16): (1, 2, 16, 128, 1, 2), - (16384, 16384, 131072, 32, 32): (1, 4, 32, 64, 5, 2), - (16384, 16384, 131072, 64, 64): (3, 4, 64, 128, 3, 4), - (16384, 16384, 131072, 128, 128): (4, 2048, 128, 64, 2, 4), - }, - ("scatter_mm", "NVIDIA A100-SXM4-80GB", (0, torch.float16, 0.5)): { - (256, 256, 256, 16, 16): (5, 4, 16, 16, 1, 4), - (256, 256, 256, 32, 32): (5, 2, 32, 16, 1, 4), - (256, 256, 256, 64, 64): (4, 1, 32, 32, 1, 8), - (256, 256, 256, 128, 128): (2, 1, 32, 32, 1, 4), - (256, 256, 512, 16, 16): (2, 2, 16, 32, 1, 4), - (256, 256, 512, 32, 32): (4, 8, 32, 32, 1, 8), - (256, 256, 512, 64, 64): (4, 8, 32, 64, 1, 4), - (256, 256, 512, 128, 128): (4, 8, 32, 64, 1, 4), - (256, 256, 1024, 16, 16): (4, 2, 16, 64, 1, 2), - (256, 256, 1024, 32, 32): (4, 16, 32, 64, 1, 2), - (256, 256, 1024, 64, 64): (4, 16, 32, 64, 1, 4), - (256, 256, 1024, 128, 128): (4, 16, 64, 64, 1, 8), - (256, 256, 2048, 16, 16): (2, 16, 16, 64, 1, 8), - (256, 256, 2048, 32, 32): (4, 16, 32, 64, 1, 2), - (256, 256, 2048, 64, 64): (4, 16, 32, 64, 1, 4), - (256, 256, 2048, 128, 128): (4, 16, 64, 64, 1, 4), - (256, 256, 4096, 16, 16): (4, 32, 16, 64, 1, 1), - (256, 256, 4096, 32, 32): (2, 64, 32, 64, 1, 2), - (256, 256, 4096, 64, 64): (4, 64, 64, 64, 1, 4), - (256, 256, 4096, 128, 128): (4, 32, 64, 64, 1, 4), - (256, 256, 8192, 16, 16): (4, 64, 16, 64, 1, 1), - (256, 256, 8192, 32, 32): (4, 128, 32, 64, 1, 2), - (256, 256, 8192, 64, 64): (4, 64, 64, 64, 1, 4), - (256, 256, 8192, 128, 128): (4, 64, 64, 64, 1, 4), - (256, 256, 16384, 16, 16): (4, 128, 16, 64, 1, 1), - (256, 256, 16384, 32, 32): (2, 128, 32, 64, 1, 2), - (256, 256, 16384, 64, 64): (4, 32, 32, 128, 1, 4), - (256, 256, 16384, 128, 128): (4, 16, 64, 64, 1, 4), - (256, 256, 32768, 16, 16): (4, 64, 16, 64, 1, 1), - (256, 256, 32768, 32, 32): (2, 256, 32, 64, 1, 2), - (256, 256, 32768, 64, 64): (4, 32, 32, 128, 1, 4), - (256, 256, 32768, 128, 128): (4, 32, 64, 64, 1, 4), - (256, 256, 65536, 16, 16): (4, 128, 16, 64, 1, 1), - (256, 256, 65536, 32, 32): (4, 1, 32, 64, 1, 2), - (256, 256, 65536, 64, 64): (2, 1, 64, 64, 1, 2), - (256, 256, 65536, 128, 128): (4, 32, 64, 64, 1, 4), - (256, 256, 131072, 16, 16): (4, 64, 16, 64, 1, 1), - (256, 256, 131072, 32, 32): (2, 1, 32, 64, 1, 2), - (256, 256, 131072, 64, 64): (4, 32, 32, 128, 1, 4), - (256, 256, 131072, 128, 128): (4, 32, 64, 64, 1, 4), - (512, 512, 256, 16, 16): (4, 16, 16, 16, 1, 4), - (512, 512, 256, 32, 32): (2, 4, 32, 16, 1, 4), - (512, 512, 256, 64, 64): (2, 16, 64, 16, 3, 8), - (512, 512, 256, 128, 128): (4, 16, 64, 16, 1, 4), - (512, 512, 512, 16, 16): (1, 1, 16, 64, 1, 8), - (512, 512, 512, 32, 32): (2, 4, 16, 32, 1, 1), - (512, 512, 512, 64, 64): (2, 1, 32, 32, 1, 2), - (512, 512, 512, 128, 128): (4, 8, 32, 64, 1, 4), - (512, 512, 1024, 16, 16): (2, 8, 16, 64, 1, 8), - (512, 512, 1024, 32, 32): (4, 16, 32, 64, 1, 2), - (512, 512, 1024, 64, 64): (4, 16, 64, 64, 1, 4), - (512, 512, 1024, 128, 128): (2, 8, 64, 64, 1, 4), - (512, 512, 2048, 16, 16): (4, 16, 16, 64, 1, 4), - (512, 512, 2048, 32, 32): (4, 16, 32, 64, 1, 2), - (512, 512, 2048, 64, 64): (4, 16, 64, 64, 1, 8), - (512, 512, 2048, 128, 128): (4, 16, 64, 64, 1, 4), - (512, 512, 4096, 16, 16): (4, 32, 16, 128, 1, 2), - (512, 512, 4096, 32, 32): (4, 32, 32, 64, 1, 2), - (512, 512, 4096, 64, 64): (4, 32, 64, 64, 1, 4), - (512, 512, 4096, 128, 128): (4, 32, 64, 64, 1, 4), - (512, 512, 8192, 16, 16): (2, 32, 16, 128, 1, 2), - (512, 512, 8192, 32, 32): (4, 64, 32, 64, 1, 2), - (512, 512, 8192, 64, 64): (4, 128, 64, 64, 1, 2), - (512, 512, 8192, 128, 128): (4, 64, 64, 64, 1, 4), - (512, 512, 16384, 16, 16): (4, 32, 16, 64, 1, 1), - (512, 512, 16384, 32, 32): (4, 64, 32, 64, 1, 2), - (512, 512, 16384, 64, 64): (4, 16, 64, 64, 1, 4), - (512, 512, 16384, 128, 128): (4, 32, 64, 64, 1, 4), - (512, 512, 32768, 16, 16): (7, 16, 16, 128, 1, 2), - (512, 512, 32768, 32, 32): (4, 64, 32, 64, 1, 2), - (512, 512, 32768, 64, 64): (2, 32, 64, 64, 3, 2), - (512, 512, 32768, 128, 128): (2, 32, 64, 64, 1, 4), - (512, 512, 65536, 16, 16): (2, 32, 16, 64, 1, 1), - (512, 512, 65536, 32, 32): (4, 64, 32, 64, 1, 2), - (512, 512, 65536, 64, 64): (3, 32, 64, 64, 3, 2), - (512, 512, 65536, 128, 128): (4, 16, 64, 64, 1, 4), - (512, 512, 131072, 16, 16): (3, 32, 16, 128, 1, 2), - (512, 512, 131072, 32, 32): (4, 64, 32, 64, 1, 2), - (512, 512, 131072, 64, 64): (2, 32, 64, 64, 3, 2), - (512, 512, 131072, 128, 128): (3, 1, 64, 64, 1, 4), - (1024, 1024, 256, 16, 16): (4, 16, 16, 16, 1, 4), - (1024, 1024, 256, 32, 32): (4, 16, 32, 16, 1, 4), - (1024, 1024, 256, 64, 64): (4, 4, 64, 32, 1, 16), - (1024, 1024, 256, 128, 128): (4, 16, 64, 16, 1, 8), - (1024, 1024, 512, 16, 16): (2, 8, 16, 64, 1, 8), - (1024, 1024, 512, 32, 32): (3, 2, 32, 64, 1, 2), - (1024, 1024, 512, 64, 64): (4, 8, 32, 64, 1, 8), - (1024, 1024, 512, 128, 128): (4, 8, 64, 64, 1, 8), - (1024, 1024, 1024, 16, 16): (2, 2, 16, 64, 1, 2), - (1024, 1024, 1024, 32, 32): (2, 8, 32, 64, 1, 2), - (1024, 1024, 1024, 64, 64): (2, 8, 32, 128, 1, 4), - (1024, 1024, 1024, 128, 128): (2, 8, 64, 64, 1, 4), - (1024, 1024, 2048, 16, 16): (2, 16, 16, 128, 3, 2), - (1024, 1024, 2048, 32, 32): (4, 32, 32, 64, 1, 2), - (1024, 1024, 2048, 64, 64): (4, 16, 64, 64, 1, 4), - (1024, 1024, 2048, 128, 128): (4, 32, 64, 64, 1, 4), - (1024, 1024, 4096, 16, 16): (4, 16, 16, 128, 1, 2), - (1024, 1024, 4096, 32, 32): (3, 32, 32, 64, 1, 2), - (1024, 1024, 4096, 64, 64): (4, 32, 64, 64, 1, 4), - (1024, 1024, 4096, 128, 128): (4, 32, 64, 64, 1, 4), - (1024, 1024, 8192, 16, 16): (5, 16, 16, 128, 1, 2), - (1024, 1024, 8192, 32, 32): (2, 32, 32, 64, 3, 2), - (1024, 1024, 8192, 64, 64): (1, 16, 64, 64, 3, 2), - (1024, 1024, 8192, 128, 128): (4, 32, 64, 64, 1, 4), - (1024, 1024, 16384, 16, 16): (4, 16, 16, 128, 1, 2), - (1024, 1024, 16384, 32, 32): (1, 32, 32, 64, 3, 2), - (1024, 1024, 16384, 64, 64): (4, 16, 64, 64, 3, 2), - (1024, 1024, 16384, 128, 128): (4, 32, 128, 64, 1, 4), - (1024, 1024, 32768, 16, 16): (3, 16, 16, 128, 1, 2), - (1024, 1024, 32768, 32, 32): (1, 8, 32, 64, 3, 2), - (1024, 1024, 32768, 64, 64): (4, 16, 64, 64, 3, 2), - (1024, 1024, 32768, 128, 128): (4, 8, 128, 64, 2, 4), - (1024, 1024, 65536, 16, 16): (1, 2, 16, 128, 1, 2), - (1024, 1024, 65536, 32, 32): (2, 4, 32, 64, 3, 2), - (1024, 1024, 65536, 64, 64): (5, 16, 64, 64, 3, 2), - (1024, 1024, 65536, 128, 128): (5, 8, 128, 64, 2, 4), - (1024, 1024, 131072, 16, 16): (5, 2, 16, 128, 1, 2), - (1024, 1024, 131072, 32, 32): (1, 2, 32, 64, 3, 2), - (1024, 1024, 131072, 64, 64): (5, 16, 64, 64, 3, 2), - (1024, 1024, 131072, 128, 128): (2, 1, 128, 64, 2, 4), - (2048, 2048, 256, 16, 16): (4, 4, 16, 64, 1, 8), - (2048, 2048, 256, 32, 32): (4, 8, 32, 32, 1, 8), - (2048, 2048, 256, 64, 64): (4, 16, 64, 16, 1, 8), - (2048, 2048, 256, 128, 128): (4, 4, 128, 32, 3, 8), - (2048, 2048, 512, 16, 16): (2, 2, 16, 64, 1, 2), - (2048, 2048, 512, 32, 32): (2, 4, 32, 64, 3, 2), - (2048, 2048, 512, 64, 64): (4, 4, 64, 64, 1, 8), - (2048, 2048, 512, 128, 128): (4, 8, 64, 64, 1, 4), - (2048, 2048, 1024, 16, 16): (1, 8, 16, 64, 1, 2), - (2048, 2048, 1024, 32, 32): (2, 16, 32, 64, 3, 2), - (2048, 2048, 1024, 64, 64): (4, 8, 64, 64, 1, 4), - (2048, 2048, 1024, 128, 128): (4, 8, 128, 64, 1, 4), - (2048, 2048, 2048, 16, 16): (5, 4, 16, 128, 1, 2), - (2048, 2048, 2048, 32, 32): (1, 16, 32, 64, 3, 2), - (2048, 2048, 2048, 64, 64): (2, 8, 64, 64, 1, 4), - (2048, 2048, 2048, 128, 128): (2, 8, 128, 64, 1, 4), - (2048, 2048, 4096, 16, 16): (4, 2, 16, 128, 1, 2), - (2048, 2048, 4096, 32, 32): (2, 16, 32, 64, 3, 2), - (2048, 2048, 4096, 64, 64): (2, 8, 64, 64, 3, 2), - (2048, 2048, 4096, 128, 128): (4, 8, 128, 64, 1, 4), - (2048, 2048, 8192, 16, 16): (5, 4, 16, 128, 1, 2), - (2048, 2048, 8192, 32, 32): (2, 8, 32, 64, 3, 2), - (2048, 2048, 8192, 64, 64): (4, 8, 64, 64, 3, 2), - (2048, 2048, 8192, 128, 128): (4, 8, 128, 64, 1, 4), - (2048, 2048, 16384, 16, 16): (3, 2, 16, 128, 1, 2), - (2048, 2048, 16384, 32, 32): (2, 4, 32, 128, 3, 2), - (2048, 2048, 16384, 64, 64): (4, 8, 64, 64, 3, 2), - (2048, 2048, 16384, 128, 128): (4, 4, 128, 64, 1, 4), - (2048, 2048, 32768, 16, 16): (3, 2, 16, 128, 1, 2), - (2048, 2048, 32768, 32, 32): (3, 4, 32, 128, 3, 2), - (2048, 2048, 32768, 64, 64): (6, 4, 64, 64, 3, 2), - (2048, 2048, 32768, 128, 128): (3, 4, 128, 64, 1, 4), - (2048, 2048, 65536, 16, 16): (6, 2, 16, 128, 1, 2), - (2048, 2048, 65536, 32, 32): (1, 2, 32, 128, 1, 2), - (2048, 2048, 65536, 64, 64): (5, 4, 64, 64, 3, 2), - (2048, 2048, 65536, 128, 128): (5, 1, 128, 64, 2, 4), - (2048, 2048, 131072, 16, 16): (3, 2, 16, 128, 1, 2), - (2048, 2048, 131072, 32, 32): (2, 1, 32, 128, 3, 2), - (2048, 2048, 131072, 64, 64): (4, 1, 64, 64, 3, 2), - (2048, 2048, 131072, 128, 128): (3, 1, 128, 64, 2, 4), - (4096, 4096, 256, 16, 16): (5, 8, 16, 32, 1, 4), - (4096, 4096, 256, 32, 32): (4, 16, 32, 16, 2, 4), - (4096, 4096, 256, 64, 64): (2, 1, 64, 64, 3, 4), - (4096, 4096, 256, 128, 128): (4, 4, 128, 32, 1, 4), - (4096, 4096, 512, 16, 16): (4, 2, 16, 128, 1, 2), - (4096, 4096, 512, 32, 32): (4, 8, 32, 64, 1, 2), - (4096, 4096, 512, 64, 64): (4, 4, 64, 64, 1, 4), - (4096, 4096, 512, 128, 128): (4, 8, 128, 64, 2, 4), - (4096, 4096, 1024, 16, 16): (1, 2, 16, 128, 1, 2), - (4096, 4096, 1024, 32, 32): (6, 8, 32, 64, 3, 2), - (4096, 4096, 1024, 64, 64): (2, 16, 64, 64, 4, 4), - (4096, 4096, 1024, 128, 128): (2, 4, 128, 64, 2, 4), - (4096, 4096, 2048, 16, 16): (3, 1, 16, 128, 1, 2), - (4096, 4096, 2048, 32, 32): (1, 4, 32, 64, 5, 2), - (4096, 4096, 2048, 64, 64): (3, 16, 64, 64, 3, 2), - (4096, 4096, 2048, 128, 128): (4, 32, 128, 64, 2, 4), - (4096, 4096, 4096, 16, 16): (1, 2, 16, 128, 1, 2), - (4096, 4096, 4096, 32, 32): (1, 4, 32, 64, 3, 2), - (4096, 4096, 4096, 64, 64): (1, 1, 64, 64, 4, 4), - (4096, 4096, 4096, 128, 128): (2, 1, 128, 128, 1, 8), - (4096, 4096, 8192, 16, 16): (3, 1, 16, 128, 1, 2), - (4096, 4096, 8192, 32, 32): (2, 2, 32, 64, 5, 2), - (4096, 4096, 8192, 64, 64): (4, 16, 64, 64, 3, 2), - (4096, 4096, 8192, 128, 128): (4, 16, 128, 64, 2, 4), - (4096, 4096, 16384, 16, 16): (1, 2, 16, 128, 1, 2), - (4096, 4096, 16384, 32, 32): (4, 2, 32, 64, 5, 2), - (4096, 4096, 16384, 64, 64): (4, 16, 64, 64, 3, 2), - (4096, 4096, 16384, 128, 128): (4, 16, 128, 64, 2, 4), - (4096, 4096, 32768, 16, 16): (3, 1, 16, 128, 1, 2), - (4096, 4096, 32768, 32, 32): (3, 1, 32, 128, 1, 4), - (4096, 4096, 32768, 64, 64): (3, 1, 64, 64, 3, 4), - (4096, 4096, 32768, 128, 128): (5, 16, 128, 64, 2, 4), - (4096, 4096, 65536, 16, 16): (5, 1, 16, 128, 1, 2), - (4096, 4096, 65536, 32, 32): (5, 1, 32, 128, 1, 4), - (4096, 4096, 65536, 64, 64): (1, 1, 64, 64, 3, 4), - (4096, 4096, 65536, 128, 128): (3, 16, 128, 64, 2, 4), - (4096, 4096, 131072, 16, 16): (3, 1, 16, 128, 1, 2), - (4096, 4096, 131072, 32, 32): (3, 1, 32, 128, 3, 2), - (4096, 4096, 131072, 64, 64): (2, 1, 64, 64, 3, 4), - (4096, 4096, 131072, 128, 128): (1, 1, 128, 64, 1, 4), - (8192, 8192, 256, 16, 16): (4, 16, 16, 16, 1, 4), - (8192, 8192, 256, 32, 32): (1, 16, 32, 16, 4, 4), - (8192, 8192, 256, 64, 64): (4, 16, 64, 16, 3, 8), - (8192, 8192, 256, 128, 128): (4, 16, 128, 16, 1, 2), - (8192, 8192, 512, 16, 16): (2, 8, 16, 64, 1, 4), - (8192, 8192, 512, 32, 32): (4, 8, 32, 64, 3, 2), - (8192, 8192, 512, 64, 64): (2, 8, 64, 64, 4, 4), - (8192, 8192, 512, 128, 128): (4, 8, 128, 64, 2, 4), - (8192, 8192, 1024, 16, 16): (4, 16, 16, 64, 1, 8), - (8192, 8192, 1024, 32, 32): (2, 8, 32, 64, 5, 2), - (8192, 8192, 1024, 64, 64): (1, 16, 64, 64, 3, 2), - (8192, 8192, 1024, 128, 128): (5, 16, 128, 64, 2, 4), - (8192, 8192, 2048, 16, 16): (7, 2, 16, 128, 1, 2), - (8192, 8192, 2048, 32, 32): (1, 16, 32, 64, 5, 2), - (8192, 8192, 2048, 64, 64): (4, 16, 64, 64, 3, 2), - (8192, 8192, 2048, 128, 128): (6, 16, 128, 64, 2, 4), - (8192, 8192, 4096, 16, 16): (4, 2, 16, 128, 1, 2), - (8192, 8192, 4096, 32, 32): (2, 8, 32, 64, 5, 2), - (8192, 8192, 4096, 64, 64): (3, 16, 64, 64, 3, 2), - (8192, 8192, 4096, 128, 128): (3, 64, 128, 64, 2, 4), - (8192, 8192, 8192, 16, 16): (4, 2, 16, 128, 1, 2), - (8192, 8192, 8192, 32, 32): (1, 4, 32, 128, 5, 4), - (8192, 8192, 8192, 64, 64): (4, 4, 64, 64, 1, 4), - (8192, 8192, 8192, 128, 128): (2, 2, 128, 128, 3, 8), - (8192, 8192, 16384, 16, 16): (1, 2, 16, 128, 1, 2), - (8192, 8192, 16384, 32, 32): (4, 8, 32, 64, 5, 2), - (8192, 8192, 16384, 64, 64): (5, 8, 64, 64, 3, 2), - (8192, 8192, 16384, 128, 128): (3, 16, 128, 64, 2, 4), - (8192, 8192, 32768, 16, 16): (7, 2, 16, 128, 1, 2), - (8192, 8192, 32768, 32, 32): (3, 4, 32, 64, 3, 2), - (8192, 8192, 32768, 64, 64): (2, 8, 64, 64, 3, 2), - (8192, 8192, 32768, 128, 128): (6, 16, 128, 64, 2, 4), - (8192, 8192, 65536, 16, 16): (9, 2, 16, 128, 1, 2), - (8192, 8192, 65536, 32, 32): (7, 4, 32, 64, 5, 2), - (8192, 8192, 65536, 64, 64): (4, 8, 64, 64, 3, 2), - (8192, 8192, 65536, 128, 128): (3, 16, 128, 64, 2, 4), - (8192, 8192, 131072, 16, 16): (9, 2, 16, 128, 1, 2), - (8192, 8192, 131072, 32, 32): (1, 8, 32, 64, 5, 2), - (8192, 8192, 131072, 64, 64): (1, 8, 64, 64, 3, 2), - (8192, 8192, 131072, 128, 128): (4, 16, 128, 64, 2, 4), - (16384, 16384, 256, 16, 16): (5, 16, 16, 16, 1, 4), - (16384, 16384, 256, 32, 32): (4, 16, 32, 16, 4, 4), - (16384, 16384, 256, 64, 64): (4, 16, 64, 16, 3, 8), - (16384, 16384, 256, 128, 128): (4, 16, 128, 16, 1, 2), - (16384, 16384, 512, 16, 16): (2, 8, 16, 64, 1, 4), - (16384, 16384, 512, 32, 32): (1, 4, 32, 64, 5, 2), - (16384, 16384, 512, 64, 64): (4, 8, 64, 64, 1, 4), - (16384, 16384, 512, 128, 128): (3, 8, 128, 64, 2, 4), - (16384, 16384, 1024, 16, 16): (4, 2, 16, 128, 1, 2), - (16384, 16384, 1024, 32, 32): (4, 8, 32, 64, 5, 2), - (16384, 16384, 1024, 64, 64): (6, 16, 64, 64, 3, 2), - (16384, 16384, 1024, 128, 128): (3, 16, 128, 64, 2, 4), - (16384, 16384, 2048, 16, 16): (3, 2, 16, 128, 1, 2), - (16384, 16384, 2048, 32, 32): (1, 8, 32, 64, 5, 2), - (16384, 16384, 2048, 64, 64): (5, 16, 64, 64, 3, 2), - (16384, 16384, 2048, 128, 128): (2, 32, 128, 64, 2, 4), - (16384, 16384, 4096, 16, 16): (2, 2, 16, 128, 1, 2), - (16384, 16384, 4096, 32, 32): (1, 4, 32, 64, 3, 2), - (16384, 16384, 4096, 64, 64): (2, 8, 64, 64, 3, 2), - (16384, 16384, 4096, 128, 128): (3, 16, 128, 64, 2, 4), - (16384, 16384, 8192, 16, 16): (3, 2, 16, 128, 1, 2), - (16384, 16384, 8192, 32, 32): (2, 4, 32, 64, 5, 2), - (16384, 16384, 8192, 64, 64): (4, 8, 64, 64, 3, 2), - (16384, 16384, 8192, 128, 128): (8, 32, 128, 64, 2, 4), - (16384, 16384, 16384, 16, 16): (1, 2, 16, 256, 1, 4), - (16384, 16384, 16384, 32, 32): (1, 4, 32, 128, 3, 4), - (16384, 16384, 16384, 64, 64): (5, 4, 64, 64, 1, 4), - (16384, 16384, 16384, 128, 128): (4, 8, 128, 64, 2, 4), - (16384, 16384, 32768, 16, 16): (2, 2, 16, 128, 1, 2), - (16384, 16384, 32768, 32, 32): (1, 4, 32, 64, 3, 2), - (16384, 16384, 32768, 64, 64): (5, 4, 64, 64, 1, 4), - (16384, 16384, 32768, 128, 128): (5, 8, 128, 64, 2, 4), - (16384, 16384, 65536, 16, 16): (8, 2, 16, 128, 1, 2), - (16384, 16384, 65536, 32, 32): (6, 4, 32, 64, 5, 2), - (16384, 16384, 65536, 64, 64): (2, 4, 64, 64, 1, 4), - (16384, 16384, 65536, 128, 128): (4, 8, 128, 64, 2, 4), - (16384, 16384, 131072, 16, 16): (3, 1, 16, 128, 1, 2), - (16384, 16384, 131072, 32, 32): (1, 4, 32, 64, 3, 2), - (16384, 16384, 131072, 64, 64): (4, 4, 64, 64, 1, 4), - (16384, 16384, 131072, 128, 128): (1, 8, 128, 64, 2, 4), - (32768, 32768, 256, 16, 16): (4, 16, 16, 16, 1, 4), - (32768, 32768, 512, 16, 16): (4, 2, 16, 128, 1, 2), - (32768, 32768, 1024, 16, 16): (3, 2, 16, 128, 1, 2), - (32768, 32768, 2048, 16, 16): (4, 2, 16, 128, 1, 2), - (32768, 32768, 4096, 16, 16): (5, 4, 16, 64, 1, 1), - (32768, 32768, 8192, 16, 16): (4, 4, 16, 64, 1, 1), - (32768, 32768, 16384, 16, 16): (4, 4, 16, 64, 1, 1), - (32768, 32768, 32768, 16, 16): (5, 4, 16, 64, 1, 1), - }, - ("scatter_mm", "NVIDIA A100-SXM4-80GB", (0, torch.float32, 0.5)): { - (256, 256, 256, 16, 16): (1, 1, 16, 16, 1, 8), - (256, 256, 256, 32, 32): (1, 1, 16, 16, 1, 4), - (256, 256, 256, 64, 64): (1, 1, 16, 16, 1, 4), - (256, 256, 256, 128, 128): (1, 1, 16, 16, 1, 1), - (256, 256, 512, 16, 16): (1, 1, 16, 16, 1, 4), - (256, 256, 512, 32, 32): (1, 16, 16, 16, 1, 1), - (256, 256, 512, 64, 64): (1, 1, 16, 16, 1, 1), - (256, 256, 512, 128, 128): (1, 1, 32, 32, 1, 4), - (256, 256, 1024, 16, 16): (1, 1, 16, 32, 1, 2), - (256, 256, 1024, 32, 32): (1, 4, 16, 16, 1, 1), - (256, 256, 1024, 64, 64): (1, 1, 32, 32, 1, 4), - (256, 256, 1024, 128, 128): (1, 1, 32, 32, 1, 4), - (256, 256, 2048, 16, 16): (1, 2, 16, 32, 1, 2), - (256, 256, 2048, 32, 32): (1, 1, 16, 32, 1, 2), - (256, 256, 2048, 64, 64): (2, 1, 16, 32, 1, 2), - (256, 256, 2048, 128, 128): (1, 1, 16, 16, 1, 1), - (256, 256, 4096, 16, 16): (1, 1, 16, 32, 1, 2), - (256, 256, 4096, 32, 32): (1, 1, 16, 32, 1, 2), - (256, 256, 4096, 64, 64): (1, 1, 32, 32, 1, 4), - (256, 256, 4096, 128, 128): (3, 1, 32, 64, 1, 4), - (256, 256, 8192, 16, 16): (1, 32, 16, 64, 1, 2), - (256, 256, 8192, 32, 32): (1, 1, 32, 64, 1, 4), - (256, 256, 8192, 64, 64): (1, 1, 32, 64, 1, 4), - (256, 256, 8192, 128, 128): (2, 1, 64, 32, 1, 4), - (256, 256, 16384, 16, 16): (1, 1, 16, 64, 1, 2), - (256, 256, 16384, 32, 32): (1, 1, 32, 64, 1, 4), - (256, 256, 16384, 64, 64): (1, 128, 64, 64, 1, 4), - (256, 256, 16384, 128, 128): (2, 1, 64, 32, 1, 4), - (256, 256, 32768, 16, 16): (2, 128, 16, 64, 1, 1), - (256, 256, 32768, 32, 32): (1, 1, 32, 64, 1, 4), - (256, 256, 32768, 64, 64): (1, 128, 64, 64, 1, 4), - (256, 256, 32768, 128, 128): (2, 1, 64, 64, 1, 4), - (256, 256, 65536, 16, 16): (1, 1, 16, 64, 1, 2), - (256, 256, 65536, 32, 32): (1, 1, 32, 64, 1, 4), - (256, 256, 65536, 64, 64): (2, 1, 64, 64, 1, 4), - (256, 256, 65536, 128, 128): (1, 1, 128, 32, 1, 4), - (256, 256, 131072, 16, 16): (3, 128, 16, 64, 1, 1), - (256, 256, 131072, 32, 32): (1, 1, 32, 64, 1, 4), - (256, 256, 131072, 64, 64): (2, 1, 64, 64, 1, 4), - (256, 256, 131072, 128, 128): (1, 8192, 64, 16, 1, 4), - (512, 512, 256, 16, 16): (1, 2, 16, 16, 1, 1), - (512, 512, 256, 32, 32): (1, 4, 16, 16, 1, 1), - (512, 512, 256, 64, 64): (1, 16, 16, 16, 1, 1), - (512, 512, 256, 128, 128): (1, 1, 16, 32, 1, 4), - (512, 512, 512, 16, 16): (1, 8, 16, 32, 1, 2), - (512, 512, 512, 32, 32): (1, 8, 16, 32, 1, 2), - (512, 512, 512, 64, 64): (1, 2, 16, 32, 1, 2), - (512, 512, 512, 128, 128): (1, 1, 32, 32, 1, 4), - (512, 512, 1024, 16, 16): (1, 1, 16, 32, 1, 2), - (512, 512, 1024, 32, 32): (1, 1, 16, 32, 1, 2), - (512, 512, 1024, 64, 64): (1, 1, 16, 32, 1, 2), - (512, 512, 1024, 128, 128): (1, 1, 64, 32, 1, 4), - (512, 512, 2048, 16, 16): (1, 16, 16, 64, 1, 2), - (512, 512, 2048, 32, 32): (1, 1, 32, 32, 1, 4), - (512, 512, 2048, 64, 64): (1, 1, 32, 32, 1, 4), - (512, 512, 2048, 128, 128): (2, 1, 32, 32, 1, 4), - (512, 512, 4096, 16, 16): (2, 64, 16, 64, 1, 1), - (512, 512, 4096, 32, 32): (1, 64, 32, 64, 1, 4), - (512, 512, 4096, 64, 64): (1, 1, 32, 32, 1, 4), - (512, 512, 4096, 128, 128): (1, 1, 64, 32, 1, 4), - (512, 512, 8192, 16, 16): (2, 64, 16, 64, 1, 1), - (512, 512, 8192, 32, 32): (1, 256, 32, 32, 1, 1), - (512, 512, 8192, 64, 64): (1, 64, 64, 64, 1, 4), - (512, 512, 8192, 128, 128): (2, 1, 64, 32, 1, 8), - (512, 512, 16384, 16, 16): (2, 64, 16, 64, 1, 1), - (512, 512, 16384, 32, 32): (1, 128, 32, 32, 1, 1), - (512, 512, 16384, 64, 64): (1, 64, 64, 64, 1, 4), - (512, 512, 16384, 128, 128): (3, 1, 64, 32, 1, 8), - (512, 512, 32768, 16, 16): (2, 64, 16, 64, 1, 1), - (512, 512, 32768, 32, 32): (1, 128, 32, 32, 1, 1), - (512, 512, 32768, 64, 64): (1, 64, 64, 64, 1, 4), - (512, 512, 32768, 128, 128): (2, 1, 64, 32, 1, 8), - (512, 512, 65536, 16, 16): (2, 32, 16, 64, 1, 1), - (512, 512, 65536, 32, 32): (1, 128, 32, 32, 1, 1), - (512, 512, 65536, 64, 64): (1, 64, 64, 64, 1, 4), - (512, 512, 65536, 128, 128): (2, 1, 64, 32, 1, 8), - (512, 512, 131072, 16, 16): (2, 32, 16, 64, 1, 1), - (512, 512, 131072, 32, 32): (1, 128, 32, 32, 1, 1), - (512, 512, 131072, 64, 64): (3, 64, 64, 64, 1, 4), - (512, 512, 131072, 128, 128): (1, 8192, 64, 16, 1, 4), - (1024, 1024, 256, 16, 16): (1, 4, 16, 32, 1, 2), - (1024, 1024, 256, 32, 32): (1, 4, 16, 32, 1, 2), - (1024, 1024, 256, 64, 64): (1, 1, 16, 32, 1, 2), - (1024, 1024, 256, 128, 128): (1, 1, 16, 16, 1, 1), - (1024, 1024, 512, 16, 16): (1, 8, 16, 32, 1, 2), - (1024, 1024, 512, 32, 32): (1, 8, 16, 32, 1, 1), - (1024, 1024, 512, 64, 64): (1, 8, 32, 32, 1, 4), - (1024, 1024, 512, 128, 128): (2, 1, 32, 32, 1, 4), - (1024, 1024, 1024, 16, 16): (1, 16, 16, 32, 1, 2), - (1024, 1024, 1024, 32, 32): (1, 16, 32, 64, 1, 4), - (1024, 1024, 1024, 64, 64): (1, 16, 32, 64, 1, 4), - (1024, 1024, 1024, 128, 128): (1, 1, 32, 32, 1, 4), - (1024, 1024, 2048, 16, 16): (2, 32, 16, 64, 1, 1), - (1024, 1024, 2048, 32, 32): (1, 32, 32, 64, 1, 4), - (1024, 1024, 2048, 64, 64): (1, 32, 64, 64, 1, 4), - (1024, 1024, 2048, 128, 128): (1, 1, 32, 64, 1, 4), - (1024, 1024, 4096, 16, 16): (2, 16, 16, 64, 1, 1), - (1024, 1024, 4096, 32, 32): (1, 64, 32, 32, 1, 1), - (1024, 1024, 4096, 64, 64): (1, 64, 64, 64, 1, 4), - (1024, 1024, 4096, 128, 128): (2, 64, 64, 32, 1, 8), - (1024, 1024, 8192, 16, 16): (2, 16, 16, 64, 1, 1), - (1024, 1024, 8192, 32, 32): (1, 64, 32, 32, 1, 1), - (1024, 1024, 8192, 64, 64): (1, 64, 64, 64, 1, 4), - (1024, 1024, 8192, 128, 128): (4, 1, 32, 64, 1, 4), - (1024, 1024, 16384, 16, 16): (2, 16, 16, 64, 1, 1), - (1024, 1024, 16384, 32, 32): (1, 64, 32, 32, 1, 1), - (1024, 1024, 16384, 64, 64): (1, 32, 64, 64, 1, 4), - (1024, 1024, 16384, 128, 128): (2, 64, 64, 32, 1, 4), - (1024, 1024, 32768, 16, 16): (2, 16, 16, 64, 1, 1), - (1024, 1024, 32768, 32, 32): (1, 64, 32, 32, 1, 1), - (1024, 1024, 32768, 64, 64): (1, 32, 64, 64, 1, 4), - (1024, 1024, 32768, 128, 128): (4, 1, 32, 64, 1, 4), - (1024, 1024, 65536, 16, 16): (2, 16, 16, 64, 1, 1), - (1024, 1024, 65536, 32, 32): (1, 32, 32, 32, 1, 1), - (1024, 1024, 65536, 64, 64): (2, 32, 64, 64, 1, 4), - (1024, 1024, 65536, 128, 128): (4, 1, 64, 32, 1, 4), - (1024, 1024, 131072, 16, 16): (2, 16, 16, 64, 1, 1), - (1024, 1024, 131072, 32, 32): (1, 32, 32, 32, 1, 1), - (1024, 1024, 131072, 64, 64): (1, 16, 64, 64, 1, 4), - (1024, 1024, 131072, 128, 128): (1, 8192, 64, 16, 1, 4), - (2048, 2048, 256, 16, 16): (1, 4, 16, 32, 1, 2), - (2048, 2048, 256, 32, 32): (1, 8, 16, 32, 1, 1), - (2048, 2048, 256, 64, 64): (1, 8, 32, 32, 1, 4), - (2048, 2048, 256, 128, 128): (1, 4, 64, 64, 1, 8), - (2048, 2048, 512, 16, 16): (2, 8, 16, 32, 1, 2), - (2048, 2048, 512, 32, 32): (2, 8, 32, 64, 1, 4), - (2048, 2048, 512, 64, 64): (2, 4, 64, 64, 1, 4), - (2048, 2048, 512, 128, 128): (1, 8, 32, 64, 1, 4), - (2048, 2048, 1024, 16, 16): (2, 16, 16, 64, 3, 1), - (2048, 2048, 1024, 32, 32): (1, 32, 32, 32, 1, 1), - (2048, 2048, 1024, 64, 64): (1, 16, 64, 64, 1, 4), - (2048, 2048, 1024, 128, 128): (2, 4, 64, 64, 1, 8), - (2048, 2048, 2048, 16, 16): (2, 16, 16, 64, 1, 1), - (2048, 2048, 2048, 32, 32): (1, 32, 32, 32, 1, 1), - (2048, 2048, 2048, 64, 64): (1, 16, 64, 64, 1, 4), - (2048, 2048, 2048, 128, 128): (2, 32, 32, 64, 1, 4), - (2048, 2048, 4096, 16, 16): (3, 2, 16, 64, 1, 1), - (2048, 2048, 4096, 32, 32): (3, 4, 32, 32, 1, 1), - (2048, 2048, 4096, 64, 64): (1, 16, 64, 64, 1, 4), - (2048, 2048, 4096, 128, 128): (2, 32, 64, 32, 1, 4), - (2048, 2048, 8192, 16, 16): (3, 4, 16, 64, 1, 1), - (2048, 2048, 8192, 32, 32): (2, 4, 32, 32, 1, 1), - (2048, 2048, 8192, 64, 64): (2, 32, 64, 32, 1, 2), - (2048, 2048, 8192, 128, 128): (4, 1, 32, 64, 1, 4), - (2048, 2048, 16384, 16, 16): (3, 4, 16, 64, 1, 1), - (2048, 2048, 16384, 32, 32): (1, 4, 32, 32, 1, 1), - (2048, 2048, 16384, 64, 64): (2, 8, 64, 32, 1, 2), - (2048, 2048, 16384, 128, 128): (2, 8, 64, 32, 1, 4), - (2048, 2048, 32768, 16, 16): (2, 4, 16, 64, 1, 1), - (2048, 2048, 32768, 32, 32): (2, 8, 32, 32, 1, 1), - (2048, 2048, 32768, 64, 64): (1, 16, 64, 32, 1, 2), - (2048, 2048, 32768, 128, 128): (4, 1, 32, 64, 1, 4), - (2048, 2048, 65536, 16, 16): (3, 4, 16, 64, 1, 1), - (2048, 2048, 65536, 32, 32): (1, 8, 32, 32, 1, 1), - (2048, 2048, 65536, 64, 64): (1, 8, 64, 32, 1, 2), - (2048, 2048, 65536, 128, 128): (4, 1, 64, 32, 1, 4), - (2048, 2048, 131072, 16, 16): (2, 4, 16, 64, 1, 1), - (2048, 2048, 131072, 32, 32): (1, 8, 32, 32, 1, 1), - (2048, 2048, 131072, 64, 64): (3, 1, 64, 32, 1, 2), - (2048, 2048, 131072, 128, 128): (1, 8192, 128, 16, 1, 8), - (4096, 4096, 256, 16, 16): (2, 4, 16, 32, 1, 2), - (4096, 4096, 256, 32, 32): (1, 4, 32, 64, 1, 4), - (4096, 4096, 256, 64, 64): (1, 4, 64, 64, 1, 4), - (4096, 4096, 256, 128, 128): (1, 4, 32, 64, 1, 4), - (4096, 4096, 512, 16, 16): (2, 8, 16, 64, 3, 1), - (4096, 4096, 512, 32, 32): (2, 16, 32, 32, 1, 1), - (4096, 4096, 512, 64, 64): (1, 8, 64, 64, 1, 4), - (4096, 4096, 512, 128, 128): (1, 8, 32, 64, 1, 4), - (4096, 4096, 1024, 16, 16): (1, 8, 16, 64, 3, 1), - (4096, 4096, 1024, 32, 32): (1, 16, 32, 32, 1, 1), - (4096, 4096, 1024, 64, 64): (1, 16, 64, 32, 1, 2), - (4096, 4096, 1024, 128, 128): (1, 16, 32, 64, 1, 4), - (4096, 4096, 2048, 16, 16): (1, 16, 16, 64, 3, 1), - (4096, 4096, 2048, 32, 32): (1, 16, 32, 32, 1, 1), - (4096, 4096, 2048, 64, 64): (3, 16, 64, 32, 1, 2), - (4096, 4096, 2048, 128, 128): (4, 8, 32, 64, 1, 4), - (4096, 4096, 4096, 16, 16): (1, 8, 16, 64, 3, 1), - (4096, 4096, 4096, 32, 32): (1, 1, 32, 32, 1, 1), - (4096, 4096, 4096, 64, 64): (2, 16, 64, 32, 1, 2), - (4096, 4096, 4096, 128, 128): (4, 8, 32, 64, 1, 4), - (4096, 4096, 8192, 16, 16): (1, 8, 16, 64, 3, 1), - (4096, 4096, 8192, 32, 32): (2, 1, 32, 32, 1, 1), - (4096, 4096, 8192, 64, 64): (1, 16, 64, 32, 1, 2), - (4096, 4096, 8192, 128, 128): (2, 1, 32, 64, 1, 4), - (4096, 4096, 16384, 16, 16): (1, 8, 16, 64, 3, 1), - (4096, 4096, 16384, 32, 32): (1, 1, 32, 32, 1, 1), - (4096, 4096, 16384, 64, 64): (2, 8, 64, 32, 1, 2), - (4096, 4096, 16384, 128, 128): (2, 1, 32, 64, 1, 4), - (4096, 4096, 32768, 16, 16): (1, 8, 16, 64, 3, 1), - (4096, 4096, 32768, 32, 32): (1, 1, 32, 32, 1, 1), - (4096, 4096, 32768, 64, 64): (1, 8, 64, 32, 1, 2), - (4096, 4096, 32768, 128, 128): (2, 1, 32, 64, 1, 4), - (4096, 4096, 65536, 16, 16): (1, 8, 16, 64, 3, 1), - (4096, 4096, 65536, 32, 32): (3, 1, 32, 32, 1, 1), - (4096, 4096, 65536, 64, 64): (3, 4, 64, 32, 1, 2), - (4096, 4096, 65536, 128, 128): (2, 1, 32, 64, 1, 4), - (4096, 4096, 131072, 16, 16): (1, 8, 16, 64, 3, 1), - (4096, 4096, 131072, 32, 32): (1, 1, 32, 32, 1, 1), - (4096, 4096, 131072, 64, 64): (2, 8, 64, 32, 1, 2), - (4096, 4096, 131072, 128, 128): (1, 8192, 128, 16, 1, 8), - (8192, 8192, 256, 16, 16): (2, 4, 16, 64, 3, 1), - (8192, 8192, 256, 32, 32): (1, 8, 32, 32, 1, 1), - (8192, 8192, 256, 64, 64): (1, 4, 64, 64, 1, 4), - (8192, 8192, 256, 128, 128): (1, 4, 32, 64, 1, 4), - (8192, 8192, 512, 16, 16): (1, 4, 16, 64, 3, 1), - (8192, 8192, 512, 32, 32): (1, 16, 32, 32, 1, 1), - (8192, 8192, 512, 64, 64): (2, 4, 64, 64, 1, 4), - (8192, 8192, 512, 128, 128): (2, 1, 32, 64, 1, 4), - (8192, 8192, 1024, 16, 16): (3, 8, 16, 64, 3, 1), - (8192, 8192, 1024, 32, 32): (1, 16, 32, 32, 1, 1), - (8192, 8192, 1024, 64, 64): (1, 8, 64, 32, 1, 2), - (8192, 8192, 1024, 128, 128): (2, 4, 32, 64, 1, 4), - (8192, 8192, 2048, 16, 16): (1, 8, 16, 64, 3, 1), - (8192, 8192, 2048, 32, 32): (1, 16, 32, 32, 1, 1), - (8192, 8192, 2048, 64, 64): (2, 8, 64, 32, 1, 2), - (8192, 8192, 2048, 128, 128): (4, 1, 32, 64, 1, 4), - (8192, 8192, 4096, 16, 16): (1, 8, 16, 64, 3, 1), - (8192, 8192, 4096, 32, 32): (1, 16, 32, 32, 1, 1), - (8192, 8192, 4096, 64, 64): (1, 4, 64, 32, 1, 2), - (8192, 8192, 4096, 128, 128): (3, 1, 32, 64, 1, 4), - (8192, 8192, 8192, 16, 16): (1, 8, 16, 64, 3, 1), - (8192, 8192, 8192, 32, 32): (1, 8, 32, 32, 1, 1), - (8192, 8192, 8192, 64, 64): (1, 8, 64, 32, 1, 2), - (8192, 8192, 8192, 128, 128): (4, 1, 32, 64, 1, 4), - (8192, 8192, 16384, 16, 16): (3, 4, 16, 64, 3, 1), - (8192, 8192, 16384, 32, 32): (1, 8, 32, 32, 1, 1), - (8192, 8192, 16384, 64, 64): (2, 2, 64, 32, 1, 2), - (8192, 8192, 16384, 128, 128): (7, 1, 32, 64, 1, 4), - (8192, 8192, 32768, 16, 16): (1, 4, 16, 64, 3, 1), - (8192, 8192, 32768, 32, 32): (1, 8, 32, 32, 1, 1), - (8192, 8192, 32768, 64, 64): (3, 2, 64, 32, 1, 2), - (8192, 8192, 32768, 128, 128): (6, 1, 32, 64, 1, 4), - (8192, 8192, 65536, 16, 16): (1, 4, 16, 64, 3, 1), - (8192, 8192, 65536, 32, 32): (4, 8, 32, 32, 1, 1), - (8192, 8192, 65536, 64, 64): (1, 2, 64, 32, 1, 2), - (8192, 8192, 65536, 128, 128): (4, 1, 32, 64, 1, 4), - (8192, 8192, 131072, 16, 16): (1, 4, 16, 64, 3, 1), - (8192, 8192, 131072, 32, 32): (1, 8, 32, 32, 1, 1), - (8192, 8192, 131072, 64, 64): (5, 4, 64, 32, 1, 2), - (8192, 8192, 131072, 128, 128): (1, 4096, 128, 16, 1, 8), - (16384, 16384, 256, 16, 16): (1, 4, 16, 64, 3, 1), - (16384, 16384, 256, 32, 32): (1, 8, 32, 32, 1, 1), - (16384, 16384, 256, 64, 64): (1, 4, 64, 32, 1, 2), - (16384, 16384, 256, 128, 128): (1, 4, 32, 64, 1, 4), - (16384, 16384, 512, 16, 16): (1, 8, 16, 64, 3, 1), - (16384, 16384, 512, 32, 32): (1, 16, 32, 32, 1, 1), - (16384, 16384, 512, 64, 64): (1, 4, 64, 32, 1, 2), - (16384, 16384, 512, 128, 128): (3, 1, 32, 64, 1, 4), - (16384, 16384, 1024, 16, 16): (1, 8, 16, 64, 3, 1), - (16384, 16384, 1024, 32, 32): (1, 16, 32, 32, 1, 1), - (16384, 16384, 1024, 64, 64): (2, 4, 64, 32, 1, 2), - (16384, 16384, 1024, 128, 128): (1, 2, 32, 64, 1, 4), - (16384, 16384, 2048, 16, 16): (1, 4, 16, 64, 3, 1), - (16384, 16384, 2048, 32, 32): (1, 16, 32, 32, 1, 1), - (16384, 16384, 2048, 64, 64): (3, 4, 64, 32, 1, 2), - (16384, 16384, 2048, 128, 128): (2, 1, 32, 64, 1, 4), - (16384, 16384, 4096, 16, 16): (4, 8, 16, 64, 3, 1), - (16384, 16384, 4096, 32, 32): (5, 16, 32, 32, 1, 1), - (16384, 16384, 4096, 64, 64): (3, 2, 64, 32, 1, 2), - (16384, 16384, 4096, 128, 128): (2, 1, 32, 64, 1, 4), - (16384, 16384, 8192, 16, 16): (1, 4, 16, 64, 3, 1), - (16384, 16384, 8192, 32, 32): (1, 4, 32, 32, 1, 1), - (16384, 16384, 8192, 64, 64): (1, 2, 64, 32, 1, 2), - (16384, 16384, 8192, 128, 128): (2, 1, 32, 64, 1, 4), - (16384, 16384, 16384, 16, 16): (1, 8, 16, 64, 3, 1), - (16384, 16384, 16384, 32, 32): (1, 4, 32, 32, 1, 1), - (16384, 16384, 16384, 64, 64): (1, 2, 64, 32, 1, 2), - (16384, 16384, 16384, 128, 128): (3, 1, 32, 64, 1, 4), - (16384, 16384, 32768, 16, 16): (1, 4, 16, 64, 3, 1), - (16384, 16384, 32768, 32, 32): (1, 2, 32, 32, 1, 1), - (16384, 16384, 32768, 64, 64): (3, 2, 64, 32, 1, 2), - (16384, 16384, 32768, 128, 128): (3, 1, 32, 64, 1, 4), - (16384, 16384, 65536, 16, 16): (1, 8, 16, 64, 3, 1), - (16384, 16384, 65536, 32, 32): (1, 4, 32, 32, 1, 1), - (16384, 16384, 65536, 64, 64): (4, 4, 64, 32, 1, 2), - (16384, 16384, 65536, 128, 128): (5, 1, 32, 64, 1, 4), - (16384, 16384, 131072, 16, 16): (1, 2, 16, 64, 3, 1), - (16384, 16384, 131072, 32, 32): (1, 4, 32, 32, 1, 1), - (16384, 16384, 131072, 64, 64): (1, 2, 64, 32, 1, 2), - (16384, 16384, 131072, 128, 128): (1, 4096, 128, 16, 1, 8), - }, - # END GENERATED DATA -} - -if __name__ == "__main__": - for M, K, N in [(14336, 4096, 16), (4096, 14336, 16), (14336, 4096, 16), (4096, 14336, 16)]: - optimize_bsr_dense_addmm( - M, - K, - N, - 64, - 64, - beta=0, - alpha=1, - sparsity=0.9, - dtype=torch.bfloat16, - opname="bsr_dense_addmm", - verbose=True, - ) diff --git a/torchao/prototype/sparsity/superblock/bsr_triton_ops.py b/torchao/prototype/sparsity/superblock/bsr_triton_ops.py index 28100b2bb9..f4cdaed79b 100644 --- a/torchao/prototype/sparsity/superblock/bsr_triton_ops.py +++ b/torchao/prototype/sparsity/superblock/bsr_triton_ops.py @@ -10,7 +10,7 @@ from torch._dynamo.utils import warn_once from torch.utils._triton import has_triton -from ._triton_ops_meta import get_meta +from torch.sparse._triton_ops_meta import get_meta TORCH_SPARSE_BSR_SCATTER_MM_LRU_CACHE_SIZE = int( @@ -257,483 +257,7 @@ def as1Dbatch(tensor): return tensor -def scatter_mm(blocks, others, indices_data, *, accumulators=None): - """Scattered matrix multiplication of tensors. - - A scattered matrix multiplication is defined as a series of matrix - multiplications applied to input tensors according to the input - and output mappings specified by indices data. - - The following indices data formats are supported for defining a - scattered matrix multiplication operation (:attr:`indices_data[0]` - holds the name of the indices data format as specified below): - - - ``"scatter_mm"`` - matrix multiplications scattered in batches - of tensors. - - If :attr:`blocks` is a :math:`(* \times M \times K) tensor, - :attr:`others` is a :math:`(* \times K \times N)` tensor, - :attr:`accumulators` is a :math:`(* \times M \times N)` tensor, - and :attr:`indices = indices_data['indices']` is a :math:`(* - \times 3)` tensor, then the operation is equivalent to the - following code:: - - c_offsets, pq = indices_data[1:] - for r in range(len(c_offsets) - 1): - for g in range(c_offsets[r], c_offsets[r + 1]): - p, q = pq[g] - accumulators[r] += blocks[p] @ others[q] - - - ``"bsr_strided_mm"`` - matrix multiplications scattered in - batches of tensors and a tensor. - - If :attr:`blocks` is a :math:`(Ms \times Ks) tensor, - :attr:`others` is a :math:`(* \times K \times N)` tensor, - :attr:`accumulators` is a :math:`(* \times M \times N)` tensor, then - the operation is equivalent to the following code:: - - c_indices, r_offsets, p_offsets, q_offsets, meta = indices_data[1:] - for b in range(nbatches): - for i, r in enumerate(r_offsets): - r0, r1 = divmod(r, N) - acc = accumulators[b, r0:r0 + Ms, r1:r1 + Ns] - for g in range(c_indices[i], c_indices[i+1]): - p = p_offsets[g] - q0, q1 = divmod(q_offsets[g], N) - acc += blocks[p] @ others[b, q0:q0 + Ks, q1:q1 + Ns] - - where ``Ns = N // meta['SPLIT_N']``, and ``M`` and ``K`` are - integer multiples of ``Ms`` and ``Ks``, respectively. - - - ``"bsr_strided_mm_compressed"`` - matrix multiplications - scattered in batches of tensors and a tensor. A memory and - processor efficient version of ``"bsr_strided_mm"`` format. If - :attr:`blocks` is a :math:`(Ms \times Ks) tensor, :attr:`others` - is a :math:`(* \times K \times N)` tensor, :attr:`accumulators` - is a :math:`(* \times M \times N)` tensor, then the operation is - equivalent to the following code:: - - c_indices, r_offsets, q_offsets, meta = indices_data[1:] - for b in range(nbatches): - for r in r_offsets: - m = (r // N) // Ms - n = (r % N) // Ns - r0, r1 = divmod(r, N) - c0, c1 = c_indices[m], c_indices[m + 1] - acc = accumulators[b, r0:r0 + Ms, r1:r1 + Ns] - for i, p in enumerate(range(c0, c1)): - q = q_offsets[n * c1 + (SPLIT_N - n) * c0 + i] - q0, q1 = divmod(q, N) - acc += blocks[p] @ others[b, q0:q0 + Ks, q1:q1 + Ns] - - where ``Ns = N // meta['SPLIT_N']``, and ``M`` and ``K`` are - integer multiples of ``Ms`` and ``Ks``, respectively. - - Notice that the order of ``r_offsets`` items can be arbitrary; - this property enables defining swizzle operators via - rearrangements of ``r_offsets`` items.. - - Auxilary functions are provided for pre-computing - :attr:`indices_data`. For example, - :func:`bsr_scatter_mm_indices_data` is used to define indices data - for matrix multiplication of BSR and strided tensors. - - Parameters - ---------- - blocks (Tensor): a 3-D tensor of first matrices to be multiplied - - others (Tensor): a tensor of second matrices to be multiplied. If - ``indices_data[0]=="scatter_mm"``, the tensor is a 1-D batch - tensor of second input matrices to be multiplied. Otherwise, the - second input matrices are slices of the :attr:`others` tensor. - indices_data (tuple): a format data that defines the inputs and - outputs of scattered matrix multiplications. - - Keyword arguments - ----------------- - - accumulators (Tensor, optional): a tensor of matrix product - accumulators. If ``indices_data[0]=="scatter_mm"``, the tensor - is a 1-D batch tensor of output matrices. Otherwise, output - matrices are slices of the :attr:`accumulators` tensor. - """ - indices_format = indices_data[0] - - assert blocks.ndim == 3 - _P, Ms, Ks = blocks.shape - - if indices_format == "scatter_mm": - c_offsets, pq = indices_data[1:] - - assert others.ndim == 3 - _Q, Ks_, Ns = others.shape - assert Ks == Ks_ - - if accumulators is None: - R = c_offsets.shape[0] - 1 - accumulators = torch.zeros( - (R, Ms, Ns), dtype=blocks.dtype, device=blocks.device - ) - else: - R, Ms_, Ns_ = accumulators.shape - assert Ms_ == Ms - assert Ns_ == Ns - - if Ms % 16 or Ks % 16 or Ns % 16 or _scatter_mm2 is None: - for r in range(c_offsets.shape[0] - 1): - g0 = c_offsets[r] - g1 = c_offsets[r + 1] - for g in range(g0, g1): - p, q = pq[g] - accumulators[r] += blocks[p] @ others[q] - else: - _scatter_mm2(blocks, others, c_offsets, pq, accumulators) - return accumulators - - elif indices_format == "bsr_strided_mm": - others_shape = others.shape - others = as1Dbatch(others) - - B, K, N = others.shape - assert K % Ks == 0 - - c_indices, r_offsets, p_offsets, q_offsets, meta = indices_data[1:] - SPLIT_N = meta["SPLIT_N"] - - if accumulators is None: - M = Ms + (r_offsets.max().item() + 1) // N - accumulators = torch.zeros( - (*others_shape[:-2], M, N), dtype=blocks.dtype, device=blocks.device - ) - else: - M, N_ = accumulators.shape[-2:] - assert N_ == N - - accumulators_shape = accumulators.shape - accumulators = as1Dbatch(accumulators) - - Ns = N // SPLIT_N - - if Ms % 16 or Ks % 16 or Ns % 16 or _scatter_mm6 is None: - accumulators.zero_() - for b in range(B): - for r in range(r_offsets.shape[0]): - r_ = r_offsets[r].item() - g0 = c_indices[r].item() - g1 = c_indices[r + 1].item() - r0, r1 = divmod(r_, N) - acc = accumulators[b, r0 : r0 + Ms, r1 : r1 + Ns] - for g in range(g0, g1): - p, q = p_offsets[g], q_offsets[g] - q0, q1 = divmod(q.item(), N) - acc += blocks[p] @ others[b, q0 : q0 + Ks, q1 : q1 + Ns] - else: - _scatter_mm6( - blocks, - others, - c_indices, - r_offsets, - p_offsets, - q_offsets, - meta, - accumulators, - ) - return accumulators.view(accumulators_shape) - - elif indices_format == "bsr_strided_mm_compressed": - others_shape = others.shape - others = as1Dbatch(others) - - B, K, N = others.shape - assert K % Ks == 0 - - c_indices, r_offsets, q_offsets, meta = indices_data[1:] - SPLIT_N = meta["SPLIT_N"] - - if accumulators is None: - M = Ms + (r_offsets.max().item() + 1) // N - accumulators = torch.zeros( - (*others_shape[:-2], M, N), dtype=blocks.dtype, device=blocks.device - ) - else: - M, N_ = accumulators.shape[-2:] - assert N_ == N - - accumulators_shape = accumulators.shape - accumulators = as1Dbatch(accumulators) - - Ns = N // SPLIT_N - - if Ms % 16 or Ks % 16 or Ns % 16 or _scatter_mm6 is None: - for b in range(B): - for j in range(len(r_offsets)): - r0, r1 = divmod(r_offsets[j].item(), N) - m = r0 // Ms - n = r1 // Ns - c0 = c_indices[m].item() - c1 = c_indices[m + 1].item() - acc = accumulators[b, r0 : r0 + Ms, r1 : r1 + Ns] - for i, p in enumerate(range(c0, c1)): - q = q_offsets[n * c1 + (SPLIT_N - n) * c0 + i].item() - q0, q1 = divmod(q, N) - acc += blocks[p] @ others[b, q0 : q0 + Ks, q1 : q1 + Ns] - else: - p_offsets = torch.empty( - (0,), dtype=q_offsets.dtype, device=q_offsets.device - ) - _scatter_mm6( - blocks, - others, - c_indices, - r_offsets, - p_offsets, - q_offsets, - meta, - accumulators, - ) - return accumulators.view(accumulators_shape) - - else: - raise NotImplementedError(indices_format) - - -def scatter_mm_meta( - M, - K, - N, - Ms, - Ks, - GROUP_SIZE=None, - TILE_M=None, - TILE_N=None, - SPLIT_N=None, - num_warps=None, - num_stages=None, - **extra, -): - if {TILE_M, TILE_N, SPLIT_N, num_warps, num_stages, GROUP_SIZE} == {None}: - device_name = torch.cuda.get_device_name() - meta = get_meta( - "scatter_mm", - (M, K, N, Ms, Ks), - device_name, - version=(0, torch.float16, 0.5), - ) - if meta is not None: - meta.update(**extra) - return meta - # The following parameters are optimized for the performance - # equilibrium points of bsr-dense and dense-dense matrix - # multiplications when using GPU card NVIDIA GeForce RTX 2060 - # SUPER. For points far from the performance equilibrium - # points as well as for other GPU cards, the optimal - # parameters are likely different from what specified below. - if (M, K, N) == (256,) * 3: - if (Ms, Ks) == (16, 16): - SPLIT_N = 1 - TILE_M = 16 - TILE_N = 16 - GROUP_SIZE = 4 - num_stages = 1 - num_warps = 4 # noqa: E225,E231,E702 - elif (Ms, Ks) == (32, 32): - SPLIT_N = 2 - TILE_M = 32 - TILE_N = 16 - GROUP_SIZE = 4 - num_stages = 1 - num_warps = 4 # noqa: E225,E231,E702 - elif (Ms, Ks) == (64, 64): - SPLIT_N = 1 - TILE_M = 32 - TILE_N = 32 - GROUP_SIZE = 4 - num_stages = 1 - num_warps = 4 # noqa: E225,E231,E702 - elif (Ms, Ks) == (128, 128): - SPLIT_N = 1 - TILE_M = 32 - TILE_N = 32 - GROUP_SIZE = 2 - num_stages = 1 - num_warps = 4 # noqa: E225,E231,E702 - elif (M, K, N) == (512,) * 3: - if (Ms, Ks) == (16, 16): - SPLIT_N = 8 - TILE_M = 16 - TILE_N = 64 - GROUP_SIZE = 2 - num_stages = 1 - num_warps = 2 # noqa: E225,E231,E702 - elif (Ms, Ks) == (32, 32): - SPLIT_N = 8 - TILE_M = 32 - TILE_N = 64 - GROUP_SIZE = 4 - num_stages = 1 - num_warps = 2 # noqa: E225,E231,E702 - elif (Ms, Ks) == (64, 64): - SPLIT_N = 4 - TILE_M = 32 - TILE_N = 128 - GROUP_SIZE = 4 - num_stages = 1 - num_warps = 4 # noqa: E225,E231,E702 - elif (Ms, Ks) == (128, 128): - SPLIT_N = 8 - TILE_M = 64 - TILE_N = 64 - GROUP_SIZE = 4 - num_stages = 1 - num_warps = 4 # noqa: E225,E231,E702 - elif (M, K, N) == (1024,) * 3: - if (Ms, Ks) == (16, 16): - SPLIT_N = 4 - TILE_M = 16 - TILE_N = 128 - GROUP_SIZE = 2 - num_stages = 1 - num_warps = 1 # noqa: E225,E231,E702 - elif (Ms, Ks) == (32, 32): - SPLIT_N = 8 - TILE_M = 32 - TILE_N = 64 - GROUP_SIZE = 2 - num_stages = 1 - num_warps = 1 # noqa: E225,E231,E702 - elif (Ms, Ks) == (64, 64): - SPLIT_N = 16 - TILE_M = 64 - TILE_N = 64 - GROUP_SIZE = 4 - num_stages = 1 - num_warps = 2 # noqa: E225,E231,E702 - elif (Ms, Ks) == (128, 128): - SPLIT_N = 16 - TILE_M = 64 - TILE_N = 64 - GROUP_SIZE = 4 - num_stages = 1 - num_warps = 4 # noqa: E225,E231,E702 - elif (Ms, Ks) == (256, 256): - SPLIT_N = 16 - TILE_M = 64 - TILE_N = 64 - GROUP_SIZE = 2 - num_stages = 1 - num_warps = 4 # noqa: E225,E231,E702 - elif (M, K, N) == (2048,) * 3: - if (Ms, Ks) == (16, 16): - SPLIT_N = 4 - TILE_M = 16 - TILE_N = 128 - GROUP_SIZE = 8 - num_stages = 1 - num_warps = 1 # noqa: E225,E231,E702 - elif (Ms, Ks) == (32, 32): - SPLIT_N = 4 - TILE_M = 32 - TILE_N = 64 - GROUP_SIZE = 4 - num_stages = 1 - num_warps = 1 # noqa: E225,E231,E702 - elif (Ms, Ks) == (64, 64): - SPLIT_N = 4 - TILE_M = 64 - TILE_N = 128 - GROUP_SIZE = 4 - num_stages = 1 - num_warps = 4 # noqa: E225,E231,E702 - elif (Ms, Ks) == (128, 128): - SPLIT_N = 8 - TILE_M = 64 - TILE_N = 64 - GROUP_SIZE = 4 - num_stages = 1 - num_warps = 4 # noqa: E225,E231,E702 - elif (Ms, Ks) == (256, 256): - SPLIT_N = 4 - TILE_M = 64 - TILE_N = 64 - GROUP_SIZE = 2 - num_stages = 1 - num_warps = 4 # noqa: E225,E231,E702 - elif (M, K, N) == (4096,) * 3: - if (Ms, Ks) == (16, 16): - SPLIT_N = 2 - TILE_M = 16 - TILE_N = 256 - GROUP_SIZE = 2 - num_stages = 1 - num_warps = 2 # noqa: E225,E231,E702 - elif (Ms, Ks) == (32, 32): - SPLIT_N = 2 - TILE_M = 32 - TILE_N = 64 - GROUP_SIZE = 2 - num_stages = 1 - num_warps = 1 # noqa: E225,E231,E702 - elif (Ms, Ks) == (64, 64): - SPLIT_N = 2 - TILE_M = 64 - TILE_N = 128 - GROUP_SIZE = 2 - num_stages = 1 - num_warps = 4 # noqa: E225,E231,E702 - - if SPLIT_N is None: - # Assume NVIDIA GeForce RTX 2060 SUPER: - # With the probality of 92% (99.9% when N > 512), the - # performance will not be worse more than 2% from the - # performance when using an optimal value. Otherwise, when N - # <= 512, using the following heuristics may give upto 15% - # lower performance. - SPLIT_N = { - 16: 1, - 32: 2, - 64: 4, - 128: 8, - 256: 16, - 512: 8, - 1024: 16, - 4096: 32, - 8192: 64, - }.get(N, 16) - if Ms >= 512 and N >= 2048: - SPLIT_N = 1 - Ns = N // SPLIT_N - if TILE_M is None: - TILE_M = min(64 if Ns < 512 else 32, Ms) - if TILE_N is None: - TILE_N = min(64 if Ns < 512 else 32, Ns) - num_stages = num_stages or 1 - if num_warps is None: - if min(M, N) > 1024: - num_warps = {16: 1, 32: 1, 64: 2}.get(Ms, 4) - elif min(M, N) == 1024: - num_warps = {16: 1, 32: 1, 64: 2}.get(Ms, 4) - elif min(M, N) == 256: - num_warps = {16: 1, 32: 4}.get(Ms, 4) - else: - num_warps = {16: 1, 32: 2}.get(Ms, 4) - GROUP_SIZE = GROUP_SIZE or 4 - - assert TILE_M <= Ms, dict(TILE_M=TILE_M, Ms=Ms) - assert TILE_N <= Ns, dict(TILE_N=TILE_N, Ns=Ns) - assert Ms <= M, dict(M=M, Ms=Ms) - assert Ns <= N, dict(N=N, Ns=Ns) - assert Ks <= K, dict(K=K, Ks=Ks) - - return dict( - TILE_M=TILE_M, - TILE_N=TILE_N, - GROUP_SIZE=GROUP_SIZE, - num_stages=num_stages, - num_warps=num_warps, - SPLIT_N=SPLIT_N, - **extra, - ) - +## addmm functionality def bsr_dense_addmm_meta( M, @@ -815,7 +339,6 @@ def bsr_dense_addmm_meta( else: # see [Computing optimal kernel parameters] in # _triton_ops_meta.py for ways to avoid this warning - # from ._triton_ops_meta import optimize_bsr_dense_addmm # optimize_bsr_dense_addmm( # M, # K, @@ -937,207 +460,6 @@ def obj(self): return self._obj_ref() -@lru_cache(maxsize=TORCH_SPARSE_BSR_SCATTER_MM_LRU_CACHE_SIZE) -def _bsr_scatter_mm_indices_data( - indices_format, M, K, N, Ms, Ks, nbatches, SPLIT_N, compressed_sparse_tensor_as_key -): - bsr = compressed_sparse_tensor_as_key.obj - assert bsr is not None - crow_indices, col_indices = bsr.crow_indices(), bsr.col_indices() - device = crow_indices.device - indices_dtype = torch.int32 - - if indices_format == "bsr_strided_mm_compressed": - Ns = N // SPLIT_N - q_offsets_lst = [] - b = torch.arange(SPLIT_N, dtype=indices_dtype, device=device) * Ns - for m in range(M // Ms): - r0 = crow_indices[m].item() - r1 = crow_indices[m + 1].item() - if r1 == r0: - continue - q_offsets_lst.append( - (col_indices[r0:r1] * (Ks * N)).repeat(SPLIT_N) - + b.repeat_interleave(r1 - r0) - ) - q_offsets = torch.cat(q_offsets_lst) - crow_indices_diff = crow_indices.diff() - non_zero_row_indices = crow_indices_diff.nonzero() - a = non_zero_row_indices * (Ms * N) - r_offsets = (a + b).view(-1) - c_indices = crow_indices - # swizzle operation: mm elements with longer sums are computed first: - nnz_per_row = crow_indices_diff[non_zero_row_indices].repeat_interleave(SPLIT_N) - nnz_per_row, indices = nnz_per_row.sort(descending=True, stable=True) - r_offsets = r_offsets[indices] - return (indices_format, c_indices, r_offsets, q_offsets) - - elif indices_format == "bsr_strided_mm": - Ns = N // SPLIT_N - p_offsets_lst = [] - q_offsets_lst = [] - b = torch.arange(SPLIT_N, dtype=indices_dtype, device=device) * Ns - for m in range(M // Ms): - r0 = crow_indices[m].item() - r1 = crow_indices[m + 1].item() - if r1 == r0: - continue - p_offsets_lst.append( - torch.arange(r0, r1, dtype=indices_dtype, device=device).repeat(SPLIT_N) - ) - q_offsets_lst.append( - (col_indices[r0:r1] * (Ks * N)).repeat(SPLIT_N) - + b.repeat_interleave(r1 - r0) - ) - q_offsets = torch.cat(q_offsets_lst) - crow_indices_diff = crow_indices.diff() - non_zero_row_indices = crow_indices_diff.nonzero() - a = non_zero_row_indices * (Ms * N) - r_offsets = (a + b).view(-1) - c_indices = torch.cat( - ( - crow_indices[:1], - torch.cumsum( - crow_indices_diff[non_zero_row_indices].repeat_interleave(SPLIT_N), - 0, - ), - ) - ) - p_offsets = torch.cat(p_offsets_lst) - return (indices_format, c_indices, r_offsets, p_offsets, q_offsets) - - elif indices_format == "scatter_mm": - Ns = Ms - c_indices = [0] - pq_offsets = [] - # todo: eliminate inner for-loops for efficiency - for b in range(nbatches): - for m in range(M // Ms): - r0 = crow_indices[m].item() - r1 = crow_indices[m + 1].item() - for n in range(N // Ns): - c_indices.append(c_indices[-1] + r1 - r0) - for t in range(r1 - r0): - p = r0 + t - q = (col_indices[p].item() + b * (K // Ks)) * (N // Ns) + n - pq_offsets.append([p, q]) - - return ( - indices_format, - torch.tensor(c_indices, dtype=indices_dtype, device=device), - torch.tensor(pq_offsets, dtype=indices_dtype, device=device), - ) - - else: - raise ValueError( - f"Invalid {indices_format=}. Expected bsr_strided_mm_compressed|bsr_strided_mm|scatter_mm" - ) - - -def bsr_scatter_mm_indices_data( - bsr, other, indices_format="bsr_strided_mm_compressed", **meta_input -): - """Computes indices data for :func:`scatter_mm` used in BSR and - strided tensor matrix multiplication. - """ - assert bsr.dense_dim() == 0 - assert bsr.ndim == 2 # no batch dims - blocksize = bsr.values().shape[-2:] - M, K = bsr.shape - Ms, Ks = blocksize - K_, N = other.shape[-2:] - assert K_ == K - nbatches = other.shape[:-2].numel() - - meta = scatter_mm_meta(M, K, N, Ms, Ks, **meta_input) - if "allow_tf32" not in meta_input: - meta.update(allow_tf32=bsr.dtype in {torch.float16, torch.bfloat16}) - SPLIT_N = meta["SPLIT_N"] - indices_data = _bsr_scatter_mm_indices_data( - indices_format, M, K, N, Ms, Ks, nbatches, SPLIT_N, TensorAsKey(bsr) - ) - - if indices_format == "bsr_strided_mm_compressed": - meta.update(is_compressed=True) - return indices_data + (meta,) - elif indices_format == "bsr_strided_mm": - meta.update(is_compressed=False) - return indices_data + (meta,) - else: - return indices_data - - -def bsr_scatter_mm(bsr, other, indices_data=None, out=None): - """BSR @ strided -> strided""" - - assert bsr.ndim == 2 - assert other.ndim >= 2 - - Ms, Ks, Ns = bsr.shape[-2], bsr.shape[-1], other.shape[-1] - blocksize = bsr.values().shape[-2:] - - if indices_data is None: - indices_data = bsr_scatter_mm_indices_data( - bsr, other, indices_format="bsr_strided_mm_compressed" - ) - - indices_format = indices_data[0] - - if out is None: - out = torch.empty( - (*other.shape[:-2], Ms, Ns), dtype=bsr.dtype, device=bsr.device - ) - out_shape = out.shape - out = as1Dbatch(out) - - if bsr._nnz() == 0: - out.zero_() - elif indices_format in {"bsr_strided_mm_compressed", "bsr_strided_mm"}: - out.zero_() - scatter_mm(bsr.values(), other, indices_data, accumulators=out) - elif indices_format == "scatter_mm": - nbatches = other.shape[:-2].numel() - accumulators = torch.zeros( - ( - nbatches * Ms // blocksize[0] * Ns // blocksize[0], - blocksize[0], - blocksize[0], - ), - dtype=bsr.dtype, - device=bsr.device, - ) - others = ( - as1Dbatch(other) - .transpose(-2, -1) - .view( - nbatches, - Ns // blocksize[0], - blocksize[0], - Ks // blocksize[1], - blocksize[1], - ) - .movedim( - (3, 1, 4, 2), (1, 2, 3, 4) - ) # equivalent to .transpose(-3, -2).transpose(-2, -1).transpose(-4, -3) - .flatten(0, 2) - ) - scatter_mm(bsr.values(), others, indices_data, accumulators=accumulators) - out.copy_( - accumulators.unflatten( - 0, (nbatches, Ms // blocksize[0], Ns // blocksize[0]) - ) - .movedim( - (1, 2, 3, 4), (3, 1, 4, 2) - ) # equivalent to .transpose(-4, -3).transpose(-2, -1).transpose(-3, -2) - .reshape(nbatches, Ns, Ms) - .transpose(-2, -1) - ) - else: - raise NotImplementedError(indices_format) - - return out.view(out_shape) - - def _int_bsr_dense_addmm( input: torch.Tensor, bsr: torch.Tensor, @@ -1352,998 +674,6 @@ def kernel(grid, *sliced_tensors): import triton import triton.language as tl - @triton.jit - def _sampled_addmm_kernel( - alpha, - beta, - IS_BETA_ZERO: tl.constexpr, - BLOCKSIZE_ROW: tl.constexpr, - BLOCKSIZE_COL: tl.constexpr, - k, - TILE_K: tl.constexpr, - values_ptr, - values_batch_stride, - values_nnz_stride, - values_row_block_stride, - values_col_block_stride, - crow_indices_ptr, - crow_indices_batch_stride, - crow_indices_stride, - col_indices_ptr, - col_indices_batch_stride, - col_indices_stride, - mat1_ptr, - mat1_batch_stride, - mat1_tiled_row_stride, - mat1_tiled_col_stride, - mat1_row_block_stride, - mat1_col_block_stride, - mat2_ptr, - mat2_batch_stride, - mat2_tiled_row_stride, - mat2_tiled_col_stride, - mat2_row_block_stride, - mat2_col_block_stride, - acc_dtype: tl.constexpr, - allow_tf32: tl.constexpr, - ): - batch_pid = tl.program_id(axis=1) - row_block_pid = tl.program_id(axis=0) - - crow_indices_offset_ptr = ( - crow_indices_ptr - + crow_indices_batch_stride * batch_pid - + crow_indices_stride * row_block_pid - ) - nnz_offset = tl.load(crow_indices_offset_ptr) - nnz_offset_next = tl.load(crow_indices_offset_ptr + crow_indices_stride) - - # Compute nnz for the row with number row_block_pid. - # If it is zero, skip the row. - row_nnz = nnz_offset_next - nnz_offset - if row_nnz == 0: - return - - row_block_arange = tl.arange(0, BLOCKSIZE_ROW) - col_block_arange = tl.arange(0, BLOCKSIZE_COL) - - # Pointers are set to the first block of the current row. - values_block_ptrs = ( - values_ptr - + values_batch_stride * batch_pid - + values_nnz_stride * nnz_offset - + values_row_block_stride * row_block_arange[:, None] - + values_col_block_stride * col_block_arange[None, :] - ) - - col_index_nnz_ptr = ( - col_indices_ptr - + col_indices_batch_stride * batch_pid - + col_indices_stride * nnz_offset - ) - - # Advance mat1 to the current tiled row, ignore columns. - mat1_block_ptrs = ( - mat1_ptr - + mat1_batch_stride * batch_pid - + mat1_tiled_row_stride * row_block_pid - + mat1_row_block_stride * row_block_arange[:, None] - ) - - # Advance mat2 in batch and block col dimension. - mat2_block_ptrs = ( - mat2_ptr - + mat2_batch_stride * batch_pid - + mat2_col_block_stride * col_block_arange[None, :] - ) - - k_tile_arange = tl.arange(0, TILE_K) - for _ in range(row_nnz): - acc_block = tl.zeros((BLOCKSIZE_ROW, BLOCKSIZE_COL), dtype=acc_dtype) - - # find column block index - col_block = tl.load(col_index_nnz_ptr) - - for k_tile in range(0, k, TILE_K): - k_offsets = k_tile + k_tile_arange - mask_k = k_offsets < k - - mat1_block = tl.load( - mat1_block_ptrs + mat1_col_block_stride * k_offsets[None, :], - mask=mask_k[None, :], - other=0.0, - ) - - mat2_block = tl.load( - mat2_block_ptrs - + mat2_tiled_col_stride * col_block - + mat2_row_block_stride * k_offsets[:, None], - mask=mask_k[:, None], - other=0.0, - ) - - acc_block += tl.dot( - mat1_block, mat2_block, allow_tf32=allow_tf32, out_dtype=acc_dtype - ) - - if IS_BETA_ZERO: - acc_block *= alpha - else: - acc_block = alpha * acc_block + beta * tl.load(values_block_ptrs) - - # write result - tl.store(values_block_ptrs, acc_block.to(values_ptr.dtype.element_ty)) - - # advance val/col_index ptrs to the next block in the row. - values_block_ptrs += values_nnz_stride - col_index_nnz_ptr += col_indices_stride - - @triton.jit - def _bsr_strided_dense_rowspace_kernel( - # values prologue - values_ptr, - values_batch_stride, - values_nnz_stride, - values_row_block_stride, - values_col_block_stride, - # values epilogue - # crow_indices prologue - crow_indices_ptr, - crow_indices_batch_stride, - crow_indices_stride, - # crow_indices epilogue - # col_indices prologue - col_indices_ptr, - col_indices_batch_stride, - col_indices_stride, - # col_indices epilogue - # dense prologue - dense_ptr, - dense_batch_stride, - dense_tiled_row_stride, - dense_tiled_col_stride, - dense_row_block_stride, - dense_col_block_stride, - # dense epilogue - # output prologue - output_ptr, - output_batch_stride, - output_tiled_row_stride, - output_tiled_col_stride, - output_row_block_stride, - output_col_block_stride, - # output epilogue - # - # gh-113754: Always keep all constexpr arguments at the end of - # triton kernel arguments list because with triton 2.1 or - # earlier non-contiguous outputs will corrupt CUDA state due - # to a triton bug (fixed in openai/triton#2262). - BLOCKSIZE_ROW: tl.constexpr, - BLOCKSIZE_COL: tl.constexpr, - acc_dtype: tl.constexpr, - allow_tf32: tl.constexpr, - GROUP_SIZE_ROW: tl.constexpr, - ): - batch_pid = tl.program_id(axis=2) - row_block_pid = tl.program_id(axis=0) - col_block_pid = tl.program_id(axis=1) - n_block_rows = tl.num_programs(axis=0) - n_block_cols = tl.num_programs(axis=1) - - row_block_pid, col_block_pid = tl.swizzle2d( - row_block_pid, col_block_pid, n_block_rows, n_block_cols, GROUP_SIZE_ROW - ) - - crow_indices_offset_ptr = ( - crow_indices_ptr - + crow_indices_batch_stride * batch_pid - + crow_indices_stride * row_block_pid - ) - nnz_offset = tl.load(crow_indices_offset_ptr) - nnz_offset_next = tl.load(crow_indices_offset_ptr + crow_indices_stride) - - # Compute nnz for the row with number row_block_pid. - # If it is zero, skip the row. - row_nnz = nnz_offset_next - nnz_offset - if row_nnz == 0: - return - - row_block_arange = tl.arange(0, BLOCKSIZE_ROW) - col_block_arange = tl.arange(0, BLOCKSIZE_COL) - - # Pointers are set to the first block of the current row. - values_block_ptrs = ( - values_ptr - + values_batch_stride * batch_pid - + values_nnz_stride * nnz_offset - + values_row_block_stride * row_block_arange[:, None] - + values_col_block_stride * col_block_arange[None, :] - ) - - # NOTE: dense is advanced into all dimensions but the tiled row one. - # That will be advanced in the loop according to values in col_indices. - dense_block_ptrs = ( - dense_ptr - + dense_batch_stride * batch_pid - + dense_tiled_col_stride * col_block_pid - + dense_row_block_stride * col_block_arange[:, None] - + dense_col_block_stride * row_block_arange[None, :] - ) - - # Pointers are set to exact write-to locations - output_ptrs = ( - output_ptr - + output_batch_stride * batch_pid - + output_tiled_row_stride * row_block_pid - + output_tiled_col_stride * col_block_pid - + output_row_block_stride * row_block_arange[:, None] - + output_col_block_stride * row_block_arange[None, :] - ) - - # Set pointer to the first nonzero element in the current row - col_index_nnz_ptr = ( - col_indices_ptr - + col_indices_batch_stride * batch_pid - + col_indices_stride * nnz_offset - ) - - output_acc_block = tl.zeros((BLOCKSIZE_ROW, BLOCKSIZE_COL), dtype=acc_dtype) - for _ in range(row_nnz): - values_block = tl.load(values_block_ptrs) - - # find which row of dense needs to get loaded - # for multiplication with values_block. - dense_row_idx = tl.load(col_index_nnz_ptr) - dense_block = tl.load( - dense_block_ptrs + dense_tiled_row_stride * dense_row_idx - ) - - # do block mm - output_acc_block += tl.dot( - values_block, dense_block, allow_tf32=allow_tf32, out_dtype=acc_dtype - ) - - # move val/col_index ptrs to the next block in the row - values_block_ptrs += values_nnz_stride - col_index_nnz_ptr += col_indices_stride - - # write back the result - tl.store(output_ptrs, output_acc_block.to(output_ptr.dtype.element_ty)) - - def _run_sampled_addmm_kernel( - alpha, - beta, - is_beta_zero, - blocksize, - k, - tile_k, - values, - crow_indices, - col_indices, - mat1, - mat2, - max_grid, - ): - n_batches = values.size(0) - n_block_rows = crow_indices.size(-1) - 1 - - full_grid = (n_batches, n_block_rows) - if max_grid is not None: - grid_blocks = tuple(max_grid[:2][::-1]) + (None,) * (2 - len(max_grid[:2])) - else: - grid_blocks = None - tensor_dims_map = { - values: (0, None), - crow_indices: (0, -1), - col_indices: (0, None), - mat1: (0, -4), - mat2: (0, None), - } - if values.dtype in (torch.half, torch.bfloat16): - acc_dtype = tl.float32 - allow_tf32 = True - else: - acc_dtype = tl.float64 - allow_tf32 = False - - def kernel(grid, *sliced_tensors): - _sampled_addmm_kernel[grid]( - alpha, - beta, - is_beta_zero, - *blocksize, - k, - tile_k, - *ptr_stride_extractor(*sliced_tensors), - acc_dtype=acc_dtype, - allow_tf32=allow_tf32, - num_stages=1, - num_warps=4, - ) - - launch_kernel(kernel, tensor_dims_map, full_grid, grid_blocks) - - def sampled_addmm( - input: torch.Tensor, - mat1: torch.Tensor, - mat2: torch.Tensor, - *, - beta=1.0, - alpha=1.0, - out: Optional[torch.Tensor] = None, - skip_checks: bool = False, - max_grid: Optional[tuple[Optional[int], Optional[int], Optional[int]]] = None, - ): - f_name = "sampled_addmm" - - check_bsr_layout(f_name, input) - input_broadcasted = broadcast_batch_dims_bsr(f_name, input, mat1, mat2) - - if not skip_checks: - check_device(f_name, mat1, input.device) - check_device(f_name, mat2, input.device) - if beta != 0.0 and input.dtype is torch.bool: - check( - False, - f"{f_name}(): having beta == {beta} not equal to 0.0 with boolean mask is not allowed.", - ) - if input.dtype is not torch.bool: - check_dtype(f_name, mat1, input.dtype) - check_dtype(f_name, mat2, input.dtype) - else: - check_dtype(f_name, mat1, mat2.dtype) - check_mm_compatible_shapes(f_name, mat1, mat2) - if out is not None: - check_bsr_layout(f_name, out) - check_device(f_name, out, mat1.device) - check_dtype(f_name, out, input.dtype) - check( - out.shape == input_broadcasted.shape and out._nnz() == input._nnz(), - f"{f_name}(): Expects `out` to be of shape {input_broadcasted.shape} " - f"and with nnz equal to {input_broadcasted._nnz()} " - f"but got out.shape = {out.shape} and out.nnz = {out._nnz()}", - ) - - if out is None: - out = input_broadcasted.to(mat1.dtype, copy=True) - else: - out.copy_(input_broadcasted) - - if out.numel() == 0 or out._nnz() == 0: - return out - - blocksize = out.values().shape[-2:] - k = mat1.size(-1) - - # NOTE: (m, 0) @ (0, n) == zeros(m, n) - if alpha == 0.0 or k == 0: - out.values().mul_(beta) - return out - - # prepare inputs by reshaping them to be kernel-compatible - out_backup = out - crow_indices, col_indices, values, mat1, mat2 = prepare_inputs(out, mat1, mat2) - - mat1 = tile_to_blocksize(mat1, (blocksize[0], k)) - mat2 = tile_to_blocksize(mat2, (k, blocksize[1])) - tile_k = max(*blocksize) - - _run_sampled_addmm_kernel( - alpha, - beta, - beta == 0.0, - blocksize, - k, - tile_k, - values, - crow_indices, - col_indices, - mat1, - mat2, - max_grid, - ) - - # If nnz x block strides are not the same in out_backup.values and values, - # it means that out_backup.values and values are not the views of each other, - # so we have to copy. - if out_backup.values().stride()[-3:] != values.stride()[-3:]: - out_backup.values().copy_(values.reshape(out_backup.values().shape)) - return out_backup - - def bsr_dense_mm( - bsr: torch.Tensor, - dense: torch.Tensor, - *, - out: Optional[torch.Tensor] = None, - skip_checks: bool = False, - max_grid: Optional[tuple[Optional[int], Optional[int], Optional[int]]] = None, - meta: Optional[dict] = None, - ): - f_name = "bsr_dense_mm" - m, _kl = bsr.shape[-2:] - if not skip_checks: - check_bsr_layout(f_name, bsr) - check_device(f_name, bsr, dense.device) - check_dtype(f_name, bsr, dense.dtype, (torch.int8,)) - check_mm_compatible_shapes(f_name, bsr, dense) - - n = dense.size(-1) - row_block, col_block = bsr.values().shape[-2:] - check_blocksize(f_name, (row_block, col_block)) - check( - not n % 16, - f"{f_name}(): dense.size(-1) == {n} should be divisible by 16", - ) - else: - _kr, n = dense.shape[-2:] - - original_batch_dims_broadcasted = broadcast_batch_dims(f_name, bsr, dense) - - if out is not None and not skip_checks: - expected_out_shape = original_batch_dims_broadcasted + (m, n) - check( - out.shape == expected_out_shape, - "bsr_dense_mm(): `out` argument has wrong shape, " - f"expected {expected_out_shape}, but got {out.shape}.", - ) - check( - out.is_contiguous() or out.transpose(-2, -1).is_contiguous(), - "bsr_dense_mm(): only row-major/col-major `out` arguments are supported, " - "i.e. (out.is_contiguous() or out.transpose(-2, -1).is_contiguous()) " - "should be True.", - ) - - # Allocate out - if out is None: - out = dense.new_empty(original_batch_dims_broadcasted + (m, n)) - - # Short circuit if lhs is zero - if bsr._nnz() == 0: - return out.zero_() - - # with beta==0, addmm ignores input content, so we can use out - # as a placeholder for input because their shapes match: - return bsr_dense_addmm(out, bsr, dense, alpha=1, beta=0, out=out) - - @triton.jit - def _bsr_softmax_kernel( - crow_indices_ptr, - crow_indices_batch_stride, - crow_indices_stride, - values_ptr, - values_batch_stride, - values_row_block_stride, - values_nnz_col_block_stride, - row_block, - col_block, - MAX_ROW_NNZ: tl.constexpr, - TILE: tl.constexpr, - ): - batch_pid = tl.program_id(axis=2) - row_block_offset_pid = tl.program_id(axis=1) - row_block_pid = tl.program_id(axis=0) - - crow_indices_offset_ptr = ( - crow_indices_ptr - + crow_indices_batch_stride * batch_pid - + crow_indices_stride * row_block_pid - ) - nnz_offset = tl.load(crow_indices_offset_ptr) - nnz_offset_next = tl.load(crow_indices_offset_ptr + crow_indices_stride) - - # Compute nnz for the row with number row_block_pid. - # If it is zero, skip the row. - row_nnz = nnz_offset_next - nnz_offset - if row_nnz == 0: - return - - row_arange = tl.arange(0, TILE) - mask = row_arange < row_nnz * col_block - - curr_row_values_ptrs = ( - values_ptr - + values_batch_stride * batch_pid - + values_row_block_stride * row_block_offset_pid - + nnz_offset * col_block - ) - - # find max in the row - row_tile = tl.load( - curr_row_values_ptrs + row_arange, mask=mask, other=-float("inf") - ).to(tl.float32) - max_row_value = tl.max(row_tile, axis=0) - for _ in range(TILE, MAX_ROW_NNZ, TILE): - row_arange += TILE - mask = row_arange < row_nnz * col_block - row_tile = tl.load( - curr_row_values_ptrs + row_arange, mask=mask, other=-float("inf") - ).to(tl.float32) - curr_max_row_value = tl.max(row_tile, axis=0) - max_row_value = tl.where( - max_row_value > curr_max_row_value, max_row_value, curr_max_row_value - ) - - # find denominator for stable softmax - num = tl.exp(row_tile - max_row_value) - denom = tl.sum(num, axis=0) - for _ in range(TILE, MAX_ROW_NNZ, TILE): - row_arange -= TILE - mask = row_arange < row_nnz * col_block - row_tile = tl.load( - curr_row_values_ptrs + row_arange, mask=mask, other=-float("inf") - ).to(tl.float32) - num = tl.exp(row_tile - max_row_value) - denom += tl.sum(num, axis=0) - - # populate output - tl.store( - curr_row_values_ptrs + row_arange, - (num / denom).to(values_ptr.dtype.element_ty), - mask=mask, - ) - for _ in range(TILE, MAX_ROW_NNZ, TILE): - row_arange += TILE - mask = row_arange < row_nnz * col_block - row_tile = tl.load( - curr_row_values_ptrs + row_arange, mask=mask, other=-float("inf") - ).to(tl.float32) - num = tl.exp(row_tile - max_row_value) - tl.store( - curr_row_values_ptrs + row_arange, - (num / denom).to(values_ptr.dtype.element_ty), - mask=mask, - ) - - def bsr_softmax(input, max_row_nnz=None): - f_name = "bsr_softmax" - - check_bsr_layout(f_name, input) - check_dtype(f_name, input, input.dtype) - - if input._nnz() == 0 or input.numel() == 0: - return input.clone() - - m, n = input.shape[-2:] - nnz = input._nnz() - row_block, col_block = input.values().shape[-2:] - - if max_row_nnz is None: - max_row_nnz = triton.next_power_of_2(n) - else: - max_row_nnz = triton.next_power_of_2(max_row_nnz) - - crow_indices = input.crow_indices().unsqueeze(0).flatten(0, -2) - # reshape values from - # (b1, ..., bn, nnz, row_block, col_block) to - # (b1 * ... * bn, row_block, nnz * col_block). - # This simplifies batch dim manipulation and unlocks - # the possibility to access all nnzs in any given row. - if input.values().transpose(-3, -2).is_contiguous(): - # Need to clone to avoid `contiguous` returning a view. - values = input.values().clone() - else: - values = input.values() - values = ( - values.transpose(-3, -2) - .contiguous() - .unsqueeze(0) - .flatten(0, -4) - .reshape(-1, row_block, nnz * col_block) - ) - full_grid = (values.shape[0], row_block, m // row_block) - grid_blocks = None - tensor_dims_map = { - # We span nnz number of blocks, not nnz + 1, - # hence crow_indices[..., :-1] - crow_indices[..., :-1]: (0, None, -1), - values: (0, None, None), - } - - def kernel(grid, *sliced_tensors): - _bsr_softmax_kernel[grid]( - *ptr_stride_extractor(*sliced_tensors), - row_block, - col_block, - max_row_nnz, - # Triton's max numel is bounded by 2 ** 17. - min(2**17, max_row_nnz), - ) - - launch_kernel(kernel, tensor_dims_map, full_grid, grid_blocks) - - values = ( - values.reshape(-1, row_block, nnz, col_block) - .transpose(-3, -2) - .reshape(*input.values().shape) - ) - - return torch.sparse_compressed_tensor( - input.crow_indices().clone(), - input.col_indices().clone(), - values, - size=input.shape, - layout=input.layout, - ) - - def _scaled_dot_product_attention( - query: torch.Tensor, - key: torch.Tensor, - value: torch.Tensor, - attn_mask: Optional[torch.Tensor], - dropout_p: float = 0.0, - is_causal: bool = False, - scale: Optional[float] = None, - ): - f_name = "_scaled_dot_product_attention" - check(not is_causal, f"{f_name}(): is_causal == True is not supported.") - check(attn_mask is not None, f"{f_name}(): attn_mask == None is not supported.") - assert attn_mask is not None - - check( - attn_mask.layout == torch.sparse_bsr, - f"{f_name}(): " - f"attn_mask.layout must be {torch.sparse_bsr}, but got " - f"attn_mask.layout == {attn_mask.layout}.", - ) - - check_device(f_name, key, query.device) - check_device(f_name, value, query.device) - check_device(f_name, attn_mask, query.device) - - check_dtype(f_name, key, query.dtype) - check_dtype(f_name, value, query.dtype) - if attn_mask.dtype is not torch.bool: - check_dtype(f_name, attn_mask, query.dtype) - - sdpa = sampled_addmm( - attn_mask, query, key.transpose(-2, -1), beta=0.0, skip_checks=False - ) - if scale is None and query.size(-1) == 0 or scale == 0.0: - check( - False, - f"{f_name}(): current value of scale == {scale} " - "results in division by zero.", - ) - scale_factor = 1 / math.sqrt(query.size(-1)) if scale is None else scale - sdpa.values().mul_(scale_factor) - sdpa = bsr_softmax(sdpa) - torch.nn.functional.dropout(sdpa.values(), p=dropout_p, inplace=True) - sdpa = bsr_dense_mm(sdpa, value) - return sdpa - - @triton.jit - def _scatter_mm2_kernel( - M: tl.constexpr, - K: tl.constexpr, - N: tl.constexpr, - blocks_ptr, - blocks_stride_P, - blocks_stride_M, - blocks_stride_K, - others_ptr, - others_stride_Q, - others_stride_K, - others_stride_N, - accumulators_ptr, - accumulators_stride_R, - accumulators_stride_M, - accumulators_stride_N, - pq_offsets_ptr, - pq_offsets_stride, - pq_ptr, - pq_stride_T, - pq_stride_1, - dot_out_dtype: tl.constexpr, - TILE_M: tl.constexpr, - TILE_N: tl.constexpr, - allow_tf32: tl.constexpr, - ): - Ms = M // TILE_M - - pid_t = tl.program_id(axis=0) - - pid = tl.program_id(axis=1) - pid_m = pid // Ms - pid_n = pid % Ms - - rm = pid_m * TILE_M + tl.arange(0, TILE_M) - rn = pid_n * TILE_N + tl.arange(0, TILE_N) - rk = tl.arange(0, K) - - A_ptr = blocks_ptr + ( - rm[:, None] * blocks_stride_M + rk[None, :] * blocks_stride_K - ) - B_ptr = others_ptr + ( - rk[:, None] * others_stride_K + rn[None, :] * others_stride_N - ) - - g0 = tl.load(pq_offsets_ptr + pid_t * pq_offsets_stride) - g1 = tl.load(pq_offsets_ptr + (pid_t + 1) * pq_offsets_stride) - - if g0 == g1: - return - - acc_block = tl.zeros((TILE_M, TILE_N), dtype=dot_out_dtype) - - for i in range(g0, g1): - p = tl.load(pq_ptr + i * pq_stride_T) - q = tl.load(pq_ptr + i * pq_stride_T + pq_stride_1) - A = tl.load(A_ptr + p * blocks_stride_P) - B = tl.load(B_ptr + q * others_stride_Q) - acc_block += tl.dot(A, B, out_dtype=dot_out_dtype, allow_tf32=allow_tf32) - - C_ptr = ( - accumulators_ptr - + pid_t * accumulators_stride_R - + ( - rm[:, None] * accumulators_stride_M - + rn[None, :] * accumulators_stride_N - ) - ) - tl.store(C_ptr, acc_block.to(accumulators_ptr.dtype.element_ty)) - - def _scatter_mm2( - blocks: torch.Tensor, - others: torch.Tensor, - pq_offsets: torch.Tensor, - pq_indices: torch.Tensor, - accumulators: torch.Tensor, - ): - _P, M, K = blocks.shape - _Q, _, N = others.shape - - meta = dict( - TILE_M=max(16, M // 4), TILE_N=max(16, N // 4), num_stages=1, num_warps=2 - ) - - def grid(META): - return ( - pq_offsets.shape[0] - 1, - triton.cdiv(M, META["TILE_M"]) * triton.cdiv(N, META["TILE_N"]), - 1, - ) - - dot_out_dtype = { - torch.float16: tl.float32, - torch.bfloat16: tl.float32, - torch.float32: tl.float64, - torch.float64: tl.float64, - }[accumulators.dtype] - if "allow_tf32" not in meta: - meta.update(allow_tf32=dot_out_dtype == tl.float32) - _scatter_mm2_kernel[grid]( - M, - K, - N, - blocks, - blocks.stride(0), - blocks.stride(1), - blocks.stride(2), - others, - others.stride(0), - others.stride(1), - others.stride(2), - accumulators, - accumulators.stride(0), - accumulators.stride(1), - accumulators.stride(2), - pq_offsets, - pq_offsets.stride(0), - pq_indices, - pq_indices.stride(0), - pq_indices.stride(1), - dot_out_dtype=dot_out_dtype, - **meta, - ) - - @triton.jit - def _scatter_mm6_kernel( - nbatches, - Ms, - Ks: tl.constexpr, - N, - blocks_ptr, - blocks_stride_P, - blocks_stride_M, - blocks_stride_K, - others_ptr, - others_stride_B, - others_stride_K, - others_stride_N, - accumulators_ptr, - accumulators_stride_B, - accumulators_stride_M, - accumulators_stride_N, - c_indices_ptr, - r_offsets_ptr, - p_offsets_ptr, - q_offsets_ptr, - is_compressed: tl.constexpr, - dot_out_dtype: tl.constexpr, - SPLIT_N: tl.constexpr, - TILE_M: tl.constexpr, - TILE_N: tl.constexpr, - GROUP_SIZE: tl.constexpr, - allow_tf32: tl.constexpr, - ): - Ns = N // SPLIT_N - BLOCKS_M = Ms // TILE_M - BLOCKS_N = Ns // TILE_N - - pid_t_ = tl.program_id(axis=0) - pid = tl.program_id(axis=1) - pid_b = pid_t_ % nbatches - pid_t = pid_t_ // nbatches - - num_pid_in_group = GROUP_SIZE * BLOCKS_N - group_id = pid // num_pid_in_group - first_pid_m = group_id * GROUP_SIZE - group_size_m = min(BLOCKS_M - first_pid_m, GROUP_SIZE) - pid_m = first_pid_m + (pid % group_size_m) - pid_n = (pid % num_pid_in_group) // group_size_m - - rm = pid_m * TILE_M + tl.arange(0, TILE_M) - rn = pid_n * TILE_N + tl.arange(0, TILE_N) - rk = tl.arange(0, Ks) - A_ptr = blocks_ptr + ( - rm[:, None] * blocks_stride_M + rk[None, :] * blocks_stride_K - ) - B_ptr = ( - others_ptr - + pid_b * others_stride_B - + (rk[:, None] * others_stride_K + rn[None, :] * others_stride_N) - ) - - # When is_compressed is True, r is the only variable that - # depends on pid_t. This property allows sorting r values - # before calling the kernel. The sorting of r is equivalent to - # defining swizzle operator outside of the kernel. - r = tl.load(r_offsets_ptr + pid_t) - - if is_compressed: - m = (r // N) // Ms - n = (r % N) // Ns - r0 = tl.load(c_indices_ptr + m) - r1 = tl.load(c_indices_ptr + m + 1) - g0 = n * r1 + (SPLIT_N - n) * r0 - nnz = r1 - r0 - else: - g0 = tl.load(c_indices_ptr + pid_t) - g1 = tl.load(c_indices_ptr + pid_t + 1) - nnz = g1 - g0 - - q_ptr = q_offsets_ptr + g0 - acc_block = tl.zeros((TILE_M, TILE_N), dtype=dot_out_dtype) - - if is_compressed: - A_ptr += r0 * blocks_stride_P # type: ignore[possibly-undefined] - for _ in range(nnz): - q = tl.load(q_ptr) - B = tl.load(B_ptr + q) - A = tl.load(A_ptr) - acc_block += tl.dot( - A, B, out_dtype=dot_out_dtype, allow_tf32=allow_tf32 - ) - A_ptr += blocks_stride_P - q_ptr += 1 - else: - p_ptr = p_offsets_ptr + g0 - for _ in range(nnz): - q = tl.load(q_ptr) - B = tl.load(B_ptr + q) - p = tl.load(p_ptr) - A = tl.load(A_ptr + p * blocks_stride_P) - p_ptr += 1 - q_ptr += 1 - acc_block += tl.dot( - A, B, out_dtype=dot_out_dtype, allow_tf32=allow_tf32 - ) - - C_ptr = ( - accumulators_ptr - + r - + pid_b * accumulators_stride_B - + ( - rm[:, None] * accumulators_stride_M - + rn[None, :] * accumulators_stride_N - ) - ) - tl.store(C_ptr, acc_block.to(accumulators_ptr.dtype.element_ty)) - - def _scatter_mm6( - blocks: torch.Tensor, - others: torch.Tensor, - c_indices: torch.Tensor, - r_offsets: torch.Tensor, - p_offsets: torch.Tensor, - q_offsets: torch.Tensor, - meta: dict, - accumulators: torch.Tensor, - force_contiguous: bool = True, - ): - SPLIT_N = meta["SPLIT_N"] - _P, Ms, Ks = blocks.shape - B, _K, N = others.shape - B_, _M, N_ = accumulators.shape - assert N_ == N - Ns = N // SPLIT_N - assert B_ == B - - def grid(META): - return ( - r_offsets.shape[0] * B, - triton.cdiv(Ms, META["TILE_M"]) * triton.cdiv(Ns, META["TILE_N"]), - ) - - dot_out_dtype = { - torch.float16: tl.float32, - torch.bfloat16: tl.float32, - torch.float32: tl.float64, - torch.float64: tl.float64, - }[accumulators.dtype] - if "allow_tf32" not in meta: - meta.update(allow_tf32=dot_out_dtype == tl.float32) - - assert c_indices.stride(0) == 1 - assert r_offsets.stride(0) == 1 - assert p_offsets.stride(0) == 1 - assert q_offsets.stride(0) == 1 - - # Re non-contiguous tensor arguments. Sometimes triton kernel - # launches may fail with - # - # RuntimeError: Triton Error [CUDA]: an illegal memory access was encountered - # - # that appears to be case when the size of a non-contiguous - # tensor argument is larger than a certain threshold. Could - # this be related to shared memory or L1 cache size of a GPU - # card? In anycase, ensuring that tensor arguments are - # contiguous seems to avoid the above exception. So, in the - # following we'll always convert tensor arguments to - # C-contiguous tensors. - - if force_contiguous: - blocks = blocks.contiguous() - others = others.contiguous() - if not accumulators.is_contiguous(): - accumulators_ = accumulators.contiguous() - else: - accumulators_ = accumulators - else: - accumulators_ = accumulators - - _scatter_mm6_kernel[grid]( - B, - Ms, - Ks, - N, - blocks, - blocks.stride(0), - blocks.stride(1), - blocks.stride(2), - others, - others.stride(0), - others.stride(1), - others.stride(2), - accumulators_, - accumulators_.stride(0), - accumulators_.stride(1), - accumulators_.stride(2), - c_indices, - r_offsets, - p_offsets, - q_offsets, - dot_out_dtype=dot_out_dtype, - **meta, - ) - - if force_contiguous and not accumulators.is_contiguous(): - accumulators.copy_(accumulators_) - - def next_power_of_two(n): - assert n > 0 - return 2 ** (n.bit_length()) - @triton.jit def _bsr_strided_addmm_kernel( # values prologue @@ -2557,10 +887,4 @@ def _bsr_strided_addmm_kernel( tl.store(output_ptrs, output_acc_block.to(output_ptr.dtype.element_ty)) else: - bsr_softmax = None # type: ignore[assignment] - bsr_dense_mm = None # type: ignore[assignment] - sampled_addmm = None # type: ignore[assignment] - _scaled_dot_product_attention = None # type: ignore[assignment] - _scatter_mm2 = None # type: ignore[assignment] - _scatter_mm6 = None # type: ignore[assignment] _bsr_strided_addmm_kernel = None # type: ignore[assignment] diff --git a/torchao/prototype/sparsity/superblock/blocksparse.py b/torchao/sparsity/blocksparse.py similarity index 100% rename from torchao/prototype/sparsity/superblock/blocksparse.py rename to torchao/sparsity/blocksparse.py diff --git a/torchao/prototype/sparsity/superblock/supermask.py b/torchao/sparsity/supermask.py similarity index 100% rename from torchao/prototype/sparsity/superblock/supermask.py rename to torchao/sparsity/supermask.py From 5e25b8b0b11c8a017d1255cef6f72fc7d433afee Mon Sep 17 00:00:00 2001 From: Jesse Cai Date: Thu, 23 Jan 2025 16:14:49 -0800 Subject: [PATCH 08/23] wip --- torchao/_models/llama/generate.py | 20 ++----------------- .../superblock => sparsity}/bsr_triton_ops.py | 0 2 files changed, 2 insertions(+), 18 deletions(-) rename torchao/{prototype/sparsity/superblock => sparsity}/bsr_triton_ops.py (100%) diff --git a/torchao/_models/llama/generate.py b/torchao/_models/llama/generate.py index 834fe0b4ee..902bca50e3 100644 --- a/torchao/_models/llama/generate.py +++ b/torchao/_models/llama/generate.py @@ -799,7 +799,7 @@ def ffn_or_attn_only(mod, fqn): if "bsr" in sparsity: # Apply Supermask to get sparse weights - from torchao.prototype.sparsity.superblock.supermask import SupermaskLinear + from torchao.sparsity.supermask import SupermaskLinear sparsify_( model, lambda x: SupermaskLinear.from_linear(x, @@ -809,26 +809,10 @@ def ffn_or_attn_only(mod, fqn): filter_fn=ffn_only, ) - from torchao.prototype.sparsity.superblock.blocksparse import block_sparse_weight + from torchao.sparsity.blocksparse import block_sparse_weight sparsify_(model, block_sparse_weight(blocksize=64), filter_fn=ffn_only) - - # from torchao.prototype.sparsity.superblock._triton_ops_meta import optimize_bsr_dense_addmm - # for M, K, N in [(14336, 4096, 8192), (4096, 14336, 8192)]: - # optimize_bsr_dense_addmm( - # M, - # K, - # N, - # 64, - # 64, - # beta=0, - # alpha=1, - # sparsity=0.9, - # dtype=torch.bfloat16, - # opname="bsr_dense_addmm", - # verbose=True, - # ) model_size = get_model_size_in_bytes(model, ignore_embeddings=True) / 1e9 diff --git a/torchao/prototype/sparsity/superblock/bsr_triton_ops.py b/torchao/sparsity/bsr_triton_ops.py similarity index 100% rename from torchao/prototype/sparsity/superblock/bsr_triton_ops.py rename to torchao/sparsity/bsr_triton_ops.py From fe655a2bf8817246efc726da2e12eb5505639638 Mon Sep 17 00:00:00 2001 From: Jesse Cai Date: Thu, 23 Jan 2025 16:25:38 -0800 Subject: [PATCH 09/23] added tests --- test/sparsity/test_bsr.py | 69 ++++++++++++++++ torchao/_models/llama/benchmark_results.txt | 90 +++++++++++++++++++++ torchao/_models/llama/bsr_benchmarks.sh | 7 ++ torchao/sparsity/blocksparse.py | 7 +- 4 files changed, 168 insertions(+), 5 deletions(-) create mode 100644 test/sparsity/test_bsr.py create mode 100644 torchao/_models/llama/bsr_benchmarks.sh diff --git a/test/sparsity/test_bsr.py b/test/sparsity/test_bsr.py new file mode 100644 index 0000000000..e9a85e9fd6 --- /dev/null +++ b/test/sparsity/test_bsr.py @@ -0,0 +1,69 @@ +import copy +import logging +import unittest + +import torch +from torch import nn +from torch.testing._internal import common_utils + +from torchao.dtypes import MarlinSparseLayout, SemiSparseLayout +from torchao.quantization.quant_api import ( + int4_weight_only, + int8_dynamic_activation_int8_weight, + quantize_, +) +from torchao.sparsity import apply_fake_sparsity, semi_sparse_weight, sparsify_ +from torchao.utils import ( + TORCH_VERSION_AT_LEAST_2_3, + TORCH_VERSION_AT_LEAST_2_4, + TORCH_VERSION_AT_LEAST_2_5, + TORCH_VERSION_AT_LEAST_2_6, +) + +logging.basicConfig( + format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", level=logging.INFO +) + + +class TestBlockSparseWeight(common_utils.TestCase): + @unittest.skipIf( + not TORCH_VERSION_AT_LEAST_2_4, + "pytorch 2.4+ feature due to need for custom op support", + ) + @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available") + @common_utils.parametrize("compile", [True, False]) + def test_sparse(self, compile): + input = torch.rand((1024, 1024)).half().cuda() + model = ( + nn.Sequential( + nn.Linear(1024, 2048), + nn.Linear(2048, 1024), + ) + .half() + .cuda() + .eval() + ) + + from torchao.sparsity.utils import create_block_sparse_tensor + + M, N = model[0].weight.shape + model[0].weight.data = create_block_sparse_tensor(M, N, 64, 0.5, torch.float16) + M, N = model[1].weight.shape + model[1].weight.data = create_block_sparse_tensor(M, N, 64, 0.5, torch.float16) + dense_result = model(input) + + from torchao.prototype.sparsity.superblock.blocksparse import ( + block_sparse_weight, + ) + + sparsify_(model, block_sparse_weight(blocksize=64)) + # if compile: + # model = torch.compile(model) + sparse_result = model(input) + + torch.testing.assert_close(dense_result, sparse_result, rtol=1e-3, atol=1e-3) + +common_utils.instantiate_parametrized_tests(TestBlockSparseWeight) + +if __name__ == "__main__": + unittest.main() diff --git a/torchao/_models/llama/benchmark_results.txt b/torchao/_models/llama/benchmark_results.txt index d59c5f552e..9edfe7bb3f 100644 --- a/torchao/_models/llama/benchmark_results.txt +++ b/torchao/_models/llama/benchmark_results.txt @@ -50,3 +50,93 @@ OTHER BENCHMARKS 20240910010056, tok/s= 47.85, mem/s= 213.24 GB/s, peak_mem=11.85 GB, model_size= 4.46 GB quant: uintx-4-64, mod: Meta-Llama-3-8B, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --quantization uintx-4-64 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3-8B/model.pth --device cuda --precision torch.bfloat16 --compile --num_samples 5 --max_new_tokens 200 --top_k 200 --temperature 0.8 20240910010647, tok/s= 34.83, mem/s= 261.42 GB/s, peak_mem=14.99 GB, model_size= 7.51 GB quant: uintx-2-8, mod: Meta-Llama-3-8B, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --quantization uintx-2-8 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3-8B/model.pth --device cuda --precision torch.bfloat16 --compile --num_samples 5 --max_new_tokens 200 --top_k 200 --temperature 0.8 20240910110958, tok/s=223.95, mem/s= 682.88 GB/s, peak_mem= 5.59 GB, model_size= 3.05 GB quant: sparse-marlin, mod: Meta-Llama-3-8B, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.float16, device: cuda repro: python generate.py --quantization sparse-marlin --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3-8B/model.pth --device cuda --precision torch.float16 --compile --num_samples 5 --max_new_tokens 200 --top_k 200 --temperature 0.8 + +20250115111811, tok/s=132.58, tok/s_decode=134.92, ttft=0.0256, mem/s=1989.99 GB/s, peak_mem=16.50 GB, model_size=15.01 GB quant: None, sparse: None, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250115111955, tok/s=132.39, tok/s_decode=134.90, ttft=0.0274, mem/s=1987.19 GB/s, peak_mem=16.50 GB, model_size=15.01 GB quant: None, sparse: None, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250115112851, tok/s=102.36, tok/s_decode=106.53, ttft=0.0759, mem/s= 499.36 GB/s, peak_mem=10.11 GB, model_size= 4.88 GB quant: None, sparse: None, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250115113023, tok/s=132.40, tok/s_decode=134.92, ttft=0.0275, mem/s=1987.31 GB/s, peak_mem=16.50 GB, model_size=15.01 GB quant: None, sparse: None, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250115113154, tok/s=102.34, tok/s_decode=106.46, ttft=0.0748, mem/s= 499.29 GB/s, peak_mem=10.11 GB, model_size= 4.88 GB quant: None, sparse: bsr, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250115114035, tok/s= 82.15, tok/s_decode=107.69, ttft=0.5768, mem/s=1233.05 GB/s, peak_mem=36.46 GB, model_size=15.01 GB quant: None, sparse: None, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --prefill_size 8192--num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250115114623, tok/s= 72.78, tok/s_decode= 88.50, ttft=0.4874, mem/s= 355.08 GB/s, peak_mem=18.27 GB, model_size= 4.88 GB quant: None, sparse: bsr, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --prefill_size 8192--num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250115114936, tok/s=132.34, tok/s_decode=134.85, ttft=0.0274, mem/s=1986.47 GB/s, peak_mem=16.50 GB, model_size=15.01 GB quant: None, sparse: None, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250115115115, tok/s=102.81, tok/s_decode=106.89, ttft=0.0735, mem/s= 501.58 GB/s, peak_mem=10.11 GB, model_size= 4.88 GB quant: None, sparse: bsr, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250115115406, tok/s=132.39, tok/s_decode=134.90, ttft=0.0274, mem/s=1987.10 GB/s, peak_mem=16.50 GB, model_size=15.01 GB quant: None, sparse: None, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250115115503, tok/s=132.41, tok/s_decode=134.91, ttft=0.0273, mem/s=1987.40 GB/s, peak_mem=16.50 GB, model_size=15.01 GB quant: None, sparse: None, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250115120048, tok/s=132.39, tok/s_decode=134.93, ttft=0.0277, mem/s=1987.15 GB/s, peak_mem=16.50 GB, model_size=15.01 GB quant: None, sparse: None, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250116123651, tok/s=129.31, tok/s_decode=134.38, ttft=0.0576, mem/s= 630.81 GB/s, peak_mem= 6.94 GB, model_size= 4.88 GB quant: None, sparse: bsr, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250116124020, tok/s=110.09, tok/s_decode=132.55, ttft=0.0607, mem/s= 537.06 GB/s, peak_mem= 6.67 GB, model_size= 4.88 GB quant: None, sparse: bsr, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --profile bsr_trace --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250116124956, tok/s=131.75, tok/s_decode=134.13, ttft=0.0263, mem/s=1977.55 GB/s, peak_mem=16.50 GB, model_size=15.01 GB quant: None, sparse: None, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250116130019, tok/s=130.31, tok/s_decode=134.85, ttft=0.0512, mem/s= 635.66 GB/s, peak_mem= 6.67 GB, model_size= 4.88 GB quant: None, sparse: bsr, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250116130350, tok/s= 20.09, tok/s_decode= 20.32, ttft=0.1054, mem/s= 98.00 GB/s, peak_mem=16.97 GB, model_size= 4.88 GB quant: None, sparse: bsr, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: False, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250121085551, tok/s= 19.53, tok/s_decode= 19.75, ttft=0.1045, mem/s= 117.50 GB/s, peak_mem=16.97 GB, model_size= 6.02 GB quant: None, sparse: bsr, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: False, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250121090403, tok/s= 5.14, tok/s_decode= 5.17, ttft=0.1720, mem/s= 30.95 GB/s, peak_mem=27.73 GB, model_size= 6.02 GB quant: None, sparse: bsr, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: False, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250121090648, tok/s=132.21, tok/s_decode=134.58, ttft=0.0261, mem/s=1984.43 GB/s, peak_mem=16.50 GB, model_size=15.01 GB quant: None, sparse: None, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250121090848, tok/s=132.12, tok/s_decode=134.62, ttft=0.0274, mem/s=1983.16 GB/s, peak_mem=16.50 GB, model_size=15.01 GB quant: None, sparse: None, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250121091251, tok/s= 5.13, tok/s_decode= 5.16, ttft=0.1628, mem/s= 30.89 GB/s, peak_mem=27.73 GB, model_size= 6.02 GB quant: None, sparse: bsr, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: False, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250121091339, tok/s=121.71, tok/s_decode=134.38, ttft=0.0315, mem/s=1826.78 GB/s, peak_mem=16.50 GB, model_size=15.01 GB quant: None, sparse: None, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --profile baseline --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250121091826, tok/s= 4.65, tok/s_decode= 5.17, ttft=0.1760, mem/s= 27.99 GB/s, peak_mem=27.73 GB, model_size= 6.02 GB quant: None, sparse: bsr, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: False, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --profile bsr --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250121092437, tok/s= 4.65, tok/s_decode= 5.16, ttft=0.1638, mem/s= 27.95 GB/s, peak_mem=27.73 GB, model_size= 6.02 GB quant: None, sparse: bsr, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: False, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --profile bsr --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250121093419, tok/s= 4.67, tok/s_decode= 5.17, ttft=0.1728, mem/s= 28.10 GB/s, peak_mem=27.73 GB, model_size= 6.02 GB quant: None, sparse: bsr, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: False, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --profile bsr --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250121093920, tok/s= 2.65, tok/s_decode= 5.14, ttft=0.5703, mem/s= 15.94 GB/s, peak_mem=27.73 GB, model_size= 6.02 GB quant: None, sparse: bsr, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: False, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --profile bsr --num_samples 1 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250121094143, tok/s= 2.66, tok/s_decode= 5.15, ttft=0.5759, mem/s= 16.03 GB/s, peak_mem=27.73 GB, model_size= 6.02 GB quant: None, sparse: bsr, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: False, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --profile bsr --num_samples 1 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250121100759, tok/s= 2.82, tok/s_decode= 5.14, ttft=0.5244, mem/s= 16.97 GB/s, peak_mem=27.73 GB, model_size= 6.02 GB quant: None, sparse: bsr, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: False, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --profile bsr --num_samples 1 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250121101108, tok/s= 2.85, tok/s_decode= 5.13, ttft=0.5582, mem/s= 17.15 GB/s, peak_mem=27.73 GB, model_size= 6.02 GB quant: None, sparse: bsr, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: False, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --profile bsr --num_samples 1 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250121101728, tok/s= 2.82, tok/s_decode= 5.14, ttft=0.5433, mem/s= 16.98 GB/s, peak_mem=27.73 GB, model_size= 6.02 GB quant: None, sparse: bsr, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: False, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --profile bsr --num_samples 1 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250121102340, tok/s= 81.98, tok/s_decode=107.42, ttft=0.5773, mem/s=1230.47 GB/s, peak_mem=36.46 GB, model_size=15.01 GB quant: None, sparse: None, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --prefill_size 8192--num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250121102642, tok/s= 82.03, tok/s_decode=107.47, ttft=0.5765, mem/s=1231.23 GB/s, peak_mem=36.44 GB, model_size=15.01 GB quant: None, sparse: None, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --prefill_size 8192--num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250121102757, tok/s= 82.08, tok/s_decode=107.51, ttft=0.5758, mem/s=1231.94 GB/s, peak_mem=36.19 GB, model_size=15.01 GB quant: None, sparse: None, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --prefill_size 8192--num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250121102943, tok/s= 82.10, tok/s_decode=107.54, ttft=0.5757, mem/s=1232.24 GB/s, peak_mem=36.19 GB, model_size=15.01 GB quant: None, sparse: None, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --prefill_size 8192--num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250121103057, tok/s= 82.05, tok/s_decode=107.53, ttft=0.5769, mem/s=1231.59 GB/s, peak_mem=36.19 GB, model_size=15.01 GB quant: None, sparse: None, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --prefill_size 8192--num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250121103140, tok/s= 81.98, tok/s_decode=107.50, ttft=0.5785, mem/s=1230.47 GB/s, peak_mem=36.19 GB, model_size=15.01 GB quant: None, sparse: None, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --prefill_size 8192--num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250121103512, tok/s= 82.09, tok/s_decode=107.54, ttft=0.5757, mem/s=1232.19 GB/s, peak_mem=36.19 GB, model_size=15.01 GB quant: None, sparse: None, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --prefill_size 8192--num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250121104154, tok/s= 82.13, tok/s_decode=107.59, ttft=0.5755, mem/s=1232.79 GB/s, peak_mem=36.19 GB, model_size=15.01 GB quant: None, sparse: None, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --prefill_size 8192--num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250121104406, tok/s=119.88, tok/s_decode=151.12, ttft=0.3441, mem/s= 584.77 GB/s, peak_mem=12.38 GB, model_size= 4.88 GB quant: None, sparse: bsr, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --prefill_size 8192--num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250121104646, tok/s= 82.06, tok/s_decode=107.51, ttft=0.5761, mem/s=1231.68 GB/s, peak_mem=36.19 GB, model_size=15.01 GB quant: None, sparse: None, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --prefill_size 8192--num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250121104931, tok/s= 77.10, tok/s_decode=107.58, ttft=0.7870, mem/s=1157.20 GB/s, peak_mem=36.70 GB, model_size=15.01 GB quant: None, sparse: None, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --prefill_size 8192--num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250121110040, tok/s= 82.08, tok/s_decode=107.53, ttft=0.5756, mem/s=1232.06 GB/s, peak_mem=36.19 GB, model_size=15.01 GB quant: None, sparse: None, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --prefill_size 8192--num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250121110148, tok/s= 76.50, tok/s_decode=107.04, ttft=0.5778, mem/s=1148.24 GB/s, peak_mem=36.19 GB, model_size=15.01 GB quant: None, sparse: None, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --prefill_size 8192--profile baseline_prefill --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250121110258, tok/s=108.99, tok/s_decode=150.54, ttft=0.3432, mem/s= 531.67 GB/s, peak_mem=12.38 GB, model_size= 4.88 GB quant: None, sparse: bsr, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --prefill_size 8192--profile bsr_prefill --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250121110904, tok/s=203.34, tok/s_decode=214.37, ttft=0.0499, mem/s= 991.92 GB/s, peak_mem= 6.67 GB, model_size= 4.88 GB quant: None, sparse: bsr, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250121111229, tok/s=182.26, tok/s_decode=214.52, ttft=0.0467, mem/s= 889.09 GB/s, peak_mem= 6.67 GB, model_size= 4.88 GB quant: None, sparse: bsr, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --profile bsr --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250121112735, tok/s=182.42, tok/s_decode=214.30, ttft=0.0495, mem/s= 889.89 GB/s, peak_mem= 6.67 GB, model_size= 4.88 GB quant: None, sparse: bsr, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --profile bsr --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250121113757, tok/s=182.86, tok/s_decode=214.41, ttft=0.0494, mem/s= 892.01 GB/s, peak_mem= 6.67 GB, model_size= 4.88 GB quant: None, sparse: bsr, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --profile bsr --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250121114610, tok/s=182.63, tok/s_decode=214.34, ttft=0.0503, mem/s= 890.88 GB/s, peak_mem= 6.67 GB, model_size= 4.88 GB quant: None, sparse: bsr, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --profile bsr --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250121122840, tok/s= 69.40, tok/s_decode= 70.52, ttft=0.0455, mem/s=1824.78 GB/s, peak_mem=27.82 GB, model_size=26.30 GB quant: None, sparse: bsr, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250121123616, tok/s=205.26, tok/s_decode=214.95, ttft=0.0434, mem/s=1001.28 GB/s, peak_mem= 6.58 GB, model_size= 4.88 GB quant: None, sparse: bsr, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250121124112, tok/s=204.65, tok/s_decode=214.91, ttft=0.0460, mem/s= 998.30 GB/s, peak_mem= 6.58 GB, model_size= 4.88 GB quant: None, sparse: bsr, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250121124437, tok/s=205.09, tok/s_decode=215.09, ttft=0.0448, mem/s=1000.48 GB/s, peak_mem= 6.58 GB, model_size= 4.88 GB quant: None, sparse: bsr, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250121135848, tok/s=123.54, tok/s_decode=134.43, ttft=0.0113, mem/s=1854.27 GB/s, peak_mem=16.24 GB, model_size=15.01 GB quant: None, sparse: None, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --profile baseline --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250121135953, tok/s=182.45, tok/s_decode=214.15, ttft=0.0495, mem/s= 890.04 GB/s, peak_mem= 6.58 GB, model_size= 4.88 GB quant: None, sparse: bsr, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --profile bsr --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250121140550, tok/s=123.42, tok/s_decode=134.38, ttft=0.0119, mem/s=1852.55 GB/s, peak_mem=16.24 GB, model_size=15.01 GB quant: None, sparse: None, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --profile baseline --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250121140658, tok/s=182.52, tok/s_decode=214.21, ttft=0.0502, mem/s= 890.35 GB/s, peak_mem= 6.58 GB, model_size= 4.88 GB quant: None, sparse: bsr, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --profile bsr --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250122135225, tok/s=123.34, tok/s_decode=134.41, ttft=0.0121, mem/s=1851.32 GB/s, peak_mem=16.24 GB, model_size=15.01 GB quant: None, sparse: None, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --profile baseline --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250122135328, tok/s=182.94, tok/s_decode=214.26, ttft=0.0487, mem/s= 892.41 GB/s, peak_mem= 6.58 GB, model_size= 4.88 GB quant: None, sparse: bsr, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --profile bsr --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250122135614, tok/s= 11.63, tok/s_decode= 12.98, ttft=0.1701, mem/s= 56.74 GB/s, peak_mem=17.34 GB, model_size= 4.88 GB quant: None, sparse: bsr, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: False, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --profile bsr --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250122154428, tok/s= 1.08, tok/s_decode= 1.18, ttft=0.1716, mem/s= 5.29 GB/s, peak_mem=17.34 GB, model_size= 4.88 GB quant: None, sparse: bsr, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: False, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --profile bsr --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250123101614, tok/s=170.05, tok/s_decode=214.41, ttft=0.0481, mem/s= 829.55 GB/s, peak_mem= 6.58 GB, model_size= 4.88 GB quant: None, sparse: bsr, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --profile bsr --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250123101846, tok/s=182.85, tok/s_decode=214.25, ttft=0.0474, mem/s= 891.99 GB/s, peak_mem= 6.58 GB, model_size= 4.88 GB quant: None, sparse: bsr, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --profile bsr --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250123113033, tok/s= 11.72, tok/s_decode= 13.29, ttft=0.3043, mem/s= 57.15 GB/s, peak_mem=17.34 GB, model_size= 4.88 GB quant: None, sparse: bsr, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: False, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --profile bsr --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250123113307, tok/s=179.71, tok/s_decode=213.42, ttft=0.0530, mem/s= 876.67 GB/s, peak_mem= 6.58 GB, model_size= 4.88 GB quant: None, sparse: bsr, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --profile bsr --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250123113418, tok/s=182.62, tok/s_decode=214.15, ttft=0.0490, mem/s= 890.83 GB/s, peak_mem= 6.58 GB, model_size= 4.88 GB quant: None, sparse: bsr, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --profile bsr --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250123113647, tok/s=182.27, tok/s_decode=214.18, ttft=0.0488, mem/s= 889.15 GB/s, peak_mem= 6.58 GB, model_size= 4.88 GB quant: None, sparse: bsr, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --profile bsr_padded_trition.json.gz --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250123114432, tok/s=182.23, tok/s_decode=217.09, ttft=0.0581, mem/s= 888.94 GB/s, peak_mem= 6.58 GB, model_size= 4.88 GB quant: None, sparse: bsr, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --profile bsr_padded_trition --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250123123324, tok/s=186.08, tok/s_decode=217.45, ttft=0.0475, mem/s= 907.74 GB/s, peak_mem= 6.58 GB, model_size= 4.88 GB quant: None, sparse: bsr, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --profile bsr_padded_trition --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250123135016, tok/s=186.23, tok/s_decode=217.48, ttft=0.0468, mem/s= 908.45 GB/s, peak_mem= 6.58 GB, model_size= 4.88 GB quant: None, sparse: bsr, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --profile bsr_padded_trition --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250123145640, tok/s=185.20, tok/s_decode=216.60, ttft=0.0494, mem/s= 903.44 GB/s, peak_mem= 6.58 GB, model_size= 4.88 GB quant: None, sparse: bsr, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --profile bsr_padded_trition --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250123145919, tok/s=185.33, tok/s_decode=217.23, ttft=0.0493, mem/s= 904.08 GB/s, peak_mem= 6.58 GB, model_size= 4.88 GB quant: None, sparse: bsr, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --profile bsr_padded_trition --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250123150712, tok/s= 77.17, tok/s_decode=109.19, ttft=0.5785, mem/s=1158.24 GB/s, peak_mem=36.46 GB, model_size=15.01 GB quant: None, sparse: None, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --prefill_size 8192--profile baseline_prefill --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250123151351, tok/s= 77.72, tok/s_decode=109.31, ttft=0.5766, mem/s=1166.50 GB/s, peak_mem=36.44 GB, model_size=15.01 GB quant: None, sparse: None, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --prefill_size 8192--profile baseline_prefill --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250123151839, tok/s=102.64, tok/s_decode=154.70, ttft=0.4758, mem/s= 500.68 GB/s, peak_mem=17.94 GB, model_size= 4.88 GB quant: None, sparse: bsr, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --prefill_size 8192--profile bsr_prefill --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250123152218, tok/s= 77.85, tok/s_decode=109.37, ttft=0.5770, mem/s=1168.55 GB/s, peak_mem=36.19 GB, model_size=15.01 GB quant: None, sparse: None, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --prefill_size 8192--profile baseline_prefill --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250123152330, tok/s=102.95, tok/s_decode=154.99, ttft=0.4876, mem/s= 502.20 GB/s, peak_mem=17.67 GB, model_size= 4.88 GB quant: None, sparse: bsr, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --prefill_size 8192--profile bsr_prefill --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250123152615, tok/s=102.68, tok/s_decode=154.82, ttft=0.4879, mem/s= 500.90 GB/s, peak_mem=17.67 GB, model_size= 4.88 GB quant: None, sparse: bsr, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --prefill_size 8192--profile bsr_prefill --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250123153256, tok/s=100.57, tok/s_decode=151.81, ttft=0.4890, mem/s= 490.60 GB/s, peak_mem=17.94 GB, model_size= 4.88 GB quant: None, sparse: bsr, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --prefill_size 8192--profile bsr_prefill --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250123154843, tok/s=101.30, tok/s_decode=152.23, ttft=0.4892, mem/s= 494.15 GB/s, peak_mem=17.92 GB, model_size= 4.88 GB quant: None, sparse: bsr, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --prefill_size 8192--profile bsr_prefill --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250123155937, tok/s=101.24, tok/s_decode=152.16, ttft=0.4889, mem/s= 493.86 GB/s, peak_mem=17.92 GB, model_size= 4.88 GB quant: None, sparse: bsr, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --prefill_size 8192--profile bsr_prefill --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250123160302, tok/s=122.71, tok/s_decode=134.11, ttft=0.0120, mem/s=1841.91 GB/s, peak_mem=16.50 GB, model_size=15.01 GB quant: None, sparse: None, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --profile baseline --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250123160508, tok/s=123.29, tok/s_decode=134.19, ttft=0.0116, mem/s=1850.63 GB/s, peak_mem=16.24 GB, model_size=15.01 GB quant: None, sparse: None, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --profile baseline --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250123160810, tok/s=123.16, tok/s_decode=134.16, ttft=0.0118, mem/s=1848.66 GB/s, peak_mem=16.24 GB, model_size=15.01 GB quant: None, sparse: None, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --profile baseline --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250123161148, tok/s=185.76, tok/s_decode=217.48, ttft=0.0502, mem/s= 906.15 GB/s, peak_mem= 6.58 GB, model_size= 4.88 GB quant: None, sparse: bsr, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --profile bsr_padded_trition --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250123161835, tok/s=123.16, tok/s_decode=134.13, ttft=0.0118, mem/s=1848.54 GB/s, peak_mem=16.24 GB, model_size=15.01 GB quant: None, sparse: None, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --profile baseline --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250123162259, tok/s=123.05, tok/s_decode=134.13, ttft=0.0122, mem/s=1846.98 GB/s, peak_mem=16.24 GB, model_size=15.01 GB quant: None, sparse: None, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --profile baseline --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250123162406, tok/s=186.18, tok/s_decode=217.73, ttft=0.0470, mem/s= 908.22 GB/s, peak_mem= 6.58 GB, model_size= 4.88 GB quant: None, sparse: bsr, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --profile bsr_padded_trition --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 \ No newline at end of file diff --git a/torchao/_models/llama/bsr_benchmarks.sh b/torchao/_models/llama/bsr_benchmarks.sh new file mode 100644 index 0000000000..b22ad7b800 --- /dev/null +++ b/torchao/_models/llama/bsr_benchmarks.sh @@ -0,0 +1,7 @@ +export CHECKPOINT_PATH=../../../checkpoints # path to checkpoints folder +export MODEL_REPO=meta-llama/Meta-Llama-3.1-8B + +#python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --compile_prefill --write_result benchmark_results.txt --prefill_size 8192 --profile baseline_prefill +#python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --compile_prefill --write_result benchmark_results.txt --prefill_size 8192 --sparsity bsr --profile bsr_prefill +python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --compile_prefill --write_result benchmark_results.txt --profile baseline +python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --compile_prefill --write_result benchmark_results.txt --sparsity bsr --profile bsr_padded_trition diff --git a/torchao/sparsity/blocksparse.py b/torchao/sparsity/blocksparse.py index 1d204956c4..0cf674dc00 100644 --- a/torchao/sparsity/blocksparse.py +++ b/torchao/sparsity/blocksparse.py @@ -2,12 +2,11 @@ from typing import Any, Callable, Dict, List, Optional, Tuple import torch -from torch.sparse._triton_ops import broadcast_batch_dims, bsr_dense_addmm, bsr_dense_mm from torch.utils._python_dispatch import return_and_correct_aliasing from torchao.quantization.quant_api import _get_linear_subclass_inserter from torchao.utils import TorchAOBaseTensor -from .bsr_triton_ops import bsr_dense_addmm as torchao_bsr_dense_addmm +from .bsr_triton_ops import bsr_dense_addmm, broadcast_batch_dims aten = torch.ops.aten @@ -103,15 +102,13 @@ def blocksparse_addmm( weight_bsr = torch.sparse_bsr_tensor(crow_indices, col_indices, values, size=(M, K)) N_padded = x_padded.shape[1] out = x_padded.new_empty((M, N_padded)) - torchao_bsr_dense_addmm( + bsr_dense_addmm( out, weight_bsr, x_padded, alpha=1, beta=0, out=out, - # left_alpha=left_alpha, - # right_alpha=right_alpha, ) return out From b12df57c3d02b47d5bfc855d17b000ac26b942b1 Mon Sep 17 00:00:00 2001 From: Jesse Cai Date: Thu, 23 Jan 2025 16:29:24 -0800 Subject: [PATCH 10/23] cleaned up BSR code --- torchao/_models/llama/bsr_benchmarks.sh | 5 +++-- torchao/_models/llama/generate.py | 13 +++++++------ 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/torchao/_models/llama/bsr_benchmarks.sh b/torchao/_models/llama/bsr_benchmarks.sh index b22ad7b800..86e5f01d5a 100644 --- a/torchao/_models/llama/bsr_benchmarks.sh +++ b/torchao/_models/llama/bsr_benchmarks.sh @@ -3,5 +3,6 @@ export MODEL_REPO=meta-llama/Meta-Llama-3.1-8B #python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --compile_prefill --write_result benchmark_results.txt --prefill_size 8192 --profile baseline_prefill #python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --compile_prefill --write_result benchmark_results.txt --prefill_size 8192 --sparsity bsr --profile bsr_prefill -python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --compile_prefill --write_result benchmark_results.txt --profile baseline -python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --compile_prefill --write_result benchmark_results.txt --sparsity bsr --profile bsr_padded_trition +python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --compile_prefill --write_result benchmark_results.txt +python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --compile_prefill --write_result benchmark_results.txt --sparsity bsr-0.9-64 +python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --compile_prefill --write_result benchmark_results.txt --sparsity bsr-0.9-32 diff --git a/torchao/_models/llama/generate.py b/torchao/_models/llama/generate.py index 902bca50e3..12da66efe4 100644 --- a/torchao/_models/llama/generate.py +++ b/torchao/_models/llama/generate.py @@ -798,20 +798,21 @@ def ffn_or_attn_only(mod, fqn): sparsify_(model.to(device), semi_sparse_weight(), filter_fn=ffn_only) if "bsr" in sparsity: - # Apply Supermask to get sparse weights - from torchao.sparsity.supermask import SupermaskLinear + from torchao.sparsity import SupermaskLinear, block_sparse_weight + # parse "bsr-0.9-64" + _, sparsity_level, blocksize = sparsity.split("-") sparsify_( model, lambda x: SupermaskLinear.from_linear(x, - sparsity_level=0.9, - blocksize=64, + sparsity_level=sparsity_level, + blocksize=blocksize, ), filter_fn=ffn_only, ) - from torchao.sparsity.blocksparse import block_sparse_weight + # Accelerate with triton bsr kernels sparsify_(model, - block_sparse_weight(blocksize=64), + block_sparse_weight(blocksize=blocksize), filter_fn=ffn_only) model_size = get_model_size_in_bytes(model, ignore_embeddings=True) / 1e9 From 6df43e0144d892110baf6c5bc99a86488eb0c506 Mon Sep 17 00:00:00 2001 From: Jesse Cai Date: Thu, 23 Jan 2025 16:37:07 -0800 Subject: [PATCH 11/23] update generate.py --- torchao/_models/llama/benchmark_results.txt | 7 ++++++- torchao/_models/llama/generate.py | 1 + torchao/sparsity/__init__.py | 4 ++++ 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/torchao/_models/llama/benchmark_results.txt b/torchao/_models/llama/benchmark_results.txt index 9edfe7bb3f..f801531825 100644 --- a/torchao/_models/llama/benchmark_results.txt +++ b/torchao/_models/llama/benchmark_results.txt @@ -139,4 +139,9 @@ OTHER BENCHMARKS 20250123161148, tok/s=185.76, tok/s_decode=217.48, ttft=0.0502, mem/s= 906.15 GB/s, peak_mem= 6.58 GB, model_size= 4.88 GB quant: None, sparse: bsr, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --profile bsr_padded_trition --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 20250123161835, tok/s=123.16, tok/s_decode=134.13, ttft=0.0118, mem/s=1848.54 GB/s, peak_mem=16.24 GB, model_size=15.01 GB quant: None, sparse: None, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --profile baseline --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 20250123162259, tok/s=123.05, tok/s_decode=134.13, ttft=0.0122, mem/s=1846.98 GB/s, peak_mem=16.24 GB, model_size=15.01 GB quant: None, sparse: None, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --profile baseline --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 -20250123162406, tok/s=186.18, tok/s_decode=217.73, ttft=0.0470, mem/s= 908.22 GB/s, peak_mem= 6.58 GB, model_size= 4.88 GB quant: None, sparse: bsr, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --profile bsr_padded_trition --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 \ No newline at end of file +20250123162406, tok/s=186.18, tok/s_decode=217.73, ttft=0.0470, mem/s= 908.22 GB/s, peak_mem= 6.58 GB, model_size= 4.88 GB quant: None, sparse: bsr, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --profile bsr_padded_trition --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250123162940, tok/s=133.31, tok/s_decode=134.35, ttft=0.0112, mem/s=2000.93 GB/s, peak_mem=16.24 GB, model_size=15.01 GB quant: None, sparse: None, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250123163117, tok/s=133.26, tok/s_decode=134.39, ttft=0.0120, mem/s=2000.18 GB/s, peak_mem=16.24 GB, model_size=15.01 GB quant: None, sparse: None, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250123163224, tok/s=133.28, tok/s_decode=134.39, ttft=0.0117, mem/s=2000.52 GB/s, peak_mem=16.24 GB, model_size=15.01 GB quant: None, sparse: None, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250123163331, tok/s=207.77, tok/s_decode=218.34, ttft=0.0459, mem/s=1013.55 GB/s, peak_mem= 6.58 GB, model_size= 4.88 GB quant: None, sparse: bsr-0.9-64, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr-0.9-64 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250123163555, tok/s=179.75, tok/s_decode=187.99, ttft=0.0481, mem/s= 879.72 GB/s, peak_mem= 6.32 GB, model_size= 4.89 GB quant: None, sparse: bsr-0.9-32, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr-0.9-32 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 \ No newline at end of file diff --git a/torchao/_models/llama/generate.py b/torchao/_models/llama/generate.py index 12da66efe4..f08d63f5c2 100644 --- a/torchao/_models/llama/generate.py +++ b/torchao/_models/llama/generate.py @@ -801,6 +801,7 @@ def ffn_or_attn_only(mod, fqn): from torchao.sparsity import SupermaskLinear, block_sparse_weight # parse "bsr-0.9-64" _, sparsity_level, blocksize = sparsity.split("-") + sparsity_level, blocksize = float(sparsity_level), int(blocksize) sparsify_( model, lambda x: SupermaskLinear.from_linear(x, diff --git a/torchao/sparsity/__init__.py b/torchao/sparsity/__init__.py index 77ccd2c00b..d139b032c0 100644 --- a/torchao/sparsity/__init__.py +++ b/torchao/sparsity/__init__.py @@ -7,6 +7,8 @@ from torchao.quantization.quant_api import ( int8_dynamic_activation_int8_semi_sparse_weight, ) +from .blocksparse import block_sparse_weight +from .supermask import SupermaskLinear from .sparse_api import ( apply_fake_sparsity, @@ -18,9 +20,11 @@ __all__ = [ "WandaSparsifier", + "SupermaskLinear", "PerChannelNormObserver", "apply_fake_sparsity", "sparsify_", "semi_sparse_weight", + "block_sparse_weight", "int8_dynamic_activation_int8_semi_sparse_weight", ] From d7fd2956c5cc5a5031b2ffb047210a7504992733 Mon Sep 17 00:00:00 2001 From: Jesse Cai Date: Thu, 23 Jan 2025 18:58:38 -0800 Subject: [PATCH 12/23] wip --- test/sparsity/test_bsr.py | 13 +- torchao/_models/llama/benchmark_results.txt | 13 +- torchao/_models/llama/bsr_benchmarks.sh | 4 +- .../sparsity/superblock/supermask.py | 156 ++++++++++++++++++ torchao/sparsity/blocksparse.py | 27 +-- torchao/sparsity/bsr_triton_ops.py | 7 +- 6 files changed, 182 insertions(+), 38 deletions(-) create mode 100644 torchao/prototype/sparsity/superblock/supermask.py diff --git a/test/sparsity/test_bsr.py b/test/sparsity/test_bsr.py index e9a85e9fd6..c8d14eed98 100644 --- a/test/sparsity/test_bsr.py +++ b/test/sparsity/test_bsr.py @@ -31,13 +31,12 @@ class TestBlockSparseWeight(common_utils.TestCase): "pytorch 2.4+ feature due to need for custom op support", ) @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available") - @common_utils.parametrize("compile", [True, False]) + @common_utils.parametrize("compile", [True]) def test_sparse(self, compile): - input = torch.rand((1024, 1024)).half().cuda() + input = torch.rand((1, 1024)).half().cuda() model = ( nn.Sequential( nn.Linear(1024, 2048), - nn.Linear(2048, 1024), ) .half() .cuda() @@ -48,11 +47,9 @@ def test_sparse(self, compile): M, N = model[0].weight.shape model[0].weight.data = create_block_sparse_tensor(M, N, 64, 0.5, torch.float16) - M, N = model[1].weight.shape - model[1].weight.data = create_block_sparse_tensor(M, N, 64, 0.5, torch.float16) dense_result = model(input) - from torchao.prototype.sparsity.superblock.blocksparse import ( + from torchao.sparsity import ( block_sparse_weight, ) @@ -61,8 +58,12 @@ def test_sparse(self, compile): # model = torch.compile(model) sparse_result = model(input) + print(dense_result) + print(sparse_result) + torch.testing.assert_close(dense_result, sparse_result, rtol=1e-3, atol=1e-3) + common_utils.instantiate_parametrized_tests(TestBlockSparseWeight) if __name__ == "__main__": diff --git a/torchao/_models/llama/benchmark_results.txt b/torchao/_models/llama/benchmark_results.txt index f801531825..2b4cd8695b 100644 --- a/torchao/_models/llama/benchmark_results.txt +++ b/torchao/_models/llama/benchmark_results.txt @@ -144,4 +144,15 @@ OTHER BENCHMARKS 20250123163117, tok/s=133.26, tok/s_decode=134.39, ttft=0.0120, mem/s=2000.18 GB/s, peak_mem=16.24 GB, model_size=15.01 GB quant: None, sparse: None, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 20250123163224, tok/s=133.28, tok/s_decode=134.39, ttft=0.0117, mem/s=2000.52 GB/s, peak_mem=16.24 GB, model_size=15.01 GB quant: None, sparse: None, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 20250123163331, tok/s=207.77, tok/s_decode=218.34, ttft=0.0459, mem/s=1013.55 GB/s, peak_mem= 6.58 GB, model_size= 4.88 GB quant: None, sparse: bsr-0.9-64, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr-0.9-64 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 -20250123163555, tok/s=179.75, tok/s_decode=187.99, ttft=0.0481, mem/s= 879.72 GB/s, peak_mem= 6.32 GB, model_size= 4.89 GB quant: None, sparse: bsr-0.9-32, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr-0.9-32 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 \ No newline at end of file +20250123163555, tok/s=179.75, tok/s_decode=187.99, ttft=0.0481, mem/s= 879.72 GB/s, peak_mem= 6.32 GB, model_size= 4.89 GB quant: None, sparse: bsr-0.9-32, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr-0.9-32 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250123165038, tok/s=207.95, tok/s_decode=218.24, ttft=0.0447, mem/s=1014.41 GB/s, peak_mem= 6.31 GB, model_size= 4.88 GB quant: None, sparse: bsr-0.9-64, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr-0.9-64 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250123170322, tok/s=208.20, tok/s_decode=218.38, ttft=0.0442, mem/s=1015.65 GB/s, peak_mem= 6.31 GB, model_size= 4.88 GB quant: None, sparse: bsr-0.9-64, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr-0.9-64 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250123171256, tok/s=208.58, tok/s_decode=218.48, ttft=0.0428, mem/s=1017.47 GB/s, peak_mem= 6.31 GB, model_size= 4.88 GB quant: None, sparse: bsr-0.9-64, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr-0.9-64 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250123172543, tok/s=146.94, tok/s_decode=149.85, ttft=0.0259, mem/s=1941.80 GB/s, peak_mem=13.94 GB, model_size=13.21 GB quant: None, sparse: None, mod: Llama-2-7b-chat-hf, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --checkpoint_path ../../../checkpoints/meta-llama/Llama-2-7b-chat-hf/model.pth --device cuda --precision torch.bfloat16 --compile --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250123173042, tok/s=207.86, tok/s_decode=218.47, ttft=0.0461, mem/s=1013.96 GB/s, peak_mem= 6.31 GB, model_size= 4.88 GB quant: None, sparse: bsr-0.9-64, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr-0.9-64 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250123173713, tok/s=208.45, tok/s_decode=218.38, ttft=0.0430, mem/s=1016.85 GB/s, peak_mem= 6.31 GB, model_size= 4.88 GB quant: None, sparse: bsr-0.9-64, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr-0.9-64 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250123183901, tok/s=207.95, tok/s_decode=218.33, ttft=0.0450, mem/s=1014.43 GB/s, peak_mem= 6.31 GB, model_size= 4.88 GB quant: None, sparse: bsr-0.9-64, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr-0.9-64 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250123184904, tok/s=146.81, tok/s_decode=149.91, ttft=0.0275, mem/s=1940.08 GB/s, peak_mem=13.92 GB, model_size=13.21 GB quant: None, sparse: None, mod: Llama-2-7b-chat-hf, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --checkpoint_path ../../../checkpoints/meta-llama/Llama-2-7b-chat-hf/model.pth --device cuda --precision torch.bfloat16 --compile --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250123184942, tok/s= 63.00, tok/s_decode= 68.67, ttft=0.2616, mem/s= 417.12 GB/s, peak_mem= 9.16 GB, model_size= 6.62 GB quant: int8dq, sparse: None, mod: Llama-2-7b-chat-hf, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --quantization int8dq --checkpoint_path ../../../checkpoints/meta-llama/Llama-2-7b-chat-hf/model.pth --device cuda --precision torch.bfloat16 --compile --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250123185104, tok/s=207.53, tok/s_decode=218.47, ttft=0.0475, mem/s=1012.36 GB/s, peak_mem= 6.31 GB, model_size= 4.88 GB quant: None, sparse: bsr-0.9-64, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr-0.9-64 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250123185604, tok/s=208.35, tok/s_decode=218.59, ttft=0.0444, mem/s=1016.38 GB/s, peak_mem= 6.31 GB, model_size= 4.88 GB quant: None, sparse: bsr-0.9-64, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr-0.9-64 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 \ No newline at end of file diff --git a/torchao/_models/llama/bsr_benchmarks.sh b/torchao/_models/llama/bsr_benchmarks.sh index 86e5f01d5a..a667b7c882 100644 --- a/torchao/_models/llama/bsr_benchmarks.sh +++ b/torchao/_models/llama/bsr_benchmarks.sh @@ -3,6 +3,6 @@ export MODEL_REPO=meta-llama/Meta-Llama-3.1-8B #python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --compile_prefill --write_result benchmark_results.txt --prefill_size 8192 --profile baseline_prefill #python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --compile_prefill --write_result benchmark_results.txt --prefill_size 8192 --sparsity bsr --profile bsr_prefill -python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --compile_prefill --write_result benchmark_results.txt +#python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --compile_prefill --write_result benchmark_results.txt python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --compile_prefill --write_result benchmark_results.txt --sparsity bsr-0.9-64 -python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --compile_prefill --write_result benchmark_results.txt --sparsity bsr-0.9-32 +#python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --compile_prefill --write_result benchmark_results.txt --sparsity bsr-0.9-32 diff --git a/torchao/prototype/sparsity/superblock/supermask.py b/torchao/prototype/sparsity/superblock/supermask.py new file mode 100644 index 0000000000..e1f8a67108 --- /dev/null +++ b/torchao/prototype/sparsity/superblock/supermask.py @@ -0,0 +1,156 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. + +import torch.nn as nn +import math +import torch +from torch.autograd import Variable +import torch.nn.functional as F +import numpy as np + +from torchao.quantization.quant_api import _replace_with_custom_fn_if_matches_filter + +# original supermask +scores_min=None +scores_max=9e9 +uniform_init_01 = False + +# adjusted supermask, initialize scores with uniform distribution in [0,1], clamp scores in each step in [0,1] +# scores_min=0. +# scores_max=1. +# uniform_init_01 = True + +def percentile(t, q): + """Return the value that is larger than q% of t""" + k = 1 + round(.01 * float(q) * (t.numel() - 1)) + return t.view(-1).kthvalue(k).values + + +class GetSubnet(torch.autograd.Function): + """Supermask STE function""" + @staticmethod + def forward(ctx, scores, zeros, ones, sparsity): + clamped_scores = scores.clamp(min=scores_min,max=scores_max) + k_val = percentile(clamped_scores, sparsity*100) + return torch.where(clamped_scores < k_val, zeros.to(scores.device), ones.to(scores.device)) + @staticmethod + def backward(ctx, g): + return g, None, None, None + + +class ApplyMask(torch.autograd.Function): + """Supermask STE function""" + @staticmethod + def forward(ctx, weight, scores): + return weight * scores + @staticmethod + def backward(ctx, grad_output): + grad_weight = grad_scores = None + if ctx.needs_input_grad[0]: + grad_weight = grad_output + if ctx.needs_input_grad[1]: + grad_scores = grad_output + return grad_weight, grad_scores + + +class SupermaskLinear(nn.Linear): + """Supermask class for Linear layer""" + def __init__(self, sparsity, fixed_mask, fixed_weight, bitwidth, transform, fixed_transform, *args, **kwargs): + tile_size = kwargs.pop("tile_size", 1) + super(SupermaskLinear, self).__init__(*args, **kwargs) + # initialize the scores + max_sparsity = 1 - (1 / math.prod([math.ceil(k / tile_size) for k in self.weight.size()])) + self.sparsity = sparsity + if self.sparsity > max_sparsity: + print( + f"reducing sparsity from {self.sparsity} to {max_sparsity}", + f"(maximum sparsity for layer with shape {self.weight.size()} and tile size {tile_size})" + ) + self.sparsity = max_sparsity + self.tile_size = tile_size + self.sparsify_weights = False + self.scores = nn.Parameter( + torch.empty( + [max(1, int(math.ceil(wn / tile_size))) for wn in self.weight.size()] + ), + requires_grad=not fixed_mask, + ) + nn.init.uniform_(self.scores) if uniform_init_01 else nn.init.kaiming_uniform_(self.scores, a=math.sqrt(5)) + + # the shift and the scale are transformation parameters + # the actually used weights = self.weight*self.scale+self.shift + # the transformation is activated only for quantized weights + self.shift=nn.Parameter(torch.Tensor(1).fill_(0.), requires_grad=False) + self.scale=nn.Parameter(torch.Tensor(1).fill_(1.), requires_grad=False) + + with torch.no_grad(): + # if bitwidth is None, then use floating point values in self.weight + # if bitwidth is not None, then quantize self.weight into k-bit (k=bitwidth) + # quantized values are -2^(k-1), -2^(k-1)+1, ..., 0, 1, ..., 2^(k-1)-1 + # these quantized values are uniformly distributed + if bitwidth is not None: + weights_max = torch.max(self.weight).item() + weights_min = torch.min(self.weight).item() + least_step = (weights_max-weights_min)/pow(2,bitwidth) + left_bound = weights_min-1e-6 + right_bound = weights_min+least_step+1e-6 + # self.shift=nn.Parameter(torch.Tensor(1).fill_( (weights_min+(pow(2,bitwidth-1)+0.5)*least_step) if transform[0] is None else transform[0] ), requires_grad=not fixed_transform[0]) + # self.scale=nn.Parameter(torch.Tensor(1).fill_( least_step if transform[1] is None else transform[1] ), requires_grad=not fixed_transform[1]) + # for example, if using binary weights (k=1) with -a, +a, set transform = [a,2a]; if using binary weights (k=1) with a, 0, set transform = [0,-a]; + self.shift=nn.Parameter(torch.Tensor(1).fill_( 0. if transform[0] is None else transform[0] ), requires_grad=not fixed_transform[0]) + self.scale=nn.Parameter(torch.Tensor(1).fill_( 1. if transform[1] is None else transform[1] ), requires_grad=not fixed_transform[1]) + for i in range(-int(pow(2,bitwidth-1)),int(pow(2,bitwidth-1))): + self.weight[torch.logical_and(self.weight>left_bound, self.weight<=right_bound)] = i + left_bound = right_bound + right_bound += least_step + + self.weight.requires_grad = not fixed_weight + + def get_mask(self): + subnet = GetSubnet.apply(self.scores, + torch.zeros_like(self.scores), + torch.ones_like(self.scores), + self.sparsity) + + if self.tile_size != 1: + for i, k in enumerate(self.weight.shape): + subnet = subnet.repeat_interleave(self.tile_size, dim=i) + subnet = torch.narrow(subnet, i, 0, k) + + return subnet + + def sparsify_offline(self): + subnet = self.get_mask() + self.weight.data = (self.weight*self.scale+self.shift) * subnet + self.sparsify_weights = True + + def forward(self, x): + if not self.sparsify_weights: + subnet = self.get_mask() + # w = (self.weight*self.scale+self.shift) + w = ApplyMask.apply(self.weight, subnet) + return F.linear(x, w, self.bias) + return F.linear(x, self.weight, self.bias) + + @classmethod + def from_linear(cls, linear : torch.nn.Linear, sparsity_level:float=0.0, blocksize=1, inference=True): + module_new = None + + assert isinstance(linear, torch.nn.Linear) + module_new = SupermaskLinear( + sparsity_level, False, False, None, None, None, + linear.in_features, + linear.out_features, + bias=linear.bias is not None, + tile_size=blocksize, + ).to(device=linear.weight.device, dtype=linear.weight.dtype) + module_new.weight.data.copy_(linear.weight.data) + if linear.bias is not None: + module_new.bias.data.copy_(linear.bias.data) + if inference: + module_new.sparsify_offline() + return module_new + + @classmethod + def to_linear(cls): + pass + diff --git a/torchao/sparsity/blocksparse.py b/torchao/sparsity/blocksparse.py index 0cf674dc00..a0b04b4863 100644 --- a/torchao/sparsity/blocksparse.py +++ b/torchao/sparsity/blocksparse.py @@ -6,6 +6,8 @@ from torchao.quantization.quant_api import _get_linear_subclass_inserter from torchao.utils import TorchAOBaseTensor +# from torch.sparse._triton_ops import broadcast_batch_dims, bsr_dense_addmm + from .bsr_triton_ops import bsr_dense_addmm, broadcast_batch_dims aten = torch.ops.aten @@ -68,8 +70,6 @@ def blocksparse_linear( bias: torch.Tensor, ) -> torch.Tensor: weight_bsr = torch.sparse_bsr_tensor(crow_indices, col_indices, values, size=(M, K)) - # TODO: Change this to call into Triton kernel directly like int_addmm - # This way we know we must be on the hot path return torch.nn.functional.linear(A, weight_bsr, bias) @@ -327,33 +327,11 @@ def next_power_of_two(n): @implements(torch.nn.functional.linear) def block_sparse_linear(func, types, args, kwargs): - # linear(x, w^t) - # linear(w, x^t)^t x_orig, w, bias = args - # # TODO: Change this to do padding to make sure blocksparse.linear works - # return torch.ops.blocksparse.linear( - # x, w.crow_indices(), w.col_indices(), w.values(), w.shape[0], w.shape[1], bias - # ) x = x_orig.reshape(-1, x_orig.size(-1)).t() M = w.shape[0] K = w.shape[1] N = x.shape[1] - # TODO: Replace this with mul + sum for the mv case similar to - # https://github.com/pytorch/pytorch/blob/a9685767773157440c162caaf125856e04e2981f/torch/_inductor/decomposition.py#L292 - # use .to_dense to get a baseline implementation that works and then use NJT for .sum and such - # if x.size(-1) == 1: - # # print("USING THIS") - # # breakpoint() - # out = (torch.mul(w.unsqueeze(2), x.unsqueeze(0))).sum(dim=1) - # out_orig = out.t().reshape(x_orig.shape[:-1] + (M,)) - # if bias is None: - # special_ret = out_orig - # else: - # special_ret = out_orig + bias - # return special_ret - # else: - # N_padded = max(16, next_power_of_two(N)) - # x_padded = torch.nn.functional.pad(x, (0, N_padded - N), 'constant', 0) out = torch.ops.blocksparse.addmm( x, w.crow_indices(), @@ -363,7 +341,6 @@ def block_sparse_linear(func, types, args, kwargs): K, None, ) - # out_orig = out[:, :x.size(-1)].t().reshape(x_orig.shape[:-1] + (M,)) out_orig = out.t() if bias is None: return out_orig diff --git a/torchao/sparsity/bsr_triton_ops.py b/torchao/sparsity/bsr_triton_ops.py index f4cdaed79b..f392e3714a 100644 --- a/torchao/sparsity/bsr_triton_ops.py +++ b/torchao/sparsity/bsr_triton_ops.py @@ -822,17 +822,16 @@ def _bsr_strided_addmm_kernel( ) output_acc_block = tl.zeros((BLOCKSIZE_ROW, PADDED_BLOCKSIZE_COL), dtype=acc_dtype) - + # offsets = tl.arange(0, PADDED_BLOCKSIZE_COL)[None, :] for _ in range(row_nnz): values_block = tl.load(values_block_ptrs) # find which row of dense needs to get loaded # for multiplication with values_block. dense_row_idx = tl.load(col_index_nnz_ptr) - offsets = tl.arange(0, PADDED_BLOCKSIZE_COL)[None, :] dense_block = tl.load( dense_block_ptrs + dense_tiled_row_stride * dense_row_idx, - mask=offsets < BLOCKSIZE_COL, + mask=col_block_arange[None, :] < BLOCKSIZE_COL, ) # do block mm @@ -884,7 +883,7 @@ def _bsr_strided_addmm_kernel( output_acc_block += beta * tl.load(input_ptrs) # write back the result - tl.store(output_ptrs, output_acc_block.to(output_ptr.dtype.element_ty)) + tl.store(output_ptrs, output_acc_block.to(output_ptr.dtype.element_ty), mask=col_block_arange[None, :]< BLOCKSIZE_COL) else: _bsr_strided_addmm_kernel = None # type: ignore[assignment] From 13e230c83ae147f2f157de02ada434d13cde15ef Mon Sep 17 00:00:00 2001 From: Jesse Cai Date: Fri, 24 Jan 2025 14:15:52 -0800 Subject: [PATCH 13/23] wip --- test/sparsity/test_bsr.py | 29 +++++++++++++++++++ torchao/_models/llama/benchmark_results.txt | 9 +++++- torchao/_models/llama/bsr_benchmarks.sh | 6 ++-- .../sparsity/superblock/benchmark.py | 6 ++-- .../prototype/sparsity/superblock/utils.py | 2 +- 5 files changed, 45 insertions(+), 7 deletions(-) diff --git a/test/sparsity/test_bsr.py b/test/sparsity/test_bsr.py index c8d14eed98..11a551ec85 100644 --- a/test/sparsity/test_bsr.py +++ b/test/sparsity/test_bsr.py @@ -1,6 +1,7 @@ import copy import logging import unittest +import math import torch from torch import nn @@ -64,7 +65,35 @@ def test_sparse(self, compile): torch.testing.assert_close(dense_result, sparse_result, rtol=1e-3, atol=1e-3) +class TestSupermask(common_utils.TestCase): + + @common_utils.parametrize("sparsity_level", [0.25, 0.5]) + @common_utils.parametrize("blocksize", [2, 4, 8]) + def test_supermask(self, sparsity_level, blocksize): + input = torch.randn((1, 16)).half().cuda() + model = ( + nn.Sequential( + nn.Linear(16, 16, bias=False), + ) + .half() + .cuda() + .eval() + ) + + from torchao.sparsity import SupermaskLinear + + M, N = model[0].weight.shape + sparsify_(model, lambda x: SupermaskLinear.from_linear(x, sparsity_level=sparsity_level, blocksize=blocksize)) + weight_bsr = model[0].weight.to_sparse_bsr(blocksize=blocksize) + + nnz = weight_bsr._nnz() + expected = round((M // blocksize) * (N // blocksize) * (1 - sparsity_level)) + assert nnz == expected, f"Expected {expected} nonzeros, got {nnz}" + + common_utils.instantiate_parametrized_tests(TestBlockSparseWeight) +common_utils.instantiate_parametrized_tests(TestSupermask) + if __name__ == "__main__": unittest.main() diff --git a/torchao/_models/llama/benchmark_results.txt b/torchao/_models/llama/benchmark_results.txt index 2b4cd8695b..aaeaeb8c9a 100644 --- a/torchao/_models/llama/benchmark_results.txt +++ b/torchao/_models/llama/benchmark_results.txt @@ -155,4 +155,11 @@ OTHER BENCHMARKS 20250123184904, tok/s=146.81, tok/s_decode=149.91, ttft=0.0275, mem/s=1940.08 GB/s, peak_mem=13.92 GB, model_size=13.21 GB quant: None, sparse: None, mod: Llama-2-7b-chat-hf, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --checkpoint_path ../../../checkpoints/meta-llama/Llama-2-7b-chat-hf/model.pth --device cuda --precision torch.bfloat16 --compile --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 20250123184942, tok/s= 63.00, tok/s_decode= 68.67, ttft=0.2616, mem/s= 417.12 GB/s, peak_mem= 9.16 GB, model_size= 6.62 GB quant: int8dq, sparse: None, mod: Llama-2-7b-chat-hf, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --quantization int8dq --checkpoint_path ../../../checkpoints/meta-llama/Llama-2-7b-chat-hf/model.pth --device cuda --precision torch.bfloat16 --compile --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 20250123185104, tok/s=207.53, tok/s_decode=218.47, ttft=0.0475, mem/s=1012.36 GB/s, peak_mem= 6.31 GB, model_size= 4.88 GB quant: None, sparse: bsr-0.9-64, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr-0.9-64 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 -20250123185604, tok/s=208.35, tok/s_decode=218.59, ttft=0.0444, mem/s=1016.38 GB/s, peak_mem= 6.31 GB, model_size= 4.88 GB quant: None, sparse: bsr-0.9-64, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr-0.9-64 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 \ No newline at end of file +20250123185604, tok/s=208.35, tok/s_decode=218.59, ttft=0.0444, mem/s=1016.38 GB/s, peak_mem= 6.31 GB, model_size= 4.88 GB quant: None, sparse: bsr-0.9-64, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr-0.9-64 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250124120551, tok/s=148.85, tok/s_decode=157.66, ttft=0.0748, mem/s= 726.10 GB/s, peak_mem= 6.58 GB, model_size= 4.88 GB quant: None, sparse: bsr-0.9-64, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr-0.9-64 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250124121349, tok/s= 93.00, tok/s_decode= 93.67, ttft=0.0150, mem/s=1395.96 GB/s, peak_mem=16.47 GB, model_size=15.01 GB quant: None, sparse: None, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250124121533, tok/s=149.71, tok/s_decode=157.95, ttft=0.0695, mem/s= 730.29 GB/s, peak_mem= 6.58 GB, model_size= 4.88 GB quant: None, sparse: bsr-0.9-64, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr-0.9-64 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250124124548, tok/s= 48.92, tok/s_decode= 70.49, ttft=1.2505, mem/s= 734.29 GB/s, peak_mem=36.45 GB, model_size=15.01 GB quant: None, sparse: None, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --prefill_size 8192--num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250124124720, tok/s= 48.95, tok/s_decode= 70.50, ttft=1.2485, mem/s= 734.75 GB/s, peak_mem=36.70 GB, model_size=15.01 GB quant: None, sparse: None, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --prefill_size 8192--num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250124125113, tok/s= 48.87, tok/s_decode= 70.78, ttft=1.2673, mem/s= 733.50 GB/s, peak_mem=36.70 GB, model_size=15.01 GB quant: None, sparse: None, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --prefill_size 8192--num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250124125909, tok/s= 67.03, tok/s_decode= 99.25, ttft=0.9682, mem/s= 326.99 GB/s, peak_mem=18.15 GB, model_size= 4.88 GB quant: None, sparse: bsr-0.9-64, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr-0.9-64 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --prefill_size 8192--num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 \ No newline at end of file diff --git a/torchao/_models/llama/bsr_benchmarks.sh b/torchao/_models/llama/bsr_benchmarks.sh index a667b7c882..6e2515dfe8 100644 --- a/torchao/_models/llama/bsr_benchmarks.sh +++ b/torchao/_models/llama/bsr_benchmarks.sh @@ -1,8 +1,8 @@ export CHECKPOINT_PATH=../../../checkpoints # path to checkpoints folder export MODEL_REPO=meta-llama/Meta-Llama-3.1-8B -#python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --compile_prefill --write_result benchmark_results.txt --prefill_size 8192 --profile baseline_prefill -#python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --compile_prefill --write_result benchmark_results.txt --prefill_size 8192 --sparsity bsr --profile bsr_prefill +python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --compile_prefill --write_result benchmark_results.txt --prefill_size 8192 +python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --compile_prefill --write_result benchmark_results.txt --prefill_size 8192 --sparsity bsr-0.9-64 #python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --compile_prefill --write_result benchmark_results.txt -python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --compile_prefill --write_result benchmark_results.txt --sparsity bsr-0.9-64 +#python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --compile_prefill --write_result benchmark_results.txt --sparsity bsr-0.9-64 #python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --compile_prefill --write_result benchmark_results.txt --sparsity bsr-0.9-32 diff --git a/torchao/prototype/sparsity/superblock/benchmark.py b/torchao/prototype/sparsity/superblock/benchmark.py index 9f6d70e8b0..a53f53c21a 100644 --- a/torchao/prototype/sparsity/superblock/benchmark.py +++ b/torchao/prototype/sparsity/superblock/benchmark.py @@ -65,8 +65,7 @@ def main(args): ).eval() # Fake sparsity necessary for BSR, since we find based on SuperBlock - # sparsifier_or_none = simulate_sparsity(model, args) - sparsifier_or_none = None + sparsifier_or_none = simulate_sparsity(model, args) if sparsifier_or_none is not None: sparsifier_or_none.squash_mask() @@ -82,6 +81,9 @@ def main(args): # With quantization, we must use cuSPARSELt to fuse one of the scalar matmuls. # Otherwise, we observe the CUTLASS kernels to be faster, so we use those instead. accelerate_with_sparsity(model, args) + if "bsr" in args.sparsity: + sparsify_(model, block_sparse_weight(blocksize=args.blocksize)) + elif "semi-structured" in args.sparsityk # compile model = torch.compile(model, mode="max-autotune", fullgraph=True) diff --git a/torchao/prototype/sparsity/superblock/utils.py b/torchao/prototype/sparsity/superblock/utils.py index 8928db63cb..990cd96e0e 100644 --- a/torchao/prototype/sparsity/superblock/utils.py +++ b/torchao/prototype/sparsity/superblock/utils.py @@ -379,7 +379,7 @@ def mlp_only_with_args( ### Custom sparsification utils def apply_sparsity(model): for name, module in model.named_modules(): - if isinstance(module, SupermaskLinear) and "feed_forward" in name: + if isinstance(module, SupermaskLinear) and "mlp" in name: module.sparsify_offline() From 560198fd7b6adb602dc83ed970390823a14df457 Mon Sep 17 00:00:00 2001 From: Jesse Cai Date: Fri, 24 Jan 2025 14:24:55 -0800 Subject: [PATCH 14/23] updated --- .../prototype/sparsity/superblock/utils.py | 26 ++++++--- torchao/sparsity/supermask.py | 55 +++++++++++++++++-- 2 files changed, 70 insertions(+), 11 deletions(-) diff --git a/torchao/prototype/sparsity/superblock/utils.py b/torchao/prototype/sparsity/superblock/utils.py index 990cd96e0e..89a443bdab 100644 --- a/torchao/prototype/sparsity/superblock/utils.py +++ b/torchao/prototype/sparsity/superblock/utils.py @@ -22,6 +22,7 @@ from torchao.prototype.sparsity.superblock.blocksparse import block_sparse_weight from torchao.prototype.sparsity.superblock.supermask import ( SupermaskLinear, + apply_supermask, ) from torchao.quantization import int8_dynamic_activation_int8_weight, quantize_ from torchao.sparsity import semi_sparse_weight, sparsify_ @@ -383,7 +384,7 @@ def apply_sparsity(model): module.sparsify_offline() -def accelerate_with_sparsity(model, args, filter_fn): +def accelerate_with_sparsity(model, args): if args.sparsity == "bsr": apply_sparsity(model) if args.quantization: @@ -392,14 +393,13 @@ def accelerate_with_sparsity(model, args, filter_fn): quantize_( model, int8_dynamic_activation_int8_weight( - layout=BlockSparseLayout(blocksize=args.bsr) + _layout=BlockSparseLayout(blocksize=args.bsr) ), - filter_fn, - + superblock_only, ) else: assert args.bsr is not None, "BSR requires a block size" - quantize_(model, block_sparse_weight(blocksize=args.bsr), filter_fn) + sparsify_(model, block_sparse_weight(blocksize=args.bsr), superblock_only) elif args.sparsity == "semi_structured": if args.quantization: from torchao.dtypes import SemiSparseLayout @@ -417,9 +417,21 @@ def accelerate_with_sparsity(model, args, filter_fn): quantize_(model, int8_dynamic_activation_int8_weight(), mlp_only) -def simulate_sparsity(model, args, filter_fn): +def simulate_sparsity(model, args): if args.sparsity == "bsr": - pass + apply_supermask( + model, + linear_sparsity=args.sparsity_linear, + linear_sp_tilesize=args.bsr, + conv1x1_sparsity=args.sparsity_conv1x1, + conv1x1_sp_tilesize=args.bsr, + conv_sparsity=args.sparsity_conv, + conv_sp_tilesize=args.bsr, + skip_last_layer_sparsity=args.skip_last_layer_sparsity, + skip_first_transformer_sparsity=args.skip_first_transformer_sparsity, + device=args.device, + verbose=False, + ) elif args.sparsity == "semi_structured": sparse_config = [] for name, mod in model.named_modules(): diff --git a/torchao/sparsity/supermask.py b/torchao/sparsity/supermask.py index 73a05802a8..cd54e70a92 100644 --- a/torchao/sparsity/supermask.py +++ b/torchao/sparsity/supermask.py @@ -150,7 +150,54 @@ def from_linear(cls, linear, sparsity_level=0.0, blocksize=1, inference=True): module_new.sparsify_offline() return module_new - @classmethod - def to_linear(cls): - pass - + +def apply_supermask( + model, + linear_sparsity=0.0, + linear_sp_tilesize=1, + skip_last_layer_sparsity=False, + skip_first_transformer_sparsity=False, + device="cuda", + verbose=False, +): + sparsified_modules = {} + + for n, m in model.named_modules(): + # check conditions for skipping sparsity + if skip_last_layer_sparsity and n == "heads.head": + continue + if skip_first_transformer_sparsity and "encoder.layers.encoder_layer_0" in n: + continue + + if linear_sparsity != 0.0 and isinstance(m, torch.nn.Linear): + new_m = SupermaskLinear( + linear_sparsity, + False, + False, + None, + None, + None, + m.in_features, + m.out_features, + bias=m.bias is not None, + device=device, + tile_size=linear_sp_tilesize, + ) + new_m.weight.data.copy_(m.weight.data) + if m.bias is not None: + new_m.bias.data.copy_(m.bias.data) + sparsified_modules[n] = new_m + continue + + # add modules to model + for k, v in sparsified_modules.items(): + sm_name, ch_name = k.rsplit(".", 1) + sm = model.get_submodule(sm_name) + sm.add_module(ch_name, v) + + if verbose: + print( + f'sparsified module "{k}" with sparsity={v.sparsity}, tile size={v.tile_size}' + ) + + return model From 89f3ad0b89f89b92562a0ae5ec1267945caf87ae Mon Sep 17 00:00:00 2001 From: Jesse Cai Date: Fri, 24 Jan 2025 14:27:31 -0800 Subject: [PATCH 15/23] moved file --- torchao/{sparsity/bsr_triton_ops.py => kernel/bsr_trition_ops.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename torchao/{sparsity/bsr_triton_ops.py => kernel/bsr_trition_ops.py} (100%) diff --git a/torchao/sparsity/bsr_triton_ops.py b/torchao/kernel/bsr_trition_ops.py similarity index 100% rename from torchao/sparsity/bsr_triton_ops.py rename to torchao/kernel/bsr_trition_ops.py From b414b49db298b900835c0f2344343d2f6d4851e2 Mon Sep 17 00:00:00 2001 From: Jesse Cai Date: Fri, 24 Jan 2025 15:28:57 -0800 Subject: [PATCH 16/23] big supermask refactor --- torchao/_models/llama/benchmark_results.txt | 3 +- torchao/_models/llama/bsr_benchmarks.sh | 6 +- torchao/_models/llama/generate.py | 7 + torchao/kernel/__init__.py | 3 + .../{bsr_trition_ops.py => bsr_triton_ops.py} | 0 .../prototype/sparsity/superblock/README.md | 4 +- torchao/sparsity/__init__.py | 6 +- torchao/sparsity/blocksparse.py | 16 +- torchao/sparsity/sparse_api.py | 10 +- torchao/sparsity/supermask.py | 167 +++++------------- 10 files changed, 77 insertions(+), 145 deletions(-) rename torchao/kernel/{bsr_trition_ops.py => bsr_triton_ops.py} (100%) diff --git a/torchao/_models/llama/benchmark_results.txt b/torchao/_models/llama/benchmark_results.txt index aaeaeb8c9a..179d308ce4 100644 --- a/torchao/_models/llama/benchmark_results.txt +++ b/torchao/_models/llama/benchmark_results.txt @@ -162,4 +162,5 @@ OTHER BENCHMARKS 20250124124548, tok/s= 48.92, tok/s_decode= 70.49, ttft=1.2505, mem/s= 734.29 GB/s, peak_mem=36.45 GB, model_size=15.01 GB quant: None, sparse: None, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --prefill_size 8192--num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 20250124124720, tok/s= 48.95, tok/s_decode= 70.50, ttft=1.2485, mem/s= 734.75 GB/s, peak_mem=36.70 GB, model_size=15.01 GB quant: None, sparse: None, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --prefill_size 8192--num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 20250124125113, tok/s= 48.87, tok/s_decode= 70.78, ttft=1.2673, mem/s= 733.50 GB/s, peak_mem=36.70 GB, model_size=15.01 GB quant: None, sparse: None, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --prefill_size 8192--num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 -20250124125909, tok/s= 67.03, tok/s_decode= 99.25, ttft=0.9682, mem/s= 326.99 GB/s, peak_mem=18.15 GB, model_size= 4.88 GB quant: None, sparse: bsr-0.9-64, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr-0.9-64 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --prefill_size 8192--num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 \ No newline at end of file +20250124125909, tok/s= 67.03, tok/s_decode= 99.25, ttft=0.9682, mem/s= 326.99 GB/s, peak_mem=18.15 GB, model_size= 4.88 GB quant: None, sparse: bsr-0.9-64, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr-0.9-64 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --prefill_size 8192--num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250124152728, tok/s=149.00, tok/s_decode=157.80, ttft=0.0745, mem/s= 726.43 GB/s, peak_mem= 6.67 GB, model_size= 4.88 GB quant: None, sparse: bsr-0.9-64, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr-0.9-64 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 \ No newline at end of file diff --git a/torchao/_models/llama/bsr_benchmarks.sh b/torchao/_models/llama/bsr_benchmarks.sh index 6e2515dfe8..5e0228b6c0 100644 --- a/torchao/_models/llama/bsr_benchmarks.sh +++ b/torchao/_models/llama/bsr_benchmarks.sh @@ -1,8 +1,8 @@ export CHECKPOINT_PATH=../../../checkpoints # path to checkpoints folder export MODEL_REPO=meta-llama/Meta-Llama-3.1-8B -python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --compile_prefill --write_result benchmark_results.txt --prefill_size 8192 -python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --compile_prefill --write_result benchmark_results.txt --prefill_size 8192 --sparsity bsr-0.9-64 +#python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --compile_prefill --write_result benchmark_results.txt --prefill_size 8192 +#python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --compile_prefill --write_result benchmark_results.txt --prefill_size 8192 --sparsity bsr-0.9-64 #python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --compile_prefill --write_result benchmark_results.txt -#python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --compile_prefill --write_result benchmark_results.txt --sparsity bsr-0.9-64 +python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --compile_prefill --write_result benchmark_results.txt --sparsity bsr-0.9-64 #python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --compile_prefill --write_result benchmark_results.txt --sparsity bsr-0.9-32 diff --git a/torchao/_models/llama/generate.py b/torchao/_models/llama/generate.py index f08d63f5c2..65a35f2fb3 100644 --- a/torchao/_models/llama/generate.py +++ b/torchao/_models/llama/generate.py @@ -810,6 +810,13 @@ def ffn_or_attn_only(mod, fqn): ), filter_fn=ffn_only, ) + print(model) + sparsify_( + model, + SupermaskLinear.to_linear, + filter_fn=ffn_only, + ) + print(model) # Accelerate with triton bsr kernels sparsify_(model, diff --git a/torchao/kernel/__init__.py b/torchao/kernel/__init__.py index 409da72601..2006d6c403 100644 --- a/torchao/kernel/__init__.py +++ b/torchao/kernel/__init__.py @@ -1,6 +1,9 @@ from torchao.kernel.intmm import int_scaled_matmul, safe_int_mm +from torchao.kernel.bsr_triton_ops import bsr_dense_addmm, broadcast_batch_dims __all__ = [ + "bsr_dense_addmm", + "broadcast_batch_dims" "safe_int_mm", "int_scaled_matmul", ] diff --git a/torchao/kernel/bsr_trition_ops.py b/torchao/kernel/bsr_triton_ops.py similarity index 100% rename from torchao/kernel/bsr_trition_ops.py rename to torchao/kernel/bsr_triton_ops.py diff --git a/torchao/prototype/sparsity/superblock/README.md b/torchao/prototype/sparsity/superblock/README.md index 6fea1a0e3a..bed75c9ad3 100644 --- a/torchao/prototype/sparsity/superblock/README.md +++ b/torchao/prototype/sparsity/superblock/README.md @@ -66,11 +66,11 @@ Please refer to [TRAINING.md](TRAINING.md) for training from scratch. We use [To For example, if you would like to train a `vit_b_16` from scratch using Supermask, you can use the respective torchvision command found in [TRAINING.md](TRAINING.md) and append the supermask arguments: ``` torchrun --nproc_per_node=8 train.py\ - --model vit_h_14 --epochs 3 --batch-size 64 --opt adamw --lr 0.003 --wd 0.3\ + --model vit_b_16 --epochs 1 --batch-size 64 --opt adamw --lr 0.003 --wd 0.3\ --lr-scheduler cosineannealinglr --lr-warmup-method linear --lr-warmup-epochs 30\ --lr-warmup-decay 0.033 --amp --label-smoothing 0.11 --mixup-alpha 0.2 \ --clip-grad-norm 1 --cutmix-alpha 1.0 --model-ema\ - --sparsity semi_structured --data-path $IMAGENET_PATH + --sparsity bsr --data-path $IMAGENET_PATH ``` Through this command, we are training a `vit_b_16` with 90% sparsity to linear layers using 32x32 tiles. diff --git a/torchao/sparsity/__init__.py b/torchao/sparsity/__init__.py index d139b032c0..96b74fdb70 100644 --- a/torchao/sparsity/__init__.py +++ b/torchao/sparsity/__init__.py @@ -4,15 +4,12 @@ # This source code is licensed under the license found in the # LICENSE file in the root directory of this source tree. -from torchao.quantization.quant_api import ( - int8_dynamic_activation_int8_semi_sparse_weight, -) -from .blocksparse import block_sparse_weight from .supermask import SupermaskLinear from .sparse_api import ( apply_fake_sparsity, semi_sparse_weight, + block_sparse_weight, sparsify_, ) from .utils import PerChannelNormObserver # noqa: F403 @@ -26,5 +23,4 @@ "sparsify_", "semi_sparse_weight", "block_sparse_weight", - "int8_dynamic_activation_int8_semi_sparse_weight", ] diff --git a/torchao/sparsity/blocksparse.py b/torchao/sparsity/blocksparse.py index a0b04b4863..f4c9d20c39 100644 --- a/torchao/sparsity/blocksparse.py +++ b/torchao/sparsity/blocksparse.py @@ -6,13 +6,11 @@ from torchao.quantization.quant_api import _get_linear_subclass_inserter from torchao.utils import TorchAOBaseTensor -# from torch.sparse._triton_ops import broadcast_batch_dims, bsr_dense_addmm - -from .bsr_triton_ops import bsr_dense_addmm, broadcast_batch_dims +from torchao.kernel.bsr_triton_ops import bsr_dense_addmm, broadcast_batch_dims aten = torch.ops.aten - +# custom op definition @torch.library.custom_op("blocksparse::int_addmm", mutates_args=()) def blocksparse_int_addmm( crow_indices: torch.Tensor, @@ -320,10 +318,6 @@ def block_sparse_col_indices(func, types, args, kwargs): def block_sparse__nnz(func, types, args, kwargs): return args[0].bsr_values.shape[0] -def next_power_of_two(n): - assert n > 0 - return 2 ** (n.bit_length()) - @implements(torch.nn.functional.linear) def block_sparse_linear(func, types, args, kwargs): @@ -346,9 +340,3 @@ def block_sparse_linear(func, types, args, kwargs): return out_orig return out_orig + bias - - -def block_sparse_weight(blocksize=64): - return _get_linear_subclass_inserter( - partial(BlockSparseTensor.from_dense, blocksize=blocksize) - ) diff --git a/torchao/sparsity/sparse_api.py b/torchao/sparsity/sparse_api.py index 3dd7971525..3277518f87 100644 --- a/torchao/sparsity/sparse_api.py +++ b/torchao/sparsity/sparse_api.py @@ -1,9 +1,11 @@ from typing import Callable, Optional +from functools import partial import torch -from torch.ao.pruning import WeightNormSparsifier from torch.sparse import to_sparse_semi_structured +from torchao.prototype.sparsity.sparsifier.weight_norm_sparsifier import WeightNormSparsifier +from torchao.sparsity.blocksparse import BlockSparseTensor from torchao.quantization.quant_api import ( _get_linear_subclass_inserter, _is_linear, @@ -31,6 +33,12 @@ def apply_fake_sparsity(model, **kwargs): sparsifier.squash_mask() +def block_sparse_weight(blocksize=64): + return _get_linear_subclass_inserter( + partial(BlockSparseTensor.from_dense, blocksize=blocksize) + ) + + def semi_sparse_weight(): """ Convert the weight of linear moduels to semi-structured (2:4) sparsity diff --git a/torchao/sparsity/supermask.py b/torchao/sparsity/supermask.py index cd54e70a92..0f2fec55f3 100644 --- a/torchao/sparsity/supermask.py +++ b/torchao/sparsity/supermask.py @@ -12,12 +12,6 @@ # original supermask scores_min=None scores_max=9e9 -uniform_init_01 = False - -# adjusted supermask, initialize scores with uniform distribution in [0,1], clamp scores in each step in [0,1] -# scores_min=0. -# scores_max=1. -# uniform_init_01 = True def percentile(t, q): """Return the value that is larger than q% of t""" @@ -32,6 +26,7 @@ def forward(ctx, scores, zeros, ones, sparsity): clamped_scores = scores.clamp(min=scores_min,max=scores_max) k_val = percentile(clamped_scores, sparsity*100) return torch.where(clamped_scores < k_val, zeros.to(scores.device), ones.to(scores.device)) + @staticmethod def backward(ctx, g): return g, None, None, None @@ -54,54 +49,28 @@ def backward(ctx, grad_output): class SupermaskLinear(nn.Linear): """Supermask class for Linear layer""" - def __init__(self, sparsity, fixed_mask, fixed_weight, bitwidth, transform, fixed_transform, *args, **kwargs): - tile_size = kwargs.pop("tile_size", 1) + def __init__(self, sparsity_level, blocksize, fixed_mask, fixed_weight, *args, **kwargs): super(SupermaskLinear, self).__init__(*args, **kwargs) - # initialize the scores - max_sparsity = 1 - (1 / math.prod([math.ceil(k / tile_size) for k in self.weight.size()])) - self.sparsity = sparsity - if self.sparsity > max_sparsity: + # calculate the maximum sparsity given blocksize for the layer + max_sparsity_level = 1 - (1 / math.prod([math.ceil(k / blocksize) for k in self.weight.size()])) + self.sparsity_level = sparsity_level + if self.sparsity_level > max_sparsity_level: print( f"reducing sparsity from {self.sparsity} to {max_sparsity}", - f"(maximum sparsity for layer with shape {self.weight.size()} and tile size {tile_size})" + f"(maximum sparsity for layer with shape {self.weight.size()} and tile size {blocksize})" ) - self.sparsity = max_sparsity - self.tile_size = tile_size + self.sparsity_level = max_sparsity_level + self.blocksize = blocksize self.sparsify_weights = False self.scores = nn.Parameter( torch.empty( - [max(1, int(math.ceil(wn / tile_size))) for wn in self.weight.size()] + [max(1, int(math.ceil(wn / blocksize))) for wn in self.weight.size()] ), requires_grad=not fixed_mask, ) - nn.init.uniform_(self.scores) if uniform_init_01 else nn.init.kaiming_uniform_(self.scores, a=math.sqrt(5)) - - # the shift and the scale are transformation parameters - # the actually used weights = self.weight*self.scale+self.shift - # the transformation is activated only for quantized weights - self.shift=nn.Parameter(torch.Tensor(1).fill_(0.), requires_grad=False) - self.scale=nn.Parameter(torch.Tensor(1).fill_(1.), requires_grad=False) - - with torch.no_grad(): - # if bitwidth is None, then use floating point values in self.weight - # if bitwidth is not None, then quantize self.weight into k-bit (k=bitwidth) - # quantized values are -2^(k-1), -2^(k-1)+1, ..., 0, 1, ..., 2^(k-1)-1 - # these quantized values are uniformly distributed - if bitwidth is not None: - weights_max = torch.max(self.weight).item() - weights_min = torch.min(self.weight).item() - least_step = (weights_max-weights_min)/pow(2,bitwidth) - left_bound = weights_min-1e-6 - right_bound = weights_min+least_step+1e-6 - # self.shift=nn.Parameter(torch.Tensor(1).fill_( (weights_min+(pow(2,bitwidth-1)+0.5)*least_step) if transform[0] is None else transform[0] ), requires_grad=not fixed_transform[0]) - # self.scale=nn.Parameter(torch.Tensor(1).fill_( least_step if transform[1] is None else transform[1] ), requires_grad=not fixed_transform[1]) - # for example, if using binary weights (k=1) with -a, +a, set transform = [a,2a]; if using binary weights (k=1) with a, 0, set transform = [0,-a]; - self.shift=nn.Parameter(torch.Tensor(1).fill_( 0. if transform[0] is None else transform[0] ), requires_grad=not fixed_transform[0]) - self.scale=nn.Parameter(torch.Tensor(1).fill_( 1. if transform[1] is None else transform[1] ), requires_grad=not fixed_transform[1]) - for i in range(-int(pow(2,bitwidth-1)),int(pow(2,bitwidth-1))): - self.weight[torch.logical_and(self.weight>left_bound, self.weight<=right_bound)] = i - left_bound = right_bound - right_bound += least_step + nn.init.kaiming_uniform_(self.scores, a=math.sqrt(5)) + + # NOTE: the previous implementation of Supermask supported quantizing the weights, this has been removed. self.weight.requires_grad = not fixed_weight @@ -109,95 +78,55 @@ def get_mask(self): subnet = GetSubnet.apply(self.scores, torch.zeros_like(self.scores), torch.ones_like(self.scores), - self.sparsity) + self.sparsity_level) - if self.tile_size != 1: + if self.blocksize != 1: for i, k in enumerate(self.weight.shape): - subnet = subnet.repeat_interleave(self.tile_size, dim=i) + subnet = subnet.repeat_interleave(self.blocksize, dim=i) subnet = torch.narrow(subnet, i, 0, k) return subnet - def sparsify_offline(self): - subnet = self.get_mask() - self.weight.data = (self.weight*self.scale+self.shift) * subnet - self.sparsify_weights = True def forward(self, x): - if not self.sparsify_weights: - subnet = self.get_mask() - # w = (self.weight*self.scale+self.shift) - w = ApplyMask.apply(self.weight, subnet) - return F.linear(x, w, self.bias) - return F.linear(x, self.weight, self.bias) + subnet = self.get_mask() + w = ApplyMask.apply(self.weight, subnet) + return F.linear(x, w, self.bias) @classmethod - def from_linear(cls, linear, sparsity_level=0.0, blocksize=1, inference=True): - module_new = None - + def from_linear(cls, linear, sparsity_level=0.0, blocksize=1, ): + """ + Main entrypoint for creating a SupermaskLinear from a Linear layer. + """ assert isinstance(linear, torch.nn.Linear) - module_new = SupermaskLinear( - sparsity_level, False, False, None, None, None, + + supermask_linear = SupermaskLinear( + sparsity_level, blocksize, False, False, linear.in_features, linear.out_features, bias=linear.bias is not None, - tile_size=blocksize, ).to(device=linear.weight.device, dtype=linear.weight.dtype) - module_new.weight.data.copy_(linear.weight.data) + supermask_linear.weight.data.copy_(linear.weight.data) if linear.bias is not None: - module_new.bias.data.copy_(linear.bias.data) - if inference: - module_new.sparsify_offline() - return module_new - - -def apply_supermask( - model, - linear_sparsity=0.0, - linear_sp_tilesize=1, - skip_last_layer_sparsity=False, - skip_first_transformer_sparsity=False, - device="cuda", - verbose=False, -): - sparsified_modules = {} - - for n, m in model.named_modules(): - # check conditions for skipping sparsity - if skip_last_layer_sparsity and n == "heads.head": - continue - if skip_first_transformer_sparsity and "encoder.layers.encoder_layer_0" in n: - continue - - if linear_sparsity != 0.0 and isinstance(m, torch.nn.Linear): - new_m = SupermaskLinear( - linear_sparsity, - False, - False, - None, - None, - None, - m.in_features, - m.out_features, - bias=m.bias is not None, - device=device, - tile_size=linear_sp_tilesize, - ) - new_m.weight.data.copy_(m.weight.data) - if m.bias is not None: - new_m.bias.data.copy_(m.bias.data) - sparsified_modules[n] = new_m - continue - - # add modules to model - for k, v in sparsified_modules.items(): - sm_name, ch_name = k.rsplit(".", 1) - sm = model.get_submodule(sm_name) - sm.add_module(ch_name, v) - - if verbose: - print( - f'sparsified module "{k}" with sparsity={v.sparsity}, tile size={v.tile_size}' - ) + supermask_linear.bias.data.copy_(linear.bias.data) + return supermask_linear - return model + @classmethod + def to_linear(cls, supermask_linear): + """ + Convert a SupermaskLinear to a Linear layer. + Replaces the old sparsify_offline() function. + """ + self = supermask_linear + + linear = torch.nn.Linear( + self.in_features, + self.out_features, + bias=self.bias is not None, + ).to(device=self.weight.device, dtype=self.weight.dtype) + + mask = self.get_mask() + linear.weight.data.copy_(self.weight * mask) + if self.bias is not None: + linear.bias.data.copy_(self.bias.data) + return linear From 1ff8aa04bfeb0ea73ecaa825aa3721f567a644fa Mon Sep 17 00:00:00 2001 From: Jesse Cai Date: Mon, 27 Jan 2025 20:46:00 -0800 Subject: [PATCH 17/23] update --- benchmarks/benchmark_gpu_sparsity.py | 60 +++++++++++-------- .../{test_bsr.py => test_supermask.py} | 49 +++------------ 2 files changed, 44 insertions(+), 65 deletions(-) rename test/sparsity/{test_bsr.py => test_supermask.py} (57%) diff --git a/benchmarks/benchmark_gpu_sparsity.py b/benchmarks/benchmark_gpu_sparsity.py index 9e22f6d43a..832108f552 100644 --- a/benchmarks/benchmark_gpu_sparsity.py +++ b/benchmarks/benchmark_gpu_sparsity.py @@ -13,6 +13,8 @@ ) from torchao.utils import benchmark_model +from torchao.sparsity.blocksparse import BlockSparseTensor + torch.set_printoptions( precision=2, threshold=None, @@ -43,7 +45,8 @@ def run_gpu_sparse_benchmark(m, k, n, args): A = create_block_sparse_tensor( m, k, args.block_size, args.sparsity_level, dtype ) - A_sparse = A.to_sparse_bsr(blocksize=args.block_size) + # A_sparse = A.to_sparse_bsr(blocksize=args.block_size) + A_sparse = BlockSparseTensor.from_dense(A, args.block_size).detach() # BSR kernel tuning if args.bsr_autotune: print("Tuning kernel params") @@ -61,13 +64,16 @@ def run_gpu_sparse_benchmark(m, k, n, args): raise ValueError(f"Unknown sparsity: {args.sparsity}") if args.eval_fn == "linear": - b = torch.randn(m, dtype=dtype).cuda() + # b = torch.randn(m, dtype=dtype).cuda() + b = None # can't use lambda - def dense_func(): + @torch.compile(mode="max-autotune") + def dense_func(x, A, b): return F.linear(x, A, b) - def sparse_func(): + @torch.compile(mode="max-autotune") + def sparse_func(x, A_sparse, b): return F.linear(x, A_sparse, b) elif args.eval_fn == "mm": @@ -101,20 +107,27 @@ def sparse_func(): else: raise ValueError(f"Unknown eval_fn: {args.eval_fn}") - dense_time = benchmark_model_with_warmup(dense_func, "dense.json.gz") - sparse_time = benchmark_model_with_warmup(sparse_func, "sparse.json.gz") - dense_func_c = torch.compile(dense_func, mode="max-autotune") - dense_time_c = benchmark_model_with_warmup( - dense_func_c, "dense_compile.json.gz" + dense_time, sparse_time = 0, 0 + + #WARMUP + benchmark_model( + dense_func, 3, args=(x, A, b), device_type="cuda" + ) + + dense_time_c = benchmark_model( + dense_func, 10, args=(x, A, b), device_type="cuda" ) - sparse_func_c = torch.compile(sparse_func, mode="max-autotune") - sparse_time_c = benchmark_model_with_warmup( - sparse_func_c, "sparse_compile.json.gz" + + # WARMUP + benchmark_model( + sparse_func, 3, args=(x, A_sparse, b), device_type="cuda" ) - torch._dynamo.reset() + sparse_time_c = benchmark_model( + sparse_func, 10, args=(x, A_sparse, b), device_type="cuda" + ) return { "test_function": args.eval_fn, @@ -126,8 +139,7 @@ def sparse_func(): "dense": dense_time, "dense_c": dense_time_c, "sparse_c": sparse_time_c, - "speedup (d/s)": min(dense_time, dense_time_c) - / min(sparse_time, sparse_time_c), + "speedup (d/s)": dense_time_c / sparse_time_c, } @@ -200,15 +212,15 @@ def sparse_func(): ) elif args.mode == "llama3-8b-w": mm_shapes = [ - (16, 4096, 11008), - (16, 4096, 4096), - (16, 11008, 4096), - (4096, 4096, 11008), - (4096, 4096, 4096), - (4096, 11008, 4096), - (8192, 4096, 11008), - (8192, 4096, 4096), - (8192, 11008, 4096), + (4096, 11008, 16), + (11008, 4096, 16), + # (16, 4096, 4096), + # (4096, 4096, 11008), + # (4096, 4096, 4096), + # (4096, 11008, 4096), + # (8192, 4096, 11008), + # (8192, 4096, 4096), + # (8192, 11008, 4096), ] results = ( run_gpu_sparse_benchmark(m, k, n, args) for (m, k, n) in tqdm(mm_shapes) diff --git a/test/sparsity/test_bsr.py b/test/sparsity/test_supermask.py similarity index 57% rename from test/sparsity/test_bsr.py rename to test/sparsity/test_supermask.py index 11a551ec85..233826163f 100644 --- a/test/sparsity/test_bsr.py +++ b/test/sparsity/test_supermask.py @@ -25,46 +25,6 @@ format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", level=logging.INFO ) - -class TestBlockSparseWeight(common_utils.TestCase): - @unittest.skipIf( - not TORCH_VERSION_AT_LEAST_2_4, - "pytorch 2.4+ feature due to need for custom op support", - ) - @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available") - @common_utils.parametrize("compile", [True]) - def test_sparse(self, compile): - input = torch.rand((1, 1024)).half().cuda() - model = ( - nn.Sequential( - nn.Linear(1024, 2048), - ) - .half() - .cuda() - .eval() - ) - - from torchao.sparsity.utils import create_block_sparse_tensor - - M, N = model[0].weight.shape - model[0].weight.data = create_block_sparse_tensor(M, N, 64, 0.5, torch.float16) - dense_result = model(input) - - from torchao.sparsity import ( - block_sparse_weight, - ) - - sparsify_(model, block_sparse_weight(blocksize=64)) - # if compile: - # model = torch.compile(model) - sparse_result = model(input) - - print(dense_result) - print(sparse_result) - - torch.testing.assert_close(dense_result, sparse_result, rtol=1e-3, atol=1e-3) - - class TestSupermask(common_utils.TestCase): @common_utils.parametrize("sparsity_level", [0.25, 0.5]) @@ -84,14 +44,21 @@ def test_supermask(self, sparsity_level, blocksize): M, N = model[0].weight.shape sparsify_(model, lambda x: SupermaskLinear.from_linear(x, sparsity_level=sparsity_level, blocksize=blocksize)) + sparsify_(model, SupermaskLinear.to_linear) weight_bsr = model[0].weight.to_sparse_bsr(blocksize=blocksize) + # Test correct sparsity level nnz = weight_bsr._nnz() expected = round((M // blocksize) * (N // blocksize) * (1 - sparsity_level)) assert nnz == expected, f"Expected {expected} nonzeros, got {nnz}" + def test_from_linear(self): + from torchao.sparsity import SupermaskLinear + linear = nn.Linear(128, 128) + supermask_linear = SupermaskLinear.from_linear(linear, sparsity_level=0.5, blocksize=4) + assert supermask_linear.weight.shape == linear.weight.shape + -common_utils.instantiate_parametrized_tests(TestBlockSparseWeight) common_utils.instantiate_parametrized_tests(TestSupermask) From d503e5d5278e440774cbe5a4c5f66ee0b8078136 Mon Sep 17 00:00:00 2001 From: Jesse Cai Date: Tue, 28 Jan 2025 15:01:02 -0800 Subject: [PATCH 18/23] bsr triton updateS --- benchmarks/benchmark_gpu_sparsity.py | 50 +++++++++++++----------- torchao/kernel/bsr_triton_ops.py | 58 +++++++--------------------- torchao/sparsity/blocksparse.py | 18 ++++++++- 3 files changed, 57 insertions(+), 69 deletions(-) diff --git a/benchmarks/benchmark_gpu_sparsity.py b/benchmarks/benchmark_gpu_sparsity.py index 832108f552..3918622b25 100644 --- a/benchmarks/benchmark_gpu_sparsity.py +++ b/benchmarks/benchmark_gpu_sparsity.py @@ -1,4 +1,5 @@ import argparse +from typing import Callable, List, Optional, Tuple import pandas as pd import torch @@ -11,7 +12,7 @@ create_block_sparse_tensor, create_semi_structured_tensor, ) -from torchao.utils import benchmark_model +import torch.utils.benchmark as benchmark from torchao.sparsity.blocksparse import BlockSparseTensor @@ -29,6 +30,17 @@ def benchmark_model_with_warmup(func, x, N_WARMUP=3): benchmark_model(func, N_WARMUP, device_type="cuda") return benchmark_model(func, 10, device_type="cuda") +def benchmark_torch_function_in_microseconds(func: Callable, *args, **kwargs) -> float: + # warmup + for _ in range(1): + func(*args, **kwargs) + # t0 = benchmark.Timer( + # stmt="func(*args, **kwargs)", + # globals={"args": args, "kwargs": kwargs, "func": func}, + # ) + # return t0.adaptive_autorange(min_run_time=0.1).median * 1e6 + return 1 + def run_gpu_sparse_benchmark(m, k, n, args): with torch.no_grad(): @@ -69,11 +81,11 @@ def run_gpu_sparse_benchmark(m, k, n, args): # can't use lambda @torch.compile(mode="max-autotune") - def dense_func(x, A, b): + def dense_func(x): return F.linear(x, A, b) @torch.compile(mode="max-autotune") - def sparse_func(x, A_sparse, b): + def sparse_func(x): return F.linear(x, A_sparse, b) elif args.eval_fn == "mm": @@ -107,27 +119,17 @@ def sparse_func(): else: raise ValueError(f"Unknown eval_fn: {args.eval_fn}") - + # print(x) + # print(A) + # print(A_sparse.crow_indices()) + # print(A_sparse.col_indices()) + # print(A_sparse.values()) dense_time, sparse_time = 0, 0 + dense_time_c, sparse_time_c = 1, 1 #WARMUP - benchmark_model( - dense_func, 3, args=(x, A, b), device_type="cuda" - ) - - dense_time_c = benchmark_model( - dense_func, 10, args=(x, A, b), device_type="cuda" - ) - - - # WARMUP - benchmark_model( - sparse_func, 3, args=(x, A_sparse, b), device_type="cuda" - ) - - sparse_time_c = benchmark_model( - sparse_func, 10, args=(x, A_sparse, b), device_type="cuda" - ) + # dense_time_c = benchmark_torch_function_in_microseconds(dense_func, x) + sparse_time_c = benchmark_torch_function_in_microseconds(sparse_func, x) return { "test_function": args.eval_fn, @@ -212,8 +214,10 @@ def sparse_func(): ) elif args.mode == "llama3-8b-w": mm_shapes = [ - (4096, 11008, 16), - (11008, 4096, 16), + # (32, 32, 16), + (4096, 14336, 1), + # (14336, 4096, 1), + # (11008, 4096, 16), # (16, 4096, 4096), # (4096, 4096, 11008), # (4096, 4096, 4096), diff --git a/torchao/kernel/bsr_triton_ops.py b/torchao/kernel/bsr_triton_ops.py index f392e3714a..f6f28ee4a0 100644 --- a/torchao/kernel/bsr_triton_ops.py +++ b/torchao/kernel/bsr_triton_ops.py @@ -13,6 +13,7 @@ from torch.sparse._triton_ops_meta import get_meta + TORCH_SPARSE_BSR_SCATTER_MM_LRU_CACHE_SIZE = int( os.getenv("TORCH_SPARSE_BSR_SCATTER_MM_LRU_CACHE_SIZE", 2) ) @@ -504,6 +505,7 @@ def _int_bsr_dense_addmm( def bsr_dense_addmm( input: torch.Tensor, bsr: torch.Tensor, + row_indices: torch.Tensor, dense: torch.Tensor, *, beta=1, @@ -655,6 +657,7 @@ def kernel(grid, *sliced_tensors): BLOCKSIZE_ROW=BM, BLOCKSIZE_INNER=BK, BLOCKSIZE_COL=BN, + BLOCKSIZE_K=32, allow_tf32=dot_out_dtype == tl.float32, acc_dtype=dot_out_dtype, **meta, @@ -743,6 +746,7 @@ def _bsr_strided_addmm_kernel( BLOCKSIZE_ROW: tl.constexpr, BLOCKSIZE_COL: tl.constexpr, BLOCKSIZE_INNER: tl.constexpr, + BLOCKSIZE_K: tl.constexpr, acc_dtype: tl.constexpr, allow_tf32: tl.constexpr, GROUP_SIZE_ROW: tl.constexpr, @@ -778,10 +782,10 @@ def _bsr_strided_addmm_kernel( row_block_arange = tl.arange(0, BLOCKSIZE_ROW) inner_block_arange = tl.arange(0, BLOCKSIZE_INNER) - if BLOCKSIZE_COL < 16 or BLOCKSIZE_COL % 16 != 0: - PADDED_BLOCKSIZE_COL : tl.constexpr = 16 - else: - PADDED_BLOCKSIZE_COL: tl.constexpr = BLOCKSIZE_COL + PADDED_BLOCKSIZE_COL : tl.constexpr = 16 + # if BLOCKSIZE_COL < 16 or BLOCKSIZE_COL % 16 != 0: + # else: + # PADDED_BLOCKSIZE_COL: tl.constexpr = BLOCKSIZE_COL col_block_arange = tl.arange(0, PADDED_BLOCKSIZE_COL) @@ -822,8 +826,11 @@ def _bsr_strided_addmm_kernel( ) output_acc_block = tl.zeros((BLOCKSIZE_ROW, PADDED_BLOCKSIZE_COL), dtype=acc_dtype) - # offsets = tl.arange(0, PADDED_BLOCKSIZE_COL)[None, :] - for _ in range(row_nnz): + + nsub_blocks = tl.cdiv(BLOCKSIZE_ROW, BLOCKSIZE_K) + + + for i in range(row_nnz): values_block = tl.load(values_block_ptrs) # find which row of dense needs to get loaded @@ -843,45 +850,6 @@ def _bsr_strided_addmm_kernel( values_block_ptrs += values_nnz_stride col_index_nnz_ptr += col_indices_stride - if not alpha_is_one: - output_acc_block *= alpha - - if not left_alpha_is_one: - left_alpha_ptrs = ( - left_alpha_ptr - + left_alpha_batch_stride * batch_pid - + left_alpha_tiled_row_stride * row_block_pid - + left_alpha_tiled_col_stride * col_block_pid - + left_alpha_row_block_stride * row_block_arange[:, None] - + left_alpha_col_block_stride * col_block_arange[None, :] - ) - output_acc_block *= tl.load(left_alpha_ptrs) - - if not right_alpha_is_one: - right_alpha_ptrs = ( - right_alpha_ptr - + right_alpha_batch_stride * batch_pid - + right_alpha_tiled_row_stride * row_block_pid - + right_alpha_tiled_col_stride * col_block_pid - + right_alpha_row_block_stride * row_block_arange[:, None] - + right_alpha_col_block_stride * col_block_arange[None, :] - ) - output_acc_block *= tl.load(right_alpha_ptrs) - - if beta_is_nonzero: - input_ptrs = ( - input_ptr - + input_batch_stride * batch_pid - + input_tiled_row_stride * row_block_pid - + input_tiled_col_stride * col_block_pid - + input_row_block_stride * row_block_arange[:, None] - + input_col_block_stride * col_block_arange[None, :] - ) - if beta_is_one: - output_acc_block += tl.load(input_ptrs) - else: - output_acc_block += beta * tl.load(input_ptrs) - # write back the result tl.store(output_ptrs, output_acc_block.to(output_ptr.dtype.element_ty), mask=col_block_arange[None, :]< BLOCKSIZE_COL) diff --git a/torchao/sparsity/blocksparse.py b/torchao/sparsity/blocksparse.py index f4c9d20c39..d4e92cc940 100644 --- a/torchao/sparsity/blocksparse.py +++ b/torchao/sparsity/blocksparse.py @@ -10,6 +10,7 @@ aten = torch.ops.aten + # custom op definition @torch.library.custom_op("blocksparse::int_addmm", mutates_args=()) def blocksparse_int_addmm( @@ -91,6 +92,7 @@ def blocksparse_addmm( x_padded: torch.Tensor, crow_indices: torch.Tensor, col_indices: torch.Tensor, + row_indices: torch.Tensor, values: torch.Tensor, M: int, K: int, @@ -103,6 +105,7 @@ def blocksparse_addmm( bsr_dense_addmm( out, weight_bsr, + row_indices, x_padded, alpha=1, beta=0, @@ -116,6 +119,7 @@ def blocksparse_addmm_abstract( x_padded: torch.Tensor, crow_indices: torch.Tensor, col_indices: torch.Tensor, + row_indices: torch.Tensor, values: torch.Tensor, M: int, K: int, @@ -129,10 +133,11 @@ def blocksparse_addmm_abstract( class BlockSparseTensor(TorchAOBaseTensor): bsr_crow_indices: Optional[torch.Tensor] bsr_col_indices: Optional[torch.Tensor] + bsr_row_indices: Optional[torch.Tensor] bsr_values: Optional[torch.Tensor] blocksize: int - __slots__ = ["bsr_crow_indices", "bsr_col_indices", "bsr_values"] + __slots__ = ["bsr_crow_indices", "bsr_col_indices", "bsr_row_indices", "bsr_values"] @staticmethod def __new__( # noqa: PYI034 @@ -141,6 +146,7 @@ def __new__( # noqa: PYI034 blocksize: int, bsr_crow_indices: Optional[torch.Tensor], bsr_col_indices: Optional[torch.Tensor], + bsr_row_indices: Optional[torch.Tensor], bsr_values: Optional[torch.Tensor], requires_grad: bool = False, ): @@ -162,6 +168,7 @@ def __new__( # noqa: PYI034 tensor.bsr_crow_indices = bsr_crow_indices tensor.bsr_values = bsr_values tensor.bsr_col_indices = bsr_col_indices + tensor.bsr_row_indices = bsr_row_indices return tensor def __repr__(self) -> str: # type: ignore[override] @@ -189,18 +196,22 @@ def __tensor_unflatten__( blocksize=blocksize, bsr_crow_indices=inner_tensors.get("bsr_crow_indices", None), bsr_col_indices=inner_tensors.get("bsr_col_indices", None), + bsr_row_indices=inner_tensors.get("bsr_row_indices", None), bsr_values=inner_tensors.get("bsr_values", None), requires_grad=requires_grad, ) + @classmethod def from_dense(cls, dense_tensor, blocksize): bsr_tensor = dense_tensor.to_sparse_bsr(blocksize) + bsr_tensor_t = dense_tensor.t().contiguous().to_sparse_bsr(blocksize) return cls( shape=dense_tensor.shape, blocksize=blocksize, bsr_crow_indices=bsr_tensor.crow_indices(), bsr_col_indices=bsr_tensor.col_indices(), + bsr_row_indices=bsr_tensor_t.col_indices(), bsr_values=bsr_tensor.values(), requires_grad=False, ) @@ -211,6 +222,7 @@ def apply_fn_to_shard(self, func): blocksize=self.blocksize, bsr_crow_indices=func(self.bsr_crow_indices), bsr_col_indices=func(self.bsr_col_indices), + bsr_row_indices=func(self.bsr_row_indices), bsr_values=func(self.bsr_values), requires_grad=self.requires_grad, ) @@ -313,6 +325,9 @@ def block_sparse_crow_indices(func, types, args, kwargs): def block_sparse_col_indices(func, types, args, kwargs): return args[0].bsr_col_indices.detach() +@implements(aten.row_indices.default) +def block_sparse_col_indices(func, types, args, kwargs): + return args[0].bsr_row_indices.detach() @implements(aten._nnz.default) def block_sparse__nnz(func, types, args, kwargs): @@ -330,6 +345,7 @@ def block_sparse_linear(func, types, args, kwargs): x, w.crow_indices(), w.col_indices(), + w.row_indices(), w.values(), M, K, From c09368147709e5c8ac0054fcdcc375e5992b9a36 Mon Sep 17 00:00:00 2001 From: Jesse Cai Date: Tue, 11 Feb 2025 18:36:18 -0800 Subject: [PATCH 19/23] wip --- benchmarks/benchmark_gpu_sparsity.py | 1 + test/sparsity/test_bsr_sum_prod.py | 41 +++++++ test/sparsity/test_supermask.py | 14 +++ torchao/_models/llama/benchmark_results.txt | 28 ++++- torchao/_models/llama/bsr_benchmarks.sh | 2 +- torchao/kernel/bsr_triton_ops.py | 75 +++++++++---- torchao/sparsity/blocksparse.py | 114 ++++++++++++++------ 7 files changed, 220 insertions(+), 55 deletions(-) create mode 100644 test/sparsity/test_bsr_sum_prod.py diff --git a/benchmarks/benchmark_gpu_sparsity.py b/benchmarks/benchmark_gpu_sparsity.py index 3918622b25..f871af8989 100644 --- a/benchmarks/benchmark_gpu_sparsity.py +++ b/benchmarks/benchmark_gpu_sparsity.py @@ -217,6 +217,7 @@ def sparse_func(): # (32, 32, 16), (4096, 14336, 1), # (14336, 4096, 1), + # (14336, 4096, 1), # (11008, 4096, 16), # (16, 4096, 4096), # (4096, 4096, 11008), diff --git a/test/sparsity/test_bsr_sum_prod.py b/test/sparsity/test_bsr_sum_prod.py new file mode 100644 index 0000000000..8415abbfe1 --- /dev/null +++ b/test/sparsity/test_bsr_sum_prod.py @@ -0,0 +1,41 @@ +import torch + +import triton +import triton.language as tl +import pdb + +from torchao.sparsity.utils import create_block_sparse_tensor +from torchao.sparsity.blocksparse import BlockSparseTensor +from torch.library import wrap_triton, triton_op + + + +@torch.compile(dynamic=False, fullgraph=True) +def test(w, x): + b = x.unsqueeze(0) + out= (torch.mul(w, b)).sum(dim=1) + return out + +torch.set_printoptions(profile='full', linewidth=100000) +torch.manual_seed(0) +size = 98432 + +with torch.no_grad(): + create_block_sparse_tensor = torch.compiler.disable(create_block_sparse_tensor) + a = create_block_sparse_tensor(32, 32, 16, 0.5, torch.bfloat16).cuda() * torch.randn(32, 32, dtype=torch.bfloat16).cuda() + a[:16, :16] *= 4 + a[16:, 16:] *= 4 + a[16:, :16] *= 2 + a[:16, 16:] *= 1 + # print(a) + # print(x) + w = BlockSparseTensor.from_dense(a, 16).detach() + x = torch.arange(32).reshape((32, 1)).to(torch.bfloat16).cuda() + # expected= test(a.unsqueeze(2), x) + # print(expected) + # print("strides", w.unsqueeze(2).stride()) + # print("strides", w.stride()) + out = test(w.unsqueeze(2), x) + # print(out) + + # torch.testing.assert_close(out, expected, rtol=1e-2, atol=1e-2) diff --git a/test/sparsity/test_supermask.py b/test/sparsity/test_supermask.py index 233826163f..4306f0a8cb 100644 --- a/test/sparsity/test_supermask.py +++ b/test/sparsity/test_supermask.py @@ -14,6 +14,8 @@ quantize_, ) from torchao.sparsity import apply_fake_sparsity, semi_sparse_weight, sparsify_ +from torchao.sparsity.blocksparse import BlockSparseTensor +from torchao.sparsity.utils import create_block_sparse_tensor from torchao.utils import ( TORCH_VERSION_AT_LEAST_2_3, TORCH_VERSION_AT_LEAST_2_4, @@ -58,6 +60,18 @@ def test_from_linear(self): supermask_linear = SupermaskLinear.from_linear(linear, sparsity_level=0.5, blocksize=4) assert supermask_linear.weight.shape == linear.weight.shape + def test_fastpath(self): + a = create_block_sparse_tensor(128, 128, 64, 0.5, torch.bfloat16).cuda() + # print(a) + w = a + x = torch.randn(128, 1).to(torch.bfloat16).cuda() + expected = (torch.mul(w.unsqueeze(2), x.unsqueeze(0))).sum(dim=1) + + a_sparse = BlockSparseTensor.from_dense(a, 64) + w = a_sparse + out = (torch.mul(w.unsqueeze(2), x.unsqueeze(0))).sum(dim=1) + torch.testing.assert_close(out, expected, rtol=1e-2, atol=1e-2) + common_utils.instantiate_parametrized_tests(TestSupermask) diff --git a/torchao/_models/llama/benchmark_results.txt b/torchao/_models/llama/benchmark_results.txt index 179d308ce4..155adfe19f 100644 --- a/torchao/_models/llama/benchmark_results.txt +++ b/torchao/_models/llama/benchmark_results.txt @@ -163,4 +163,30 @@ OTHER BENCHMARKS 20250124124720, tok/s= 48.95, tok/s_decode= 70.50, ttft=1.2485, mem/s= 734.75 GB/s, peak_mem=36.70 GB, model_size=15.01 GB quant: None, sparse: None, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --prefill_size 8192--num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 20250124125113, tok/s= 48.87, tok/s_decode= 70.78, ttft=1.2673, mem/s= 733.50 GB/s, peak_mem=36.70 GB, model_size=15.01 GB quant: None, sparse: None, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --prefill_size 8192--num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 20250124125909, tok/s= 67.03, tok/s_decode= 99.25, ttft=0.9682, mem/s= 326.99 GB/s, peak_mem=18.15 GB, model_size= 4.88 GB quant: None, sparse: bsr-0.9-64, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr-0.9-64 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --prefill_size 8192--num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 -20250124152728, tok/s=149.00, tok/s_decode=157.80, ttft=0.0745, mem/s= 726.43 GB/s, peak_mem= 6.67 GB, model_size= 4.88 GB quant: None, sparse: bsr-0.9-64, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr-0.9-64 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 \ No newline at end of file +20250124152728, tok/s=149.00, tok/s_decode=157.80, ttft=0.0745, mem/s= 726.43 GB/s, peak_mem= 6.67 GB, model_size= 4.88 GB quant: None, sparse: bsr-0.9-64, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr-0.9-64 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250129115156, tok/s= 48.69, tok/s_decode= 70.11, ttft=1.2547, mem/s= 730.83 GB/s, peak_mem=36.45 GB, model_size=15.01 GB quant: None, sparse: None, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --prefill_size 8192--num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250129120020, tok/s= 69.94, tok/s_decode=103.46, ttft=0.9261, mem/s= 341.04 GB/s, peak_mem=17.88 GB, model_size= 4.88 GB quant: None, sparse: bsr-0.9-64, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr-0.9-64 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --prefill_size 8192--num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250129120430, tok/s= 92.81, tok/s_decode= 94.27, ttft=0.0336, mem/s=1393.04 GB/s, peak_mem=16.47 GB, model_size=15.01 GB quant: None, sparse: None, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250129121107, tok/s=156.23, tok/s_decode=165.67, ttft=0.0726, mem/s= 761.84 GB/s, peak_mem= 6.67 GB, model_size= 4.88 GB quant: None, sparse: bsr-0.9-64, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr-0.9-64 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250129122328, tok/s= 93.58, tok/s_decode= 94.27, ttft=0.0152, mem/s=1404.66 GB/s, peak_mem=16.47 GB, model_size=15.01 GB quant: None, sparse: None, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250129122559, tok/s=175.88, tok/s_decode=187.83, ttft=0.0720, mem/s= 857.69 GB/s, peak_mem= 6.40 GB, model_size= 4.88 GB quant: None, sparse: bsr-0.9-64, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr-0.9-64 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250129143459, tok/s= 93.09, tok/s_decode= 94.32, ttft=0.0280, mem/s=1397.21 GB/s, peak_mem=16.47 GB, model_size=15.01 GB quant: None, sparse: None, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250129143733, tok/s=183.34, tok/s_decode=196.18, ttft=0.0710, mem/s= 894.06 GB/s, peak_mem= 6.40 GB, model_size= 4.88 GB quant: None, sparse: bsr-0.9-64, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr-0.9-64 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250203164133, tok/s= 93.24, tok/s_decode= 93.97, ttft=0.0162, mem/s=1399.57 GB/s, peak_mem=16.24 GB, model_size=15.01 GB quant: None, sparse: None, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250203220542, tok/s=149.48, tok/s_decode=158.11, ttft=0.0727, mem/s= 728.77 GB/s, peak_mem= 6.44 GB, model_size= 4.88 GB quant: None, sparse: bsr-0.9-64, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr-0.9-64 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250203222522, tok/s=149.09, tok/s_decode=157.80, ttft=0.0737, mem/s= 726.87 GB/s, peak_mem= 6.44 GB, model_size= 4.88 GB quant: None, sparse: bsr-0.9-64, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr-0.9-64 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250203222822, tok/s=148.75, tok/s_decode=157.61, ttft=0.0752, mem/s= 725.24 GB/s, peak_mem= 6.44 GB, model_size= 4.88 GB quant: None, sparse: bsr-0.9-64, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr-0.9-64 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250203223826, tok/s=133.34, tok/s_decode=140.28, ttft=0.0739, mem/s= 650.07 GB/s, peak_mem= 6.74 GB, model_size= 4.88 GB quant: None, sparse: bsr-0.9-64, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr-0.9-64 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250203224115, tok/s=132.88, tok/s_decode=139.87, ttft=0.0748, mem/s= 647.86 GB/s, peak_mem= 6.65 GB, model_size= 4.88 GB quant: None, sparse: bsr-0.9-64, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr-0.9-64 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250203224408, tok/s=133.34, tok/s_decode=140.40, ttft=0.0750, mem/s= 650.07 GB/s, peak_mem= 6.65 GB, model_size= 4.88 GB quant: None, sparse: bsr-0.9-64, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr-0.9-64 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250203231700, tok/s=133.51, tok/s_decode=140.46, ttft=0.0738, mem/s= 650.90 GB/s, peak_mem= 6.65 GB, model_size= 4.88 GB quant: None, sparse: bsr-0.9-64, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr-0.9-64 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250203232813, tok/s=133.80, tok/s_decode=141.08, ttft=0.0767, mem/s= 652.32 GB/s, peak_mem= 6.74 GB, model_size= 4.88 GB quant: None, sparse: bsr-0.9-64, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr-0.9-64 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250203233649, tok/s=180.76, tok/s_decode=193.95, ttft=0.0749, mem/s= 881.28 GB/s, peak_mem= 6.44 GB, model_size= 4.88 GB quant: None, sparse: bsr-0.9-64, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr-0.9-64 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250203235829, tok/s=181.28, tok/s_decode=193.93, ttft=0.0717, mem/s= 883.79 GB/s, peak_mem= 6.44 GB, model_size= 4.88 GB quant: None, sparse: bsr-0.9-64, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr-0.9-64 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250204070120, tok/s=133.92, tok/s_decode=141.25, ttft=0.0771, mem/s= 652.93 GB/s, peak_mem= 6.74 GB, model_size= 4.88 GB quant: None, sparse: bsr-0.9-64, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr-0.9-64 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250204081135, tok/s=118.95, tok/s_decode=140.12, ttft=0.0797, mem/s= 579.91 GB/s, peak_mem= 6.64 GB, model_size= 4.88 GB quant: None, sparse: bsr-0.9-64, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr-0.9-64 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --profile sparse_sum_prod.json.gz --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250204083318, tok/s=157.18, tok/s_decode=192.39, ttft=0.0842, mem/s= 766.33 GB/s, peak_mem= 6.44 GB, model_size= 4.88 GB quant: None, sparse: bsr-0.9-64, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr-0.9-64 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --profile sparse_bsr_addmm.json.gz --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250204090000, tok/s=117.18, tok/s_decode=140.68, ttft=0.0812, mem/s= 571.28 GB/s, peak_mem= 6.74 GB, model_size= 4.88 GB quant: None, sparse: bsr-0.9-64, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr-0.9-64 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --profile sparse_bsr_addmm.json.gz --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250204092213, tok/s=138.33, tok/s_decode=146.00, ttft=0.0757, mem/s= 675.49 GB/s, peak_mem= 6.66 GB, model_size= 4.88 GB quant: None, sparse: bsr-0.9-32, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr-0.9-32 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250204093828, tok/s=119.76, tok/s_decode=125.40, ttft=0.0748, mem/s= 585.69 GB/s, peak_mem= 6.55 GB, model_size= 4.89 GB quant: None, sparse: bsr-0.9-16, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr-0.9-16 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250204102741, tok/s=169.35, tok/s_decode=180.85, ttft=0.0747, mem/s= 826.97 GB/s, peak_mem= 6.45 GB, model_size= 4.88 GB quant: None, sparse: bsr-0.9-32, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr-0.9-32 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 \ No newline at end of file diff --git a/torchao/_models/llama/bsr_benchmarks.sh b/torchao/_models/llama/bsr_benchmarks.sh index 5e0228b6c0..fadcef9623 100644 --- a/torchao/_models/llama/bsr_benchmarks.sh +++ b/torchao/_models/llama/bsr_benchmarks.sh @@ -4,5 +4,5 @@ export MODEL_REPO=meta-llama/Meta-Llama-3.1-8B #python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --compile_prefill --write_result benchmark_results.txt --prefill_size 8192 #python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --compile_prefill --write_result benchmark_results.txt --prefill_size 8192 --sparsity bsr-0.9-64 #python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --compile_prefill --write_result benchmark_results.txt -python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --compile_prefill --write_result benchmark_results.txt --sparsity bsr-0.9-64 +python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --compile_prefill --write_result benchmark_results.txt --sparsity bsr-0.9-32 #python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --compile_prefill --write_result benchmark_results.txt --sparsity bsr-0.9-32 diff --git a/torchao/kernel/bsr_triton_ops.py b/torchao/kernel/bsr_triton_ops.py index f6f28ee4a0..1b5031d796 100644 --- a/torchao/kernel/bsr_triton_ops.py +++ b/torchao/kernel/bsr_triton_ops.py @@ -13,7 +13,6 @@ from torch.sparse._triton_ops_meta import get_meta - TORCH_SPARSE_BSR_SCATTER_MM_LRU_CACHE_SIZE = int( os.getenv("TORCH_SPARSE_BSR_SCATTER_MM_LRU_CACHE_SIZE", 2) ) @@ -354,13 +353,14 @@ def bsr_dense_addmm_meta( # verbose=True, # ) # get padded key - padded_key = (M, K, 16, Ms, Ks, beta == 0, beta == 1, alpha == 1) - meta = get_meta( - "bsr_dense_addmm", - padded_key, - device_name, - version=(_version, version_dtype, sparsity), - ) + # padded_key = (M, K, 16, Ms, Ks, beta == 0, beta == 1, alpha == 1) + # meta = get_meta( + # "bsr_dense_addmm", + # padded_key, + # device_name, + # version=(_version, version_dtype, sparsity), + # ) + pass # breakpoint() # return meta # message @@ -372,7 +372,7 @@ def bsr_dense_addmm_meta( SPLIT_N = SPLIT_N or max(N // Ms, 1) GROUP_SIZE_ROW = GROUP_SIZE_ROW or 4 - num_stages = num_stages or 1 + num_stages = num_stages or 4 num_warps = num_warps or 4 return dict( SPLIT_N=SPLIT_N, @@ -505,7 +505,6 @@ def _int_bsr_dense_addmm( def bsr_dense_addmm( input: torch.Tensor, bsr: torch.Tensor, - row_indices: torch.Tensor, dense: torch.Tensor, *, beta=1, @@ -657,7 +656,6 @@ def kernel(grid, *sliced_tensors): BLOCKSIZE_ROW=BM, BLOCKSIZE_INNER=BK, BLOCKSIZE_COL=BN, - BLOCKSIZE_K=32, allow_tf32=dot_out_dtype == tl.float32, acc_dtype=dot_out_dtype, **meta, @@ -746,7 +744,6 @@ def _bsr_strided_addmm_kernel( BLOCKSIZE_ROW: tl.constexpr, BLOCKSIZE_COL: tl.constexpr, BLOCKSIZE_INNER: tl.constexpr, - BLOCKSIZE_K: tl.constexpr, acc_dtype: tl.constexpr, allow_tf32: tl.constexpr, GROUP_SIZE_ROW: tl.constexpr, @@ -782,10 +779,10 @@ def _bsr_strided_addmm_kernel( row_block_arange = tl.arange(0, BLOCKSIZE_ROW) inner_block_arange = tl.arange(0, BLOCKSIZE_INNER) - PADDED_BLOCKSIZE_COL : tl.constexpr = 16 - # if BLOCKSIZE_COL < 16 or BLOCKSIZE_COL % 16 != 0: - # else: - # PADDED_BLOCKSIZE_COL: tl.constexpr = BLOCKSIZE_COL + if BLOCKSIZE_COL < 16 or BLOCKSIZE_COL % 16 != 0: + PADDED_BLOCKSIZE_COL : tl.constexpr = 16 + else: + PADDED_BLOCKSIZE_COL: tl.constexpr = BLOCKSIZE_COL col_block_arange = tl.arange(0, PADDED_BLOCKSIZE_COL) @@ -826,11 +823,8 @@ def _bsr_strided_addmm_kernel( ) output_acc_block = tl.zeros((BLOCKSIZE_ROW, PADDED_BLOCKSIZE_COL), dtype=acc_dtype) - - nsub_blocks = tl.cdiv(BLOCKSIZE_ROW, BLOCKSIZE_K) - - - for i in range(row_nnz): + # offsets = tl.arange(0, PADDED_BLOCKSIZE_COL)[None, :] + for _ in range(row_nnz): values_block = tl.load(values_block_ptrs) # find which row of dense needs to get loaded @@ -850,6 +844,45 @@ def _bsr_strided_addmm_kernel( values_block_ptrs += values_nnz_stride col_index_nnz_ptr += col_indices_stride + if not alpha_is_one: + output_acc_block *= alpha + + if not left_alpha_is_one: + left_alpha_ptrs = ( + left_alpha_ptr + + left_alpha_batch_stride * batch_pid + + left_alpha_tiled_row_stride * row_block_pid + + left_alpha_tiled_col_stride * col_block_pid + + left_alpha_row_block_stride * row_block_arange[:, None] + + left_alpha_col_block_stride * col_block_arange[None, :] + ) + output_acc_block *= tl.load(left_alpha_ptrs) + + if not right_alpha_is_one: + right_alpha_ptrs = ( + right_alpha_ptr + + right_alpha_batch_stride * batch_pid + + right_alpha_tiled_row_stride * row_block_pid + + right_alpha_tiled_col_stride * col_block_pid + + right_alpha_row_block_stride * row_block_arange[:, None] + + right_alpha_col_block_stride * col_block_arange[None, :] + ) + output_acc_block *= tl.load(right_alpha_ptrs) + + if beta_is_nonzero: + input_ptrs = ( + input_ptr + + input_batch_stride * batch_pid + + input_tiled_row_stride * row_block_pid + + input_tiled_col_stride * col_block_pid + + input_row_block_stride * row_block_arange[:, None] + + input_col_block_stride * col_block_arange[None, :] + ) + if beta_is_one: + output_acc_block += tl.load(input_ptrs) + else: + output_acc_block += beta * tl.load(input_ptrs) + # write back the result tl.store(output_ptrs, output_acc_block.to(output_ptr.dtype.element_ty), mask=col_block_arange[None, :]< BLOCKSIZE_COL) diff --git a/torchao/sparsity/blocksparse.py b/torchao/sparsity/blocksparse.py index d4e92cc940..b2d8cc3423 100644 --- a/torchao/sparsity/blocksparse.py +++ b/torchao/sparsity/blocksparse.py @@ -2,6 +2,9 @@ from typing import Any, Callable, Dict, List, Optional, Tuple import torch +import triton +import triton.language as tl +from torch.library import wrap_triton, triton_op from torch.utils._python_dispatch import return_and_correct_aliasing from torchao.quantization.quant_api import _get_linear_subclass_inserter from torchao.utils import TorchAOBaseTensor @@ -10,8 +13,65 @@ aten = torch.ops.aten +@triton.jit +def sum_with_offsets_kernel(values, crow_indices, output, BLOCK_SIZE: tl.constexpr): + # For each kernel invokation, we assume we are dealing with a specific row + + pid = tl.program_id(0) + + # Compute the start and end offset for our given row + start = tl.load(crow_indices + pid) + end = tl.load(crow_indices + pid + 1) + + # Number of nonzero elements in the row + row_nnz = end - start + BLOCK_ELEMENTS = BLOCK_SIZE * BLOCK_SIZE + + row_block_arange = tl.arange(0, BLOCK_SIZE) + inner_block_arange = tl.arange(0, BLOCK_SIZE) + + # Calculate correct pointer offset + values += BLOCK_ELEMENTS * start + BLOCK_SIZE * row_block_arange[:, None] + inner_block_arange[None, :] + + # Accumulate rowwise + acc = tl.zeros((BLOCK_SIZE, ), dtype=tl.float32) + + # Loop over the block and accumulate the sum using offsets + for i in range(row_nnz): + # should I be store /loading the sumprod? + acc += tl.sum(tl.load(values), axis=1) + # tl.device_print("vals", tl.load(values)) + # tl.device_print("acc", acc) + + # move to next block in values + values += BLOCK_ELEMENTS + + + # Write the result to the output + output_arange = tl.arange(0, BLOCK_SIZE) + tl.store(output + BLOCK_SIZE * pid + output_arange, acc.to(output.dtype.element_ty)) + +@triton_op("blocksparse::sum", mutates_args=()) +def sum_with_offsets( + values: torch.Tensor, + crow_indices: torch.Tensor, + M: int, +) -> torch.Tensor: + + # Define the block size and number of blocks + BLOCK_SIZE = values.shape[1] + num_offsets = crow_indices.numel() - 1 + grid = lambda meta: (triton.cdiv(num_offsets, 1), ) + + # Allocate output tensor + y = torch.empty((M, 1), dtype=torch.bfloat16, device='cuda') + + # Launch the kernel + wrap_triton(sum_with_offsets_kernel)[grid](values, crow_indices, y, BLOCK_SIZE) + + # Sum the partial results on the CPU + return y -# custom op definition @torch.library.custom_op("blocksparse::int_addmm", mutates_args=()) def blocksparse_int_addmm( crow_indices: torch.Tensor, @@ -87,26 +147,26 @@ def blocksparse_linear_abstract( # bsr wrapper custom op +# @triton_op("blocksparse::addmm", mutates_args=()) @torch.library.custom_op("blocksparse::addmm", mutates_args=()) def blocksparse_addmm( x_padded: torch.Tensor, crow_indices: torch.Tensor, col_indices: torch.Tensor, - row_indices: torch.Tensor, values: torch.Tensor, M: int, K: int, bias: torch.Tensor, ) -> torch.Tensor: assert bias is None - weight_bsr = torch.sparse_bsr_tensor(crow_indices, col_indices, values, size=(M, K)) + bsr = torch.sparse_bsr_tensor(crow_indices, col_indices, values, size=(M, K)) N_padded = x_padded.shape[1] out = x_padded.new_empty((M, N_padded)) bsr_dense_addmm( out, - weight_bsr, - row_indices, + bsr, x_padded, + # (M, K), alpha=1, beta=0, out=out, @@ -119,7 +179,6 @@ def blocksparse_addmm_abstract( x_padded: torch.Tensor, crow_indices: torch.Tensor, col_indices: torch.Tensor, - row_indices: torch.Tensor, values: torch.Tensor, M: int, K: int, @@ -133,11 +192,10 @@ def blocksparse_addmm_abstract( class BlockSparseTensor(TorchAOBaseTensor): bsr_crow_indices: Optional[torch.Tensor] bsr_col_indices: Optional[torch.Tensor] - bsr_row_indices: Optional[torch.Tensor] bsr_values: Optional[torch.Tensor] blocksize: int - __slots__ = ["bsr_crow_indices", "bsr_col_indices", "bsr_row_indices", "bsr_values"] + __slots__ = ["bsr_crow_indices", "bsr_col_indices", "bsr_values"] @staticmethod def __new__( # noqa: PYI034 @@ -146,7 +204,6 @@ def __new__( # noqa: PYI034 blocksize: int, bsr_crow_indices: Optional[torch.Tensor], bsr_col_indices: Optional[torch.Tensor], - bsr_row_indices: Optional[torch.Tensor], bsr_values: Optional[torch.Tensor], requires_grad: bool = False, ): @@ -168,7 +225,6 @@ def __new__( # noqa: PYI034 tensor.bsr_crow_indices = bsr_crow_indices tensor.bsr_values = bsr_values tensor.bsr_col_indices = bsr_col_indices - tensor.bsr_row_indices = bsr_row_indices return tensor def __repr__(self) -> str: # type: ignore[override] @@ -191,12 +247,12 @@ def __tensor_unflatten__( outer_stride, ) -> torch.Tensor: shape, requires_grad, blocksize = tensor_meta + # print("unflatten", outer_size, outer_stride) return cls( shape=shape, blocksize=blocksize, bsr_crow_indices=inner_tensors.get("bsr_crow_indices", None), bsr_col_indices=inner_tensors.get("bsr_col_indices", None), - bsr_row_indices=inner_tensors.get("bsr_row_indices", None), bsr_values=inner_tensors.get("bsr_values", None), requires_grad=requires_grad, ) @@ -205,13 +261,12 @@ def __tensor_unflatten__( @classmethod def from_dense(cls, dense_tensor, blocksize): bsr_tensor = dense_tensor.to_sparse_bsr(blocksize) - bsr_tensor_t = dense_tensor.t().contiguous().to_sparse_bsr(blocksize) + # bsr_tensor_t = dense_tensor.t().contiguous().to_sparse_bsr(blocksize) return cls( shape=dense_tensor.shape, blocksize=blocksize, bsr_crow_indices=bsr_tensor.crow_indices(), bsr_col_indices=bsr_tensor.col_indices(), - bsr_row_indices=bsr_tensor_t.col_indices(), bsr_values=bsr_tensor.values(), requires_grad=False, ) @@ -222,7 +277,6 @@ def apply_fn_to_shard(self, func): blocksize=self.blocksize, bsr_crow_indices=func(self.bsr_crow_indices), bsr_col_indices=func(self.bsr_col_indices), - bsr_row_indices=func(self.bsr_row_indices), bsr_values=func(self.bsr_values), requires_grad=self.requires_grad, ) @@ -260,7 +314,8 @@ def block_sparse_unsqueeze(func, types, args, kwargs): bsr.blocksize, bsr.crow_indices(), bsr.col_indices(), - bsr.values().unsqueeze(-1)) + bsr.values().unsqueeze(-1), + requires_grad=False) @implements(aten.mul.Tensor) @@ -276,7 +331,7 @@ def my_mul(bsr, t): assert t.dim() == 3 assert not bsr.requires_grad assert t.size(0) == 1 - t_blocked = t.view(t.size(0), t.size(1) // 64, 64, 1) + t_blocked = t.view(t.size(0), t.size(1) // bsr.blocksize, bsr.blocksize, 1) masked_t = t_blocked.transpose(0, 1).index_select(0, bsr.col_indices()) new_values = bsr.values() * masked_t return BlockSparseTensor(bsr.shape, @@ -298,18 +353,8 @@ def block_sparse_sum(func, types, args, kwargs): dim = dim[0] bsr_dim = bsr.dim() assert dim == 1 - out = torch.empty((bsr.shape[0], bsr.shape[2]), dtype=bsr.dtype, device=bsr.device) - crow_indices = bsr.crow_indices() - blocksize = bsr.blocksize - - for i in range(crow_indices.shape[0]-1): - start, stop = crow_indices[i], crow_indices[i+1] - temp_sum = bsr.values()[start:stop] - temp_sum = temp_sum.sum(dim=0).sum(dim=1) - out[i * blocksize : (i + 1) * blocksize] = temp_sum + return torch.ops.blocksparse.sum(bsr.values(), bsr.crow_indices(), bsr.shape[0]) - return out - @implements(aten.values.default) def block_sparse_values(func, types, args, kwargs): @@ -325,10 +370,6 @@ def block_sparse_crow_indices(func, types, args, kwargs): def block_sparse_col_indices(func, types, args, kwargs): return args[0].bsr_col_indices.detach() -@implements(aten.row_indices.default) -def block_sparse_col_indices(func, types, args, kwargs): - return args[0].bsr_row_indices.detach() - @implements(aten._nnz.default) def block_sparse__nnz(func, types, args, kwargs): return args[0].bsr_values.shape[0] @@ -341,11 +382,20 @@ def block_sparse_linear(func, types, args, kwargs): M = w.shape[0] K = w.shape[1] N = x.shape[1] + + # if x.size(-1) == 1: + # out = (torch.mul(w.unsqueeze(2), x.unsqueeze(0))).sum(dim=1) + # out_orig = out.t().view(x_orig.shape[:-1] + (M,)) + # if bias is None: + # special_ret = out_orig + # else: + # special_ret = out_orig + bias + # return special_ret + # else: out = torch.ops.blocksparse.addmm( x, w.crow_indices(), w.col_indices(), - w.row_indices(), w.values(), M, K, From 8241ef71488d5a24ffabccbc9433d52db49a72d6 Mon Sep 17 00:00:00 2001 From: Jesse Cai Date: Wed, 12 Feb 2025 13:05:09 -0800 Subject: [PATCH 20/23] wip --- torchao/_models/llama/benchmark_results.txt | 4 +++- torchao/_models/llama/bsr_benchmarks.sh | 8 ++++---- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/torchao/_models/llama/benchmark_results.txt b/torchao/_models/llama/benchmark_results.txt index 155adfe19f..a111f6d858 100644 --- a/torchao/_models/llama/benchmark_results.txt +++ b/torchao/_models/llama/benchmark_results.txt @@ -189,4 +189,6 @@ OTHER BENCHMARKS 20250204090000, tok/s=117.18, tok/s_decode=140.68, ttft=0.0812, mem/s= 571.28 GB/s, peak_mem= 6.74 GB, model_size= 4.88 GB quant: None, sparse: bsr-0.9-64, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr-0.9-64 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --profile sparse_bsr_addmm.json.gz --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 20250204092213, tok/s=138.33, tok/s_decode=146.00, ttft=0.0757, mem/s= 675.49 GB/s, peak_mem= 6.66 GB, model_size= 4.88 GB quant: None, sparse: bsr-0.9-32, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr-0.9-32 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 20250204093828, tok/s=119.76, tok/s_decode=125.40, ttft=0.0748, mem/s= 585.69 GB/s, peak_mem= 6.55 GB, model_size= 4.89 GB quant: None, sparse: bsr-0.9-16, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr-0.9-16 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 -20250204102741, tok/s=169.35, tok/s_decode=180.85, ttft=0.0747, mem/s= 826.97 GB/s, peak_mem= 6.45 GB, model_size= 4.88 GB quant: None, sparse: bsr-0.9-32, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr-0.9-32 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 \ No newline at end of file +20250204102741, tok/s=169.35, tok/s_decode=180.85, ttft=0.0747, mem/s= 826.97 GB/s, peak_mem= 6.45 GB, model_size= 4.88 GB quant: None, sparse: bsr-0.9-32, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr-0.9-32 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250212113401, tok/s=240.05, tok/s_decode=254.28, ttft=0.0460, mem/s=1172.21 GB/s, peak_mem= 6.47 GB, model_size= 4.88 GB quant: None, sparse: bsr-0.9-32, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr-0.9-32 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250212122003, tok/s=248.29, tok/s_decode=262.66, ttft=0.0435, mem/s=1210.52 GB/s, peak_mem= 6.46 GB, model_size= 4.88 GB quant: None, sparse: bsr-0.9-64, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr-0.9-64 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 \ No newline at end of file diff --git a/torchao/_models/llama/bsr_benchmarks.sh b/torchao/_models/llama/bsr_benchmarks.sh index fadcef9623..549ad482e0 100644 --- a/torchao/_models/llama/bsr_benchmarks.sh +++ b/torchao/_models/llama/bsr_benchmarks.sh @@ -1,8 +1,8 @@ export CHECKPOINT_PATH=../../../checkpoints # path to checkpoints folder export MODEL_REPO=meta-llama/Meta-Llama-3.1-8B -#python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --compile_prefill --write_result benchmark_results.txt --prefill_size 8192 +#python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --compile_prefill --write_result benchmark_results.txt --prefill_size 8192 #python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --compile_prefill --write_result benchmark_results.txt --prefill_size 8192 --sparsity bsr-0.9-64 -#python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --compile_prefill --write_result benchmark_results.txt -python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --compile_prefill --write_result benchmark_results.txt --sparsity bsr-0.9-32 -#python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --compile_prefill --write_result benchmark_results.txt --sparsity bsr-0.9-32 +#python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --compile_prefill --write_result benchmark_results.txt +# python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --compile_prefill --write_result benchmark_results.txt --sparsity bsr-0.9-32 +python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --compile_prefill --write_result benchmark_results.txt --sparsity bsr-0.9-64 From 684ee072566697ff2cee657ee7463d5a90f3d9e1 Mon Sep 17 00:00:00 2001 From: Jesse Cai Date: Tue, 18 Feb 2025 12:51:48 -0800 Subject: [PATCH 21/23] undo benchmark change --- torchao/prototype/sparsity/superblock/benchmark.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/torchao/prototype/sparsity/superblock/benchmark.py b/torchao/prototype/sparsity/superblock/benchmark.py index a53f53c21a..b87834afae 100644 --- a/torchao/prototype/sparsity/superblock/benchmark.py +++ b/torchao/prototype/sparsity/superblock/benchmark.py @@ -81,9 +81,6 @@ def main(args): # With quantization, we must use cuSPARSELt to fuse one of the scalar matmuls. # Otherwise, we observe the CUTLASS kernels to be faster, so we use those instead. accelerate_with_sparsity(model, args) - if "bsr" in args.sparsity: - sparsify_(model, block_sparse_weight(blocksize=args.blocksize)) - elif "semi-structured" in args.sparsityk # compile model = torch.compile(model, mode="max-autotune", fullgraph=True) From e05680d8d2b5d4a69de72927c366ab0bc9a8dcaf Mon Sep 17 00:00:00 2001 From: Jesse Cai Date: Tue, 18 Feb 2025 13:01:08 -0800 Subject: [PATCH 22/23] deleted supermask in prototype --- .../sparsity/superblock/supermask.py | 156 ------------------ 1 file changed, 156 deletions(-) delete mode 100644 torchao/prototype/sparsity/superblock/supermask.py diff --git a/torchao/prototype/sparsity/superblock/supermask.py b/torchao/prototype/sparsity/superblock/supermask.py deleted file mode 100644 index e1f8a67108..0000000000 --- a/torchao/prototype/sparsity/superblock/supermask.py +++ /dev/null @@ -1,156 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. - -import torch.nn as nn -import math -import torch -from torch.autograd import Variable -import torch.nn.functional as F -import numpy as np - -from torchao.quantization.quant_api import _replace_with_custom_fn_if_matches_filter - -# original supermask -scores_min=None -scores_max=9e9 -uniform_init_01 = False - -# adjusted supermask, initialize scores with uniform distribution in [0,1], clamp scores in each step in [0,1] -# scores_min=0. -# scores_max=1. -# uniform_init_01 = True - -def percentile(t, q): - """Return the value that is larger than q% of t""" - k = 1 + round(.01 * float(q) * (t.numel() - 1)) - return t.view(-1).kthvalue(k).values - - -class GetSubnet(torch.autograd.Function): - """Supermask STE function""" - @staticmethod - def forward(ctx, scores, zeros, ones, sparsity): - clamped_scores = scores.clamp(min=scores_min,max=scores_max) - k_val = percentile(clamped_scores, sparsity*100) - return torch.where(clamped_scores < k_val, zeros.to(scores.device), ones.to(scores.device)) - @staticmethod - def backward(ctx, g): - return g, None, None, None - - -class ApplyMask(torch.autograd.Function): - """Supermask STE function""" - @staticmethod - def forward(ctx, weight, scores): - return weight * scores - @staticmethod - def backward(ctx, grad_output): - grad_weight = grad_scores = None - if ctx.needs_input_grad[0]: - grad_weight = grad_output - if ctx.needs_input_grad[1]: - grad_scores = grad_output - return grad_weight, grad_scores - - -class SupermaskLinear(nn.Linear): - """Supermask class for Linear layer""" - def __init__(self, sparsity, fixed_mask, fixed_weight, bitwidth, transform, fixed_transform, *args, **kwargs): - tile_size = kwargs.pop("tile_size", 1) - super(SupermaskLinear, self).__init__(*args, **kwargs) - # initialize the scores - max_sparsity = 1 - (1 / math.prod([math.ceil(k / tile_size) for k in self.weight.size()])) - self.sparsity = sparsity - if self.sparsity > max_sparsity: - print( - f"reducing sparsity from {self.sparsity} to {max_sparsity}", - f"(maximum sparsity for layer with shape {self.weight.size()} and tile size {tile_size})" - ) - self.sparsity = max_sparsity - self.tile_size = tile_size - self.sparsify_weights = False - self.scores = nn.Parameter( - torch.empty( - [max(1, int(math.ceil(wn / tile_size))) for wn in self.weight.size()] - ), - requires_grad=not fixed_mask, - ) - nn.init.uniform_(self.scores) if uniform_init_01 else nn.init.kaiming_uniform_(self.scores, a=math.sqrt(5)) - - # the shift and the scale are transformation parameters - # the actually used weights = self.weight*self.scale+self.shift - # the transformation is activated only for quantized weights - self.shift=nn.Parameter(torch.Tensor(1).fill_(0.), requires_grad=False) - self.scale=nn.Parameter(torch.Tensor(1).fill_(1.), requires_grad=False) - - with torch.no_grad(): - # if bitwidth is None, then use floating point values in self.weight - # if bitwidth is not None, then quantize self.weight into k-bit (k=bitwidth) - # quantized values are -2^(k-1), -2^(k-1)+1, ..., 0, 1, ..., 2^(k-1)-1 - # these quantized values are uniformly distributed - if bitwidth is not None: - weights_max = torch.max(self.weight).item() - weights_min = torch.min(self.weight).item() - least_step = (weights_max-weights_min)/pow(2,bitwidth) - left_bound = weights_min-1e-6 - right_bound = weights_min+least_step+1e-6 - # self.shift=nn.Parameter(torch.Tensor(1).fill_( (weights_min+(pow(2,bitwidth-1)+0.5)*least_step) if transform[0] is None else transform[0] ), requires_grad=not fixed_transform[0]) - # self.scale=nn.Parameter(torch.Tensor(1).fill_( least_step if transform[1] is None else transform[1] ), requires_grad=not fixed_transform[1]) - # for example, if using binary weights (k=1) with -a, +a, set transform = [a,2a]; if using binary weights (k=1) with a, 0, set transform = [0,-a]; - self.shift=nn.Parameter(torch.Tensor(1).fill_( 0. if transform[0] is None else transform[0] ), requires_grad=not fixed_transform[0]) - self.scale=nn.Parameter(torch.Tensor(1).fill_( 1. if transform[1] is None else transform[1] ), requires_grad=not fixed_transform[1]) - for i in range(-int(pow(2,bitwidth-1)),int(pow(2,bitwidth-1))): - self.weight[torch.logical_and(self.weight>left_bound, self.weight<=right_bound)] = i - left_bound = right_bound - right_bound += least_step - - self.weight.requires_grad = not fixed_weight - - def get_mask(self): - subnet = GetSubnet.apply(self.scores, - torch.zeros_like(self.scores), - torch.ones_like(self.scores), - self.sparsity) - - if self.tile_size != 1: - for i, k in enumerate(self.weight.shape): - subnet = subnet.repeat_interleave(self.tile_size, dim=i) - subnet = torch.narrow(subnet, i, 0, k) - - return subnet - - def sparsify_offline(self): - subnet = self.get_mask() - self.weight.data = (self.weight*self.scale+self.shift) * subnet - self.sparsify_weights = True - - def forward(self, x): - if not self.sparsify_weights: - subnet = self.get_mask() - # w = (self.weight*self.scale+self.shift) - w = ApplyMask.apply(self.weight, subnet) - return F.linear(x, w, self.bias) - return F.linear(x, self.weight, self.bias) - - @classmethod - def from_linear(cls, linear : torch.nn.Linear, sparsity_level:float=0.0, blocksize=1, inference=True): - module_new = None - - assert isinstance(linear, torch.nn.Linear) - module_new = SupermaskLinear( - sparsity_level, False, False, None, None, None, - linear.in_features, - linear.out_features, - bias=linear.bias is not None, - tile_size=blocksize, - ).to(device=linear.weight.device, dtype=linear.weight.dtype) - module_new.weight.data.copy_(linear.weight.data) - if linear.bias is not None: - module_new.bias.data.copy_(linear.bias.data) - if inference: - module_new.sparsify_offline() - return module_new - - @classmethod - def to_linear(cls): - pass - From dc5cf3359fdb9917b60ed24dd184e2a087bf1265 Mon Sep 17 00:00:00 2001 From: Jesse Cai Date: Tue, 18 Feb 2025 13:03:57 -0800 Subject: [PATCH 23/23] update blocksparse API --- torchao/sparsity/blocksparse.py | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/torchao/sparsity/blocksparse.py b/torchao/sparsity/blocksparse.py index b2d8cc3423..059d554561 100644 --- a/torchao/sparsity/blocksparse.py +++ b/torchao/sparsity/blocksparse.py @@ -147,7 +147,6 @@ def blocksparse_linear_abstract( # bsr wrapper custom op -# @triton_op("blocksparse::addmm", mutates_args=()) @torch.library.custom_op("blocksparse::addmm", mutates_args=()) def blocksparse_addmm( x_padded: torch.Tensor, @@ -282,15 +281,6 @@ def apply_fn_to_shard(self, func): ) - def dense(self): - return torch.sparse_bsr_tensor( - crow_indices=self.bsr_crow_indices, - col_indices=self.bsr_col_indices, - values=self.bsr_values, - size=self.shape, - ).to_dense() - - # Subclass op dispatch registration implements = BlockSparseTensor.implements @@ -383,15 +373,6 @@ def block_sparse_linear(func, types, args, kwargs): K = w.shape[1] N = x.shape[1] - # if x.size(-1) == 1: - # out = (torch.mul(w.unsqueeze(2), x.unsqueeze(0))).sum(dim=1) - # out_orig = out.t().view(x_orig.shape[:-1] + (M,)) - # if bias is None: - # special_ret = out_orig - # else: - # special_ret = out_orig + bias - # return special_ret - # else: out = torch.ops.blocksparse.addmm( x, w.crow_indices(),