From 2691f85aad6337fcbe08e5c6e6bfe3e629d14e28 Mon Sep 17 00:00:00 2001 From: George Ohashi Date: Tue, 4 Jun 2024 20:23:40 +0000 Subject: [PATCH 01/12] activation ordering --- src/sparseml/modifiers/quantization/gptq/base.py | 1 + src/sparseml/modifiers/quantization/gptq/pytorch.py | 2 +- .../modifiers/quantization/gptq/utils/gptq_wrapper.py | 11 +++++++++++ src/sparseml/modifiers/utils/layer_compressor.py | 4 ++-- 4 files changed, 15 insertions(+), 3 deletions(-) diff --git a/src/sparseml/modifiers/quantization/gptq/base.py b/src/sparseml/modifiers/quantization/gptq/base.py index 004fce2ee7a..c3254ab31ca 100644 --- a/src/sparseml/modifiers/quantization/gptq/base.py +++ b/src/sparseml/modifiers/quantization/gptq/base.py @@ -80,6 +80,7 @@ class GPTQModifier(Modifier): and activation 8 bit quantization on the Linear layers. """ + actorder: bool = False sequential_update: Optional[bool] = False targets: Union[str, List[str], None] = None block_size: int = 128 diff --git a/src/sparseml/modifiers/quantization/gptq/pytorch.py b/src/sparseml/modifiers/quantization/gptq/pytorch.py index e9e3f715625..66898688f12 100644 --- a/src/sparseml/modifiers/quantization/gptq/pytorch.py +++ b/src/sparseml/modifiers/quantization/gptq/pytorch.py @@ -156,7 +156,7 @@ def apply_compression( layer_compressor.pre_compress() _LOGGER.info(f"Calibrating {layer_compressor.name}...") run_calibration_forward(self.model, dataloader, mask_padding=True) - layer_compressor.compress() + layer_compressor.compress(self.actorder) layer_compressor.post_compress() layer_compressor.revert_layer_wrappers() torch.cuda.empty_cache() diff --git a/src/sparseml/modifiers/quantization/gptq/utils/gptq_wrapper.py b/src/sparseml/modifiers/quantization/gptq/utils/gptq_wrapper.py index 73321c0d0aa..f7b54f56038 100644 --- a/src/sparseml/modifiers/quantization/gptq/utils/gptq_wrapper.py +++ b/src/sparseml/modifiers/quantization/gptq/utils/gptq_wrapper.py @@ -81,6 +81,7 @@ def add_batch(self, inp: torch.Tensor, out: torch.Tensor): def fasterprune( self, + actorder: bool = False, blocksize: int = 128, percdamp: float = 0.01, ): @@ -109,6 +110,12 @@ def fasterprune( self.H[dead, dead] = 1 W[:, dead] = 0 + if actorder: + perm = torch.argsort(torch.diag(H), descending=True) + W = W[:, perm] + H = H[perm][:, perm] + invperm = torch.argsort(perm) + Losses = torch.zeros(self.rows, device=self.dev) damp = percdamp * torch.mean(torch.diag(self.H)) @@ -153,6 +160,7 @@ def fasterprune( for i in range(count): w = W1[:, i] d = Hinv1[i, i] + q = w.clone() if sparsity >= SPARSITY_THRESHOLD: q[mask1[:, i]] = 0 @@ -227,6 +235,9 @@ def fasterprune( _LOGGER.info("time %.2f" % (time.time() - tick)) _LOGGER.info("error %.2f" % torch.sum(Losses).item()) + if actorder: + W = W[:, invperm] + if isinstance(self.layer, transformers.Conv1D): W = W.t() W = W.reshape(final_shape).to(final_dtype) diff --git a/src/sparseml/modifiers/utils/layer_compressor.py b/src/sparseml/modifiers/utils/layer_compressor.py index e5a36f77278..5090539d84e 100644 --- a/src/sparseml/modifiers/utils/layer_compressor.py +++ b/src/sparseml/modifiers/utils/layer_compressor.py @@ -131,7 +131,7 @@ def revert_layer_wrappers(self): module_wrapper.free() self.modules = None - def compress(self): + def compress(self, actorder: bool = False): """ Apply compression to each wrapped submodule in the layer """ @@ -141,7 +141,7 @@ def prune(module): if isinstance(module, self.module_compressor_class): full_name = self._get_full_submodule_name(module.name) _LOGGER.info(f"Compressing {full_name}...") - module.fasterprune(**self.args) + module.fasterprune(actorder=actorder, **self.args) self.layer.apply(prune) From 4cdbb8d55517f13ce10a0b0ba1bb6f8018bdbd51 Mon Sep 17 00:00:00 2001 From: George Ohashi Date: Wed, 5 Jun 2024 15:27:07 +0000 Subject: [PATCH 02/12] self reference to H --- .../modifiers/quantization/gptq/utils/gptq_wrapper.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/sparseml/modifiers/quantization/gptq/utils/gptq_wrapper.py b/src/sparseml/modifiers/quantization/gptq/utils/gptq_wrapper.py index f7b54f56038..de472c66ecb 100644 --- a/src/sparseml/modifiers/quantization/gptq/utils/gptq_wrapper.py +++ b/src/sparseml/modifiers/quantization/gptq/utils/gptq_wrapper.py @@ -111,9 +111,9 @@ def fasterprune( W[:, dead] = 0 if actorder: - perm = torch.argsort(torch.diag(H), descending=True) + perm = torch.argsort(torch.diag(self.H), descending=True) W = W[:, perm] - H = H[perm][:, perm] + self.H = self.H[perm][:, perm] invperm = torch.argsort(perm) Losses = torch.zeros(self.rows, device=self.dev) From eb6ad2bd7cf59df71658ed0b682bf5988c499251 Mon Sep 17 00:00:00 2001 From: George Ohashi Date: Wed, 5 Jun 2024 18:40:17 +0000 Subject: [PATCH 03/12] doc string --- src/sparseml/modifiers/quantization/gptq/base.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/sparseml/modifiers/quantization/gptq/base.py b/src/sparseml/modifiers/quantization/gptq/base.py index c3254ab31ca..08656e0c538 100644 --- a/src/sparseml/modifiers/quantization/gptq/base.py +++ b/src/sparseml/modifiers/quantization/gptq/base.py @@ -50,6 +50,7 @@ class GPTQModifier(Modifier): - LayerCompressor.revert_layer_wrappers() + :param actorder: Whether to use activation reordering or not :param sequential_update: Whether or not to update weights sequentially by layer, True saves on GPU memory :param targets: list of layer names to compress during GPTQ, or '__ALL__' From d39e7f9e68f41bd9a6ec53858ab21bf0c7480f99 Mon Sep 17 00:00:00 2001 From: George Ohashi Date: Mon, 17 Jun 2024 19:58:29 +0000 Subject: [PATCH 04/12] comments --- src/sparseml/modifiers/quantization/gptq/utils/gptq_wrapper.py | 1 + src/sparseml/modifiers/utils/layer_compressor.py | 2 ++ 2 files changed, 3 insertions(+) diff --git a/src/sparseml/modifiers/quantization/gptq/utils/gptq_wrapper.py b/src/sparseml/modifiers/quantization/gptq/utils/gptq_wrapper.py index de472c66ecb..30a13196d92 100644 --- a/src/sparseml/modifiers/quantization/gptq/utils/gptq_wrapper.py +++ b/src/sparseml/modifiers/quantization/gptq/utils/gptq_wrapper.py @@ -89,6 +89,7 @@ def fasterprune( Run pruning and quantization(if applicable) on the layer up to the target sparsity value. + :param actorder: Flag to apply activation reordering :param blocksize: Number of columns to compress in one pass :param percdamp: Amount of dampening to apply to H, as a fraction of the diagonal norm diff --git a/src/sparseml/modifiers/utils/layer_compressor.py b/src/sparseml/modifiers/utils/layer_compressor.py index 5090539d84e..2d7fdf53e00 100644 --- a/src/sparseml/modifiers/utils/layer_compressor.py +++ b/src/sparseml/modifiers/utils/layer_compressor.py @@ -134,6 +134,8 @@ def revert_layer_wrappers(self): def compress(self, actorder: bool = False): """ Apply compression to each wrapped submodule in the layer + + :param: actorder: flag to apply activation reordering """ @torch.no_grad() From 828e185cf6753bff1fea0faa643df7d8338d8690 Mon Sep 17 00:00:00 2001 From: George Ohashi Date: Mon, 24 Jun 2024 13:32:16 +0000 Subject: [PATCH 05/12] add g_idx --- .../quantization/gptq/utils/gptq_wrapper.py | 23 ++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/src/sparseml/modifiers/quantization/gptq/utils/gptq_wrapper.py b/src/sparseml/modifiers/quantization/gptq/utils/gptq_wrapper.py index 30a13196d92..29927c832ad 100644 --- a/src/sparseml/modifiers/quantization/gptq/utils/gptq_wrapper.py +++ b/src/sparseml/modifiers/quantization/gptq/utils/gptq_wrapper.py @@ -137,6 +137,14 @@ def fasterprune( if sparsity >= SPARSITY_THRESHOLD else None ) + + g_idx = [] + if actorder: + g_idx = [perm[i] // quant_scheme.weights.group_size for i in range(self.columns)] + g_idx = g_idx[invperm] + else: + g_idx = [i // quant_scheme.weights.group_size for i in range(self.columns)] + g_idx = torch.tensor(g_idx, dtype=torch.int32, device=W.device) # See section 3.4 of https://arxiv.org/abs/2203.07259 for i1 in range(0, self.columns, blocksize): @@ -148,6 +156,15 @@ def fasterprune( Err1 = torch.zeros_like(W1) Losses1 = torch.zeros_like(W1) Hinv1 = Hinv[i1:i2, i1:i2] + + # """ + # if not channel wise + + # strategy = quant_scheme.weights.strategy + # if strategy is not QuantizationStrategy.CHANNEL: + # idx = i + + # """ if sparsity >= SPARSITY_THRESHOLD: tmp = ( @@ -176,6 +193,7 @@ def fasterprune( else: q = torch.quantize_per_channel(q, scale, zero_point, 0, dtype) q = torch.dequantize(q) + elif hasattr(self.layer, "quantization_scheme"): quant_scheme = self.layer.quantization_scheme if quant_scheme.weights is not None: @@ -235,9 +253,11 @@ def fasterprune( _LOGGER.info("time %.2f" % (time.time() - tick)) _LOGGER.info("error %.2f" % torch.sum(Losses).item()) - + + if actorder: W = W[:, invperm] + # g_idx = g_idx[invperm] if isinstance(self.layer, transformers.Conv1D): W = W.t() @@ -247,6 +267,7 @@ def fasterprune( # place, clone() or direct assignment won't work self.layer.weight -= self.layer.weight self.layer.weight += W + self.g_idx = g_idx def free(self): """ From 94196b58558ce1e972f95bc503eaac5b15acf8e8 Mon Sep 17 00:00:00 2001 From: George Ohashi Date: Tue, 25 Jun 2024 16:55:28 +0000 Subject: [PATCH 06/12] overwrite intialized g_idx in gptq_wrapper --- .../quantization/gptq/utils/gptq_wrapper.py | 34 ++++++++----------- 1 file changed, 14 insertions(+), 20 deletions(-) diff --git a/src/sparseml/modifiers/quantization/gptq/utils/gptq_wrapper.py b/src/sparseml/modifiers/quantization/gptq/utils/gptq_wrapper.py index 8b6ae1e5e01..5e8052ffe7f 100644 --- a/src/sparseml/modifiers/quantization/gptq/utils/gptq_wrapper.py +++ b/src/sparseml/modifiers/quantization/gptq/utils/gptq_wrapper.py @@ -17,6 +17,7 @@ from sparseml.modifiers.utils import SPARSITY_THRESHOLD from sparseml.modifiers.utils.compression_wrapper import ModuleCompressionWrapper +from torch.nn import Parameter try: import transformers @@ -119,6 +120,7 @@ def fasterprune( self.H[dead, dead] = 1 W[:, dead] = 0 + # Or read from self.layer.quantization_scheme if actorder: perm = torch.argsort(torch.diag(self.H), descending=True) W = W[:, perm] @@ -135,15 +137,6 @@ def fasterprune( self.H = torch.linalg.cholesky(self.H, upper=True) Hinv = self.H - - g_idx = [] - if actorder: - g_idx = [perm[i] // quant_scheme.weights.group_size for i in range(self.columns)] - g_idx = g_idx[invperm] - else: - g_idx = [i // quant_scheme.weights.group_size for i in range(self.columns)] - g_idx = torch.tensor(g_idx, dtype=torch.int32, device=W.device) - # See section 3.4 of https://arxiv.org/abs/2203.07259 for i1 in range(0, self.columns, blocksize): i2 = min(i1 + blocksize, self.columns) @@ -154,15 +147,6 @@ def fasterprune( Err1 = torch.zeros_like(W1) Losses1 = torch.zeros_like(W1) Hinv1 = Hinv[i1:i2, i1:i2] - - # """ - # if not channel wise - - # strategy = quant_scheme.weights.strategy - # if strategy is not QuantizationStrategy.CHANNEL: - # idx = i - - # """ if preserve_zeros: W1_nz_mask = W_nz_mask[:, i1:i2] @@ -189,6 +173,18 @@ def fasterprune( if quant_scheme.weights is not None: scale = self.layer.weight_scale zero_point = self.layer.weight_zero_point + + group_size = quant_scheme.weights.group_size + if group_size is None or group_size == -1: + group_size = self.layer.weight.shape[1] + + if actorder: + g_idx = torch.Tensor([perm[j] // group_size for j in range(self.columns)], dtype=torch.int32, device=invperm.device) + g_idx = g_idx[invperm] + self.layer.weight_g_idx = Parameter(g_idx, requires_grad=False,) + else: + g_idx = torch.Tensor([j // group_size for j in range(self.columns)], dtype=torch.int32, device=W.device) + from compressed_tensors.quantization import QuantizationStrategy from compressed_tensors.quantization.lifecycle.forward import ( fake_quantize, @@ -255,7 +251,6 @@ def fasterprune( if actorder: W = W[:, invperm] - # g_idx = g_idx[invperm] if isinstance(self.layer, transformers.Conv1D): W = W.t() @@ -265,7 +260,6 @@ def fasterprune( # place, clone() or direct assignment won't work self.layer.weight -= self.layer.weight self.layer.weight += W - self.g_idx = g_idx def free(self): """ From 2525f699ff2339641b064e52b8fef7d489356f98 Mon Sep 17 00:00:00 2001 From: George Ohashi Date: Tue, 25 Jun 2024 18:10:50 +0000 Subject: [PATCH 07/12] update g_idx --- .../quantization/gptq/utils/gptq_wrapper.py | 31 ++++++++++++++----- 1 file changed, 23 insertions(+), 8 deletions(-) diff --git a/src/sparseml/modifiers/quantization/gptq/utils/gptq_wrapper.py b/src/sparseml/modifiers/quantization/gptq/utils/gptq_wrapper.py index 5e8052ffe7f..43fe205a97d 100644 --- a/src/sparseml/modifiers/quantization/gptq/utils/gptq_wrapper.py +++ b/src/sparseml/modifiers/quantization/gptq/utils/gptq_wrapper.py @@ -14,10 +14,11 @@ import time +from torch.nn import Parameter + from sparseml.modifiers.utils import SPARSITY_THRESHOLD from sparseml.modifiers.utils.compression_wrapper import ModuleCompressionWrapper -from torch.nn import Parameter try: import transformers @@ -177,13 +178,25 @@ def fasterprune( group_size = quant_scheme.weights.group_size if group_size is None or group_size == -1: group_size = self.layer.weight.shape[1] - + if actorder: - g_idx = torch.Tensor([perm[j] // group_size for j in range(self.columns)], dtype=torch.int32, device=invperm.device) + g_idx = torch.tensor( + [perm[j] // group_size for j in range(self.columns)], + dtype=torch.int32, + device=invperm.device + ) + g_idx = g_idx[invperm] - self.layer.weight_g_idx = Parameter(g_idx, requires_grad=False,) + self.layer.weight_g_idx = Parameter( + g_idx, + requires_grad=False, + ) else: - g_idx = torch.Tensor([j // group_size for j in range(self.columns)], dtype=torch.int32, device=W.device) + g_idx = torch.Tensor( + [j // group_size for j in range(self.columns)], + + device=W.device, + ) from compressed_tensors.quantization import QuantizationStrategy from compressed_tensors.quantization.lifecycle.forward import ( @@ -191,13 +204,14 @@ def fasterprune( ) strategy = quant_scheme.weights.strategy - + breakpoint() if strategy == QuantizationStrategy.TENSOR: q = fake_quantize( q, scale, zero_point, self.layer.quantization_scheme.weights, + g_idx, ) elif strategy == QuantizationStrategy.CHANNEL: # TODO: for channelwise why isn't this just a 1d tensor? @@ -205,6 +219,7 @@ def fasterprune( q, scale[:, 0], zero_point[:, 0], + # g_idx, quant_scheme.weights, ) else: # strategy == QuantizationStrategy.GROUP @@ -222,6 +237,7 @@ def fasterprune( q, scale[:, input_dim_group], zero_point[:, input_dim_group], + # g_idx, altered_qargs, ) @@ -247,8 +263,7 @@ def fasterprune( _LOGGER.info("time %.2f" % (time.time() - tick)) _LOGGER.info("error %.2f" % torch.sum(Losses).item()) - - + if actorder: W = W[:, invperm] From c6b5b28f6600c478b4107303c1c67f69bf1bde1c Mon Sep 17 00:00:00 2001 From: George Ohashi Date: Wed, 26 Jun 2024 17:56:33 +0000 Subject: [PATCH 08/12] g_idx to fakequantize --- .../quantization/gptq/utils/gptq_wrapper.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/src/sparseml/modifiers/quantization/gptq/utils/gptq_wrapper.py b/src/sparseml/modifiers/quantization/gptq/utils/gptq_wrapper.py index 43fe205a97d..1745b7c802b 100644 --- a/src/sparseml/modifiers/quantization/gptq/utils/gptq_wrapper.py +++ b/src/sparseml/modifiers/quantization/gptq/utils/gptq_wrapper.py @@ -192,10 +192,10 @@ def fasterprune( requires_grad=False, ) else: - g_idx = torch.Tensor( + g_idx = torch.tensor( [j // group_size for j in range(self.columns)], - - device=W.device, + dtype=torch.int32, + device=W.device ) from compressed_tensors.quantization import QuantizationStrategy @@ -204,14 +204,12 @@ def fasterprune( ) strategy = quant_scheme.weights.strategy - breakpoint() if strategy == QuantizationStrategy.TENSOR: q = fake_quantize( q, scale, zero_point, self.layer.quantization_scheme.weights, - g_idx, ) elif strategy == QuantizationStrategy.CHANNEL: # TODO: for channelwise why isn't this just a 1d tensor? @@ -228,16 +226,20 @@ def fasterprune( input_dim_group = ( column_idx // quant_scheme.weights.group_size ) - # Since we're only applying quantization to a slice, this # ends up being a channelwise application altered_qargs = copy(quant_scheme.weights) altered_qargs.strategy = QuantizationStrategy.CHANNEL + + # # apply g_idx + # if g_idx is not None: + # scale = scale[g_idx] + # zero_point = zero_point[g_idx] + q = fake_quantize( q, scale[:, input_dim_group], zero_point[:, input_dim_group], - # g_idx, altered_qargs, ) From 28744fd8c256bb16b8c12ba735666412c9d86d94 Mon Sep 17 00:00:00 2001 From: George Ohashi Date: Wed, 26 Jun 2024 19:51:36 +0000 Subject: [PATCH 09/12] apply g_idx --- .../modifiers/quantization/gptq/pytorch.py | 2 +- .../quantization/gptq/utils/gptq_wrapper.py | 74 ++++++++++++------- .../modifiers/utils/layer_compressor.py | 6 +- 3 files changed, 50 insertions(+), 32 deletions(-) diff --git a/src/sparseml/modifiers/quantization/gptq/pytorch.py b/src/sparseml/modifiers/quantization/gptq/pytorch.py index 66898688f12..e9e3f715625 100644 --- a/src/sparseml/modifiers/quantization/gptq/pytorch.py +++ b/src/sparseml/modifiers/quantization/gptq/pytorch.py @@ -156,7 +156,7 @@ def apply_compression( layer_compressor.pre_compress() _LOGGER.info(f"Calibrating {layer_compressor.name}...") run_calibration_forward(self.model, dataloader, mask_padding=True) - layer_compressor.compress(self.actorder) + layer_compressor.compress() layer_compressor.post_compress() layer_compressor.revert_layer_wrappers() torch.cuda.empty_cache() diff --git a/src/sparseml/modifiers/quantization/gptq/utils/gptq_wrapper.py b/src/sparseml/modifiers/quantization/gptq/utils/gptq_wrapper.py index 1745b7c802b..c2b72ffa487 100644 --- a/src/sparseml/modifiers/quantization/gptq/utils/gptq_wrapper.py +++ b/src/sparseml/modifiers/quantization/gptq/utils/gptq_wrapper.py @@ -83,7 +83,6 @@ def add_batch(self, inp: torch.Tensor, out: torch.Tensor): def fasterprune( self, - actorder: bool = False, blocksize: int = 128, percdamp: float = 0.01, ): @@ -121,13 +120,6 @@ def fasterprune( self.H[dead, dead] = 1 W[:, dead] = 0 - # Or read from self.layer.quantization_scheme - if actorder: - perm = torch.argsort(torch.diag(self.H), descending=True) - W = W[:, perm] - self.H = self.H[perm][:, perm] - invperm = torch.argsort(perm) - Losses = torch.zeros(self.rows, device=self.dev) damp = percdamp * torch.mean(torch.diag(self.H)) @@ -138,6 +130,9 @@ def fasterprune( self.H = torch.linalg.cholesky(self.H, upper=True) Hinv = self.H + actorder = False + invperm = None + # See section 3.4 of https://arxiv.org/abs/2203.07259 for i1 in range(0, self.columns, blocksize): i2 = min(i1 + blocksize, self.columns) @@ -171,7 +166,15 @@ def fasterprune( elif hasattr(self.layer, "quantization_scheme"): quant_scheme = self.layer.quantization_scheme + actorder = quant_scheme.weights.actorder if quant_scheme.weights is not None: + + if actorder: + perm = torch.argsort(torch.diag(self.H), descending=True) + W = W[:, perm] + self.H = self.H[perm][:, perm] + invperm = torch.argsort(perm) + scale = self.layer.weight_scale zero_point = self.layer.weight_zero_point @@ -180,23 +183,15 @@ def fasterprune( group_size = self.layer.weight.shape[1] if actorder: - g_idx = torch.tensor( - [perm[j] // group_size for j in range(self.columns)], - dtype=torch.int32, - device=invperm.device - ) - + indices = torch.arange(self.columns, device=invperm.device) + g_idx = (perm[indices] // group_size).to(dtype=torch.int32) g_idx = g_idx[invperm] - self.layer.weight_g_idx = Parameter( - g_idx, - requires_grad=False, - ) + self.layer.weight_g_idx.data = g_idx else: - g_idx = torch.tensor( - [j // group_size for j in range(self.columns)], - dtype=torch.int32, - device=W.device + indices = torch.arange( + self.columns, device=W.device, dtype=torch.int32 ) + g_idx = indices // group_size from compressed_tensors.quantization import QuantizationStrategy from compressed_tensors.quantization.lifecycle.forward import ( @@ -217,7 +212,6 @@ def fasterprune( q, scale[:, 0], zero_point[:, 0], - # g_idx, quant_scheme.weights, ) else: # strategy == QuantizationStrategy.GROUP @@ -230,11 +224,16 @@ def fasterprune( # ends up being a channelwise application altered_qargs = copy(quant_scheme.weights) altered_qargs.strategy = QuantizationStrategy.CHANNEL - - # # apply g_idx - # if g_idx is not None: - # scale = scale[g_idx] - # zero_point = zero_point[g_idx] + + # apply g_idx + if g_idx is not None: + # scale and zp already transformed by group_size + # extract first index of group_idze + indices_to_extract = torch.arange( + 0, g_idx.shape[0], group_size + ) + scale = scale[:, g_idx[indices_to_extract]] + zero_point = zero_point[:, g_idx[indices_to_extract]] q = fake_quantize( q, @@ -284,3 +283,22 @@ def free(self): """ delattr(self, "H") super().free() + + +""" +(Pdb) scale.shape +torch.Size([4096, 32]) +(Pdb) self.layer.shape +*** AttributeError: 'Linear' object has no attribute 'shape' +(Pdb) self.layer.weight.shape +torch.Size([4096, 4096]) + + + +(Pdb) scale.shape +torch.Size([11008, 32]) +(Pdb) self.layer.weight.shape +torch.Size([11008, 4096]) + + +""" diff --git a/src/sparseml/modifiers/utils/layer_compressor.py b/src/sparseml/modifiers/utils/layer_compressor.py index 2d7fdf53e00..eb0b51cf269 100644 --- a/src/sparseml/modifiers/utils/layer_compressor.py +++ b/src/sparseml/modifiers/utils/layer_compressor.py @@ -131,10 +131,10 @@ def revert_layer_wrappers(self): module_wrapper.free() self.modules = None - def compress(self, actorder: bool = False): + def compress(self): """ Apply compression to each wrapped submodule in the layer - + :param: actorder: flag to apply activation reordering """ @@ -143,7 +143,7 @@ def prune(module): if isinstance(module, self.module_compressor_class): full_name = self._get_full_submodule_name(module.name) _LOGGER.info(f"Compressing {full_name}...") - module.fasterprune(actorder=actorder, **self.args) + module.fasterprune(**self.args) self.layer.apply(prune) From 51c36c223ab480f295e45058b845364daedcef30 Mon Sep 17 00:00:00 2001 From: George Ohashi Date: Wed, 26 Jun 2024 20:02:06 +0000 Subject: [PATCH 10/12] read actoder from quant_args --- src/sparseml/modifiers/quantization/gptq/base.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/sparseml/modifiers/quantization/gptq/base.py b/src/sparseml/modifiers/quantization/gptq/base.py index 58f5c37b463..833fa284531 100644 --- a/src/sparseml/modifiers/quantization/gptq/base.py +++ b/src/sparseml/modifiers/quantization/gptq/base.py @@ -82,7 +82,6 @@ class GPTQModifier(Modifier): and activation 8 bit quantization on the Linear layers. """ - actorder: bool = False sequential_update: Optional[bool] = False targets: Union[str, List[str], None] = None block_size: int = 128 From a75c462e0d9f233dffa3573cfe4ef34898c26f25 Mon Sep 17 00:00:00 2001 From: George Ohashi Date: Fri, 28 Jun 2024 15:15:04 +0000 Subject: [PATCH 11/12] lint --- src/sparseml/modifiers/quantization/gptq/utils/gptq_wrapper.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/sparseml/modifiers/quantization/gptq/utils/gptq_wrapper.py b/src/sparseml/modifiers/quantization/gptq/utils/gptq_wrapper.py index c2b72ffa487..68c737e752f 100644 --- a/src/sparseml/modifiers/quantization/gptq/utils/gptq_wrapper.py +++ b/src/sparseml/modifiers/quantization/gptq/utils/gptq_wrapper.py @@ -14,8 +14,6 @@ import time -from torch.nn import Parameter - from sparseml.modifiers.utils import SPARSITY_THRESHOLD from sparseml.modifiers.utils.compression_wrapper import ModuleCompressionWrapper From bea971cfade46505a59ecf479613db710b710d86 Mon Sep 17 00:00:00 2001 From: George Ohashi Date: Mon, 1 Jul 2024 14:32:37 +0000 Subject: [PATCH 12/12] remove commented code --- .../quantization/gptq/utils/gptq_wrapper.py | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/src/sparseml/modifiers/quantization/gptq/utils/gptq_wrapper.py b/src/sparseml/modifiers/quantization/gptq/utils/gptq_wrapper.py index 68c737e752f..9f660b987fb 100644 --- a/src/sparseml/modifiers/quantization/gptq/utils/gptq_wrapper.py +++ b/src/sparseml/modifiers/quantization/gptq/utils/gptq_wrapper.py @@ -281,22 +281,3 @@ def free(self): """ delattr(self, "H") super().free() - - -""" -(Pdb) scale.shape -torch.Size([4096, 32]) -(Pdb) self.layer.shape -*** AttributeError: 'Linear' object has no attribute 'shape' -(Pdb) self.layer.weight.shape -torch.Size([4096, 4096]) - - - -(Pdb) scale.shape -torch.Size([11008, 32]) -(Pdb) self.layer.weight.shape -torch.Size([11008, 4096]) - - -"""