From 2691f85aad6337fcbe08e5c6e6bfe3e629d14e28 Mon Sep 17 00:00:00 2001
From: George Ohashi <george@neuralmagic.com>
Date: Tue, 4 Jun 2024 20:23:40 +0000
Subject: [PATCH 01/12] activation ordering

---
 src/sparseml/modifiers/quantization/gptq/base.py      |  1 +
 src/sparseml/modifiers/quantization/gptq/pytorch.py   |  2 +-
 .../modifiers/quantization/gptq/utils/gptq_wrapper.py | 11 +++++++++++
 src/sparseml/modifiers/utils/layer_compressor.py      |  4 ++--
 4 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/src/sparseml/modifiers/quantization/gptq/base.py b/src/sparseml/modifiers/quantization/gptq/base.py
index 004fce2ee7a..c3254ab31ca 100644
--- a/src/sparseml/modifiers/quantization/gptq/base.py
+++ b/src/sparseml/modifiers/quantization/gptq/base.py
@@ -80,6 +80,7 @@ class GPTQModifier(Modifier):
         and activation 8 bit quantization on the Linear layers.
     """
 
+    actorder: bool = False
     sequential_update: Optional[bool] = False
     targets: Union[str, List[str], None] = None
     block_size: int = 128
diff --git a/src/sparseml/modifiers/quantization/gptq/pytorch.py b/src/sparseml/modifiers/quantization/gptq/pytorch.py
index e9e3f715625..66898688f12 100644
--- a/src/sparseml/modifiers/quantization/gptq/pytorch.py
+++ b/src/sparseml/modifiers/quantization/gptq/pytorch.py
@@ -156,7 +156,7 @@ def apply_compression(
                 layer_compressor.pre_compress()
                 _LOGGER.info(f"Calibrating {layer_compressor.name}...")
                 run_calibration_forward(self.model, dataloader, mask_padding=True)
-            layer_compressor.compress()
+            layer_compressor.compress(self.actorder)
             layer_compressor.post_compress()
             layer_compressor.revert_layer_wrappers()
             torch.cuda.empty_cache()
diff --git a/src/sparseml/modifiers/quantization/gptq/utils/gptq_wrapper.py b/src/sparseml/modifiers/quantization/gptq/utils/gptq_wrapper.py
index 73321c0d0aa..f7b54f56038 100644
--- a/src/sparseml/modifiers/quantization/gptq/utils/gptq_wrapper.py
+++ b/src/sparseml/modifiers/quantization/gptq/utils/gptq_wrapper.py
@@ -81,6 +81,7 @@ def add_batch(self, inp: torch.Tensor, out: torch.Tensor):
 
     def fasterprune(
         self,
+        actorder: bool = False,
         blocksize: int = 128,
         percdamp: float = 0.01,
     ):
@@ -109,6 +110,12 @@ def fasterprune(
         self.H[dead, dead] = 1
         W[:, dead] = 0
 
+        if actorder:
+            perm = torch.argsort(torch.diag(H), descending=True)
+            W = W[:, perm]
+            H = H[perm][:, perm]
+            invperm = torch.argsort(perm)
+
         Losses = torch.zeros(self.rows, device=self.dev)
 
         damp = percdamp * torch.mean(torch.diag(self.H))
@@ -153,6 +160,7 @@ def fasterprune(
             for i in range(count):
                 w = W1[:, i]
                 d = Hinv1[i, i]
+
                 q = w.clone()
                 if sparsity >= SPARSITY_THRESHOLD:
                     q[mask1[:, i]] = 0
@@ -227,6 +235,9 @@ def fasterprune(
         _LOGGER.info("time %.2f" % (time.time() - tick))
         _LOGGER.info("error %.2f" % torch.sum(Losses).item())
 
+        if actorder:
+            W = W[:, invperm]
+
         if isinstance(self.layer, transformers.Conv1D):
             W = W.t()
         W = W.reshape(final_shape).to(final_dtype)
diff --git a/src/sparseml/modifiers/utils/layer_compressor.py b/src/sparseml/modifiers/utils/layer_compressor.py
index e5a36f77278..5090539d84e 100644
--- a/src/sparseml/modifiers/utils/layer_compressor.py
+++ b/src/sparseml/modifiers/utils/layer_compressor.py
@@ -131,7 +131,7 @@ def revert_layer_wrappers(self):
             module_wrapper.free()
         self.modules = None
 
-    def compress(self):
+    def compress(self, actorder: bool = False):
         """
         Apply compression to each wrapped submodule in the layer
         """
@@ -141,7 +141,7 @@ def prune(module):
             if isinstance(module, self.module_compressor_class):
                 full_name = self._get_full_submodule_name(module.name)
                 _LOGGER.info(f"Compressing {full_name}...")
-                module.fasterprune(**self.args)
+                module.fasterprune(actorder=actorder, **self.args)
 
         self.layer.apply(prune)
 

From 4cdbb8d55517f13ce10a0b0ba1bb6f8018bdbd51 Mon Sep 17 00:00:00 2001
From: George Ohashi <george@neuralmagic.com>
Date: Wed, 5 Jun 2024 15:27:07 +0000
Subject: [PATCH 02/12] self reference to H

---
 .../modifiers/quantization/gptq/utils/gptq_wrapper.py         | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/sparseml/modifiers/quantization/gptq/utils/gptq_wrapper.py b/src/sparseml/modifiers/quantization/gptq/utils/gptq_wrapper.py
index f7b54f56038..de472c66ecb 100644
--- a/src/sparseml/modifiers/quantization/gptq/utils/gptq_wrapper.py
+++ b/src/sparseml/modifiers/quantization/gptq/utils/gptq_wrapper.py
@@ -111,9 +111,9 @@ def fasterprune(
         W[:, dead] = 0
 
         if actorder:
-            perm = torch.argsort(torch.diag(H), descending=True)
+            perm = torch.argsort(torch.diag(self.H), descending=True)
             W = W[:, perm]
-            H = H[perm][:, perm]
+            self.H = self.H[perm][:, perm]
             invperm = torch.argsort(perm)
 
         Losses = torch.zeros(self.rows, device=self.dev)

From eb6ad2bd7cf59df71658ed0b682bf5988c499251 Mon Sep 17 00:00:00 2001
From: George Ohashi <george@neuralmagic.com>
Date: Wed, 5 Jun 2024 18:40:17 +0000
Subject: [PATCH 03/12] doc string

---
 src/sparseml/modifiers/quantization/gptq/base.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/sparseml/modifiers/quantization/gptq/base.py b/src/sparseml/modifiers/quantization/gptq/base.py
index c3254ab31ca..08656e0c538 100644
--- a/src/sparseml/modifiers/quantization/gptq/base.py
+++ b/src/sparseml/modifiers/quantization/gptq/base.py
@@ -50,6 +50,7 @@ class GPTQModifier(Modifier):
             - LayerCompressor.revert_layer_wrappers()
 
 
+    :param actorder: Whether to use activation reordering or not
     :param sequential_update: Whether or not to update weights sequentially by layer,
         True saves on GPU memory
     :param targets: list of layer names to compress during GPTQ, or '__ALL__'

From d39e7f9e68f41bd9a6ec53858ab21bf0c7480f99 Mon Sep 17 00:00:00 2001
From: George Ohashi <george@neuralmagic.com>
Date: Mon, 17 Jun 2024 19:58:29 +0000
Subject: [PATCH 04/12] comments

---
 src/sparseml/modifiers/quantization/gptq/utils/gptq_wrapper.py | 1 +
 src/sparseml/modifiers/utils/layer_compressor.py               | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/src/sparseml/modifiers/quantization/gptq/utils/gptq_wrapper.py b/src/sparseml/modifiers/quantization/gptq/utils/gptq_wrapper.py
index de472c66ecb..30a13196d92 100644
--- a/src/sparseml/modifiers/quantization/gptq/utils/gptq_wrapper.py
+++ b/src/sparseml/modifiers/quantization/gptq/utils/gptq_wrapper.py
@@ -89,6 +89,7 @@ def fasterprune(
         Run pruning and quantization(if applicable) on the layer up to the target
         sparsity value.
 
+        :param actorder: Flag to apply activation reordering
         :param blocksize: Number of columns to compress in one pass
         :param percdamp: Amount of dampening to apply to H, as a fraction of the
             diagonal norm
diff --git a/src/sparseml/modifiers/utils/layer_compressor.py b/src/sparseml/modifiers/utils/layer_compressor.py
index 5090539d84e..2d7fdf53e00 100644
--- a/src/sparseml/modifiers/utils/layer_compressor.py
+++ b/src/sparseml/modifiers/utils/layer_compressor.py
@@ -134,6 +134,8 @@ def revert_layer_wrappers(self):
     def compress(self, actorder: bool = False):
         """
         Apply compression to each wrapped submodule in the layer
+        
+        :param: actorder: flag to apply activation reordering
         """
 
         @torch.no_grad()

From 828e185cf6753bff1fea0faa643df7d8338d8690 Mon Sep 17 00:00:00 2001
From: George Ohashi <george@neuralmagic.com>
Date: Mon, 24 Jun 2024 13:32:16 +0000
Subject: [PATCH 05/12] add g_idx

---
 .../quantization/gptq/utils/gptq_wrapper.py   | 23 ++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

diff --git a/src/sparseml/modifiers/quantization/gptq/utils/gptq_wrapper.py b/src/sparseml/modifiers/quantization/gptq/utils/gptq_wrapper.py
index 30a13196d92..29927c832ad 100644
--- a/src/sparseml/modifiers/quantization/gptq/utils/gptq_wrapper.py
+++ b/src/sparseml/modifiers/quantization/gptq/utils/gptq_wrapper.py
@@ -137,6 +137,14 @@ def fasterprune(
             if sparsity >= SPARSITY_THRESHOLD
             else None
         )
+        
+        g_idx = []
+        if actorder:
+            g_idx = [perm[i] // quant_scheme.weights.group_size for i in range(self.columns)]
+            g_idx = g_idx[invperm]
+        else:
+            g_idx = [i // quant_scheme.weights.group_size for i in range(self.columns)]         
+        g_idx = torch.tensor(g_idx, dtype=torch.int32, device=W.device)
 
         # See section 3.4 of https://arxiv.org/abs/2203.07259
         for i1 in range(0, self.columns, blocksize):
@@ -148,6 +156,15 @@ def fasterprune(
             Err1 = torch.zeros_like(W1)
             Losses1 = torch.zeros_like(W1)
             Hinv1 = Hinv[i1:i2, i1:i2]
+            
+            # """
+            # if not channel wise
+            
+            # strategy = quant_scheme.weights.strategy
+            # if strategy is not QuantizationStrategy.CHANNEL:
+            #     idx = i
+            
+            # """
 
             if sparsity >= SPARSITY_THRESHOLD:
                 tmp = (
@@ -176,6 +193,7 @@ def fasterprune(
                     else:
                         q = torch.quantize_per_channel(q, scale, zero_point, 0, dtype)
                     q = torch.dequantize(q)
+
                 elif hasattr(self.layer, "quantization_scheme"):
                     quant_scheme = self.layer.quantization_scheme
                     if quant_scheme.weights is not None:
@@ -235,9 +253,11 @@ def fasterprune(
 
         _LOGGER.info("time %.2f" % (time.time() - tick))
         _LOGGER.info("error %.2f" % torch.sum(Losses).item())
-
+        
+       
         if actorder:
             W = W[:, invperm]
+            # g_idx = g_idx[invperm]
 
         if isinstance(self.layer, transformers.Conv1D):
             W = W.t()
@@ -247,6 +267,7 @@ def fasterprune(
         # place, clone() or direct assignment won't work
         self.layer.weight -= self.layer.weight
         self.layer.weight += W
+        self.g_idx = g_idx
 
     def free(self):
         """

From 94196b58558ce1e972f95bc503eaac5b15acf8e8 Mon Sep 17 00:00:00 2001
From: George Ohashi <george@neuralmagic.com>
Date: Tue, 25 Jun 2024 16:55:28 +0000
Subject: [PATCH 06/12] overwrite intialized g_idx in gptq_wrapper

---
 .../quantization/gptq/utils/gptq_wrapper.py   | 34 ++++++++-----------
 1 file changed, 14 insertions(+), 20 deletions(-)

diff --git a/src/sparseml/modifiers/quantization/gptq/utils/gptq_wrapper.py b/src/sparseml/modifiers/quantization/gptq/utils/gptq_wrapper.py
index 8b6ae1e5e01..5e8052ffe7f 100644
--- a/src/sparseml/modifiers/quantization/gptq/utils/gptq_wrapper.py
+++ b/src/sparseml/modifiers/quantization/gptq/utils/gptq_wrapper.py
@@ -17,6 +17,7 @@
 from sparseml.modifiers.utils import SPARSITY_THRESHOLD
 from sparseml.modifiers.utils.compression_wrapper import ModuleCompressionWrapper
 
+from torch.nn import Parameter
 
 try:
     import transformers
@@ -119,6 +120,7 @@ def fasterprune(
         self.H[dead, dead] = 1
         W[:, dead] = 0
 
+        # Or read from self.layer.quantization_scheme
         if actorder:
             perm = torch.argsort(torch.diag(self.H), descending=True)
             W = W[:, perm]
@@ -135,15 +137,6 @@ def fasterprune(
         self.H = torch.linalg.cholesky(self.H, upper=True)
         Hinv = self.H
 
-
-        g_idx = []
-        if actorder:
-            g_idx = [perm[i] // quant_scheme.weights.group_size for i in range(self.columns)]
-            g_idx = g_idx[invperm]
-        else:
-            g_idx = [i // quant_scheme.weights.group_size for i in range(self.columns)]         
-        g_idx = torch.tensor(g_idx, dtype=torch.int32, device=W.device)
-
         # See section 3.4 of https://arxiv.org/abs/2203.07259
         for i1 in range(0, self.columns, blocksize):
             i2 = min(i1 + blocksize, self.columns)
@@ -154,15 +147,6 @@ def fasterprune(
             Err1 = torch.zeros_like(W1)
             Losses1 = torch.zeros_like(W1)
             Hinv1 = Hinv[i1:i2, i1:i2]
-            
-            # """
-            # if not channel wise
-            
-            # strategy = quant_scheme.weights.strategy
-            # if strategy is not QuantizationStrategy.CHANNEL:
-            #     idx = i
-            
-            # """
 
             if preserve_zeros:
                 W1_nz_mask = W_nz_mask[:, i1:i2]
@@ -189,6 +173,18 @@ def fasterprune(
                     if quant_scheme.weights is not None:
                         scale = self.layer.weight_scale
                         zero_point = self.layer.weight_zero_point
+
+                        group_size = quant_scheme.weights.group_size
+                        if group_size is None or group_size == -1:
+                            group_size = self.layer.weight.shape[1]
+                            
+                        if actorder:
+                            g_idx = torch.Tensor([perm[j] // group_size for j in range(self.columns)],  dtype=torch.int32, device=invperm.device)
+                            g_idx = g_idx[invperm]
+                            self.layer.weight_g_idx = Parameter(g_idx, requires_grad=False,)
+                        else:
+                            g_idx = torch.Tensor([j // group_size for j in range(self.columns)], dtype=torch.int32, device=W.device)
+
                         from compressed_tensors.quantization import QuantizationStrategy
                         from compressed_tensors.quantization.lifecycle.forward import (
                             fake_quantize,
@@ -255,7 +251,6 @@ def fasterprune(
        
         if actorder:
             W = W[:, invperm]
-            # g_idx = g_idx[invperm]
 
         if isinstance(self.layer, transformers.Conv1D):
             W = W.t()
@@ -265,7 +260,6 @@ def fasterprune(
         # place, clone() or direct assignment won't work
         self.layer.weight -= self.layer.weight
         self.layer.weight += W
-        self.g_idx = g_idx
 
     def free(self):
         """

From 2525f699ff2339641b064e52b8fef7d489356f98 Mon Sep 17 00:00:00 2001
From: George Ohashi <george@neuralmagic.com>
Date: Tue, 25 Jun 2024 18:10:50 +0000
Subject: [PATCH 07/12] update g_idx

---
 .../quantization/gptq/utils/gptq_wrapper.py   | 31 ++++++++++++++-----
 1 file changed, 23 insertions(+), 8 deletions(-)

diff --git a/src/sparseml/modifiers/quantization/gptq/utils/gptq_wrapper.py b/src/sparseml/modifiers/quantization/gptq/utils/gptq_wrapper.py
index 5e8052ffe7f..43fe205a97d 100644
--- a/src/sparseml/modifiers/quantization/gptq/utils/gptq_wrapper.py
+++ b/src/sparseml/modifiers/quantization/gptq/utils/gptq_wrapper.py
@@ -14,10 +14,11 @@
 
 import time
 
+from torch.nn import Parameter
+
 from sparseml.modifiers.utils import SPARSITY_THRESHOLD
 from sparseml.modifiers.utils.compression_wrapper import ModuleCompressionWrapper
 
-from torch.nn import Parameter
 
 try:
     import transformers
@@ -177,13 +178,25 @@ def fasterprune(
                         group_size = quant_scheme.weights.group_size
                         if group_size is None or group_size == -1:
                             group_size = self.layer.weight.shape[1]
-                            
+
                         if actorder:
-                            g_idx = torch.Tensor([perm[j] // group_size for j in range(self.columns)],  dtype=torch.int32, device=invperm.device)
+                            g_idx = torch.tensor(
+                                [perm[j] // group_size for j in range(self.columns)],
+                                dtype=torch.int32,
+                                device=invperm.device
+                            )
+                            
                             g_idx = g_idx[invperm]
-                            self.layer.weight_g_idx = Parameter(g_idx, requires_grad=False,)
+                            self.layer.weight_g_idx = Parameter(
+                                g_idx,
+                                requires_grad=False,
+                            )
                         else:
-                            g_idx = torch.Tensor([j // group_size for j in range(self.columns)], dtype=torch.int32, device=W.device)
+                            g_idx = torch.Tensor(
+                                [j // group_size for j in range(self.columns)],
+                                
+                                device=W.device,
+                            )
 
                         from compressed_tensors.quantization import QuantizationStrategy
                         from compressed_tensors.quantization.lifecycle.forward import (
@@ -191,13 +204,14 @@ def fasterprune(
                         )
 
                         strategy = quant_scheme.weights.strategy
-
+                        breakpoint()
                         if strategy == QuantizationStrategy.TENSOR:
                             q = fake_quantize(
                                 q,
                                 scale,
                                 zero_point,
                                 self.layer.quantization_scheme.weights,
+                                g_idx,
                             )
                         elif strategy == QuantizationStrategy.CHANNEL:
                             # TODO: for channelwise why isn't this just a 1d tensor?
@@ -205,6 +219,7 @@ def fasterprune(
                                 q,
                                 scale[:, 0],
                                 zero_point[:, 0],
+                                # g_idx,
                                 quant_scheme.weights,
                             )
                         else:  # strategy == QuantizationStrategy.GROUP
@@ -222,6 +237,7 @@ def fasterprune(
                                 q,
                                 scale[:, input_dim_group],
                                 zero_point[:, input_dim_group],
+                                # g_idx,
                                 altered_qargs,
                             )
 
@@ -247,8 +263,7 @@ def fasterprune(
 
         _LOGGER.info("time %.2f" % (time.time() - tick))
         _LOGGER.info("error %.2f" % torch.sum(Losses).item())
-        
-       
+
         if actorder:
             W = W[:, invperm]
 

From c6b5b28f6600c478b4107303c1c67f69bf1bde1c Mon Sep 17 00:00:00 2001
From: George Ohashi <george@neuralmagic.com>
Date: Wed, 26 Jun 2024 17:56:33 +0000
Subject: [PATCH 08/12] g_idx to fakequantize

---
 .../quantization/gptq/utils/gptq_wrapper.py      | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/src/sparseml/modifiers/quantization/gptq/utils/gptq_wrapper.py b/src/sparseml/modifiers/quantization/gptq/utils/gptq_wrapper.py
index 43fe205a97d..1745b7c802b 100644
--- a/src/sparseml/modifiers/quantization/gptq/utils/gptq_wrapper.py
+++ b/src/sparseml/modifiers/quantization/gptq/utils/gptq_wrapper.py
@@ -192,10 +192,10 @@ def fasterprune(
                                 requires_grad=False,
                             )
                         else:
-                            g_idx = torch.Tensor(
+                            g_idx = torch.tensor(
                                 [j // group_size for j in range(self.columns)],
-                                
-                                device=W.device,
+                                dtype=torch.int32,
+                                device=W.device
                             )
 
                         from compressed_tensors.quantization import QuantizationStrategy
@@ -204,14 +204,12 @@ def fasterprune(
                         )
 
                         strategy = quant_scheme.weights.strategy
-                        breakpoint()
                         if strategy == QuantizationStrategy.TENSOR:
                             q = fake_quantize(
                                 q,
                                 scale,
                                 zero_point,
                                 self.layer.quantization_scheme.weights,
-                                g_idx,
                             )
                         elif strategy == QuantizationStrategy.CHANNEL:
                             # TODO: for channelwise why isn't this just a 1d tensor?
@@ -228,16 +226,20 @@ def fasterprune(
                             input_dim_group = (
                                 column_idx // quant_scheme.weights.group_size
                             )
-
                             # Since we're only applying quantization to a slice, this
                             # ends up being a channelwise application
                             altered_qargs = copy(quant_scheme.weights)
                             altered_qargs.strategy = QuantizationStrategy.CHANNEL
+                            
+                            # # apply g_idx 
+                            # if g_idx is not None:
+                            #     scale = scale[g_idx]
+                            #     zero_point = zero_point[g_idx]
+
                             q = fake_quantize(
                                 q,
                                 scale[:, input_dim_group],
                                 zero_point[:, input_dim_group],
-                                # g_idx,
                                 altered_qargs,
                             )
 

From 28744fd8c256bb16b8c12ba735666412c9d86d94 Mon Sep 17 00:00:00 2001
From: George Ohashi <george@neuralmagic.com>
Date: Wed, 26 Jun 2024 19:51:36 +0000
Subject: [PATCH 09/12] apply g_idx

---
 .../modifiers/quantization/gptq/pytorch.py    |  2 +-
 .../quantization/gptq/utils/gptq_wrapper.py   | 74 ++++++++++++-------
 .../modifiers/utils/layer_compressor.py       |  6 +-
 3 files changed, 50 insertions(+), 32 deletions(-)

diff --git a/src/sparseml/modifiers/quantization/gptq/pytorch.py b/src/sparseml/modifiers/quantization/gptq/pytorch.py
index 66898688f12..e9e3f715625 100644
--- a/src/sparseml/modifiers/quantization/gptq/pytorch.py
+++ b/src/sparseml/modifiers/quantization/gptq/pytorch.py
@@ -156,7 +156,7 @@ def apply_compression(
                 layer_compressor.pre_compress()
                 _LOGGER.info(f"Calibrating {layer_compressor.name}...")
                 run_calibration_forward(self.model, dataloader, mask_padding=True)
-            layer_compressor.compress(self.actorder)
+            layer_compressor.compress()
             layer_compressor.post_compress()
             layer_compressor.revert_layer_wrappers()
             torch.cuda.empty_cache()
diff --git a/src/sparseml/modifiers/quantization/gptq/utils/gptq_wrapper.py b/src/sparseml/modifiers/quantization/gptq/utils/gptq_wrapper.py
index 1745b7c802b..c2b72ffa487 100644
--- a/src/sparseml/modifiers/quantization/gptq/utils/gptq_wrapper.py
+++ b/src/sparseml/modifiers/quantization/gptq/utils/gptq_wrapper.py
@@ -83,7 +83,6 @@ def add_batch(self, inp: torch.Tensor, out: torch.Tensor):
 
     def fasterprune(
         self,
-        actorder: bool = False,
         blocksize: int = 128,
         percdamp: float = 0.01,
     ):
@@ -121,13 +120,6 @@ def fasterprune(
         self.H[dead, dead] = 1
         W[:, dead] = 0
 
-        # Or read from self.layer.quantization_scheme
-        if actorder:
-            perm = torch.argsort(torch.diag(self.H), descending=True)
-            W = W[:, perm]
-            self.H = self.H[perm][:, perm]
-            invperm = torch.argsort(perm)
-
         Losses = torch.zeros(self.rows, device=self.dev)
 
         damp = percdamp * torch.mean(torch.diag(self.H))
@@ -138,6 +130,9 @@ def fasterprune(
         self.H = torch.linalg.cholesky(self.H, upper=True)
         Hinv = self.H
 
+        actorder = False
+        invperm = None
+
         # See section 3.4 of https://arxiv.org/abs/2203.07259
         for i1 in range(0, self.columns, blocksize):
             i2 = min(i1 + blocksize, self.columns)
@@ -171,7 +166,15 @@ def fasterprune(
 
                 elif hasattr(self.layer, "quantization_scheme"):
                     quant_scheme = self.layer.quantization_scheme
+                    actorder = quant_scheme.weights.actorder
                     if quant_scheme.weights is not None:
+
+                        if actorder:
+                            perm = torch.argsort(torch.diag(self.H), descending=True)
+                            W = W[:, perm]
+                            self.H = self.H[perm][:, perm]
+                            invperm = torch.argsort(perm)
+
                         scale = self.layer.weight_scale
                         zero_point = self.layer.weight_zero_point
 
@@ -180,23 +183,15 @@ def fasterprune(
                             group_size = self.layer.weight.shape[1]
 
                         if actorder:
-                            g_idx = torch.tensor(
-                                [perm[j] // group_size for j in range(self.columns)],
-                                dtype=torch.int32,
-                                device=invperm.device
-                            )
-                            
+                            indices = torch.arange(self.columns, device=invperm.device)
+                            g_idx = (perm[indices] // group_size).to(dtype=torch.int32)
                             g_idx = g_idx[invperm]
-                            self.layer.weight_g_idx = Parameter(
-                                g_idx,
-                                requires_grad=False,
-                            )
+                            self.layer.weight_g_idx.data = g_idx
                         else:
-                            g_idx = torch.tensor(
-                                [j // group_size for j in range(self.columns)],
-                                dtype=torch.int32,
-                                device=W.device
+                            indices = torch.arange(
+                                self.columns, device=W.device, dtype=torch.int32
                             )
+                            g_idx = indices // group_size
 
                         from compressed_tensors.quantization import QuantizationStrategy
                         from compressed_tensors.quantization.lifecycle.forward import (
@@ -217,7 +212,6 @@ def fasterprune(
                                 q,
                                 scale[:, 0],
                                 zero_point[:, 0],
-                                # g_idx,
                                 quant_scheme.weights,
                             )
                         else:  # strategy == QuantizationStrategy.GROUP
@@ -230,11 +224,16 @@ def fasterprune(
                             # ends up being a channelwise application
                             altered_qargs = copy(quant_scheme.weights)
                             altered_qargs.strategy = QuantizationStrategy.CHANNEL
-                            
-                            # # apply g_idx 
-                            # if g_idx is not None:
-                            #     scale = scale[g_idx]
-                            #     zero_point = zero_point[g_idx]
+
+                            # apply g_idx
+                            if g_idx is not None:
+                                # scale and zp already transformed by group_size
+                                # extract first index of group_idze
+                                indices_to_extract = torch.arange(
+                                    0, g_idx.shape[0], group_size
+                                )
+                                scale = scale[:, g_idx[indices_to_extract]]
+                                zero_point = zero_point[:, g_idx[indices_to_extract]]
 
                             q = fake_quantize(
                                 q,
@@ -284,3 +283,22 @@ def free(self):
         """
         delattr(self, "H")
         super().free()
+
+
+"""
+(Pdb) scale.shape
+torch.Size([4096, 32])
+(Pdb) self.layer.shape
+*** AttributeError: 'Linear' object has no attribute 'shape'
+(Pdb) self.layer.weight.shape
+torch.Size([4096, 4096])
+
+
+
+(Pdb) scale.shape
+torch.Size([11008, 32])
+(Pdb) self.layer.weight.shape
+torch.Size([11008, 4096])
+
+
+"""
diff --git a/src/sparseml/modifiers/utils/layer_compressor.py b/src/sparseml/modifiers/utils/layer_compressor.py
index 2d7fdf53e00..eb0b51cf269 100644
--- a/src/sparseml/modifiers/utils/layer_compressor.py
+++ b/src/sparseml/modifiers/utils/layer_compressor.py
@@ -131,10 +131,10 @@ def revert_layer_wrappers(self):
             module_wrapper.free()
         self.modules = None
 
-    def compress(self, actorder: bool = False):
+    def compress(self):
         """
         Apply compression to each wrapped submodule in the layer
-        
+
         :param: actorder: flag to apply activation reordering
         """
 
@@ -143,7 +143,7 @@ def prune(module):
             if isinstance(module, self.module_compressor_class):
                 full_name = self._get_full_submodule_name(module.name)
                 _LOGGER.info(f"Compressing {full_name}...")
-                module.fasterprune(actorder=actorder, **self.args)
+                module.fasterprune(**self.args)
 
         self.layer.apply(prune)
 

From 51c36c223ab480f295e45058b845364daedcef30 Mon Sep 17 00:00:00 2001
From: George Ohashi <george@neuralmagic.com>
Date: Wed, 26 Jun 2024 20:02:06 +0000
Subject: [PATCH 10/12] read actoder from quant_args

---
 src/sparseml/modifiers/quantization/gptq/base.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/sparseml/modifiers/quantization/gptq/base.py b/src/sparseml/modifiers/quantization/gptq/base.py
index 58f5c37b463..833fa284531 100644
--- a/src/sparseml/modifiers/quantization/gptq/base.py
+++ b/src/sparseml/modifiers/quantization/gptq/base.py
@@ -82,7 +82,6 @@ class GPTQModifier(Modifier):
         and activation 8 bit quantization on the Linear layers.
     """
 
-    actorder: bool = False
     sequential_update: Optional[bool] = False
     targets: Union[str, List[str], None] = None
     block_size: int = 128

From a75c462e0d9f233dffa3573cfe4ef34898c26f25 Mon Sep 17 00:00:00 2001
From: George Ohashi <george@neuralmagic.com>
Date: Fri, 28 Jun 2024 15:15:04 +0000
Subject: [PATCH 11/12] lint

---
 src/sparseml/modifiers/quantization/gptq/utils/gptq_wrapper.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/sparseml/modifiers/quantization/gptq/utils/gptq_wrapper.py b/src/sparseml/modifiers/quantization/gptq/utils/gptq_wrapper.py
index c2b72ffa487..68c737e752f 100644
--- a/src/sparseml/modifiers/quantization/gptq/utils/gptq_wrapper.py
+++ b/src/sparseml/modifiers/quantization/gptq/utils/gptq_wrapper.py
@@ -14,8 +14,6 @@
 
 import time
 
-from torch.nn import Parameter
-
 from sparseml.modifiers.utils import SPARSITY_THRESHOLD
 from sparseml.modifiers.utils.compression_wrapper import ModuleCompressionWrapper
 

From bea971cfade46505a59ecf479613db710b710d86 Mon Sep 17 00:00:00 2001
From: George Ohashi <george@neuralmagic.com>
Date: Mon, 1 Jul 2024 14:32:37 +0000
Subject: [PATCH 12/12] remove commented code

---
 .../quantization/gptq/utils/gptq_wrapper.py   | 19 -------------------
 1 file changed, 19 deletions(-)

diff --git a/src/sparseml/modifiers/quantization/gptq/utils/gptq_wrapper.py b/src/sparseml/modifiers/quantization/gptq/utils/gptq_wrapper.py
index 68c737e752f..9f660b987fb 100644
--- a/src/sparseml/modifiers/quantization/gptq/utils/gptq_wrapper.py
+++ b/src/sparseml/modifiers/quantization/gptq/utils/gptq_wrapper.py
@@ -281,22 +281,3 @@ def free(self):
         """
         delattr(self, "H")
         super().free()
-
-
-"""
-(Pdb) scale.shape
-torch.Size([4096, 32])
-(Pdb) self.layer.shape
-*** AttributeError: 'Linear' object has no attribute 'shape'
-(Pdb) self.layer.weight.shape
-torch.Size([4096, 4096])
-
-
-
-(Pdb) scale.shape
-torch.Size([11008, 32])
-(Pdb) self.layer.weight.shape
-torch.Size([11008, 4096])
-
-
-"""