[enh] Updating versions of all packages + corresponding modifications. Further hydra config parametrization.

alexeyev · alexeyev · commit c35f05bb6708 · 2021-08-30T11:08:03.000+03:00
diff --git a/.gitignore b/.gitignore
@@ -3,4 +3,6 @@
 .idea
 *~
 *.w2v
-*.json*.txt
+*.json*.txt
+*.log
+.hydra
diff --git a/README.md b/README.md
@@ -2,54 +2,30 @@
 
 Yet another PyTorch implementation of the model described in the paper [**An Unsupervised Neural Attention Model for Aspect Extraction**](https://aclweb.org/anthology/papers/P/P17/P17-1036/) by He, Ruidan and  Lee, Wee Sun  and  Ng, Hwee Tou  and  Dahlmeier, Daniel, **ACL2017**.
 
+**NOTA BENE**: now `gensim>=4.0.0` and `hydra` are required.
+
 ## Example
 
 **For a working example of a whole pipeline please refer to `example_run.sh`**
 
 Let's get some data:
 
 ```
-wget http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Electronics_5.json.gz
-gunzip reviews_Electronics_5.json.gz    
-python3 custom_format_converter.py reviews_Electronics_5.json
+wget http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Cell_Phones_and_Accessories_5.json.gz
+gunzip reviews_Cell_Phones_and_Accessories_5.json.gz    
+python3 custom_format_converter.py reviews_Cell_Phones_and_Accessories_5.json
 ```
 
 Then we need to train the word vectors:
     
 ```
-python3 word2vec.py reviews_Electronics_5.json.txt
+python3 word2vec.py reviews_Cell_Phones_and_Accessories_5.json.txt
 ```
 And run 
 
+**TODO**: running with hydra params example is in progress
 ```
-usage: main.py [-h] [--word-vectors-path <str>] [--batch-size BATCH_SIZE]
-               [--aspects-number ASPECTS_NUMBER] [--ortho-reg ORTHO_REG]
-               [--epochs EPOCHS] [--optimizer {adam,adagrad,sgd}]
-               [--negative-samples NEG_SAMPLES] [--dataset-path DATASET_PATH]
-               [--maxlen MAXLEN]
-
-optional arguments:
-  -h, --help            show this help message and exit
-  --word-vectors-path <str>, -wv <str>
-                        path to word vectors file
-  --batch-size BATCH_SIZE, -b BATCH_SIZE
-                        Batch size for training
-  --aspects-number ASPECTS_NUMBER, -as ASPECTS_NUMBER
-                        A total number of aspects
-  --ortho-reg ORTHO_REG, -orth ORTHO_REG
-                        Ortho-regularization impact coefficient
-  --epochs EPOCHS, -e EPOCHS
-                        Epochs count
-  --optimizer {adam,adagrad,sgd}, -opt {adam,adagrad,sgd}
-                        Optimizer
-  --negative-samples NEG_SAMPLES, -ns NEG_SAMPLES
-                        Negative samples per positive one
-  --dataset-path DATASET_PATH, -d DATASET_PATH
-                        Path to a training texts file. One sentence per line,
-                        tokens separated wiht spaces.
-  --maxlen MAXLEN, -l MAXLEN
-                        Max length of the considered sentence; the rest is
-                        clipped if longer
+usage: main.py ...
 
 ```
 
diff --git a/configs/config.yaml b/configs/config.yaml
@@ -1,14 +1,18 @@
 defaults:
   - embeddings: word2vec-custom
-  - optimizers: adam
+  - optimizer: adam
 
 data:
-  path: "reviews_Electronics_5.json.txt"
+  path: "reviews_Cell_Phones_and_Accessories_5.json.txt"
 
 model:
   batch_size: 50
-  ortho_reg: 0.1
-  aspects_number: 40
+  ortho_reg: 0.2
+  aspects_number: 25
   epochs: 1
   negative_samples: 5
-  max_len: 201
+  max_len: 201
+
+hydra:
+  run:
+    dir: . #results/sessions_${now:%Y-%m-%d}_${now:%H-%M-%S}
diff --git a/configs/embeddings/word2vec-custom.yaml b/configs/embeddings/word2vec-custom.yaml
@@ -1,2 +1,2 @@
 name: word2vec-custom
-path: "word_vectors/reviews_Cell_Phones_and_Accessories_5.json.txt.w2v"
+path: word_vectors/reviews_Cell_Phones_and_Accessories_5.json.txt.w2v
diff --git a/configs/optimizer/adam.yaml b/configs/optimizer/adam.yaml
diff --git a/configs/optimizer/asgd.yaml b/configs/optimizer/asgd.yaml
@@ -0,0 +1,2 @@
+name: asgd
+learning_rate: 0.05
diff --git a/configs/optimizer/sgd.yaml b/configs/optimizer/sgd.yaml
@@ -0,0 +1,2 @@
+name: sgd
+learning_rate: 0.05
diff --git a/custom_format_converter.py b/custom_format_converter.py
@@ -53,6 +53,6 @@ def read_amazon_format(path: str, sentence=True):
     if len(sys.argv) > 1:
         path = sys.argv[1]
     else:
-        path = "reviews_Electronics_5.json"
+        path = "reviews_Cell_Phones_and_Accessories_5.json"
 
     read_amazon_format(path, sentence=True)
diff --git a/example_run.sh b/example_run.sh
@@ -19,4 +19,5 @@ if [ ! -f ./word_vectors/$DATA_NAME.json.txt.w2v ]; then
 fi
 
 echo "Training ABAE..."
-python main.py -as 30 -d $DATA_NAME.json.txt -wv word_vectors/$DATA_NAME.json.txt.w2v
+echo "A working example is in progress... Please see 'main.py' code."
+#python main.py -as 30 -d $DATA_NAME.json.txt -wv word_vectors/$DATA_NAME.json.txt.w2v
diff --git a/main.py b/main.py
@@ -1,77 +1,76 @@
 # -*- coding: utf-8 -*-
+import logging
+
+import hydra
 import numpy as np
 import torch
-import hydra
+
 from model import ABAE
 from reader import get_centroids, get_w2v, read_data_tensors
 
+logger = logging.getLogger(__name__)
+
 
 @hydra.main("configs", "config")
 def main(cfg):
-
     w2v_model = get_w2v(cfg.embeddings.path)
-    print(cfg)
-    print(w2v_model)
-    # wv_dim = w2v_model.vector_size
-    # y = torch.zeros(args.batch_size, 1)
-    #
-    # model = ABAE(wv_dim=wv_dim,
-    #              asp_count=args.aspects_number,
-    #              init_aspects_matrix=get_centroids(w2v_model, aspects_count=args.aspects_number))
-    # print(model)
-    #
-    # criterion = torch.nn.MSELoss(reduction="sum")
-    #
-    # optimizer = None
-    # scheduler = None
-    #
-    # if args.optimizer == "adam":
-    #     optimizer = torch.optim.Adam(model.parameters())
-    # elif args.optimizer == "sgd":
-    #     optimizer = torch.optim.SGD(model.parameters(), lr=0.05)
-    # elif args.optimizer == "adagrad":
-    #     optimizer = torch.optim.Adagrad(model.parameters())
-    # elif args.optimizer == "asgd":
-    #     optimizer = torch.optim.ASGD(model.parameters(), lr=0.05)
-    # else:
-    #     raise Exception("Optimizer '%s' is not supported" % args.optimizer)
-    #
-    # for t in range(args.epochs):
-    #
-    #     print("Epoch %d/%d" % (t + 1, args.epochs))
-    #
-    #     data_iterator = read_data_tensors(args.dataset_path, args.wv_path,
-    #                                       batch_size=args.batch_size, maxlen=args.maxlen)
-    #
-    #     for item_number, (x, texts) in enumerate(data_iterator):
-    #         if x.shape[0] < args.batch_size:  # pad with 0 if smaller than batch size
-    #             x = np.pad(x, ((0, args.batch_size - x.shape[0]), (0, 0), (0, 0)))
-    #
-    #         x = torch.from_numpy(x)
-    #
-    #         # extracting bad samples from the very same batch; not sure if this is OK, so todo
-    #         negative_samples = torch.stack(
-    #             tuple([x[torch.randperm(x.shape[0])[:args.neg_samples]] for _ in range(args.batch_size)]))
-    #
-    #         # prediction
-    #         y_pred = model(x, negative_samples)
-    #
-    #         # error computation
-    #         loss = criterion(y_pred, y)
-    #         optimizer.zero_grad()
-    #         loss.backward()
-    #         optimizer.step()
-    #
-    #         if item_number % 1000 == 0:
-    #
-    #             print(item_number, "batches, and LR:", optimizer.param_groups[0]['lr'])
-    #
-    #             for i, aspect in enumerate(model.get_aspect_words(w2v_model)):
-    #                 print(i + 1, " ".join([a for a in aspect]))
-    #
-    #             print("Loss:", loss.item())
-    #             print()
+    wv_dim = w2v_model.vector_size
+    y = torch.zeros((cfg.model.batch_size, 1))
+
+    model = ABAE(wv_dim=wv_dim,
+                 asp_count=cfg.model.aspects_number,
+                 init_aspects_matrix=get_centroids(w2v_model, aspects_count=cfg.model.aspects_number))
+    logger.debug(str(model))
+
+    criterion = torch.nn.MSELoss(reduction="sum")
+
+    if cfg.optimizer.name == "adam":
+        optimizer = torch.optim.Adam(model.parameters())
+    elif cfg.optimizer.name == "sgd":
+        optimizer = torch.optim.SGD(model.parameters(), lr=cfg.optimizer.learning_rate)
+    elif cfg.optimizer.name == "adagrad":
+        optimizer = torch.optim.Adagrad(model.parameters())
+    elif cfg.optimizer.name == "asgd":
+        optimizer = torch.optim.ASGD(model.parameters(), lr=cfg.optimizer.learning_rate)
+    else:
+        raise Exception("Optimizer '%s' is not supported" % cfg.optimizer.name)
+
+    for t in range(cfg.model.epochs):
+
+        logger.debug("Epoch %d/%d" % (t + 1, cfg.model.epochs))
+
+        data_iterator = read_data_tensors(cfg.data.path, cfg.embeddings.path,
+                                          batch_size=cfg.model.batch_size, maxlen=cfg.model.max_len)
+
+        for item_number, (x, texts) in enumerate(data_iterator):
+            if x.shape[0] < cfg.model.batch_size:  # pad with 0 if smaller than batch size
+                x = np.pad(x, ((0, cfg.model.batch_size - x.shape[0]), (0, 0), (0, 0)))
+
+            x = torch.from_numpy(x)
+
+            # extracting bad samples from the very same batch; not sure if this is OK, so todo
+            negative_samples = torch.stack(
+                tuple([x[torch.randperm(x.shape[0])[:cfg.model.negative_samples]]
+                       for _ in range(cfg.model.batch_size)]))
+
+            # prediction
+            y_pred = model(x, negative_samples)
+
+            # error computation
+            loss = criterion(y_pred, y)
+            optimizer.zero_grad()
+            loss.backward()
+            optimizer.step()
+
+            if item_number % 1000 == 0:
+
+                logger.info("%d batches, and LR: %.5f" % (item_number, optimizer.param_groups[0]['lr']))
+
+                for i, aspect in enumerate(model.get_aspect_words(w2v_model, logger)):
+                    logger.info("[%d] %s" % (i + 1, " ".join([a for a in aspect])))
+
+                logger.info("Loss: %.4f" % loss.item())
 
 
 if __name__ == "__main__":
-    main()
+    main()
diff --git a/model.py b/model.py
@@ -13,10 +13,10 @@ def __init__(self, wv_dim: int, maxlen: int):
         # max sentence length -- batch 2nd dim size
         self.maxlen = maxlen
         self.M = Parameter(torch.empty(size=(wv_dim, wv_dim)))
-        init.kaiming_uniform(self.M.data)
+        init.kaiming_uniform_(self.M.data)
 
         # softmax for attending to wod vectors
-        self.attention_softmax = torch.nn.Softmax()
+        self.attention_softmax = torch.nn.Softmax(dim=-1)
 
     def forward(self, input_embeddings):
         # (b, wv, 1)
@@ -63,7 +63,7 @@ def __init__(self, wv_dim: int = 200, asp_count: int = 30,
 
         self.attention = SelfAttention(wv_dim, maxlen)
         self.linear_transform = torch.nn.Linear(self.wv_dim, self.asp_count)
-        self.softmax_aspects = torch.nn.Softmax()
+        self.softmax_aspects = torch.nn.Softmax(dim=-1)
         self.aspects_embeddings = Parameter(torch.empty(size=(wv_dim, asp_count)))
 
         if init_aspects_matrix is None:
@@ -108,7 +108,9 @@ def forward(self, text_embeddings, negative_samples_texts):
         reconstruction_triplet_loss = ABAE._reconstruction_loss(weighted_text_emb,
                                                                 recovered_emb,
                                                                 averaged_negative_samples)
-        max_margin = torch.max(reconstruction_triplet_loss, torch.zeros_like(reconstruction_triplet_loss))
+        max_margin = torch \
+            .max(reconstruction_triplet_loss, torch.zeros_like(reconstruction_triplet_loss)) \
+            .unsqueeze(dim=-1)
 
         return self.ortho * self._ortho_regularizer() + max_margin
 
@@ -126,20 +128,20 @@ def _ortho_regularizer(self):
             torch.matmul(self.aspects_embeddings.t(), self.aspects_embeddings) \
             - torch.eye(self.asp_count))
 
-    def get_aspect_words(self, w2v_model, topn=15):
+    def get_aspect_words(self, w2v_model, logger, topn=15):
         words = []
 
         # getting aspects embeddings
         aspects = self.aspects_embeddings.detach().numpy()
 
         # getting scalar products of word embeddings and aspect embeddings;
         # to obtain the ``probabilities'', one should also apply softmax
-        words_scores = w2v_model.wv.syn0.dot(aspects)
+        # words_scores = w2v_model.wv.syn0.dot(aspects)
+        words_scores = w2v_model.wv.vectors.dot(aspects)
 
         for row in range(aspects.shape[1]):
             argmax_scalar_products = np.argsort(- words_scores[:, row])[:topn]
-            # print([w2v_model.wv.index2word[i] for i in argmax_scalar_products])
-            # print([w for w, dist in w2v_model.similar_by_vector(aspects.T[row])[:topn]])
-            words.append([w2v_model.wv.index2word[i] for i in argmax_scalar_products])
+            # print([w for w, dist in w2v_model.wv.similar_by_vector(aspects.T[row])[:topn]])
+            words.append([w2v_model.wv.index_to_key[i] for i in argmax_scalar_products])
 
         return words
diff --git a/reader.py b/reader.py
@@ -4,7 +4,7 @@
 from sklearn.cluster import MiniBatchKMeans
 
 
-def read_data_batches(path, batch_size=50, minlength=5):
+def read_data_batches(path: str, batch_size: int=50, minlength: int=5):
     """
         Reading batched texts of given min. length
     :param path: path to the text file ``one line -- one normalized sentence''
@@ -26,7 +26,7 @@ def read_data_batches(path, batch_size=50, minlength=5):
         yield batch
 
 
-def text2vectors(text, w2v_model, maxlen, vocabulary):
+def text2vectors(text: list, w2v_model, maxlen: int, vocabulary):
     """
         Token sequence -- to a list of word vectors;
         if token not in vocabulary, it is skipped; the rest of
@@ -40,7 +40,7 @@ def text2vectors(text, w2v_model, maxlen, vocabulary):
     acc_vecs = []
 
     for word in text:
-        if word in w2v_model and (vocabulary is None or word in vocabulary):
+        if word in w2v_model.wv and (vocabulary is None or word in vocabulary):
             acc_vecs.append(w2v_model.wv[word])
 
     # padding for consistent length with ZERO vectors
@@ -94,11 +94,10 @@ def get_centroids(w2v_model, aspects_count):
     km = MiniBatchKMeans(n_clusters=aspects_count, verbose=0, n_init=100)
     m = []
 
-    for k in w2v_model.wv.vocab:
+    for k in w2v_model.wv.key_to_index:
         m.append(w2v_model.wv[k])
 
     m = np.matrix(m)
-
     km.fit(m)
     clusters = km.cluster_centers_
 
diff --git a/requirements.txt b/requirements.txt
@@ -1,8 +1,7 @@
-nltk>=3.5
-gensim>=3.8.3
-torch>=1.5.0
-torchvision>=0.6.0
-tqdm>=4.45.0
-scikit-learn>=0.22.2.post1
-numpy>=1.18.4
-hydra>=2.5
+nltk>=3.6.2
+gensim>=4.1.0
+torch>=1.9.0
+torchvision>=0.10.0
+scikit-learn>=0.24.2
+numpy>=1.21.2
+hydra-core>=1.1.1
diff --git a/word2vec.py b/word2vec.py

Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,2 @@`
`1`	`1`	`name: word2vec-custom`
`2`		`-path: "word_vectors/reviews_Cell_Phones_and_Accessories_5.json.txt.w2v"`
	`2`	`+path: word_vectors/reviews_Cell_Phones_and_Accessories_5.json.txt.w2v`