[enh] Minor improvements. E.g. 'python3' -> 'python', encodings possible troubles fixed, README updated (another dataset for faster experiments), requirements.txt added, etc.

alexeyev · alexeyev · commit eae7067939fd · 2020-05-07T19:09:06.000+03:00
diff --git a/custom_format_converter.py b/custom_format_converter.py
@@ -27,9 +27,9 @@ def read_amazon_format(path: str, sentence=True):
     :param path: a path to a filename
     :param sentence: whether to split the reviews into sentences
     """
-    with open(path + ("" if sentence else "-full_text") + ".txt", "w+") as wf:
+    with open(path + ("" if sentence else "-full_text") + ".txt", "w+", encoding="utf-8") as wf:
 
-        for line in tqdm(open(path)):
+        for line in tqdm(open(path, "r", encoding="utf-8")):
             # reading the text
             text = json.loads(line.strip())["reviewText"].replace("\n", " ")
             # splitting into sentences
diff --git a/example_run.sh b/example_run.sh
@@ -1,22 +1,22 @@
 #!/usr/bin/env bash
 
-DATA_NAME=reviews_Electronics_5
+DATA_NAME=reviews_Cell_Phones_and_Accessories_5
 
 if [ ! -f ./$DATA_NAME.json.txt ]; then
     echo "File not found! Downloading..."
 
     ### this may take a while
     wget http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/$DATA_NAME.json.gz
     gunzip $DATA_NAME.json.gz
-    python3 custom_format_converter.py $DATA_NAME.json
+    python custom_format_converter.py $DATA_NAME.json
     rm $DATA_NAME.json.gz $DATA_NAME.json
     mkdir word_vectors
 fi
 
 if [ ! -f ./word_vectors/$DATA_NAME.json.txt.w2v ]; then
     echo "Training custom word vectors..."
-    python3 word2vec.py $DATA_NAME.json.txt
+    python word2vec.py $DATA_NAME.json.txt
 fi
 
 echo "Training ABAE..."
-python3 main.py -as 30 -d $DATA_NAME.json.txt
+python main.py -as 30 -d $DATA_NAME.json.txt -wv word_vectors/$DATA_NAME.json.txt.w2v
diff --git a/main.py b/main.py
@@ -13,7 +13,6 @@
 
     parser.add_argument("--word-vectors-path", "-wv",
                         dest="wv_path", type=str, metavar='<str>',
-                        default="word_vectors/reviews_Electronics_5.json.txt.w2v",
                         help="path to word vectors file")
 
     parser.add_argument("--batch-size", "-b", dest="batch_size", type=int, default=50,
@@ -29,7 +28,7 @@
                         help="Epochs count")
 
     parser.add_argument("--optimizer", "-opt", dest="optimizer", type=str, default="adam", help="Optimizer",
-                        choices=["adam", "adagrad", "sgd"])
+                        choices=["adam", "sgd", "asgd", "adagrad"])
 
     parser.add_argument("--negative-samples", "-ns", dest="neg_samples", type=int, default=5,
                         help="Negative samples per positive one")
@@ -56,17 +55,14 @@
     optimizer = None
     scheduler = None
 
-    # if args.optimizer == "cycsgd":
-    #     optimizer = torch.optim.SGD(model.parameters(), lr=0.05, momentum=0.9)
-    #     scheduler = torch.optim.lr_scheduler.CyclicLR(optimizer, base_lr=1e-5, max_lr=0.05, mode="triangular2")
-    # elif args.optimizer == "adam":
-
     if args.optimizer == "adam":
         optimizer = torch.optim.Adam(model.parameters())
     elif args.optimizer == "sgd":
         optimizer = torch.optim.SGD(model.parameters(), lr=0.05)
     elif args.optimizer == "adagrad":
         optimizer = torch.optim.Adagrad(model.parameters())
+    elif args.optimizer == "asgd":
+        optimizer = torch.optim.ASGD(model.parameters(), lr=0.05)
     else:
         raise Exception("Optimizer '%s' is not supported" % args.optimizer)
 
@@ -95,14 +91,13 @@
             optimizer.zero_grad()
             loss.backward()
             optimizer.step()
-            # scheduler.step(epoch=t)
 
             if item_number % 1000 == 0:
 
                 print(item_number, "batches, and LR:", optimizer.param_groups[0]['lr'])
 
                 for i, aspect in enumerate(model.get_aspect_words(w2v_model)):
-                    print(i + 1, " ".join(["%10s" % a for a in aspect]))
+                    print(i + 1, " ".join([a for a in aspect]))
 
                 print("Loss:", loss.item())
                 print()
diff --git a/model.py b/model.py
@@ -6,13 +6,13 @@
 
 
 class SelfAttention(torch.nn.Module):
-    def __init__(self, wv_dim, maxlen):
+    def __init__(self, wv_dim: int, maxlen: int):
         super(SelfAttention, self).__init__()
         self.wv_dim = wv_dim
 
         # max sentence length -- batch 2nd dim size
         self.maxlen = maxlen
-        self.M = Parameter(torch.Tensor(wv_dim, wv_dim))
+        self.M = Parameter(torch.empty(size=(wv_dim, wv_dim)))
         init.kaiming_uniform(self.M.data)
 
         # softmax for attending to wod vectors
@@ -44,7 +44,8 @@ class ABAE(torch.nn.Module):
 
     """
 
-    def __init__(self, wv_dim=200, asp_count=30, ortho_reg=0.1, maxlen=201, init_aspects_matrix=None):
+    def __init__(self, wv_dim: int = 200, asp_count: int = 30,
+                 ortho_reg: float = 0.1, maxlen: int = 201, init_aspects_matrix=None):
         """
         Initializing the model
 
@@ -63,7 +64,7 @@ def __init__(self, wv_dim=200, asp_count=30, ortho_reg=0.1, maxlen=201, init_asp
         self.attention = SelfAttention(wv_dim, maxlen)
         self.linear_transform = torch.nn.Linear(self.wv_dim, self.asp_count)
         self.softmax_aspects = torch.nn.Softmax()
-        self.aspects_embeddings = Parameter(torch.Tensor(wv_dim, asp_count))
+        self.aspects_embeddings = Parameter(torch.empty(size=(wv_dim, asp_count)))
 
         if init_aspects_matrix is None:
             torch.nn.init.xavier_uniform(self.aspects_embeddings)
@@ -80,8 +81,8 @@ def get_aspects_importances(self, text_embeddings):
 
         # multiplying text embeddings by attention scores -- and summing
         # (matmul: we sum every word embedding's coordinate with attention weights)
-        weighted_text_emb = torch.matmul(attention_weights.unsqueeze(1), # (batch, 1, sentence)
-                                         text_embeddings                 # (batch, sentence, wv_dim)
+        weighted_text_emb = torch.matmul(attention_weights.unsqueeze(1),  # (batch, 1, sentence)
+                                         text_embeddings  # (batch, sentence, wv_dim)
                                          ).squeeze()
 
         # encoding with a simple feed-forward layer (wv_dim) -> (aspects_count)
diff --git a/reader.py b/reader.py
@@ -12,7 +12,7 @@ def read_data_batches(path, batch_size=50, minlength=5):
     """
     batch = []
 
-    for line in open(path):
+    for line in open(path, encoding="utf-8"):
         line = line.strip().split()
 
         # lines with less than `minlength` words are omitted
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,7 @@
+nltk>=3.5
+gensim>=3.8.3
+torch>=1.5.0
+torchvision>=0.6.0
+tqdm>=4.45.0
+scikit-learn>=0.22.2.post1
+numpy>=1.18.4
diff --git a/word2vec.py b/word2vec.py
@@ -8,11 +8,11 @@
 
 
 class Sentences(object):
-    def __init__(self, filename):
+    def __init__(self, filename: str):
         self.filename = filename
 
     def __iter__(self):
-        for line in tqdm(codecs.open(self.filename, "r", "utf-8"), self.filename):
+        for line in tqdm(codecs.open(self.filename, "r", encoding="utf-8"), self.filename):
             yield line.strip().split()