[enh] Hydra configs fixes, README, LICENSE and example_run.sh script updated.

alexeyev · alexeyev · commit 80a6e260fbec · 2021-08-30T17:43:17.000+03:00
diff --git a/LICENSE b/LICENSE
@@ -1,6 +1,6 @@
 MIT License
 
-Copyright (c) 2019 Anton Alekseev
+Copyright (c) 2019-2021 Anton Alekseev
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/README.md b/README.md
@@ -2,7 +2,7 @@
 
 Yet another PyTorch implementation of the model described in the paper [**An Unsupervised Neural Attention Model for Aspect Extraction**](https://aclweb.org/anthology/papers/P/P17/P17-1036/) by He, Ruidan and  Lee, Wee Sun  and  Ng, Hwee Tou  and  Dahlmeier, Daniel, **ACL2017**.
 
-**NOTA BENE**: now `gensim>=4.0.0` and `hydra` are required.
+**NOTA BENE**: as of August 2021, `gensim>=4.1.0` and `hydra-core>=1.1.0` are required.
 
 ## Example
 
@@ -23,13 +23,12 @@ python3 word2vec.py reviews_Cell_Phones_and_Accessories_5.json.txt
 ```
 And run 
 
-**TODO**: running with hydra params example is in progress
 ```
-usage: main.py ...
-
+python main.py model.aspects_number=35 data.path=$DATA_NAME.json.txt model.log_progress_steps=1000
 ```
 
-For a working example of a whole pipeline please refer to `example_run.sh` 
+Please see all passable params in the `configs/` directory. For a working example of a whole pipeline 
+please refer to `example_run.sh` 
 
 I acknowledge the implementation is raw, code modification requests and issues are welcome.
 
diff --git a/configs/config.yaml b/configs/config.yaml
@@ -1,6 +1,7 @@
 defaults:
   - embeddings: word2vec-custom
   - optimizer: adam
+  - _self_
 
 data:
   path: "reviews_Cell_Phones_and_Accessories_5.json.txt"
@@ -12,7 +13,8 @@ model:
   epochs: 1
   negative_samples: 5
   max_len: 201
+  log_progress_steps: 1000
 
 hydra:
   run:
-    dir: . #results/sessions_${now:%Y-%m-%d}_${now:%H-%M-%S}
+    dir: results/sessions_${now:%Y-%m-%d}_${now:%H-%M-%S}
diff --git a/custom_format_converter.py b/custom_format_converter.py
@@ -13,7 +13,7 @@
 stops = set(stopwords.words("english"))
 
 
-@lru_cache(1000000000)
+@lru_cache(maxsize=1000000000)
 def lemmatize(w: str):
     # caching the word-based lemmatizer to speed the process up
     return lemmatizer.lemmatize(w)
@@ -29,7 +29,7 @@ def read_amazon_format(path: str, sentence=True):
     """
     with open(path + ("" if sentence else "-full_text") + ".txt", "w+", encoding="utf-8") as wf:
 
-        for line in tqdm(open(path, "r", encoding="utf-8")):
+        for line in tqdm(open(path, "r", encoding="utf-8"), "normalizing texts read from [%s]" % path):
             # reading the text
             text = json.loads(line.strip())["reviewText"].replace("\n", " ")
             # splitting into sentences
diff --git a/example_run.sh b/example_run.sh
@@ -8,16 +8,15 @@ if [ ! -f ./$DATA_NAME.json.txt ]; then
     ### this may take a while
     wget http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/$DATA_NAME.json.gz
     gunzip $DATA_NAME.json.gz
-    python custom_format_converter.py $DATA_NAME.json
+    python3 custom_format_converter.py $DATA_NAME.json
     rm $DATA_NAME.json.gz $DATA_NAME.json
     mkdir word_vectors
 fi
 
 if [ ! -f ./word_vectors/$DATA_NAME.json.txt.w2v ]; then
     echo "Training custom word vectors..."
-    python word2vec.py $DATA_NAME.json.txt
+    python3 word2vec.py $DATA_NAME.json.txt
 fi
 
 echo "Training ABAE..."
-echo "A working example is in progress... Please see 'main.py' code."
-#python main.py -as 30 -d $DATA_NAME.json.txt -wv word_vectors/$DATA_NAME.json.txt.w2v
+python3 main.py model.aspects_number=35 data.path=$DATA_NAME.json.txt model.log_progress_steps=1000
diff --git a/main.py b/main.py
@@ -4,6 +4,7 @@
 import hydra
 import numpy as np
 import torch
+import os
 
 from model import ABAE
 from reader import get_centroids, get_w2v, read_data_tensors
@@ -13,7 +14,7 @@
 
 @hydra.main("configs", "config")
 def main(cfg):
-    w2v_model = get_w2v(cfg.embeddings.path)
+    w2v_model = get_w2v(os.path.join(hydra.utils.get_original_cwd(), cfg.embeddings.path))
     wv_dim = w2v_model.vector_size
     y = torch.zeros((cfg.model.batch_size, 1))
 
@@ -39,7 +40,8 @@ def main(cfg):
 
         logger.debug("Epoch %d/%d" % (t + 1, cfg.model.epochs))
 
-        data_iterator = read_data_tensors(cfg.data.path, cfg.embeddings.path,
+        data_iterator = read_data_tensors(os.path.join(hydra.utils.get_original_cwd(), cfg.data.path),
+                                          os.path.join(hydra.utils.get_original_cwd(), cfg.embeddings.path),
                                           batch_size=cfg.model.batch_size, maxlen=cfg.model.max_len)
 
         for item_number, (x, texts) in enumerate(data_iterator):
@@ -62,7 +64,7 @@ def main(cfg):
             loss.backward()
             optimizer.step()
 
-            if item_number % 1000 == 0:
+            if item_number % cfg.model.log_progress_steps == 0:
 
                 logger.info("%d batches, and LR: %.5f" % (item_number, optimizer.param_groups[0]['lr']))
 
diff --git a/word2vec.py b/word2vec.py
@@ -19,7 +19,7 @@ def __iter__(self):
 def main(path):
     sentences = Sentences(path)
     model = gensim.models.Word2Vec(sentences, vector_size=200, window=5, min_count=5, workers=7, sg=1,
-                                   negative=5, iter=1, max_vocab_size=20000)
+                                   negative=5, max_vocab_size=20000)
     model.save("word_vectors/" + path + ".w2v")
     # model.wv.save_word2vec_format("word_vectors/" + domain + ".txt", binary=False)
 
diff --git a/word_vectors/DO_NOT_README b/word_vectors/DO_NOT_README