handling DTM, topic modeling and various docuemntation; using abstract class for LatentTopicModeling.py

stephenhky · stephenhky · commit 74eeb387f081 · 2020-09-07T15:42:47.000-04:00
diff --git a/docs/codes.rst b/docs/codes.rst
@@ -19,6 +19,25 @@ Module `shorttext.utils.textpreprocessing`
 .. automodule:: shorttext.utils.textpreprocessing
    :members:
 
+Document-Term Matrix (DTM)
+--------------------------
+
+Module `shorttext.utils.dtm`
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. automodule:: shorttext.utils.dtm
+   :members:
+
+One-hot Encoding
+----------------
+
+Module `shorttext.generators.charbase.char2vec`
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. automodule:: shorttext.generators.charbase.char2vec
+   :members:
+
+
 Topic Models
 ------------
 
diff --git a/docs/tutorial_charbaseonehot.rst b/docs/tutorial_charbaseonehot.rst
@@ -44,6 +44,11 @@ We can also convert a list of sentences by
 
 You can decide whether or not to output a sparse matrix by specifiying the parameter `sparse`.
 
+
+.. automodule:: shorttext.generators.charbase.char2vec
+   :members:
+
+
 Reference
 ---------
 
diff --git a/docs/tutorial_dtm.rst b/docs/tutorial_dtm.rst
@@ -33,6 +33,9 @@ With the corpus ready in this form, we can create a `DocumentTermMatrix` class f
 
 >>> usprez_dtm = shorttext.utils.DocumentTermMatrix(corpus, docids=docids)
 
+.. autoclass:: shorttext.utils.dtm.DocumentTermMatrix
+   :members:
+
 One can get the document frequency of any token (the number of documents that the given
 token is in) by:
 
@@ -64,6 +67,9 @@ To load this class later, enter:
 
 >>> usprez_dtm2 = shorttext.utils.load_DocumentTermMatrix('/path/to/whatever.bin')
 
+.. automodule:: shorttext.utils.dtm
+   :members: load_DocumentTermMatrix
+
 Reference
 ---------
 
diff --git a/docs/tutorial_textpreprocessing.rst b/docs/tutorial_textpreprocessing.rst
@@ -25,6 +25,9 @@ Then define the preprocessor, a function, by just calling:
 
 >>> preprocessor1 = standard_text_preprocessor_1()
 
+.. automodule:: shorttext.utils.textpreprocessing
+    :members: standard_text_preprocessor_1
+
 It is a function that perform the preprocessing in the steps above:
 
 >>> preprocessor1('Maryland Blue Crab')  # output:  'maryland blue crab'
@@ -72,6 +75,9 @@ Some examples are:
 >>> preprocessor2('Maryland blue crab in Annapolis')  # output: 'MARYLAND-8 BLUE-4 CRAB-4 IN-2 ANNAPOLIS-9'
 >>> preprocessor2('generative adversarial networks')  # output: 'GENERATIVE-10 ADVERSARIAL-11 NETWORK-7'
 
+.. automodule:: shorttext.utils.textpreprocessing
+    :members: text_preprocessor
+
 Tokenization
 ------------
 
diff --git a/docs/tutorial_topic.rst b/docs/tutorial_topic.rst
@@ -88,6 +88,11 @@ not saved. To load the model, enter:
 
 >>> topicmodeler2 = shorttext.classifiers.load_gensimtopicmodel('/path/to/nihlda128', compact=False)
 
+
+.. automodule:: shorttext.generators.bow.GensimTopicModeling
+   :members:
+
+
 AutoEncoder
 -----------
 
@@ -137,6 +142,11 @@ The default is to weigh. To not weigh, initialize it as:
 
 >>> autoencoder3 = shorttext.generators.AutoencodingTopicModeler(toweigh=False)
 
+
+.. automodule:: shorttext.generators.bow.AutoEncodingTopicModeling
+   :members:
+
+
 Appendix: Unzipping Model I/O
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
diff --git a/shorttext/generators/bow/LatentTopicModeling.py b/shorttext/generators/bow/LatentTopicModeling.py
@@ -1,11 +1,13 @@
 
+from abc import ABC, abstractmethod
+
 import numpy as np
 
 from shorttext.utils import textpreprocessing as textpreprocess, gensim_corpora as gc, classification_exceptions as e
 from shorttext.utils.textpreprocessing import tokenize
 
 # abstract class
-class LatentTopicModeler:
+class LatentTopicModeler(ABC):
     """
     Abstract class for various topic modeler.
     """
@@ -33,7 +35,7 @@ def generate_corpus(self, classdict):
         """
         self.dictionary, self.corpus, self.classlabels = gc.generate_gensim_corpora(classdict,
                                                                                     preprocess_and_tokenize=lambda sent: tokenize(self.preprocessor(sent)))
-
+    @abstractmethod
     def train(self, classdict, nb_topics, *args, **kwargs):
         """ Train the modeler.
 
@@ -79,6 +81,7 @@ def retrieve_bow_vector(self, shorttext, normalize=True):
             vec /= np.linalg.norm(vec)
         return vec
 
+    @abstractmethod
     def retrieve_topicvec(self, shorttext):
         """ Calculate the topic vector representation of the short text.
 
@@ -92,6 +95,7 @@ def retrieve_topicvec(self, shorttext):
         """
         raise e.NotImplementedException()
 
+    @abstractmethod
     def get_batch_cos_similarities(self, shorttext):
         """ Calculate the cosine similarities of the given short text and all the class labels.
 
@@ -113,6 +117,7 @@ def __contains__(self, shorttext):
             raise e.ModelNotTrainedException()
         return True
 
+    @abstractmethod
     def loadmodel(self, nameprefix):
         """ Load the model from files.
 
@@ -125,6 +130,7 @@ def loadmodel(self, nameprefix):
         """
         raise e.NotImplementedException()
 
+    @abstractmethod
     def savemodel(self, nameprefix):
         """ Save the model to files.
 
diff --git a/shorttext/generators/bow/__init__.py b/shorttext/generators/bow/__init__.py
@@ -1,3 +1,4 @@
+
 from . import AutoEncodingTopicModeling
 from . import GensimTopicModeling
-from . import LatentTopicModeling
+from . import LatentTopicModeling