Skip to content

Commit 74eeb38

Browse files
committed
handling DTM, topic modeling and various docuemntation; using abstract class for LatentTopicModeling.py
1 parent 60abfeb commit 74eeb38

7 files changed

+56
-3
lines changed

docs/codes.rst

+19
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,25 @@ Module `shorttext.utils.textpreprocessing`
1919
.. automodule:: shorttext.utils.textpreprocessing
2020
:members:
2121

22+
Document-Term Matrix (DTM)
23+
--------------------------
24+
25+
Module `shorttext.utils.dtm`
26+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
27+
28+
.. automodule:: shorttext.utils.dtm
29+
:members:
30+
31+
One-hot Encoding
32+
----------------
33+
34+
Module `shorttext.generators.charbase.char2vec`
35+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
36+
37+
.. automodule:: shorttext.generators.charbase.char2vec
38+
:members:
39+
40+
2241
Topic Models
2342
------------
2443

docs/tutorial_charbaseonehot.rst

+5
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,11 @@ We can also convert a list of sentences by
4444

4545
You can decide whether or not to output a sparse matrix by specifiying the parameter `sparse`.
4646

47+
48+
.. automodule:: shorttext.generators.charbase.char2vec
49+
:members:
50+
51+
4752
Reference
4853
---------
4954

docs/tutorial_dtm.rst

+6
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,9 @@ With the corpus ready in this form, we can create a `DocumentTermMatrix` class f
3333

3434
>>> usprez_dtm = shorttext.utils.DocumentTermMatrix(corpus, docids=docids)
3535

36+
.. autoclass:: shorttext.utils.dtm.DocumentTermMatrix
37+
:members:
38+
3639
One can get the document frequency of any token (the number of documents that the given
3740
token is in) by:
3841

@@ -64,6 +67,9 @@ To load this class later, enter:
6467

6568
>>> usprez_dtm2 = shorttext.utils.load_DocumentTermMatrix('/path/to/whatever.bin')
6669

70+
.. automodule:: shorttext.utils.dtm
71+
:members: load_DocumentTermMatrix
72+
6773
Reference
6874
---------
6975

docs/tutorial_textpreprocessing.rst

+6
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,9 @@ Then define the preprocessor, a function, by just calling:
2525

2626
>>> preprocessor1 = standard_text_preprocessor_1()
2727

28+
.. automodule:: shorttext.utils.textpreprocessing
29+
:members: standard_text_preprocessor_1
30+
2831
It is a function that perform the preprocessing in the steps above:
2932

3033
>>> preprocessor1('Maryland Blue Crab') # output: 'maryland blue crab'
@@ -72,6 +75,9 @@ Some examples are:
7275
>>> preprocessor2('Maryland blue crab in Annapolis') # output: 'MARYLAND-8 BLUE-4 CRAB-4 IN-2 ANNAPOLIS-9'
7376
>>> preprocessor2('generative adversarial networks') # output: 'GENERATIVE-10 ADVERSARIAL-11 NETWORK-7'
7477

78+
.. automodule:: shorttext.utils.textpreprocessing
79+
:members: text_preprocessor
80+
7581
Tokenization
7682
------------
7783

docs/tutorial_topic.rst

+10
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,11 @@ not saved. To load the model, enter:
8888

8989
>>> topicmodeler2 = shorttext.classifiers.load_gensimtopicmodel('/path/to/nihlda128', compact=False)
9090

91+
92+
.. automodule:: shorttext.generators.bow.GensimTopicModeling
93+
:members:
94+
95+
9196
AutoEncoder
9297
-----------
9398

@@ -137,6 +142,11 @@ The default is to weigh. To not weigh, initialize it as:
137142

138143
>>> autoencoder3 = shorttext.generators.AutoencodingTopicModeler(toweigh=False)
139144

145+
146+
.. automodule:: shorttext.generators.bow.AutoEncodingTopicModeling
147+
:members:
148+
149+
140150
Appendix: Unzipping Model I/O
141151
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
142152

shorttext/generators/bow/LatentTopicModeling.py

+8-2
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,13 @@
11

2+
from abc import ABC, abstractmethod
3+
24
import numpy as np
35

46
from shorttext.utils import textpreprocessing as textpreprocess, gensim_corpora as gc, classification_exceptions as e
57
from shorttext.utils.textpreprocessing import tokenize
68

79
# abstract class
8-
class LatentTopicModeler:
10+
class LatentTopicModeler(ABC):
911
"""
1012
Abstract class for various topic modeler.
1113
"""
@@ -33,7 +35,7 @@ def generate_corpus(self, classdict):
3335
"""
3436
self.dictionary, self.corpus, self.classlabels = gc.generate_gensim_corpora(classdict,
3537
preprocess_and_tokenize=lambda sent: tokenize(self.preprocessor(sent)))
36-
38+
@abstractmethod
3739
def train(self, classdict, nb_topics, *args, **kwargs):
3840
""" Train the modeler.
3941
@@ -79,6 +81,7 @@ def retrieve_bow_vector(self, shorttext, normalize=True):
7981
vec /= np.linalg.norm(vec)
8082
return vec
8183

84+
@abstractmethod
8285
def retrieve_topicvec(self, shorttext):
8386
""" Calculate the topic vector representation of the short text.
8487
@@ -92,6 +95,7 @@ def retrieve_topicvec(self, shorttext):
9295
"""
9396
raise e.NotImplementedException()
9497

98+
@abstractmethod
9599
def get_batch_cos_similarities(self, shorttext):
96100
""" Calculate the cosine similarities of the given short text and all the class labels.
97101
@@ -113,6 +117,7 @@ def __contains__(self, shorttext):
113117
raise e.ModelNotTrainedException()
114118
return True
115119

120+
@abstractmethod
116121
def loadmodel(self, nameprefix):
117122
""" Load the model from files.
118123
@@ -125,6 +130,7 @@ def loadmodel(self, nameprefix):
125130
"""
126131
raise e.NotImplementedException()
127132

133+
@abstractmethod
128134
def savemodel(self, nameprefix):
129135
""" Save the model to files.
130136

shorttext/generators/bow/__init__.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
12
from . import AutoEncodingTopicModeling
23
from . import GensimTopicModeling
3-
from . import LatentTopicModeling
4+
from . import LatentTopicModeling

0 commit comments

Comments
 (0)