added docstring and unit tests

stephenhky · stephenhky · commit 5b3061883d91 · 2018-05-15T13:35:19.000-04:00
diff --git a/shorttext/spell/basespellcorrector.py b/shorttext/spell/basespellcorrector.py
@@ -2,8 +2,25 @@
 import shorttext.utils.classification_exceptions as ce
 
 class SpellCorrector:
+    """ Base class for all spell corrector.
+
+    This class is not implemented; this can be seen as an "abstract class."
+
+    """
     def train(self, text):
+        """ Train the spell corrector with the given corpus.
+
+        :param text: training corpus
+        :type text: str
+        """
         raise ce.NotImplementedException()
 
     def correct(self, word):
+        """ Recommend a spell correction to given the word.
+
+        :param word: word to be checked
+        :return: recommended correction
+        :type word: str
+        :rtype: str
+        """
         return word
diff --git a/shorttext/spell/binarize.py b/shorttext/spell/binarize.py
@@ -11,7 +11,6 @@
 default_specialsignals = {'eos': '#', 'unk': '_', 'number': '@'}
 default_signaldenotions = {'<eos>': 'eos', '<unk>': 'unk'}
 
-## TODO: need to refine the array settings
 
 class SpellingToConcatCharVecEncoder:
     def __init__(self, alph):
@@ -30,6 +29,11 @@ def hasnum(word):
 
 
 class SCRNNBinarizer:
+    """ A class used by Sakaguchi's spell corrector to convert text into numerical vectors.
+
+    No documentation for this class.
+
+    """
     def __init__(self, alpha, signalchar_dict):
         self.signalchar_dict = signalchar_dict
         self.concatchar_encoder = SpellingToConcatCharVecEncoder(alpha)
diff --git a/shorttext/spell/norvig.py b/shorttext/spell/norvig.py
@@ -8,23 +8,62 @@
 from .editor import compute_set_edits1, compute_set_edits2
 
 class NorvigSpellCorrector(SpellCorrector):
+    """ Spell corrector described by Peter Norvig in his blog. (https://norvig.com/spell-correct.html)
+
+    """
     def __init__(self):
+        """ Instantiate the class
+
+        """
         self.train('')
 
     def train(self, text):
+        """ Given the text, train the spell corrector.
+
+        :param text: training corpus
+        :type text: str
+        """
         self.words = re.findall(r'\w+', text.lower())
         self.WORDS = Counter(self.words)
         self.N = sum(self.WORDS.values())
 
     def P(self, word):
+        """ Compute the probability of the words randomly sampled from the training corpus.
+
+        :param word: a word
+        :return: probability of the word sampled randomly in the corpus
+        :type word: str
+        :rtype: float
+        """
         return self.WORDS[word] / float(self.N)
 
     def correct(self, word):
+        """ Recommend a spelling correction to the given word
+
+        :param word: a word
+        :return: recommended correction
+        :type word: str
+        :rtype: str
+        """
         return max(self.candidates(word), key=self.P)
 
     def known(self, words):
+        """ Filter away the words that are not found in the training corpus.
+
+        :param words: list of words
+        :return: list of words that can be found in the training corpus
+        :type words: list
+        :rtype: list
+        """
         return set(w for w in words if w in self.WORDS)
 
     def candidates(self, word):
+        """ List potential candidates for corrected spelling to the given words.
+
+        :param word: a word
+        :return: list of recommended corrections
+        :type word: str
+        :rtype: list
+        """
         return (self.known([word]) or self.known(compute_set_edits1(word)) or self.known(compute_set_edits2(word)) or [word])
 
diff --git a/shorttext/spell/sakaguchi.py b/shorttext/spell/sakaguchi.py
@@ -18,12 +18,34 @@
 
 
 class SCRNNSpellCorrector(SpellCorrector):
+    """ scRNN (semi-character-level recurrent neural network) Spell Corrector.
+
+    Reference:
+    Keisuke Sakaguchi, Kevin Duh, Matt Post, Benjamin Van Durme, "Robsut Wrod Reocginiton via semi-Character Recurrent Neural Networ," arXiv:1608.02214 (2016). [`arXiv
+    <https://arxiv.org/abs/1608.02214>`_]
+
+    """
     def __init__(self, operation,
                  alph=default_alph,
                  specialsignals=default_specialsignals,
                  concatcharvec_encoder=None,
                  batchsize=1,
                  nb_hiddenunits=650):
+        """ Instantiate the scRNN spell corrector.
+
+        :param operation: types of distortion of words in training (options: "NOISE-INSERT", "NOISE-DELETE", "NOISE-REPLACE", "JUMBLE-WHOLE", "JUMBLE-BEG", "JUMBLE-END", and "JUMBLE-INT")
+        :param alph: default string of characters (Default: "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz.,:;'*!?`$%&(){}[]-/\@_#")
+        :param specialsignals: dictionary of special signals (Default built-in)
+        :param concatcharvec_encoder: one-hot encoder for characters, initialize if None. (Default: None)
+        :param batchsize: batch size. (Default: 1)
+        :param nb_hiddenunits: number of hidden units. (Default: 650)
+        :type operation: str
+        :type alpha: str
+        :type specialsignals: dict
+        :type concatcharvec_encoder: shorttext.spell.binarize.SpellingToConcatCharVecEncoder
+        :type batchsize: int
+        :type nb_hiddenunits: int
+        """
         self.operation = operation
         self.binarizer = SCRNNBinarizer(alph, specialsignals)
         self.concatcharvec_encoder = SpellingToConcatCharVecEncoder(alph) if concatcharvec_encoder==None else concatcharvec_encoder
@@ -33,6 +55,13 @@ def __init__(self, operation,
         self.nb_hiddenunits = nb_hiddenunits
 
     def preprocess_text_train(self, text):
+        """ A generator that output numpy vectors for the text for training.
+
+        :param text: text
+        :return: generator that outputs the numpy vectors for training
+        :type text: str
+        :rtype: generator
+        """
         for token in nospace_tokenize(text):
             if self.operation.upper().startswith('NOISE'):
                 xvec, _ = self.binarizer.noise_char(token, self.operation.upper()[6:])
@@ -43,13 +72,34 @@ def preprocess_text_train(self, text):
             yield xvec, yvec
 
     def preprocess_text_correct(self, text):
+        """ A generator that output numpy vectors for the text for correction.
+
+        ModelNotTrainedException is raised if the model has not been trained.
+
+        :param text: text
+        :return: generator that outputs the numpy vectors for correction
+        :type text: str
+        :rtype: generator
+        :raise: ModelNotTrainedException
+        """
         if not self.trained:
             raise ce.ModelNotTrainedException()
         for token in nospace_tokenize(text):
             xvec, _ = self.binarizer.change_nothing(token, self.operation)
             yield xvec
 
-    def train(self, text, nb_epoch=100, optimizer='rmsprop'):
+    def train(self, text, nb_epoch=100, dropout_rate=0.01, optimizer='rmsprop'):
+        """ Train the scRNN model.
+
+        :param text: training corpus
+        :param nb_epoch: number of epochs (Default: 100)
+        :param dropout_rate: dropout rate (Default: 0.01)
+        :param optimizer: optimizer (Default: "rmsprop")
+        :type text: str
+        :type nb_epoch: int
+        :type dropout_rate: float
+        :type optimizer: str
+        """
         self.dictionary = Dictionary([nospace_tokenize(text), default_specialsignals.values()])
         self.onehotencoder.fit(np.arange(len(self.dictionary)).reshape((len(self.dictionary), 1)))
         xylist = [(xvec.transpose(), yvec.transpose()) for xvec, yvec in self.preprocess_text_train(text)]
@@ -59,26 +109,35 @@ def train(self, text, nb_epoch=100, optimizer='rmsprop'):
         # neural network here
         model = Sequential()
         model.add(LSTM(self.nb_hiddenunits, return_sequences=True, batch_input_shape=(None, self.batchsize, len(self.concatcharvec_encoder)*3)))
-        model.add(Dropout(0.01))
+        model.add(Dropout(dropout_rate))
         model.add(TimeDistributed(Dense(len(self.dictionary))))
         model.add(Activation('softmax'))
 
         # compile... more arguments
-        model.compile(loss='categorical_crossentropy', optimizer=optimizer
-                      #metrics=['accuracy'])
-                      )
-
-        print xtrain.shape
-        print ytrain.shape
+        model.compile(loss='categorical_crossentropy', optimizer=optimizer)
 
+        # training
         model.fit(xtrain, ytrain, epochs=nb_epoch)
 
         self.model = model
         self.trained = True
 
     def correct(self, word):
+        """ Recommend a spell correction to given the word.
+
+        :param word: a given word
+        :return: recommended correction
+        :type word: str
+        :rtype: str
+        """
         xmat = np.array([xvec.transpose() for xvec in self.preprocess_text_correct(word)])
         yvec = self.model.predict(xmat)
 
         maxy = yvec.argmax(axis=-1)
-        return ' '.join([self.dictionary[y] for y in maxy[0]])
+        return ' '.join([self.dictionary[y] for y in maxy[0]])
+
+    def loadmodel(self, prefix):
+        pass
+
+    def savemodel(self, prefix):
+        pass
diff --git a/test/test_sakaguchispell.py b/test/test_sakaguchispell.py
@@ -0,0 +1,42 @@
+
+import unittest
+
+import shorttext.spell.sakaguchi as sk
+
+class TestSCRNN(unittest.TestCase):
+    def setUp(self):
+        pass
+
+    def tearDown(self):
+        pass
+
+    def generalproc(self, operation):
+        corrector = sk.SCRNNSpellCorrector(operation)
+        corrector.train('I am a nerd . Natural language processing is sosad .')
+        self.assertEqual(corrector.correct('langudge'), 'language')
+
+    def test_NOISE_INSERT(self):
+        self.generalproc('NOISE-INSERT')
+
+    def test_NOISE_DELETE(self):
+        self.generalproc('NOISE-DELETE')
+
+    def test_NOISE_REPLACE(self):
+        self.generalproc('NOISE-REPLACE')
+
+    def test_JUMBLE_WHOLE(self):
+        self.generalproc('NOISE-WHOLE')
+
+    def test_JUMBLE_BEG(self):
+        self.generalproc('JUMBLE-BEG')
+
+    def test_JUMBLE_END(self):
+        self.generalproc('JUMBLE-END')
+
+    def test_JUMBLE_INT(self):
+        self.generalproc('JUMBLE-INT')
+
+
+if __name__ == '__main__':
+    unittest.main()
+