itsmystyle
diff --git a/‎.DS_Store
6 KB b/‎.DS_Store
6 KB
diff --git a/‎.gitignore
Lines changed: 8 additions & 32 deletions b/‎.gitignore
Lines changed: 8 additions & 32 deletions
diff --git a/‎README.md
Lines changed: 55 additions & 1 deletion b/‎README.md
Lines changed: 55 additions & 1 deletion
diff --git a/‎data/.DS_Store
6 KB b/‎data/.DS_Store
6 KB
diff --git a/‎data/config.json
Lines changed: 8 additions & 0 deletions b/‎data/config.json
Lines changed: 8 additions & 0 deletions
diff --git a/‎data/long_test.pkl
1.29 MB b/‎data/long_test.pkl
1.29 MB
diff --git a/‎data/test.pkl
722 KB b/‎data/test.pkl
722 KB
diff --git a/‎data/train.pkl
13.3 MB b/‎data/train.pkl
13.3 MB
diff --git a/‎data/valid.pkl
1.65 MB b/‎data/valid.pkl
1.65 MB
diff --git a/‎data/vocab.pkl
285 KB b/‎data/vocab.pkl
285 KB
diff --git a/‎data/word_set.pkl
420 KB b/‎data/word_set.pkl
420 KB
diff --git a/‎learning_graph.ipynb
Lines changed: 147 additions & 0 deletions b/‎learning_graph.ipynb
Lines changed: 147 additions & 0 deletions
diff --git a/‎models/.DS_Store
6 KB b/‎models/.DS_Store
6 KB
diff --git a/‎models/seq2seq/.DS_Store
6 KB b/‎models/seq2seq/.DS_Store
6 KB
diff --git a/‎models/seq2seq/config.json
Lines changed: 19 additions & 0 deletions b/‎models/seq2seq/config.json
Lines changed: 19 additions & 0 deletions
diff --git a/‎src/.DS_Store
6 KB b/‎src/.DS_Store
6 KB
diff --git a/‎src/__pycache__/base_predictor.cpython-35.pyc
5.14 KB b/‎src/__pycache__/base_predictor.cpython-35.pyc
5.14 KB
diff --git a/‎src/__pycache__/base_predictor.cpython-37.pyc
4.27 KB b/‎src/__pycache__/base_predictor.cpython-37.pyc
4.27 KB
diff --git a/‎src/__pycache__/callbacks.cpython-37.pyc
2.33 KB b/‎src/__pycache__/callbacks.cpython-37.pyc
2.33 KB
diff --git a/‎src/__pycache__/dataset.cpython-35.pyc
4.1 KB b/‎src/__pycache__/dataset.cpython-35.pyc
4.1 KB
diff --git a/‎src/__pycache__/dataset.cpython-37.pyc
2.21 KB b/‎src/__pycache__/dataset.cpython-37.pyc
2.21 KB
diff --git a/‎src/__pycache__/embedding.cpython-35.pyc
4 KB b/‎src/__pycache__/embedding.cpython-35.pyc
4 KB
diff --git a/‎src/__pycache__/embedding.cpython-37.pyc
3.51 KB b/‎src/__pycache__/embedding.cpython-37.pyc
3.51 KB
diff --git a/‎src/__pycache__/metrics.cpython-37.pyc
2.07 KB b/‎src/__pycache__/metrics.cpython-37.pyc
2.07 KB
diff --git a/‎src/__pycache__/predictor.cpython-35.pyc
2.92 KB b/‎src/__pycache__/predictor.cpython-35.pyc
2.92 KB
diff --git a/‎src/__pycache__/predictor.cpython-37.pyc
2.6 KB b/‎src/__pycache__/predictor.cpython-37.pyc
2.6 KB
diff --git a/‎src/__pycache__/preprocessor.cpython-35.pyc
4.15 KB b/‎src/__pycache__/preprocessor.cpython-35.pyc
4.15 KB
diff --git a/‎src/__pycache__/preprocessor.cpython-37.pyc
2.96 KB b/‎src/__pycache__/preprocessor.cpython-37.pyc
2.96 KB
diff --git a/‎src/base_predictor.py
Lines changed: 202 additions & 0 deletions b/‎src/base_predictor.py
Lines changed: 202 additions & 0 deletions
diff --git a/‎src/callbacks.py
Lines changed: 65 additions & 0 deletions b/‎src/callbacks.py
Lines changed: 65 additions & 0 deletions
@@ -1,32 +1,8 @@
-# Prerequisites
-*.d
-
-# Compiled Object files
-*.slo
-*.lo
-*.o
-*.obj
-
-# Precompiled Headers
-*.gch
-*.pch
-
-# Compiled Dynamic libraries
-*.so
-*.dylib
-*.dll
-
-# Fortran module files
-*.mod
-*.smod
-
-# Compiled Static libraries
-*.lai
-*.la
-*.a
-*.lib
-
-# Executables
-*.exe
-*.out
-*.app
+# Model files
+model.pkl.*
+
+# Dataset
+english_short_corpus_small_13to20_test.txt
+english_short_corpus_small_4to12_test.txt
+english_short_corpus_small_4to12_train.txt
+english_short_corpus_small_4to12_valid.txt
@@ -1 +1,55 @@
-# autoencoder-interpretation
+# How to train
+1. Download PubMed 20k/200k RCT numbers replaced with at sign from https://github.com/Franck-Dernoncourt/pubmed-rct .
+2. Put glove.6B.300d.txt, train.txt, dev.txt and test.txt into the data/ .
+3. cd src/ and run the following command to preprocess and generate training data.
+```
+python make_dataset.py ../data/
+```
+
+4. Run the following command to prepapre model folder. Feel free to open the config.json to tune some hyperparameters.
+```
+mkdir ../models/your_model_folder
+cp ../models/hnn/config.json ../models/your_model_folder
+```
+
+5. Run the following command to start training.
+```
+python train.py ../models/your_model_folder/
+```
+
+6. Trained model will saved in the folder 'your_model_folder/'
+
+
+
+# How to predict
+## Download pretrained model
+1. Run the following shell
+```
+./download.sh
+```
+2. cd src/
+
+## To predict test.txt
+1. Run the following command to predict test.txt. You are able to change x in '--epoch x' to the best epochs you ran.
+```
+python predict.py ../models/hnn/ --epoch 7 --input_mode 1 --input_dir ../data/test.pkl --output_dir result.txt
+
+or
+
+python predict.py ../models/your_model_folder/ --epoch 7 --input_mode 1 --input_dir ../data/test.pkl --output_dir result.txt
+```
+
+## To predict abstract
+1. Run the following command to predict input.
+```
+python predict.py ../models/hnn/ --epoch 7
+
+or
+
+python predict.py ../models/your_model_folder/ --epoch 7
+```
+
+2. Type in the abstract and it will output the result. Try input the following abstract.
+```
+To evaluate the performance ( efficacy , safety and acceptability ) of a new micro-adherent absorbent dressing ( UrgoClean ) compared with a hydrofiber dressing ( Aquacel ) in the local management of venous leg ulcers , in the debridement stage .$$$A non-inferiority European randomised controlled clinical trial ( RCT ) was conducted in @ centres , on patients presenting with venous or predominantly venous , mixed aetiology leg ulcers at their sloughy stage ( with more than @ % of the wound bed covered with slough at baseline ) .$$$Patients were followed over a @-week period and assessed weekly .$$$The primary judgement criteria was the relative regression of the wound surface area after the @-week treatment period .
+```
@@ -0,0 +1,8 @@
+{
+    "train_path": "../data/english_short_corpus_small_4to12_train.txt",
+    "test_path": "../data/english_short_corpus_small_4to12_test.txt",
+    "valid_path": "../data/english_short_corpus_small_4to12_valid.txt",
+    "long_test_path": "../data/english_short_corpus_small_13to20_test.txt",
+    "word_set_path": "../data/word_set.pkl",
+    "vocab_path": "../data/vocab.pkl"
+}
@@ -0,0 +1,19 @@
+{
+    "arch": "Predictor",
+    "train": "../data/train.pkl",
+    "words_dict": "../data/vocab.pkl",
+    "model_parameters": {
+        "encoder_model_name": "EncoderRNN",
+        "decoder_model_name": "DecoderRNN",
+        "hidden_size": 50,
+        "batch_size": 32,
+        "max_epochs": 200,
+        "max_iters_in_epoch": 1e20,
+        "grad_accumulate_steps": 1,
+        "num_workers": 4,
+        "weight_decay": 1e-5,
+        "optimizer": "Adam",
+        "learning_rate": 1e-4,
+        "valid": "../data/valid.pkl"
+    }
+}
@@ -0,0 +1,202 @@
+import torch
+import torch.utils.data.dataloader
+from torch.utils.data.dataloader import default_collate
+from torch.utils import data as Data
+from tqdm import tqdm
+import pdb
+
+class BasePredictor():
+    def __init__(self,
+                 batch_size=10,
+                 max_epochs=10,
+                 valid=None,
+                 device=None,
+                 metrics={},
+                 learning_rate=1e-3,
+                 max_iters_in_epoch=1e20,
+                 grad_accumulate_steps=1,
+                 num_workers=2):
+        
+        self.batch_size = batch_size
+        self.max_epochs = max_epochs
+        self.valid = valid
+        self.metrics = metrics
+        self.learning_rate = learning_rate
+        self.max_iters_in_epoch = max_iters_in_epoch
+        self.grad_accumulate_steps = grad_accumulate_steps
+        self.num_workers = num_workers
+
+        if device is not None:
+            self.device = torch.device(device)
+        else:
+            self.device = torch.device('cuda:0' if torch.cuda.is_available()
+                                       else 'cpu')
+
+        self.epoch = 0
+
+    def fit_dataset(self, data, collate_fn=default_collate, callbacks=[]):
+        # Start the training loop.
+        while self.epoch < self.max_epochs:
+
+            # train and evaluate train score
+            print('training %i' % self.epoch)
+            
+            dataloader = Data.DataLoader(dataset = data, 
+                                         batch_size = self.batch_size, 
+                                         shuffle = True, 
+                                         collate_fn = collate_fn, 
+                                         num_workers = self.num_workers)
+
+            # train epoch
+            log_train = self._run_epoch(dataloader, True)
+
+            # evaluate valid score
+            if self.valid is not None:
+                print('dev evaluating %i' % self.epoch)
+                dataloader = Data.DataLoader(dataset = self.valid, 
+                                             batch_size = self.batch_size, 
+                                             shuffle = False, 
+                                             collate_fn = collate_fn, 
+                                             num_workers = self.num_workers)
+                
+                log_valid = self._run_epoch(dataloader, False)
+            else:
+                log_valid = None
+                
+            for callback in callbacks:
+                callback.on_epoch_end(log_train, log_valid, self)
+
+            self.epoch += 1
+
+    def predict_dataset(self, data,
+                        collate_fn=default_collate,
+                        batch_size=None,
+                        predict_fn=None):
+        if batch_size is None:
+            batch_size = self.batch_size
+        if predict_fn is None:
+            predict_fn = self._predict_batch
+
+        # set model to eval mode
+        self.encoder.eval()
+        self.decoder.eval()
+
+        # make dataloader
+        dataloader = Data.DataLoader(dataset = data, 
+                                     batch_size = self.batch_size, 
+                                     shuffle = False, 
+                                     collate_fn = collate_fn, 
+                                     num_workers = self.num_workers)
+        
+        ys_ = []
+        with torch.no_grad():
+            for batch in tqdm(dataloader):
+                batch_y_ = predict_fn(batch)
+                ys_.append(batch_y_)
+
+        return ys_
+
+    def save(self, path):
+        torch.save({
+            'epoch': self.epoch + 1,
+            'encoder': self.encoder.state_dict(),
+            'decoder': self.decoder.state_dict(),
+            'optimizer': self.optimizer.state_dict()
+        }, path)
+
+    def load(self, path):
+        self.encoder.load_state_dict(torch.load(path)['encoder'])
+        self.decoder.load_state_dict(torch.load(path)['decoder'])
+        self.optimizer.load_state_dict(torch.load(path)['optimizer'])
+        self.epoch = torch.load(path)['epoch']
+
+    def _run_epoch(self, dataloader, training):
+        self.encoder.train(training)
+        self.decoder.train(training)
+        
+        loss = 0
+
+        # reset metric accumulators
+        for metric in self.metrics:
+            metric.reset()
+
+        if training:
+            iter_in_epoch = min(len(dataloader), self.max_iters_in_epoch)
+            description = 'training'
+        else:
+            iter_in_epoch = len(dataloader)
+            description = 'evaluating'
+
+        # run batches
+        trange = tqdm(enumerate(dataloader),
+                      total=iter_in_epoch,
+                      desc=description)
+        for i, batch in trange:
+            if training and i >= iter_in_epoch:
+                break
+            
+            if training:
+                output, batch_loss = \
+                    self._run_iter(batch, training)
+
+                batch_loss /= self.grad_accumulate_steps
+
+                # accumulate gradient - zero_grad
+                if i % self.grad_accumulate_steps == 0:
+                    self.optimizer.zero_grad()
+
+                batch_loss.backward()
+
+                # accumulate gradient - step
+                if (i + 1) % self.grad_accumulate_steps == 0:
+                    self.optimizer.step()
+            else:
+                with torch.no_grad():
+                    output, batch_loss = \
+                        self._run_iter(batch, training)
+
+            # accumulate loss and metric scores
+            loss += batch_loss.item()
+            
+            for metric in self.metrics:
+                metric.update(output, batch)
+            trange.set_postfix(
+                loss=loss / (i + 1),
+                **{m.name: m.print_score() for m in self.metrics if m.name == 'Accuracy'})
+
+        # calculate average loss and metrics
+        loss /= iter_in_epoch
+
+        epoch_log = {}
+        epoch_log['loss'] = float(loss)
+        for metric in self.metrics:
+            score = metric.get_score()
+            print('{}: {} '.format(metric.name, score))
+            epoch_log[metric.name] = score
+                
+        print('loss=%f\n' % loss)
+        return epoch_log
+
+    def _run_iter(self, batch, training):
+        """ Run iteration for training.
+
+        Args:
+            batch (dict)
+            training (bool)
+
+        Returns:
+            predicts: Prediction of the batch.
+            loss (FloatTensor): Loss of the batch.
+        """
+        pass
+
+    def _predict_batch(self, batch):
+        """ Run iteration for predicting.
+
+        Args:
+            batch (dict)
+
+        Returns:
+            predicts: Prediction of the batch.
+        """
+        pass
@@ -0,0 +1,65 @@
+import math
+import json
+import pickle
+
+
+class Callback:
+    def __init__():
+        pass
+
+    def on_epoch_end(log_train, log_valid, model):
+        pass
+
+
+class MetricsLogger(Callback):
+    def __init__(self, log_dest):
+        self.history = {
+            'train': [],
+            'valid': []
+        }
+        self.log_dest = log_dest
+
+    def on_epoch_end(self, log_train, log_valid, model):
+        log_train['epoch'] = model.epoch
+        log_valid['epoch'] = model.epoch
+        self.history['train'].append(log_train)
+        self.history['valid'].append(log_valid)
+        with open(self.log_dest, 'w') as f:
+            json.dump(self.history, f, indent='    ')
+            
+    def load(self, epoch):
+        with open(self.log_dest, 'r') as f:
+            self.history = json.loads(f.read())
+        self.history['train'] = self.history['train'][:epoch + 1]
+        self.history['valid'] = self.history['valid'][:epoch + 1]
+
+class ModelCheckpoint(Callback):
+    def __init__(self, filepath,
+                 monitor='loss',
+                 verbose=0,
+                 mode='min'):
+        self._filepath = filepath
+        self._verbose = verbose
+        self._monitor = monitor
+        self._best = math.inf if mode == 'min' else - math.inf
+        self._mode = mode
+
+    def on_epoch_end(self, log_train, log_valid, model):
+        score = log_valid[self._monitor]
+        if self._mode == 'min':
+            if score < self._best:
+                self._best = score
+                model.save(self._filepath)
+                if self._verbose > 0:
+                    print('Best model saved (%f)' % score)
+
+        elif self._mode == 'max':
+            if score > self._best:
+                self._best = score
+                model.save('{}'.format(self._filepath, model.epoch)) # save only best model ignore epoch
+                if self._verbose > 0:
+                    print('Best model saved (%f)' % score)
+
+        elif self._mode == 'all':
+            model.save('{}.{}'
+                       .format(self._filepath, model.epoch))