finallyupper
diff --git a/‎README.md
+153 b/‎README.md
+153
diff --git a/‎__init__.py b/‎__init__.py
diff --git a/‎assets/transformer.png
162 KB b/‎assets/transformer.png
162 KB
diff --git a/‎data/config.yaml
+19 b/‎data/config.yaml
+19
diff --git a/‎dataset.py
+64 b/‎dataset.py
+64
diff --git a/‎debug.ipynb
+175 b/‎debug.ipynb
+175
diff --git a/‎logs/best_model.png
33.4 KB b/‎logs/best_model.png
33.4 KB
diff --git a/‎main.py
+97 b/‎main.py
+97
diff --git a/‎model/__init__.py
+1 b/‎model/__init__.py
+1
@@ -0,0 +1,153 @@
+# Pytorch-Transformer Implementation
+This repository contains an implementation of the [Transformer model (Attention is All You Need)](https://arxiv.org/abs/1706.03762) in PyTorch. The model is trained and tested on a dummy dataset consisting of tokens `<sos>=0`, `<eos>=1`, `<pad>=2`, and additional tokens 3 and 4, representing sequences. The core architecture is located in the `model/` directory. 
+
+<img src="./assets/transformer.png" alt="transformer" width="50%">
+  
+## Guide
+Before you run the commands, modify the configurations in `data/config.yaml` as per your requirements.  
+### Training
+Run the following command to start training the model:
+```
+python main.py --output ${OUTPUT_PATH} --log ${LOG_PATH} --cfg ${CFG_PATH}
+```
+
+### Testing
+Run the following command to start testing the model:
+```
+python test.py --model ${MODEL_PATH} --cfg ${CFG_PATH}
+```
+## Implementation
+The main Transformer architecture is defined as follows. Other components like the Encoder and Decoder are implemented separately in the `model/` directory:
+```
+class Transformer(nn.Module):
+    def __init__(self, enc_vsize, dec_vsize, d_model, max_len, dropout_p=0.1, n_heads=8, n_layers=6, d_ff=2048, device=None,
+                 src_pad_idx=0, tgt_pad_idx=0):
+        super(Transformer, self).__init__()
+        self.device = device 
+
+        self.encoder = Encoder(vocab_size=enc_vsize,
+                               d_model=d_model,
+                               max_len=max_len,
+                               dropout_p=dropout_p,
+                               n_heads=n_heads,
+                               n_layers=n_layers,
+                               d_ff=d_ff,
+                               device=device)
+        
+        self.decoder = Decoder(vocab_size=dec_vsize,
+                               d_model=d_model,
+                               max_len = max_len,
+                               dropout_p=dropout_p,
+                               n_heads=n_heads,
+                               n_layers=n_layers,
+                               d_ff=d_ff,
+                               device=device)
+        self.src_pad_idx = src_pad_idx 
+        self.tgt_pad_idx = tgt_pad_idx
+
+    def make_src_mask(self, source) -> torch.Tensor:
+        """Padding mask"""
+        src_mask = (source != self.src_pad_idx).unsqueeze(1).unsqueeze(2) #  batch_size x seq_len -> batch_size x 1 x 1 x seq_len
+        return src_mask 
+    
+    def make_target_mask(self, target) -> torch.Tensor:
+        """
+        1) padding mask - finds padding token and assigns False
+        2) attention mask (target mask) - limits access available parts  
+        """
+        padding_mask = (target != self.tgt_pad_idx).unsqueeze(1).unsqueeze(3)
+        target_seq_len = target.size(1)
+        nopeak_mask = (1 - torch.triu(torch.ones(1, target_seq_len, target_seq_len), diagonal=1)).bool().to(self.device)
+        target_mask = nopeak_mask & padding_mask
+        
+        return target_mask 
+    
+    def forward(self, src, tgt):
+        src_mask = self.make_src_mask(src) # batch_size x 1 x 1 x src_seq_len
+        tgt_mask = self.make_target_mask(tgt) # batch_size x 1 x 1 x tgt_seq_len
+
+        enc_emb = self.encoder(src, src_mask) # batch_size x src_seq_len x d_model
+        tgt_emb = self.decoder(enc_emb, tgt, src_mask, tgt_mask) # batch_size x tgt_seq_len x tgt_vocab_size
+        return tgt_emb # No softmax as applied in CrossEntroyLoss
+
+```
+
+## Dataset
+**Tokens**:  
+- `SOS` token: `0`   
+-  `EOS` token: `1`  
+-  `PAD` token: `2` (not used in this function)  
+-  `WORDS`: `3`, `4` (used to generate patterns)  
+    
+**Patterns**:   
+- Sequence of all 3s: `[0, 3, 3, 3, 3, 3, 3, 3, 3, 1]`  
+- Sequence of all 4s: `[0, 4, 4, 4, 4, 4, 4, 4, 4, 1]`  
+- Alternating 3s and 4s starting with 3: `[0, 3, 4, 3, 4, 3, 4, 3, 4, 1]`  
+- Alternating 3s and 4s starting with 4: `[0, 4, 3, 4, 3, 4, 3, 4, 3, 1]`  
+      
+## Results 
+### Training
+The below graph is about the model traind until 20 epochs and 5 warmup steps. You can download the trained model [here](https://drive.google.com/file/d/1R-JXH_cFMXFKrfejEqrBj36gUigDgIyX/view?usp=sharing).
+  
+<img src="./logs/best_model.png" alt="log" width="80%"/>
+
+### Inference    
+``` 
+Example 0
+Input: [3, 3, 3, 3, 3, 3, 3, 3]
+Continuation: [3, 3, 3, 3, 3, 3, 3, 3]
+
+Example 1
+Input: [4, 4, 4, 4, 4, 4, 4, 4]
+Continuation: [4, 4, 4, 4, 4, 4, 4, 4]
+
+Example 2
+Input: [3, 4, 3, 4, 3, 4, 3, 4]
+Continuation: [3, 4, 3, 4, 3, 4, 3, 4]
+
+Example 3
+Input: [4, 3, 4, 3, 4, 3, 4, 3]
+Continuation: [3, 4, 3, 4, 3, 4, 3, 4]
+
+Example 4
+Input: [3, 4, 3]
+Continuation: [3, 4, 3, 4, 3, 4, 3, 4]
+```
+
+## Configurations Structure
+
+```
+train:  
+  batch_size:      
+  epochs:    
+  learning_rate: 
+  d_model:   
+  n_heads:  
+  n_layers:   
+  d_ff:  
+  dropout_p: 
+  max_len:   
+  warmup_steps:   
+
+test:
+  d_model:
+  n_heads: 
+  n_layers:
+  d_ff: 
+  dropout_p: 
+  max_len: 
+
+```
+
+## References 
+- [Attention Is All You Need](https://arxiv.org/abs/1706.03762)
+- [Transformer: PyTorch Implementation of "Attention Is All You Need"](https://github.com/hyunwoongko/transformer/tree/master)
+- [A detailed guide to PyTorch’s nn.Transformer() module](https://towardsdatascience.com/a-detailed-guide-to-pytorchs-nn-transformer-module-c80afbc9ffb1)
+
+## TO-DO
+- [x] Add Encoder, Decoder
+- [x] Training/Validation logic with dataset
+- [x] Refactoring
+- [x] Add other parts
+    - [] label smoothing
+    - [] Add BLEU & PPL (https://brunch.co.kr/@leadbreak/11)
@@ -0,0 +1,19 @@
+train:
+  batch_size: 16 # 16 32
+  epochs: 20 # 50 100
+  learning_rate: 0.001 # 0.01  0.0005 0.0001 
+  d_model: 512
+  n_heads: 8
+  n_layers: 6
+  d_ff: 2048
+  dropout_p: 0.1
+  max_len: 10 # 100 10
+  warmup_steps: 5 # 4000 50
+
+test:
+  d_model: 512
+  n_heads: 8
+  n_layers: 6
+  d_ff: 2048
+  dropout_p: 0.1
+  max_len: 10 
@@ -0,0 +1,64 @@
+import numpy as np
+import random 
+
+def generate_random_data(n:int, length:int=8, sos_idx:int=0, eos_idx:int=1, pad_idx:int=2) -> list:
+    """
+    Generate random sequences of data for training/testing.
+
+    Each sequence starts with an SOS token (start of sequence) and ends with an EOS token (end of sequence).
+    The sequence is filled with specific patterns of words (tokens), and no padding is used since the max length is set to 10 for convenience.
+
+    Tokens:
+        SOS token: 0
+        EOS token: 1
+        PAD token: 2 (not used in this function)
+        WORDS: 3, 4 (used to generate patterns)
+
+    Patterns:
+        - Sequence of all 3s: [0, 3, 3, 3, 3, 3, 3, 3, 3, 1]
+        - Sequence of all 4s: [0, 4, 4, 4, 4, 4, 4, 4, 4, 1]
+        - Alternating 3s and 4s starting with 3: [0, 3, 4, 3, 4, 3, 4, 3, 4, 1]
+        - Alternating 3s and 4s starting with 4: [0, 4, 3, 4, 3, 4, 3, 4, 3, 1]
+
+    Args:
+        n (int): Number of sequences to generate. Should be divisible by 3.
+        length (int, optional): Length of the sequence excluding SOS and EOS tokens. Default is 8.
+        sos_idx (int, optional): Index for the SOS token. Default is 0.
+        eos_idx (int, optional): Index for the EOS token. Default is 1.
+        pad_idx (int, optional): Index for the PAD token (not used in this function). Default is 2.
+
+    Returns:
+        list: A list of tuples, where each tuple contains two numpy arrays representing the input and target sequences.
+    """
+    SOS_token = np.array([sos_idx])
+    EOS_token = np.array([eos_idx])
+    data = []
+
+    for _ in range(n // 3):
+        X = np.concatenate((SOS_token, 3* np.ones(length), EOS_token))
+        y = np.concatenate((SOS_token, 3* np.ones(length), EOS_token))
+        data.append([X, y])
+
+    for _ in range(n // 3):
+        X = np.concatenate((SOS_token, 4 * np.ones(length), EOS_token))
+        y = np.concatenate((SOS_token, 4 * np.ones(length), EOS_token))
+        data.append([X, y])
+
+    for _ in range(n // 3):
+        X = np.ones(length)*3
+        start = random.randint(0, 1)
+        X[start::2] = 4
+
+        y = np.ones(length)*3
+        if X[-1] == 0:
+            y[::2] = 4
+        else:
+            y[1::2] = 4
+
+        X = np.concatenate((SOS_token, X, EOS_token))
+        y = np.concatenate((SOS_token, y, EOS_token))
+        data.append([X, y])
+    np.random.shuffle(data)
+    return data
+
+
@@ -0,0 +1,97 @@
+import torch
+import torch.nn as nn
+import torch.optim as optim
+from torch.optim.lr_scheduler import LambdaLR
+from model.transformer import Transformer
+from train import train
+from dataset import generate_random_data 
+from utils import load_yaml, count_parameters, initialize_weights, batchify_data, save_logs
+import os 
+import argparse
+import numpy as np 
+import warnings
+warnings.simplefilter(action='ignore', category=FutureWarning)
+
+def parse_args():
+    parser = argparse.ArgumentParser("Implementation of Transformer in Pytorch")
+    parser.add_argument("--output",
+                        required=True,
+                        type=str,
+                        help="output path for the trained model")
+    parser.add_argument("--log",
+                        required=True,
+                        type=str,
+                        help="output path for saving the logs (including filename)")
+    parser.add_argument("--cfg",
+                        required=True,
+                        type=str,
+                        help="configuration path")
+    return parser.parse_args()
+
+    
+def main():
+    args = parse_args()
+    log_save_path = args.log
+    model_save_path = args.output 
+    cfg = load_yaml(args.cfg)['train'] 
+    
+    os.makedirs(model_save_path, exist_ok=True)
+
+    device = torch.device('cuda:5' if torch.cuda.is_available() else 'cpu')
+    print(f'[INFO] Using device: {device}')
+
+    print(f'[INFO] n_warmup: {cfg["warmup_steps"]} | max length : {cfg["max_len"]} | batch size : {cfg["batch_size"]} | epochs : {cfg["epochs"]} | lr : {cfg["learning_rate"]}')
+    print(f'[INFO] d_model : {cfg["d_model"]} | n_heads : {cfg["n_heads"]} | n_layers : {cfg["n_layers"]} | d_ff : {cfg["d_ff"]} | dropout_p : {cfg["dropout_p"]}')
+    
+    print('[INFO] Load dataset ...')
+    train_data = generate_random_data(20000, length=cfg['max_len'] - 2) # 10000
+    val_data = generate_random_data(6000, length=cfg['max_len'] - 2)  # 3000
+
+    train_loader = batchify_data(train_data, batch_size=cfg['batch_size'])
+    val_loader = batchify_data(val_data, batch_size=cfg['batch_size'])
+
+    print('[INFO] Load model ...')
+    # sos, eos, padding, 3, 4
+    model = Transformer(
+        enc_vsize=5, 
+        dec_vsize=5, 
+        d_model=cfg['d_model'],
+        max_len=cfg['max_len'],
+        dropout_p=cfg['dropout_p'],
+        n_heads=cfg['n_heads'],
+        n_layers=cfg['n_layers'],
+        d_ff=cfg['d_ff'],
+        device=device,
+        src_pad_idx=2,
+        tgt_pad_idx=2
+    ).to(device)
+
+    print(f'[INFO] # of trainable parameters : {count_parameters(model):,}') 
+    model.apply(initialize_weights)
+
+    criterion = nn.CrossEntropyLoss(ignore_index=2) 
+    optimizer = optim.Adam(model.parameters(), 
+                           betas=(0.9, 0.98), 
+                           lr=cfg['learning_rate'], # default 0.001
+                           eps=1e-9)
+
+    def lr_scheduler(optimizer, warmup_steps, d_model):
+        """equation (3)"""
+        def lrate(step):
+            return (d_model ** -0.5) * min((step + 1) ** -0.5, (step + 1) * warmup_steps ** -1.5)
+        return LambdaLR(optimizer, lr_lambda=lrate)
+
+    scheduler = lr_scheduler(optimizer, 
+                             warmup_steps=cfg['warmup_steps'], 
+                             d_model=cfg['d_model'])
+
+    tr_losses, val_losses = train(model, train_loader, val_loader, 
+                                  criterion, optimizer, scheduler,
+                                  cfg['epochs'], device, model_save_path)
+    
+    save_logs(log_save_path, tr_losses, val_losses)
+    print('[INFO] Successfully saved model!')
+
+if __name__ == "__main__":
+    main()
+
@@ -0,0 +1 @@
+"""Codebase for implementation of transformer in PyTorch"""
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+"""Codebase for implementation of transformer in PyTorch"""`