Use official MNIST dataset URL

wkliao · wkliao · commit 42073d7d4cdb · 2024-08-11T21:21:27.000-05:00
* re-generating mnist_images.nc is optional, i.e. by command
  'make mnist_images.nc'
diff --git a/examples/MNIST/Makefile b/examples/MNIST/Makefile
@@ -7,6 +7,8 @@ check_PROGRAMS = mnist_main.py
 
 MNIST_URL = https://raw.githubusercontent.com/pytorch/examples/main/mnist/main.py
 
+all:
+
 mnist_main.py:
 	@curl -Ls $(MNIST_URL) -o $@
 	@patch -st $@ < mnist.patch
@@ -21,27 +23,25 @@ MNIST_DATASETS = train-images-idx3-ubyte \
 MNIST_DATASETS_GZ = $(MNIST_DATASETS:=.gz)
 
 train-images-idx3-ubyte:
-	@curl -LOs $(MNIST_DATA_URL)/$@.gz
+	@curl -LOsf $(MNIST_DATA_URL)/$@.gz
 	@gunzip $@.gz
 
 train-labels-idx1-ubyte:
-	@curl -LOs $(MNIST_DATA_URL)/$@.gz
+	@curl -LOsf $(MNIST_DATA_URL)/$@.gz
 	@gunzip $@.gz
 
 t10k-images-idx3-ubyte:
-	@curl -LOs $(MNIST_DATA_URL)/$@.gz
+	@curl -LOsf $(MNIST_DATA_URL)/$@.gz
 	@gunzip $@.gz
 
 t10k-labels-idx1-ubyte:
-	@curl -LOs $(MNIST_DATA_URL)/$@.gz
+	@curl -LOsf $(MNIST_DATA_URL)/$@.gz
 	@gunzip $@.gz
 
 mnist_images.nc: $(MNIST_DATASETS)
 	@python create_mnist_netcdf.py
 
-all:
-
-ptests check: mnist_main.py mnist_images.nc
+ptests check: mnist_main.py
 	@echo "======================================================================"
 	@echo "    examples/MNIST: Parallel testing on 4 MPI processes"
 	@echo "======================================================================"
@@ -52,5 +52,6 @@ clean:
 	rm -f mnist_main.py
 	rm -f $(MNIST_DATASETS)
 	rm -f $(MNIST_DATASETS_GZ)
-	rm -f mnist_images.nc
+
+.PHONY: all check ptests clean
 
diff --git a/examples/MNIST/README.md b/examples/MNIST/README.md
@@ -5,30 +5,43 @@ This directory contains files for running the Pytorch example program,
 using Pytorch module `DistributedDataParallel` for parallel training and
 `PnetCDF-Python` for reading data from a NetCDF files.
 
----
 ## Running the MNIST Example Program
 
-* Firstly, run commands below to generate the python program file and NetCDF file.
+* Firstly, run command below to generate the python program file.
   ```sh
-  make mnist_main.py`
-  make mnist_images.nc`
+  make mnist_main.py
   ```
 * Run command below to train the model using 4 MPI processes.
   ```sh
   mpiexec -n 4 python mnist_main.py --batch-size 4 --test-batch-size 2 --epochs 3 --input-file mnist_images.nc
   ```
 
+* `mnist_main.py` command-line options
+  ```
+  -h, --help            show this help message and exit
+  --batch-size N        input batch size for training (default: 64)
+  --test-batch-size N   input batch size for testing (default: 1000)
+  --epochs N            number of epochs to train (default: 14)
+  --lr LR               learning rate (default: 1.0)
+  --gamma M             Learning rate step gamma (default: 0.7)
+  --no-cuda             disables CUDA training
+  --no-mps              disables macOS GPU training
+  --dry-run             quickly check a single pass
+  --seed S              random seed (default: 1)
+  --log-interval N      how many batches to wait before logging training status
+  --save-model          For Saving the current Model
+  --input-file INPUT_FILE
+                        NetCDF file storing train and test samples
+  ```
+
 ## Testing
 * Command `make check` will do the following.
   + Downloads the python source codes
     [main.py](https://github.com/pytorch/examples/blob/main/mnist/main.py)
     from [Pytorch Examples](https://github.com/pytorch/examples) as file
     `mnist_main.py`.
   + Applies patch file [mnist.patch](./mnist.patch) to `mnist_main.py`.
-  + Downloads the MNIST data sets from []()
-  + Run utility program  [create_mnist_netcdf.py](./create_mnist_netcdf.py)
-    to extract a subset of images into a NetCDF file.
-  + Run the training program `mnist_main.py`.
+  + Run the training program `mnist_main.py` in parallel using 4 MPI processes.
 
 * Testing output shown on screen.
   ```
@@ -51,25 +64,15 @@ using Pytorch module `DistributedDataParallel` for parallel training and
   Test set: Average loss: 1.2531, Accuracy: 7/12 (58%)
   ```
 
-## mnist_main.py command-line options
-  ```
-  -h, --help            show this help message and exit
-  --batch-size N        input batch size for training (default: 64)
-  --test-batch-size N   input batch size for testing (default: 1000)
-  --epochs N            number of epochs to train (default: 14)
-  --lr LR               learning rate (default: 1.0)
-  --gamma M             Learning rate step gamma (default: 0.7)
-  --no-cuda             disables CUDA training
-  --no-mps              disables macOS GPU training
-  --dry-run             quickly check a single pass
-  --seed S              random seed (default: 1)
-  --log-interval N      how many batches to wait before logging training status
-  --save-model          For Saving the current Model
-  --input-file INPUT_FILE
-                        NetCDF file storing train and test samples
-  ```
-
-## create_mnist_netcdf.py command-line options
+## Generate the Input NetCDF File From MNIST Datasets
+* Utility program [create_mnist_netcdf.py](./create_mnist_netcdf.py)
+  can be used to extract a subset of images into a NetCDF file.
+* Command `make mnist_images.nc` will first download the MNIST data files from
+  https://yann.lecun.com/exdb/mnist and extract 60 images as training samples
+  and 12 images as testing samples into a new file named `mnist_images.nc`.
+* `create_mnist_netcdf.py` can also run individually to extract a different
+  number of images using command-line options shown below.
+* `create_mnist_netcdf.py` command-line options:
   ```
     -h, --help            show this help message and exit
     --verbose             Verbose mode
@@ -83,9 +86,34 @@ using Pytorch module `DistributedDataParallel` for parallel training and
                           (Optional) input file name of testing data
     --test-label-file TEST_LABEL_FILE
                           (Optional) input file name of testing labels
+    --out-file OUT_FILE   (Optional) output NetCDF file name
+  ```
+* The NetCDF file metadata can be obtained by running command "ncmpidump -h" or
+  "ncdump -h".
+  ```sh
+  % ncmpidump -h mnist_images.nc
+  netcdf mnist_images {
+  // file format: CDF-5 (big variables)
+  dimensions:
+	  height = 28 ;
+	  width = 28 ;
+	  train_num = 60 ;
+	  test_num = 12 ;
+  variables:
+	  ubyte train_samples(train_num, height, width) ;
+		  train_samples:long_name = "training data samples" ;
+	  ubyte train_labels(train_num) ;
+		  train_labels:long_name = "labels of training samples" ;
+	  ubyte test_samples(test_num, height, width) ;
+		  test_samples:long_name = "testing data samples" ;
+	  ubyte test_labels(test_num) ;
+		  test_labels:long_name = "labels of testing samples" ;
+
+  // global attributes:
+		  :url = "https://yann.lecun.com/exdb/mnist/" ;
+  }
   ```
 
----
 ## Files in this directory
 * [mnist.patch](./mnist.patch) --
   a patch file to be applied on
@@ -103,7 +131,6 @@ using Pytorch module `DistributedDataParallel` for parallel training and
   a utility python program that reads the MINST files, extract a subset of the
   samples, and stores them into a newly created file in NetCDF format.
 
----
 ### Notes:
 - The test set accuracy may vary slightly depending on how the data is distributed across the MPI processes.
 - The accuracy and loss reported after each epoch are averaged across all MPI processes.
diff --git a/examples/MNIST/mnist_images.nc b/examples/MNIST/mnist_images.nc