1
+ {
2
+ "nbformat" : 4 ,
3
+ "nbformat_minor" : 0 ,
4
+ "metadata" : {
5
+ "colab" : {
6
+ "name" : " prepare_data.ipynb" ,
7
+ "provenance" : [],
8
+ "collapsed_sections" : [],
9
+ "machine_shape" : " hm"
10
+ },
11
+ "kernelspec" : {
12
+ "name" : " python3" ,
13
+ "display_name" : " Python 3"
14
+ }
15
+ },
16
+ "cells" : [
17
+ {
18
+ "cell_type" : " code" ,
19
+ "metadata" : {
20
+ "id" : " lG5yxTxARRJ8"
21
+ },
22
+ "source" : [
23
+ " import urllib.request\n " ,
24
+ " import numpy as np\n " ,
25
+ " import subprocess\n " ,
26
+ " import os\n " ,
27
+ " from google.colab import files\n " ,
28
+ " import shutil\n " ,
29
+ " import torch\n " ,
30
+ " from torch.utils.data import TensorDataset, DataLoader"
31
+ ],
32
+ "execution_count" : null ,
33
+ "outputs" : []
34
+ },
35
+ {
36
+ "cell_type" : " code" ,
37
+ "metadata" : {
38
+ "id" : " GMeq7D7ERzwI" ,
39
+ "colab" : {
40
+ "base_uri" : " https://localhost:8080/" ,
41
+ "height" : 124
42
+ },
43
+ "outputId" : " 56930527-5205-4886-b5f7-457666b99b84"
44
+ },
45
+ "source" : [
46
+ " from google.colab import drive\n " ,
47
+ " root_dir = '/content/drive/'\n " ,
48
+ " drive.mount(root_dir, force_remount = True)"
49
+ ],
50
+ "execution_count" : null ,
51
+ "outputs" : [
52
+ {
53
+ "output_type" : " stream" ,
54
+ "text" : [
55
+ " Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly\n " ,
56
+ " \n " ,
57
+ " Enter your authorization code:\n " ,
58
+ " ··········\n " ,
59
+ " Mounted at /content/drive/\n "
60
+ ],
61
+ "name" : " stdout"
62
+ }
63
+ ]
64
+ },
65
+ {
66
+ "cell_type" : " markdown" ,
67
+ "metadata" : {
68
+ "id" : " hDGIsOIhRfOK"
69
+ },
70
+ "source" : [
71
+ " Download the Data:\n "
72
+ ]
73
+ },
74
+ {
75
+ "cell_type" : " code" ,
76
+ "metadata" : {
77
+ "id" : " UzXJ_XvBRhIG"
78
+ },
79
+ "source" : [
80
+ " def download(classes):\n " ,
81
+ " link = 'https://storage.googleapis.com/quickdraw_dataset/full/numpy_bitmap'\n " ,
82
+ " base_dir = root_dir + 'My Drive/CS7643_Final_Project/Dataset'\n " ,
83
+ " print('Downloading ...')\n " ,
84
+ " for c in classes:\n " ,
85
+ " cname = c.replace(\" \" , \" %20\" ) # (Google cloud links replace spaces with 20%)\n " ,
86
+ " print(f'{link}/{cname}.npy')\n " ,
87
+ " urllib.request.urlretrieve(f'{link}/{cname}.npy', f'{base_dir}/{c}.npy')"
88
+ ],
89
+ "execution_count" : null ,
90
+ "outputs" : []
91
+ },
92
+ {
93
+ "cell_type" : " code" ,
94
+ "metadata" : {
95
+ "id" : " Z3jsoh4LGGtW" ,
96
+ "colab" : {
97
+ "base_uri" : " https://localhost:8080/" ,
98
+ "height" : 69
99
+ },
100
+ "outputId" : " 2975ff4d-bd40-4a86-916b-42006576e5fc"
101
+ },
102
+ "source" : [
103
+ " download(['flamingo', 'sheep'])"
104
+ ],
105
+ "execution_count" : null ,
106
+ "outputs" : [
107
+ {
108
+ "output_type" : " stream" ,
109
+ "text" : [
110
+ " Downloading ...\n " ,
111
+ " https://storage.googleapis.com/quickdraw_dataset/full/numpy_bitmap/flamingo.npy\n " ,
112
+ " https://storage.googleapis.com/quickdraw_dataset/full/numpy_bitmap/sheep.npy\n "
113
+ ],
114
+ "name" : " stdout"
115
+ }
116
+ ]
117
+ },
118
+ {
119
+ "cell_type" : " markdown" ,
120
+ "metadata" : {
121
+ "id" : " aaP-Rq17RkGt"
122
+ },
123
+ "source" : [
124
+ " Split Data into training, test and validation, and save it:"
125
+ ]
126
+ },
127
+ {
128
+ "cell_type" : " code" ,
129
+ "metadata" : {
130
+ "id" : " i8ATINdPRn6w"
131
+ },
132
+ "source" : [
133
+ " def prepare_data():\n " ,
134
+ " \"\"\"\n " ,
135
+ " Split data into training, validation, and testing data\n " ,
136
+ " \"\"\"\n " ,
137
+ " base_dir = root_dir + 'My Drive/CS7643_Final_Project/Dataset'\n " ,
138
+ " file_names = os.listdir(base_dir)\n " ,
139
+ " \n " ,
140
+ " x_train, y_train = np.empty([0, 784]), np.empty([0])\n " ,
141
+ " x_valid, y_valid = np.empty([0, 784]), np.empty([0])\n " ,
142
+ " x_test, y_test = np.empty([0, 784]), np.empty([0])\n " ,
143
+ " \n " ,
144
+ " for idx, fname in enumerate(file_names):\n " ,
145
+ " print(fname)\n " ,
146
+ " \n " ,
147
+ " data = np.load(f'{base_dir}/{fname}', allow_pickle=True) \n " ,
148
+ " labels = np.full(data.shape[0], idx)\n " ,
149
+ " \n " ,
150
+ " x_train = np.concatenate((x_train, data[:8400, :]), axis=0)\n " ,
151
+ " y_train = np.append(y_train, labels[:8400])\n " ,
152
+ " \n " ,
153
+ " x_valid = np.concatenate((x_valid, data[8400: 12000, :]), axis=0)\n " ,
154
+ " y_valid = np.append(y_valid, labels[8400: 12000])\n " ,
155
+ " \n " ,
156
+ " x_test = np.concatenate((x_test, data[12000: 15000, :]), axis=0)\n " ,
157
+ " y_test = np.append(y_test, labels[12000: 15000])\n " ,
158
+ " \n " ,
159
+ " \n " ,
160
+ " N = y_train.shape[0]\n " ,
161
+ " # randomize training dataset\n " ,
162
+ " rand = np.random.permutation(N)\n " ,
163
+ " x_train, y_train = x_train[rand, :], y_train[rand]\n " ,
164
+ " \n " ,
165
+ " print(f'{len(y_train)} training files, {len(y_valid)} validation files, {len(y_test)} testing files')\n " ,
166
+ " \n " ,
167
+ " os.mkdir(root_dir + 'My Drive/CS7643_Final_Project/Train')\n " ,
168
+ " np.savez_compressed(root_dir + 'My Drive/CS7643_Final_Project/Train/Train', data=x_train, target=y_train)\n " ,
169
+ " os.mkdir(root_dir + 'My Drive/CS7643_Final_Project/Validation')\n " ,
170
+ " np.savez_compressed(root_dir + 'My Drive/CS7643_Final_Project/Validation/Validation', data=x_valid, target=y_valid)\n " ,
171
+ " os.mkdir(root_dir + 'My Drive/CS7643_Final_Project/Test')\n " ,
172
+ " np.savez_compressed(root_dir + 'My Drive/CS7643_Final_Project/Test/Test', data=x_test, target=y_test)"
173
+ ],
174
+ "execution_count" : null ,
175
+ "outputs" : []
176
+ },
177
+ {
178
+ "cell_type" : " code" ,
179
+ "metadata" : {
180
+ "id" : " Sn31eFKMR5s8" ,
181
+ "colab" : {
182
+ "base_uri" : " https://localhost:8080/" ,
183
+ "height" : 208
184
+ },
185
+ "outputId" : " 6bc2e5b3-661e-4908-cbf4-c3cda684c7fd"
186
+ },
187
+ "source" : [
188
+ " prepare_data()"
189
+ ],
190
+ "execution_count" : null ,
191
+ "outputs" : [
192
+ {
193
+ "output_type" : " stream" ,
194
+ "text" : [
195
+ " tree.npy\n " ,
196
+ " t-shirt.npy\n " ,
197
+ " ice cream.npy\n " ,
198
+ " fish.npy\n " ,
199
+ " face.npy\n " ,
200
+ " car.npy\n " ,
201
+ " bowtie.npy\n " ,
202
+ " apple.npy\n " ,
203
+ " flamingo.npy\n " ,
204
+ " sheep.npy\n " ,
205
+ " 84000 training files, 36000 validation files, 30000 testing files\n "
206
+ ],
207
+ "name" : " stdout"
208
+ }
209
+ ]
210
+ },
211
+ {
212
+ "cell_type" : " code" ,
213
+ "metadata" : {
214
+ "id" : " RknXBtKqach4"
215
+ },
216
+ "source" : [
217
+ " def load_data(batch_size = 128):\n " ,
218
+ " \"\"\"\n " ,
219
+ " Returns a list of PyTorch DataLoaders in order of train, validation, and test loaders\n " ,
220
+ " \"\"\"\n " ,
221
+ " dirnames = [root_dir + 'My Drive/CS7643_Final_Project/Train/', root_dir + 'My Drive/CS7643_Final_Project/Validation/', root_dir + 'My Drive/CS7643_Final_Project/Test/']\n " ,
222
+ " loaders = []\n " ,
223
+ " for dirname in dirnames:\n " ,
224
+ " dataname = os.listdir(dirname)[0]\n " ,
225
+ " xs = np.load(dirname + dataname)['data'].reshape(-1, 28, 28)\n " ,
226
+ " ys = np.load(dirname + dataname)['target']\n " ,
227
+ " print(f'Loading {dataname} into Pytorch Dataloader ... Xs shape: {xs.shape}, Ys shape: {ys.shape}')\n " ,
228
+ " loaders.append(DataLoader(TensorDataset(Tensor(xs), Tensor(ys)), batch_size = batch_size, shuffle=True))\n " ,
229
+ " \n " ,
230
+ " return loaders"
231
+ ],
232
+ "execution_count" : null ,
233
+ "outputs" : []
234
+ }
235
+ ]
236
+ }
0 commit comments