Skip to content

Commit ed06a46

Browse files
authored
Add files via upload
1 parent 3f71a51 commit ed06a46

File tree

1 file changed

+236
-0
lines changed

1 file changed

+236
-0
lines changed

prepare_data.ipynb

Lines changed: 236 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,236 @@
1+
{
2+
"nbformat": 4,
3+
"nbformat_minor": 0,
4+
"metadata": {
5+
"colab": {
6+
"name": "prepare_data.ipynb",
7+
"provenance": [],
8+
"collapsed_sections": [],
9+
"machine_shape": "hm"
10+
},
11+
"kernelspec": {
12+
"name": "python3",
13+
"display_name": "Python 3"
14+
}
15+
},
16+
"cells": [
17+
{
18+
"cell_type": "code",
19+
"metadata": {
20+
"id": "lG5yxTxARRJ8"
21+
},
22+
"source": [
23+
"import urllib.request\n",
24+
"import numpy as np\n",
25+
"import subprocess\n",
26+
"import os\n",
27+
"from google.colab import files\n",
28+
"import shutil\n",
29+
"import torch\n",
30+
"from torch.utils.data import TensorDataset, DataLoader"
31+
],
32+
"execution_count": null,
33+
"outputs": []
34+
},
35+
{
36+
"cell_type": "code",
37+
"metadata": {
38+
"id": "GMeq7D7ERzwI",
39+
"colab": {
40+
"base_uri": "https://localhost:8080/",
41+
"height": 124
42+
},
43+
"outputId": "56930527-5205-4886-b5f7-457666b99b84"
44+
},
45+
"source": [
46+
"from google.colab import drive\n",
47+
"root_dir = '/content/drive/'\n",
48+
"drive.mount(root_dir, force_remount = True)"
49+
],
50+
"execution_count": null,
51+
"outputs": [
52+
{
53+
"output_type": "stream",
54+
"text": [
55+
"Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly\n",
56+
"\n",
57+
"Enter your authorization code:\n",
58+
"··········\n",
59+
"Mounted at /content/drive/\n"
60+
],
61+
"name": "stdout"
62+
}
63+
]
64+
},
65+
{
66+
"cell_type": "markdown",
67+
"metadata": {
68+
"id": "hDGIsOIhRfOK"
69+
},
70+
"source": [
71+
"Download the Data:\n"
72+
]
73+
},
74+
{
75+
"cell_type": "code",
76+
"metadata": {
77+
"id": "UzXJ_XvBRhIG"
78+
},
79+
"source": [
80+
"def download(classes):\n",
81+
" link = 'https://storage.googleapis.com/quickdraw_dataset/full/numpy_bitmap'\n",
82+
" base_dir = root_dir + 'My Drive/CS7643_Final_Project/Dataset'\n",
83+
" print('Downloading ...')\n",
84+
" for c in classes:\n",
85+
" cname = c.replace(\" \", \"%20\") # (Google cloud links replace spaces with 20%)\n",
86+
" print(f'{link}/{cname}.npy')\n",
87+
" urllib.request.urlretrieve(f'{link}/{cname}.npy', f'{base_dir}/{c}.npy')"
88+
],
89+
"execution_count": null,
90+
"outputs": []
91+
},
92+
{
93+
"cell_type": "code",
94+
"metadata": {
95+
"id": "Z3jsoh4LGGtW",
96+
"colab": {
97+
"base_uri": "https://localhost:8080/",
98+
"height": 69
99+
},
100+
"outputId": "2975ff4d-bd40-4a86-916b-42006576e5fc"
101+
},
102+
"source": [
103+
"download(['flamingo', 'sheep'])"
104+
],
105+
"execution_count": null,
106+
"outputs": [
107+
{
108+
"output_type": "stream",
109+
"text": [
110+
"Downloading ...\n",
111+
"https://storage.googleapis.com/quickdraw_dataset/full/numpy_bitmap/flamingo.npy\n",
112+
"https://storage.googleapis.com/quickdraw_dataset/full/numpy_bitmap/sheep.npy\n"
113+
],
114+
"name": "stdout"
115+
}
116+
]
117+
},
118+
{
119+
"cell_type": "markdown",
120+
"metadata": {
121+
"id": "aaP-Rq17RkGt"
122+
},
123+
"source": [
124+
"Split Data into training, test and validation, and save it:"
125+
]
126+
},
127+
{
128+
"cell_type": "code",
129+
"metadata": {
130+
"id": "i8ATINdPRn6w"
131+
},
132+
"source": [
133+
"def prepare_data():\n",
134+
" \"\"\"\n",
135+
" Split data into training, validation, and testing data\n",
136+
" \"\"\"\n",
137+
" base_dir = root_dir + 'My Drive/CS7643_Final_Project/Dataset'\n",
138+
" file_names = os.listdir(base_dir)\n",
139+
" \n",
140+
" x_train, y_train = np.empty([0, 784]), np.empty([0])\n",
141+
" x_valid, y_valid = np.empty([0, 784]), np.empty([0])\n",
142+
" x_test, y_test = np.empty([0, 784]), np.empty([0])\n",
143+
" \n",
144+
" for idx, fname in enumerate(file_names):\n",
145+
" print(fname)\n",
146+
" \n",
147+
" data = np.load(f'{base_dir}/{fname}', allow_pickle=True) \n",
148+
" labels = np.full(data.shape[0], idx)\n",
149+
" \n",
150+
" x_train = np.concatenate((x_train, data[:8400, :]), axis=0)\n",
151+
" y_train = np.append(y_train, labels[:8400])\n",
152+
"\n",
153+
" x_valid = np.concatenate((x_valid, data[8400: 12000, :]), axis=0)\n",
154+
" y_valid = np.append(y_valid, labels[8400: 12000])\n",
155+
"\n",
156+
" x_test = np.concatenate((x_test, data[12000: 15000, :]), axis=0)\n",
157+
" y_test = np.append(y_test, labels[12000: 15000])\n",
158+
"\n",
159+
"\n",
160+
" N = y_train.shape[0]\n",
161+
" # randomize training dataset\n",
162+
" rand = np.random.permutation(N)\n",
163+
" x_train, y_train = x_train[rand, :], y_train[rand]\n",
164+
"\n",
165+
" print(f'{len(y_train)} training files, {len(y_valid)} validation files, {len(y_test)} testing files')\n",
166+
"\n",
167+
" os.mkdir(root_dir + 'My Drive/CS7643_Final_Project/Train')\n",
168+
" np.savez_compressed(root_dir + 'My Drive/CS7643_Final_Project/Train/Train', data=x_train, target=y_train)\n",
169+
" os.mkdir(root_dir + 'My Drive/CS7643_Final_Project/Validation')\n",
170+
" np.savez_compressed(root_dir + 'My Drive/CS7643_Final_Project/Validation/Validation', data=x_valid, target=y_valid)\n",
171+
" os.mkdir(root_dir + 'My Drive/CS7643_Final_Project/Test')\n",
172+
" np.savez_compressed(root_dir + 'My Drive/CS7643_Final_Project/Test/Test', data=x_test, target=y_test)"
173+
],
174+
"execution_count": null,
175+
"outputs": []
176+
},
177+
{
178+
"cell_type": "code",
179+
"metadata": {
180+
"id": "Sn31eFKMR5s8",
181+
"colab": {
182+
"base_uri": "https://localhost:8080/",
183+
"height": 208
184+
},
185+
"outputId": "6bc2e5b3-661e-4908-cbf4-c3cda684c7fd"
186+
},
187+
"source": [
188+
"prepare_data()"
189+
],
190+
"execution_count": null,
191+
"outputs": [
192+
{
193+
"output_type": "stream",
194+
"text": [
195+
"tree.npy\n",
196+
"t-shirt.npy\n",
197+
"ice cream.npy\n",
198+
"fish.npy\n",
199+
"face.npy\n",
200+
"car.npy\n",
201+
"bowtie.npy\n",
202+
"apple.npy\n",
203+
"flamingo.npy\n",
204+
"sheep.npy\n",
205+
"84000 training files, 36000 validation files, 30000 testing files\n"
206+
],
207+
"name": "stdout"
208+
}
209+
]
210+
},
211+
{
212+
"cell_type": "code",
213+
"metadata": {
214+
"id": "RknXBtKqach4"
215+
},
216+
"source": [
217+
"def load_data(batch_size = 128):\n",
218+
" \"\"\"\n",
219+
" Returns a list of PyTorch DataLoaders in order of train, validation, and test loaders\n",
220+
" \"\"\"\n",
221+
" dirnames = [root_dir + 'My Drive/CS7643_Final_Project/Train/', root_dir + 'My Drive/CS7643_Final_Project/Validation/', root_dir + 'My Drive/CS7643_Final_Project/Test/']\n",
222+
" loaders = []\n",
223+
" for dirname in dirnames:\n",
224+
" dataname = os.listdir(dirname)[0]\n",
225+
" xs = np.load(dirname + dataname)['data'].reshape(-1, 28, 28)\n",
226+
" ys = np.load(dirname + dataname)['target']\n",
227+
" print(f'Loading {dataname} into Pytorch Dataloader ... Xs shape: {xs.shape}, Ys shape: {ys.shape}')\n",
228+
" loaders.append(DataLoader(TensorDataset(Tensor(xs), Tensor(ys)), batch_size = batch_size, shuffle=True))\n",
229+
" \n",
230+
" return loaders"
231+
],
232+
"execution_count": null,
233+
"outputs": []
234+
}
235+
]
236+
}

0 commit comments

Comments
 (0)