Skip to content
This repository was archived by the owner on Dec 17, 2020. It is now read-only.

Commit 2084d49

Browse files
committed
Adding vocabulary and dataset builder files.
1 parent 77fb7da commit 2084d49

File tree

2 files changed

+180
-0
lines changed

2 files changed

+180
-0
lines changed

Build Dataset.ipynb

Lines changed: 179 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,179 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "markdown",
5+
"metadata": {},
6+
"source": [
7+
"# Build Dataset\n",
8+
"Split our raw test data into train/dev/test folders.\n",
9+
"\n",
10+
"We do a split of `80%/10%/10%`"
11+
]
12+
},
13+
{
14+
"cell_type": "code",
15+
"execution_count": 69,
16+
"metadata": {},
17+
"outputs": [],
18+
"source": [
19+
"import os\n",
20+
"import random\n",
21+
"from tqdm import tqdm\n",
22+
"from shutil import copyfile"
23+
]
24+
},
25+
{
26+
"cell_type": "code",
27+
"execution_count": 70,
28+
"metadata": {},
29+
"outputs": [],
30+
"source": [
31+
"raw_data_dir = './raw_data/'\n",
32+
"assert os.path.isdir(data_dir)\n",
33+
"\n",
34+
"output_dir = './processed_data/'"
35+
]
36+
},
37+
{
38+
"cell_type": "code",
39+
"execution_count": 71,
40+
"metadata": {},
41+
"outputs": [],
42+
"source": [
43+
"filenames = os.listdir(raw_data_dir)\n",
44+
"\n",
45+
"# Transform filenames into pairs (.gui, .png)\n",
46+
"filenames = [(f[:-3] + 'gui', f[:-3] + 'png') for f in filenames if f.endswith('.gui')]\n",
47+
"\n",
48+
"random.seed(12345)\n",
49+
"filenames.sort()\n",
50+
"random.shuffle(filenames)\n",
51+
"\n",
52+
"split_1 = int(0.8 * len(filenames))\n",
53+
"split_2 = int(0.9 * len(filenames))\n",
54+
"\n",
55+
"filenames = {\n",
56+
" 'train': filenames[:split_1],\n",
57+
" 'dev': filenames[split_1:split_2],\n",
58+
" 'test': filenames[split_2:]\n",
59+
"}"
60+
]
61+
},
62+
{
63+
"cell_type": "code",
64+
"execution_count": 72,
65+
"metadata": {},
66+
"outputs": [
67+
{
68+
"name": "stdout",
69+
"output_type": "stream",
70+
"text": [
71+
"Warning: output dir ./processed_data/ already exists.\n"
72+
]
73+
}
74+
],
75+
"source": [
76+
"if not os.path.exists(output_dir):\n",
77+
" os.mkdir(output_dir)\n",
78+
"else:\n",
79+
" print('Warning: output dir {} already exists.'.format(output_dir))"
80+
]
81+
},
82+
{
83+
"cell_type": "code",
84+
"execution_count": 73,
85+
"metadata": {},
86+
"outputs": [
87+
{
88+
"name": "stderr",
89+
"output_type": "stream",
90+
"text": [
91+
" 20%|█▉ | 275/1400 [00:00<00:00, 2741.09it/s]"
92+
]
93+
},
94+
{
95+
"name": "stdout",
96+
"output_type": "stream",
97+
"text": [
98+
"Warning: output dir ./processed_data/data_train already exists.\n",
99+
"Processing train data, saving to ./processed_data/data_train.\n"
100+
]
101+
},
102+
{
103+
"name": "stderr",
104+
"output_type": "stream",
105+
"text": [
106+
"100%|██████████| 1400/1400 [00:00<00:00, 2771.02it/s]\n",
107+
"100%|██████████| 175/175 [00:00<00:00, 2749.07it/s]\n",
108+
"100%|██████████| 175/175 [00:00<00:00, 2787.35it/s]"
109+
]
110+
},
111+
{
112+
"name": "stdout",
113+
"output_type": "stream",
114+
"text": [
115+
"Warning: output dir ./processed_data/data_dev already exists.\n",
116+
"Processing dev data, saving to ./processed_data/data_dev.\n",
117+
"Warning: output dir ./processed_data/data_test already exists.\n",
118+
"Processing test data, saving to ./processed_data/data_test.\n"
119+
]
120+
},
121+
{
122+
"name": "stderr",
123+
"output_type": "stream",
124+
"text": [
125+
"\n"
126+
]
127+
}
128+
],
129+
"source": [
130+
"for split in ['train', 'dev', 'test']:\n",
131+
" output_dir_split = os.path.join(output_dir, 'data_{}'.format(split))\n",
132+
" \n",
133+
" if not os.path.exists(output_dir_split):\n",
134+
" os.mkdir(output_dir_split)\n",
135+
" else:\n",
136+
" print('Warning: output dir {} already exists.'.format(output_dir_split))\n",
137+
" \n",
138+
" print('Processing {} data, saving to {}.'.format(split, output_dir_split))\n",
139+
" \n",
140+
" for (gui, png) in tqdm(filenames[split]):\n",
141+
" src_path_gui = os.path.join(raw_data_dir, gui)\n",
142+
" output_path_gui = os.path.join(output_dir_split, gui)\n",
143+
" src_path_png = os.path.join(raw_data_dir, png)\n",
144+
" output_path_png = os.path.join(output_dir_split, png)\n",
145+
" \n",
146+
" copyfile(src_path_gui, output_path_gui)\n",
147+
" copyfile(src_path_png, output_path_png)"
148+
]
149+
},
150+
{
151+
"cell_type": "code",
152+
"execution_count": null,
153+
"metadata": {},
154+
"outputs": [],
155+
"source": []
156+
}
157+
],
158+
"metadata": {
159+
"kernelspec": {
160+
"display_name": "Environment (conda_pytorch_p36)",
161+
"language": "python",
162+
"name": "conda_pytorch_p36"
163+
},
164+
"language_info": {
165+
"codemirror_mode": {
166+
"name": "ipython",
167+
"version": 3
168+
},
169+
"file_extension": ".py",
170+
"mimetype": "text/x-python",
171+
"name": "python",
172+
"nbconvert_exporter": "python",
173+
"pygments_lexer": "ipython3",
174+
"version": "3.6.4"
175+
}
176+
},
177+
"nbformat": 4,
178+
"nbformat_minor": 2
179+
}

bootstrap.vocab

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
, { } small-title text quadruple row btn-inactive btn-orange btn-green btn-red double <START> header btn-active <END> single

0 commit comments

Comments
 (0)