Adding vocabulary and dataset builder files.

andrewthebold · andrewthebold · commit 2084d49e2317 · 2018-02-23T17:17:57.000-08:00
diff --git a/Build Dataset.ipynb b/Build Dataset.ipynb
@@ -0,0 +1,179 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Build Dataset\n",
+    "Split our raw test data into train/dev/test folders.\n",
+    "\n",
+    "We do a split of `80%/10%/10%`"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 69,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import random\n",
+    "from tqdm import tqdm\n",
+    "from shutil import copyfile"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 70,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "raw_data_dir = './raw_data/'\n",
+    "assert os.path.isdir(data_dir)\n",
+    "\n",
+    "output_dir = './processed_data/'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 71,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "filenames = os.listdir(raw_data_dir)\n",
+    "\n",
+    "# Transform filenames into pairs (.gui, .png)\n",
+    "filenames = [(f[:-3] + 'gui', f[:-3] + 'png') for f in filenames if f.endswith('.gui')]\n",
+    "\n",
+    "random.seed(12345)\n",
+    "filenames.sort()\n",
+    "random.shuffle(filenames)\n",
+    "\n",
+    "split_1 = int(0.8 * len(filenames))\n",
+    "split_2 = int(0.9 * len(filenames))\n",
+    "\n",
+    "filenames = {\n",
+    "    'train': filenames[:split_1],\n",
+    "    'dev': filenames[split_1:split_2],\n",
+    "    'test': filenames[split_2:]\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 72,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Warning: output dir ./processed_data/ already exists.\n"
+     ]
+    }
+   ],
+   "source": [
+    "if not os.path.exists(output_dir):\n",
+    "    os.mkdir(output_dir)\n",
+    "else:\n",
+    "    print('Warning: output dir {} already exists.'.format(output_dir))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 73,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      " 20%|█▉        | 275/1400 [00:00<00:00, 2741.09it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Warning: output dir ./processed_data/data_train already exists.\n",
+      "Processing train data, saving to ./processed_data/data_train.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 1400/1400 [00:00<00:00, 2771.02it/s]\n",
+      "100%|██████████| 175/175 [00:00<00:00, 2749.07it/s]\n",
+      "100%|██████████| 175/175 [00:00<00:00, 2787.35it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Warning: output dir ./processed_data/data_dev already exists.\n",
+      "Processing dev data, saving to ./processed_data/data_dev.\n",
+      "Warning: output dir ./processed_data/data_test already exists.\n",
+      "Processing test data, saving to ./processed_data/data_test.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "for split in ['train', 'dev', 'test']:\n",
+    "    output_dir_split = os.path.join(output_dir, 'data_{}'.format(split))\n",
+    "    \n",
+    "    if not os.path.exists(output_dir_split):\n",
+    "        os.mkdir(output_dir_split)\n",
+    "    else:\n",
+    "        print('Warning: output dir {} already exists.'.format(output_dir_split))\n",
+    "        \n",
+    "    print('Processing {} data, saving to {}.'.format(split, output_dir_split))\n",
+    "    \n",
+    "    for (gui, png) in tqdm(filenames[split]):\n",
+    "        src_path_gui = os.path.join(raw_data_dir, gui)\n",
+    "        output_path_gui = os.path.join(output_dir_split, gui)\n",
+    "        src_path_png = os.path.join(raw_data_dir, png)\n",
+    "        output_path_png = os.path.join(output_dir_split, png)\n",
+    "        \n",
+    "        copyfile(src_path_gui, output_path_gui)\n",
+    "        copyfile(src_path_png, output_path_png)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Environment (conda_pytorch_p36)",
+   "language": "python",
+   "name": "conda_pytorch_p36"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/bootstrap.vocab b/bootstrap.vocab
@@ -0,0 +1 @@
+, { } small-title text quadruple row btn-inactive btn-orange btn-green btn-red double <START> header btn-active <END> single

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+, { } small-title text quadruple row btn-inactive btn-orange btn-green btn-red double <START> header btn-active <END> single`