uom-cse-realitix
diff --git a/‎.ipynb_checkpoints/SVM-checkpoint.ipynb
Lines changed: 328 additions & 0 deletions b/‎.ipynb_checkpoints/SVM-checkpoint.ipynb
Lines changed: 328 additions & 0 deletions
diff --git a/‎.ipynb_checkpoints/Text Classification - FYP-Copy1-checkpoint.ipynb
Lines changed: 870 additions & 0 deletions b/‎.ipynb_checkpoints/Text Classification - FYP-Copy1-checkpoint.ipynb
Lines changed: 870 additions & 0 deletions
diff --git a/‎.ipynb_checkpoints/Text_Classifier-checkpoint.ipynb
Lines changed: 300 additions & 0 deletions b/‎.ipynb_checkpoints/Text_Classifier-checkpoint.ipynb
Lines changed: 300 additions & 0 deletions
@@ -0,0 +1,300 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text",
+    "id": "RVB7GyPmE0Fm"
+   },
+   "source": [
+    "## NOTE:\n",
+    "\n",
+    "Create a folder named `data` in the files tab and upload the dataset files from https://github.com/uom-cse-realitix/text-classification-fyp/tree/master/data\n",
+    "\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text",
+    "id": "HRu_bwnmEWnL"
+   },
+   "source": [
+    "## NOTE:\n",
+    "**Tensorflow and Keras versions should same as that in local machine or lstm model won't work.**\n",
+    "Check local versions and use the below cells to change Colab versions accordingly."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "colab": {},
+    "colab_type": "code",
+    "id": "h2UDlKqN8AFK"
+   },
+   "outputs": [
+    {
+     "ename": "ModuleNotFoundError",
+     "evalue": "No module named 'keras'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mModuleNotFoundError\u001b[0m                       Traceback (most recent call last)",
+      "\u001b[0;32m<ipython-input-1-be96226b35f7>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0;32mimport\u001b[0m \u001b[0mkeras\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      2\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkeras\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__version__\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      3\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mtensorflow\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mtf\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      4\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__version__\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'keras'"
+     ]
+    }
+   ],
+   "source": [
+    "import keras\n",
+    "print(keras.__version__)\n",
+    "import tensorflow as tf\n",
+    "print(tf.__version__)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 0,
+   "metadata": {
+    "colab": {},
+    "colab_type": "code",
+    "id": "YgK0023c6eeL"
+   },
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "from keras.preprocessing.text import Tokenizer\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from nltk.tokenize import word_tokenize\n",
+    "from nltk.tokenize.treebank import TreebankWordDetokenizer\n",
+    "from keras.preprocessing.sequence import pad_sequences\n",
+    "from keras.models import Sequential\n",
+    "from keras.callbacks import EarlyStopping\n",
+    "from keras.layers import *\n",
+    "import matplotlib.pyplot as plt\n",
+    "from keras.regularizers import l2\n",
+    "# from keras.utils.vis_utils import plot_model\n",
+    "from imblearn.over_sampling import SMOTE\n",
+    "\n",
+    "import nltk\n",
+    "import numpy as np\n",
+    "\n",
+    "# The maximum number of words to be used. (most frequent)\n",
+    "from keras.models import load_model\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 0,
+   "metadata": {
+    "colab": {},
+    "colab_type": "code",
+    "id": "JUR-9Ewh6fMy"
+   },
+   "outputs": [],
+   "source": [
+    "\n",
+    "MAX_NB_WORDS = 50000\n",
+    "# Max number of words in each complaint.\n",
+    "MAX_SEQUENCE_LENGTH = 250\n",
+    "# This is fixed.\n",
+    "EMBEDDING_DIM = 32\n",
+    "# Stop words\n",
+    "stopwords_list = [\"i\", \"me\", \"my\", \"myself\", \"we\", \"our\", \"ours\", \"ourselves\", \"you\", \"your\", \"yours\", \"yourself\",\n",
+    "                  \"yourselves\", \"he\", \"him\", \"his\", \"himself\", \"she\", \"her\", \"hers\", \"herself\", \"it\", \"its\",\n",
+    "                  \"itself\", \"they\", \"them\", \"their\", \"theirs\", \"themselves\", \"which\", \"who\", \"whom\", \"these\",\n",
+    "                  \"those\", \"am\", \"is\", \"are\", \"was\", \"were\", \"be\", \"been\", \"being\", \"have\", \"has\", \"had\", \"having\",\n",
+    "                  \"do\", \"does\", \"did\", \"doing\", \"a\", \"an\", \"the\", \"and\", \"but\", \"if\", \"or\", \"because\", \"as\",\n",
+    "                  \"until\", \"while\", \"of\", \"at\", \"by\", \"for\", \"with\", \"against\", \"into\", \"through\", \"during\",\n",
+    "                  \"before\", \"after\", \"above\", \"below\", \"to\", \"from\", \"up\", \"down\", \"in\", \"out\", \"on\", \"off\", \"over\",\n",
+    "                  \"under\", \"again\", \"further\", \"then\", \"once\", \"here\", \"there\", \"when\", \"why\", \"how\", \"all\", \"any\",\n",
+    "                  \"both\", \"each\", \"few\", \"more\", \"most\", \"other\", \"some\", \"such\", \"no\", \"nor\", \"not\", \"only\", \"own\",\n",
+    "                  \"same\", \"so\", \"than\", \"too\", \"very\", \"s\", \"t\", \"don\", \"should\", \"now\"]\n",
+    "\n",
+    "\n",
+    "def import_and_prepare(filepath):\n",
+    "    df = pd.read_csv(filepath, names=['sentence', 'operation'], sep=',', engine='python')\n",
+    "    # df = shuffle(df)\n",
+    "    sentences = df['sentence'].values\n",
+    "    y = df['operation'].values\n",
+    "    return df, sentences, y\n",
+    "\n",
+    "\n",
+    "def filter_stopwords(sentences, stopwords_list):\n",
+    "    stopwords_set = set(stopwords_list)\n",
+    "    filtered = []\n",
+    "    for sentence in sentences:\n",
+    "        tokenized_sentence = word_tokenize(sentence)\n",
+    "        filtered_sentence = []\n",
+    "        for w in tokenized_sentence:\n",
+    "            if w not in stopwords_set:\n",
+    "                filtered_sentence.append(w)\n",
+    "        filtered.append(filtered_sentence)\n",
+    "    return filtered\n",
+    "\n",
+    "\n",
+    "def detokenize(filtered_sentences):\n",
+    "    detokenized_sentences = []\n",
+    "    for sentence in filtered_sentences:\n",
+    "        detokenized_sentences.append(TreebankWordDetokenizer().detokenize(sentence))\n",
+    "    return detokenized_sentences\n",
+    "\n",
+    "\n",
+    "def plot_history(history):\n",
+    "    plt.title('Loss')\n",
+    "    plt.plot(history.history['loss'], label='train')\n",
+    "    plt.plot(history.history['val_loss'], label='test')\n",
+    "    plt.legend()\n",
+    "    plt.show()\n",
+    "\n",
+    "\n",
+    "def plot_label_distribution(dataframe):\n",
+    "    dataframe['operation'].value_counts().plot(kind=\"bar\")\n",
+    "\n",
+    "\n",
+    "def init_tokenizer(MAX_NB_WORDS, dataframe):\n",
+    "    tokenizer = Tokenizer(MAX_NB_WORDS, filters='!\"#$%&()*+,-./:;<=>?@[\\]^_`{|}~', lower=True)\n",
+    "    tokenizer.fit_on_texts(dataframe['filtered_sentence'].values)\n",
+    "    word_index = tokenizer.word_index\n",
+    "    print('Found %s unique tokens.' % len(word_index))\n",
+    "    return tokenizer\n",
+    "\n",
+    "\n",
+    "def create_model(max_words, embedding_dimensions, X):\n",
+    "    model = Sequential()\n",
+    "    model.add(Embedding(max_words, embedding_dimensions, input_length=X.shape[1]))\n",
+    "    model.add(SpatialDropout1D(0.2))\n",
+    "    model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2, kernel_regularizer=l2(0.01)))\n",
+    "    model.add(Dense(3, activation='softmax'))\n",
+    "    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])\n",
+    "    return model\n",
+    "\n",
+    "\n",
+    "def lstm_train(df, tokenizer, max_sequence_length, embedding_dimensions):\n",
+    "    X = tokenizer.texts_to_sequences(df['filtered_sentence'].values)\n",
+    "    X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)\n",
+    "    print('Shape of data tensor:', X.shape)\n",
+    "    Y = pd.get_dummies(df['operation']).values\n",
+    "\n",
+    "    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=42)\n",
+    "\n",
+    "    # Oversampling the minority class\n",
+    "    smote = SMOTE('minority')\n",
+    "    X_train, Y_train = smote.fit_sample(X_train, Y_train)\n",
+    "\n",
+    "    model = create_model(max_sequence_length, embedding_dimensions, X)\n",
+    "    epochs = 150\n",
+    "    batch_size = 100\n",
+    "    history = model.fit(X_train, Y_train,\n",
+    "                        epochs=epochs, batch_size=batch_size,\n",
+    "                        validation_split=0.1,\n",
+    "                        callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])\n",
+    "\n",
+    "    accr = model.evaluate(X_test, Y_test)\n",
+    "    print(model.summary())\n",
+    "    print('Test set\\n  Loss: {:0.3f}\\n  Accuracy: {:0.3f}'.format(accr[0], accr[1]))\n",
+    "    # plot_model(model, to_file='model.png')\n",
+    "    return model, history\n",
+    "\n",
+    "def infer(sentence, tokenizer, model):\n",
+    "    sentence_as_array = [sentence]\n",
+    "    filtered_commands = filter_stopwords(sentence_as_array, stopwords_list)\n",
+    "    seq = tokenizer.texts_to_sequences(filtered_commands)\n",
+    "    padded = pad_sequences(seq, maxlen=MAX_SEQUENCE_LENGTH)\n",
+    "    pred = model.predict(padded)\n",
+    "    return pred\n",
+    "\n",
+    "def pre_initialize():\n",
+    "    df, sentences, y = import_and_prepare('data/dataset_new.txt')\n",
+    "    # df_temp, sentences_temp, y_temp = import_and_prepare('data/dataset_new.txt')\n",
+    "    plot_label_distribution(df)\n",
+    "    filtered_sentences = filter_stopwords(sentences, stopwords_list)\n",
+    "    detokenized_sentences = detokenize(filtered_sentences)\n",
+    "    df['filtered_sentence'] = detokenized_sentences\n",
+    "    tokenizer = init_tokenizer(MAX_NB_WORDS, df)\n",
+    "    return df, tokenizer"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 0,
+   "metadata": {
+    "colab": {},
+    "colab_type": "code",
+    "id": "V1tgQvAm6prk"
+   },
+   "outputs": [],
+   "source": [
+    "# df, sentences, y = import_and_prepare('data/dataset.txt')\n",
+    "nltk.download('punkt')\n",
+    "\n",
+    "df, tokenizer = pre_initialize()\n",
+    "model, history = lstm_train(df, tokenizer, MAX_NB_WORDS, MAX_SEQUENCE_LENGTH)\n",
+    "model.save('lstm.h5')\n",
+    "# plot_history(history)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 0,
+   "metadata": {
+    "colab": {},
+    "colab_type": "code",
+    "id": "lUYDRHX26voC"
+   },
+   "outputs": [],
+   "source": [
+    "model = load_model('./lstm.h5')\n",
+    "new_command = ['Track the pen']\n",
+    "filtered_commands = filter_stopwords(new_command, stopwords_list)\n",
+    "seq = tokenizer.texts_to_sequences(filtered_commands)\n",
+    "padded = pad_sequences(seq, maxlen=MAX_SEQUENCE_LENGTH)\n",
+    "pred = model.predict(padded)\n",
+    "\n",
+    "labels = ['Locate', 'Describe', 'No_Op']\n",
+    "print(\"Predicted vector: \", pred, \" Predicted Class: \", labels[np.argmax(pred)])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 0,
+   "metadata": {
+    "colab": {},
+    "colab_type": "code",
+    "id": "K-i_QcHtHqG7"
+   },
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "collapsed_sections": [],
+   "name": "Text Classifier",
+   "provenance": [],
+   "toc_visible": true
+  },
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}