Skip to content

Commit c4ebc15

Browse files
author
drox2014
committed
final update
1 parent a8fd6b2 commit c4ebc15

11 files changed

+4903
-1198
lines changed

.ipynb_checkpoints/SVM-checkpoint.ipynb

Lines changed: 328 additions & 0 deletions
Large diffs are not rendered by default.

.ipynb_checkpoints/Text Classification - FYP-Copy1-checkpoint.ipynb

Lines changed: 870 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 300 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,300 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "markdown",
5+
"metadata": {
6+
"colab_type": "text",
7+
"id": "RVB7GyPmE0Fm"
8+
},
9+
"source": [
10+
"## NOTE:\n",
11+
"\n",
12+
"Create a folder named `data` in the files tab and upload the dataset files from https://github.com/uom-cse-realitix/text-classification-fyp/tree/master/data\n",
13+
"\n",
14+
"\n"
15+
]
16+
},
17+
{
18+
"cell_type": "markdown",
19+
"metadata": {
20+
"colab_type": "text",
21+
"id": "HRu_bwnmEWnL"
22+
},
23+
"source": [
24+
"## NOTE:\n",
25+
"**Tensorflow and Keras versions should same as that in local machine or lstm model won't work.**\n",
26+
"Check local versions and use the below cells to change Colab versions accordingly."
27+
]
28+
},
29+
{
30+
"cell_type": "code",
31+
"execution_count": 1,
32+
"metadata": {
33+
"colab": {},
34+
"colab_type": "code",
35+
"id": "h2UDlKqN8AFK"
36+
},
37+
"outputs": [
38+
{
39+
"ename": "ModuleNotFoundError",
40+
"evalue": "No module named 'keras'",
41+
"output_type": "error",
42+
"traceback": [
43+
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
44+
"\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)",
45+
"\u001b[0;32m<ipython-input-1-be96226b35f7>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0;32mimport\u001b[0m \u001b[0mkeras\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkeras\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__version__\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mtensorflow\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mtf\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__version__\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
46+
"\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'keras'"
47+
]
48+
}
49+
],
50+
"source": [
51+
"import keras\n",
52+
"print(keras.__version__)\n",
53+
"import tensorflow as tf\n",
54+
"print(tf.__version__)"
55+
]
56+
},
57+
{
58+
"cell_type": "code",
59+
"execution_count": 0,
60+
"metadata": {
61+
"colab": {},
62+
"colab_type": "code",
63+
"id": "YgK0023c6eeL"
64+
},
65+
"outputs": [],
66+
"source": [
67+
"import pandas as pd\n",
68+
"from keras.preprocessing.text import Tokenizer\n",
69+
"from sklearn.model_selection import train_test_split\n",
70+
"from nltk.tokenize import word_tokenize\n",
71+
"from nltk.tokenize.treebank import TreebankWordDetokenizer\n",
72+
"from keras.preprocessing.sequence import pad_sequences\n",
73+
"from keras.models import Sequential\n",
74+
"from keras.callbacks import EarlyStopping\n",
75+
"from keras.layers import *\n",
76+
"import matplotlib.pyplot as plt\n",
77+
"from keras.regularizers import l2\n",
78+
"# from keras.utils.vis_utils import plot_model\n",
79+
"from imblearn.over_sampling import SMOTE\n",
80+
"\n",
81+
"import nltk\n",
82+
"import numpy as np\n",
83+
"\n",
84+
"# The maximum number of words to be used. (most frequent)\n",
85+
"from keras.models import load_model\n"
86+
]
87+
},
88+
{
89+
"cell_type": "code",
90+
"execution_count": 0,
91+
"metadata": {
92+
"colab": {},
93+
"colab_type": "code",
94+
"id": "JUR-9Ewh6fMy"
95+
},
96+
"outputs": [],
97+
"source": [
98+
"\n",
99+
"MAX_NB_WORDS = 50000\n",
100+
"# Max number of words in each complaint.\n",
101+
"MAX_SEQUENCE_LENGTH = 250\n",
102+
"# This is fixed.\n",
103+
"EMBEDDING_DIM = 32\n",
104+
"# Stop words\n",
105+
"stopwords_list = [\"i\", \"me\", \"my\", \"myself\", \"we\", \"our\", \"ours\", \"ourselves\", \"you\", \"your\", \"yours\", \"yourself\",\n",
106+
" \"yourselves\", \"he\", \"him\", \"his\", \"himself\", \"she\", \"her\", \"hers\", \"herself\", \"it\", \"its\",\n",
107+
" \"itself\", \"they\", \"them\", \"their\", \"theirs\", \"themselves\", \"which\", \"who\", \"whom\", \"these\",\n",
108+
" \"those\", \"am\", \"is\", \"are\", \"was\", \"were\", \"be\", \"been\", \"being\", \"have\", \"has\", \"had\", \"having\",\n",
109+
" \"do\", \"does\", \"did\", \"doing\", \"a\", \"an\", \"the\", \"and\", \"but\", \"if\", \"or\", \"because\", \"as\",\n",
110+
" \"until\", \"while\", \"of\", \"at\", \"by\", \"for\", \"with\", \"against\", \"into\", \"through\", \"during\",\n",
111+
" \"before\", \"after\", \"above\", \"below\", \"to\", \"from\", \"up\", \"down\", \"in\", \"out\", \"on\", \"off\", \"over\",\n",
112+
" \"under\", \"again\", \"further\", \"then\", \"once\", \"here\", \"there\", \"when\", \"why\", \"how\", \"all\", \"any\",\n",
113+
" \"both\", \"each\", \"few\", \"more\", \"most\", \"other\", \"some\", \"such\", \"no\", \"nor\", \"not\", \"only\", \"own\",\n",
114+
" \"same\", \"so\", \"than\", \"too\", \"very\", \"s\", \"t\", \"don\", \"should\", \"now\"]\n",
115+
"\n",
116+
"\n",
117+
"def import_and_prepare(filepath):\n",
118+
" df = pd.read_csv(filepath, names=['sentence', 'operation'], sep=',', engine='python')\n",
119+
" # df = shuffle(df)\n",
120+
" sentences = df['sentence'].values\n",
121+
" y = df['operation'].values\n",
122+
" return df, sentences, y\n",
123+
"\n",
124+
"\n",
125+
"def filter_stopwords(sentences, stopwords_list):\n",
126+
" stopwords_set = set(stopwords_list)\n",
127+
" filtered = []\n",
128+
" for sentence in sentences:\n",
129+
" tokenized_sentence = word_tokenize(sentence)\n",
130+
" filtered_sentence = []\n",
131+
" for w in tokenized_sentence:\n",
132+
" if w not in stopwords_set:\n",
133+
" filtered_sentence.append(w)\n",
134+
" filtered.append(filtered_sentence)\n",
135+
" return filtered\n",
136+
"\n",
137+
"\n",
138+
"def detokenize(filtered_sentences):\n",
139+
" detokenized_sentences = []\n",
140+
" for sentence in filtered_sentences:\n",
141+
" detokenized_sentences.append(TreebankWordDetokenizer().detokenize(sentence))\n",
142+
" return detokenized_sentences\n",
143+
"\n",
144+
"\n",
145+
"def plot_history(history):\n",
146+
" plt.title('Loss')\n",
147+
" plt.plot(history.history['loss'], label='train')\n",
148+
" plt.plot(history.history['val_loss'], label='test')\n",
149+
" plt.legend()\n",
150+
" plt.show()\n",
151+
"\n",
152+
"\n",
153+
"def plot_label_distribution(dataframe):\n",
154+
" dataframe['operation'].value_counts().plot(kind=\"bar\")\n",
155+
"\n",
156+
"\n",
157+
"def init_tokenizer(MAX_NB_WORDS, dataframe):\n",
158+
" tokenizer = Tokenizer(MAX_NB_WORDS, filters='!\"#$%&()*+,-./:;<=>?@[\\]^_`{|}~', lower=True)\n",
159+
" tokenizer.fit_on_texts(dataframe['filtered_sentence'].values)\n",
160+
" word_index = tokenizer.word_index\n",
161+
" print('Found %s unique tokens.' % len(word_index))\n",
162+
" return tokenizer\n",
163+
"\n",
164+
"\n",
165+
"def create_model(max_words, embedding_dimensions, X):\n",
166+
" model = Sequential()\n",
167+
" model.add(Embedding(max_words, embedding_dimensions, input_length=X.shape[1]))\n",
168+
" model.add(SpatialDropout1D(0.2))\n",
169+
" model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2, kernel_regularizer=l2(0.01)))\n",
170+
" model.add(Dense(3, activation='softmax'))\n",
171+
" model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])\n",
172+
" return model\n",
173+
"\n",
174+
"\n",
175+
"def lstm_train(df, tokenizer, max_sequence_length, embedding_dimensions):\n",
176+
" X = tokenizer.texts_to_sequences(df['filtered_sentence'].values)\n",
177+
" X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)\n",
178+
" print('Shape of data tensor:', X.shape)\n",
179+
" Y = pd.get_dummies(df['operation']).values\n",
180+
"\n",
181+
" X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=42)\n",
182+
"\n",
183+
" # Oversampling the minority class\n",
184+
" smote = SMOTE('minority')\n",
185+
" X_train, Y_train = smote.fit_sample(X_train, Y_train)\n",
186+
"\n",
187+
" model = create_model(max_sequence_length, embedding_dimensions, X)\n",
188+
" epochs = 150\n",
189+
" batch_size = 100\n",
190+
" history = model.fit(X_train, Y_train,\n",
191+
" epochs=epochs, batch_size=batch_size,\n",
192+
" validation_split=0.1,\n",
193+
" callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])\n",
194+
"\n",
195+
" accr = model.evaluate(X_test, Y_test)\n",
196+
" print(model.summary())\n",
197+
" print('Test set\\n Loss: {:0.3f}\\n Accuracy: {:0.3f}'.format(accr[0], accr[1]))\n",
198+
" # plot_model(model, to_file='model.png')\n",
199+
" return model, history\n",
200+
"\n",
201+
"def infer(sentence, tokenizer, model):\n",
202+
" sentence_as_array = [sentence]\n",
203+
" filtered_commands = filter_stopwords(sentence_as_array, stopwords_list)\n",
204+
" seq = tokenizer.texts_to_sequences(filtered_commands)\n",
205+
" padded = pad_sequences(seq, maxlen=MAX_SEQUENCE_LENGTH)\n",
206+
" pred = model.predict(padded)\n",
207+
" return pred\n",
208+
"\n",
209+
"def pre_initialize():\n",
210+
" df, sentences, y = import_and_prepare('data/dataset_new.txt')\n",
211+
" # df_temp, sentences_temp, y_temp = import_and_prepare('data/dataset_new.txt')\n",
212+
" plot_label_distribution(df)\n",
213+
" filtered_sentences = filter_stopwords(sentences, stopwords_list)\n",
214+
" detokenized_sentences = detokenize(filtered_sentences)\n",
215+
" df['filtered_sentence'] = detokenized_sentences\n",
216+
" tokenizer = init_tokenizer(MAX_NB_WORDS, df)\n",
217+
" return df, tokenizer"
218+
]
219+
},
220+
{
221+
"cell_type": "code",
222+
"execution_count": 0,
223+
"metadata": {
224+
"colab": {},
225+
"colab_type": "code",
226+
"id": "V1tgQvAm6prk"
227+
},
228+
"outputs": [],
229+
"source": [
230+
"# df, sentences, y = import_and_prepare('data/dataset.txt')\n",
231+
"nltk.download('punkt')\n",
232+
"\n",
233+
"df, tokenizer = pre_initialize()\n",
234+
"model, history = lstm_train(df, tokenizer, MAX_NB_WORDS, MAX_SEQUENCE_LENGTH)\n",
235+
"model.save('lstm.h5')\n",
236+
"# plot_history(history)"
237+
]
238+
},
239+
{
240+
"cell_type": "code",
241+
"execution_count": 0,
242+
"metadata": {
243+
"colab": {},
244+
"colab_type": "code",
245+
"id": "lUYDRHX26voC"
246+
},
247+
"outputs": [],
248+
"source": [
249+
"model = load_model('./lstm.h5')\n",
250+
"new_command = ['Track the pen']\n",
251+
"filtered_commands = filter_stopwords(new_command, stopwords_list)\n",
252+
"seq = tokenizer.texts_to_sequences(filtered_commands)\n",
253+
"padded = pad_sequences(seq, maxlen=MAX_SEQUENCE_LENGTH)\n",
254+
"pred = model.predict(padded)\n",
255+
"\n",
256+
"labels = ['Locate', 'Describe', 'No_Op']\n",
257+
"print(\"Predicted vector: \", pred, \" Predicted Class: \", labels[np.argmax(pred)])"
258+
]
259+
},
260+
{
261+
"cell_type": "code",
262+
"execution_count": 0,
263+
"metadata": {
264+
"colab": {},
265+
"colab_type": "code",
266+
"id": "K-i_QcHtHqG7"
267+
},
268+
"outputs": [],
269+
"source": []
270+
}
271+
],
272+
"metadata": {
273+
"accelerator": "GPU",
274+
"colab": {
275+
"collapsed_sections": [],
276+
"name": "Text Classifier",
277+
"provenance": [],
278+
"toc_visible": true
279+
},
280+
"kernelspec": {
281+
"display_name": "Python 3",
282+
"language": "python",
283+
"name": "python3"
284+
},
285+
"language_info": {
286+
"codemirror_mode": {
287+
"name": "ipython",
288+
"version": 3
289+
},
290+
"file_extension": ".py",
291+
"mimetype": "text/x-python",
292+
"name": "python",
293+
"nbconvert_exporter": "python",
294+
"pygments_lexer": "ipython3",
295+
"version": "3.7.4"
296+
}
297+
},
298+
"nbformat": 4,
299+
"nbformat_minor": 1
300+
}

0 commit comments

Comments
 (0)