Merge pull request #55 from MahimaRamireddy/main

sanjay-kv · web-flow · commit 0e5bad4c37f7 · 2024-05-14T16:49:10.000+10:00
Addition of sentiment analysis function
diff --git a/.ipynb_checkpoints/clustering movie review-checkpoint.ipynb b/.ipynb_checkpoints/clustering movie review-checkpoint.ipynb
@@ -0,0 +1,107 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "import pandas as pd\n",
+    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
+    "from sklearn.cluster import KMeans\n",
+    "from sklearn.metrics import silhouette_score\n",
+    "from nltk.sentiment import SentimentIntensityAnalyzer\n",
+    "\n",
+    "# Load NLTK's sentiment analyzer\n",
+    "sid = SentimentIntensityAnalyzer()\n",
+    "\n",
+    "data = pd.read_csv('Product listing.csv')\n",
+    "\n",
+    "# Data preprocessing\n",
+    "def preprocess_text(text):\n",
+    "    # Convert text to lowercase\n",
+    "    text = text.lower()\n",
+    "    # Tokenization can be done using regex or libraries like NLTK or spaCy\n",
+    "    # Here, a simple split by space is used\n",
+    "    tokens = text.split()\n",
+    "    # Remove stopwords (you may need to download the stopwords list for your language)\n",
+    "    stopwords = set(['the', 'and', 'is', 'in', 'to', 'it', 'this', 'of', 'for', 'with', 'as'])\n",
+    "    tokens = [token for token in tokens if token not in stopwords]\n",
+    "    return ' '.join(tokens)\n",
+    "\n",
+    "data['clean_text'] = data['product'].apply(preprocess_text)\n",
+    "\n",
+    "# TF-IDF vectorization\n",
+    "tfidf_vectorizer = TfidfVectorizer(max_features=1000)  # You can adjust max_features as needed\n",
+    "tfidf_matrix = tfidf_vectorizer.fit_transform(data['clean_text'])\n",
+    "\n",
+    "# Clustering with K-means\n",
+    "k = 5  # Number of clusters (you can adjust this)\n",
+    "kmeans = KMeans(n_clusters=k, random_state=42)\n",
+    "kmeans.fit(tfidf_matrix)\n",
+    "\n",
+    "# Assign cluster labels to each review\n",
+    "data['cluster_label'] = kmeans.labels_\n",
+    "\n",
+    "# Sentiment Analysis\n",
+    "def get_sentiment(text):\n",
+    "    # NLTK's sentiment analyzer\n",
+    "    sentiment_scores = sid.polarity_scores(text)\n",
+    "    # Classify sentiment based on compound score\n",
+    "    if sentiment_scores['compound'] >= 0.05:\n",
+    "        return 'Positive'\n",
+    "    elif sentiment_scores['compound'] <= -0.05:\n",
+    "        return 'Negative'\n",
+    "    else:\n",
+    "        return 'Neutral'\n",
+    "    \n",
+    "data['sentiment'] = data['clean_text'].apply(get_sentiment)\n",
+    "\n",
+    "\n",
+    "# Evaluate clustering using silhouette score\n",
+    "silhouette_avg = silhouette_score(tfidf_matrix, kmeans.labels_)\n",
+    "print(f\"Silhouette Score: {silhouette_avg}\")\n",
+    "\n",
+    "# Display some reviews from each cluster\n",
+    "for cluster_id in range(k):\n",
+    "    cluster_samples = data[data['cluster_label'] == cluster_id].sample(5)  # Displaying 5 samples per cluster\n",
+    "    print(f\"\\nCluster {cluster_id}:\")\n",
+    "    for index, row in cluster_samples.iterrows():\n",
+    "        print(row['product'])\n",
+    "        print(\"Sentiment:\", row['sentiment'])\n",
+    "        print('-' * 50)\n",
+    "\n",
+    "# You can further analyze the clusters and refine the process as needed\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/clustering movie review.ipynb b/clustering movie review.ipynb
@@ -2,83 +2,19 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Silhouette Score: 0.057004055728191866\n",
-      "\n",
-      "Cluster 0:\n",
-      "Acer ED322QR 31.5 Inch (80.01 cm) Full HD Curved VA Backlit LED Monitor I 144Hz Refresh Rate I Zero Frame I AMD Free Sync I Eye Care Features I Stereo Speakers\n",
-      "--------------------------------------------------\n",
-      "HP 3.1 USB HP 32 GB Flash Drive\n",
-      "--------------------------------------------------\n",
-      "Logitech MX Anywhere 3 Compact Performance Mouse – Wireless, Magnetic Scrolling, Ergonomic, 4000DPI Sensor, Custom Buttons, USB-C, Bluetooth, Apple Mac, iPad, Windows PC, Linux, Chrome - Graphite\n",
-      "--------------------------------------------------\n",
-      "SanDisk Cruzer Blade 32GB USB Flash Drive\n",
-      "--------------------------------------------------\n",
-      "APLT-Portable Slim Wireless Mouse for Laptops 2.4Ghz Silent Wireless Optical Mouse for Laptop, Desktop ( White)\n",
-      "--------------------------------------------------\n",
-      "\n",
-      "Cluster 1:\n",
-      "Zebronics Zeb-Corolla In Ear Wired Earphone with Mic, 3.5mm Jack, 1.2 Meter Cable, Multi Function Button\n",
-      "--------------------------------------------------\n",
-      "MINISO We Bare Bears in-Ear Wired Headphones with Microphone, Comfortable Earbuds Cute Earphones for Mobile Smartphones Apple Xiaomi Realme Oppo Samsung and More - Brown\n",
-      "--------------------------------------------------\n",
-      "pTron Tangent Evo with 14Hrs Playback, Bluetooth 5.0 Wireless Headphones with Deep Bass, IPX4 Water Resistance, Ergonomic & Snug-fit, Voice Assistance, Magnetic Earbuds & Built-in HD Mic (Black)\n",
-      "--------------------------------------------------\n",
-      "Ambrane Dots 38 True Wireless Earbuds TWS with Pure HD Bass, 16H Playtime, IPX4 Waterproof, Responsive Touch Sensors for Multifunctions, Compact Type-C Charging Case (Green), Normal\n",
-      "--------------------------------------------------\n",
-      "Peripage A6 203dpi Thermal Label Printer Inkless Pocket Printer Bluetooth Connection Office Assistant/Life Helper DIY Printing Travel Recorder for iOS/Android/Windows\n",
-      "--------------------------------------------------\n",
-      "\n",
-      "Cluster 2:\n",
-      "AVITA LIBER V NS14A8INF542-CS Thin and Light 14 inch (35.56cm) Laptop( Intel Core i5-10210U/ 8GB/256GB SSD /Win 10 Home/ Backlit Keyboard/ Fingerprint Sensor/ MSO 365) 1.28kg, Cloud Silver\n",
-      "--------------------------------------------------\n",
-      "(Renewed) HP ProBook 7th Gen Core i5 Laptop, 16 GB RAM, 240GB NVME SSD, Intel HD Graphics, 15.6 inch (39.62 cms) FHD Screen, Win 10, MS Office, Backlit Keyboard, Fingerprint sensor, Black\n",
-      "--------------------------------------------------\n",
-      "CHIST Gaming Desktop Intel Core i5 8GB,GT 710 2GB Graphic Card, 19 Full HD Monitor, Keyboard Mouse, Wi-Fi Ready to Play (120GB SSD 1TB HDD)\n",
-      "--------------------------------------------------\n",
-      "(Renewed) Lenovo ThinkCenter M58 19-inch (48.26 cm) Desktop (Intel Core2 Duo 4 GB 500 GB HDD Windows 7 Professional MS Office), Black\n",
-      "--------------------------------------------------\n",
-      "Lenovo ThinkBook 15 Intel 11th Gen Core i5 15.6\" (39.62 cm) FHD IPS 300 nits Antiglare 100% sRGB Thin and Light Laptop (16GB/1TB HDD+128GB SSD/Windows 10/MS Office/Mineral Grey/1.7 Kg), 20VEA0HKIH\n",
-      "--------------------------------------------------\n",
-      "\n",
-      "Cluster 3:\n",
-      "Mi 80 cm (32 inches) Horizon Edition HD Ready Android Smart LED TV 4A|L32M6-EI (Grey)\n",
-      "--------------------------------------------------\n",
-      "Foxsky 127 cm (50 inches) 4K Ultra HD Smart LED TV 50FS-VS (Black) (2021 Model) | With Voice Assistant\n",
-      "--------------------------------------------------\n",
-      "Kevin 80 cm (32 Inches) HD Ready Smart LED TV KN32A (Black) (2021 Model) | With Alexa Built-in\n",
-      "--------------------------------------------------\n",
-      "Samsung 108 cm (43 inches) 4K Ultra HD Smart QLED TV QA43Q60AAKLXL (Black) (2021 Model)\n",
-      "--------------------------------------------------\n",
-      "eAirtec 60 cm (24 Inches) HD Ready Smart Android LED TV 24DJSmart (Black) (2021 Model)\n",
-      "--------------------------------------------------\n",
-      "\n",
-      "Cluster 4:\n",
-      "Ovista- 10000mAH Digital Display Power Bank with inbuilt 4 in 1 Cable USB Input Port with Fast Charging 10000mAh Slim Power Bank with 5V/2A Fast Charging (Model-PRB035)- Black\n",
-      "--------------------------------------------------\n",
-      "Ambrane 5000mAh Li-Polymer Powerbank with Fast Charging & Compact Size (PP-501, Pink)\n",
-      "--------------------------------------------------\n",
-      "URBN 20000mAh Li-Polymer Ultra Compact Type-C Power Bank with 12W Fast Charge, Type C & Micro Input (Black)\n",
-      "--------------------------------------------------\n",
-      "Conekt 10000mAh Li-Polymer Powerbank Zeal Proton Pro (White)\n",
-      "--------------------------------------------------\n",
-      "Zeal PL-10000 10400mAh Power Bank\n",
-      "--------------------------------------------------\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "\n",
     "import pandas as pd\n",
     "from sklearn.feature_extraction.text import TfidfVectorizer\n",
     "from sklearn.cluster import KMeans\n",
     "from sklearn.metrics import silhouette_score\n",
+    "from nltk.sentiment import SentimentIntensityAnalyzer\n",
+    "\n",
+    "# Load NLTK's sentiment analyzer\n",
+    "sid = SentimentIntensityAnalyzer()\n",
     "\n",
     "data = pd.read_csv('Product listing.csv')\n",
     "\n",
@@ -108,6 +44,21 @@
     "# Assign cluster labels to each review\n",
     "data['cluster_label'] = kmeans.labels_\n",
     "\n",
+    "# Sentiment Analysis\n",
+    "def get_sentiment(text):\n",
+    "    # NLTK's sentiment analyzer\n",
+    "    sentiment_scores = sid.polarity_scores(text)\n",
+    "    # Classify sentiment based on compound score\n",
+    "    if sentiment_scores['compound'] >= 0.05:\n",
+    "        return 'Positive'\n",
+    "    elif sentiment_scores['compound'] <= -0.05:\n",
+    "        return 'Negative'\n",
+    "    else:\n",
+    "        return 'Neutral'\n",
+    "    \n",
+    "data['sentiment'] = data['clean_text'].apply(get_sentiment)\n",
+    "\n",
+    "\n",
     "# Evaluate clustering using silhouette score\n",
     "silhouette_avg = silhouette_score(tfidf_matrix, kmeans.labels_)\n",
     "print(f\"Silhouette Score: {silhouette_avg}\")\n",
@@ -118,6 +69,7 @@
     "    print(f\"\\nCluster {cluster_id}:\")\n",
     "    for index, row in cluster_samples.iterrows():\n",
     "        print(row['product'])\n",
+    "        print(\"Sentiment:\", row['sentiment'])\n",
     "        print('-' * 50)\n",
     "\n",
     "# You can further analyze the clusters and refine the process as needed\n"
@@ -126,7 +78,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -140,7 +92,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.9"
+   "version": "3.11.4"
   }
  },
  "nbformat": 4,