Skip to content

Commit 0e5bad4

Browse files
authored
Merge pull request #55 from MahimaRamireddy/main
Addition of sentiment analysis function
2 parents 564ef59 + ce2b1ad commit 0e5bad4

File tree

2 files changed

+131
-72
lines changed

2 files changed

+131
-72
lines changed
Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": null,
6+
"metadata": {},
7+
"outputs": [],
8+
"source": [
9+
"\n",
10+
"import pandas as pd\n",
11+
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
12+
"from sklearn.cluster import KMeans\n",
13+
"from sklearn.metrics import silhouette_score\n",
14+
"from nltk.sentiment import SentimentIntensityAnalyzer\n",
15+
"\n",
16+
"# Load NLTK's sentiment analyzer\n",
17+
"sid = SentimentIntensityAnalyzer()\n",
18+
"\n",
19+
"data = pd.read_csv('Product listing.csv')\n",
20+
"\n",
21+
"# Data preprocessing\n",
22+
"def preprocess_text(text):\n",
23+
" # Convert text to lowercase\n",
24+
" text = text.lower()\n",
25+
" # Tokenization can be done using regex or libraries like NLTK or spaCy\n",
26+
" # Here, a simple split by space is used\n",
27+
" tokens = text.split()\n",
28+
" # Remove stopwords (you may need to download the stopwords list for your language)\n",
29+
" stopwords = set(['the', 'and', 'is', 'in', 'to', 'it', 'this', 'of', 'for', 'with', 'as'])\n",
30+
" tokens = [token for token in tokens if token not in stopwords]\n",
31+
" return ' '.join(tokens)\n",
32+
"\n",
33+
"data['clean_text'] = data['product'].apply(preprocess_text)\n",
34+
"\n",
35+
"# TF-IDF vectorization\n",
36+
"tfidf_vectorizer = TfidfVectorizer(max_features=1000) # You can adjust max_features as needed\n",
37+
"tfidf_matrix = tfidf_vectorizer.fit_transform(data['clean_text'])\n",
38+
"\n",
39+
"# Clustering with K-means\n",
40+
"k = 5 # Number of clusters (you can adjust this)\n",
41+
"kmeans = KMeans(n_clusters=k, random_state=42)\n",
42+
"kmeans.fit(tfidf_matrix)\n",
43+
"\n",
44+
"# Assign cluster labels to each review\n",
45+
"data['cluster_label'] = kmeans.labels_\n",
46+
"\n",
47+
"# Sentiment Analysis\n",
48+
"def get_sentiment(text):\n",
49+
" # NLTK's sentiment analyzer\n",
50+
" sentiment_scores = sid.polarity_scores(text)\n",
51+
" # Classify sentiment based on compound score\n",
52+
" if sentiment_scores['compound'] >= 0.05:\n",
53+
" return 'Positive'\n",
54+
" elif sentiment_scores['compound'] <= -0.05:\n",
55+
" return 'Negative'\n",
56+
" else:\n",
57+
" return 'Neutral'\n",
58+
" \n",
59+
"data['sentiment'] = data['clean_text'].apply(get_sentiment)\n",
60+
"\n",
61+
"\n",
62+
"# Evaluate clustering using silhouette score\n",
63+
"silhouette_avg = silhouette_score(tfidf_matrix, kmeans.labels_)\n",
64+
"print(f\"Silhouette Score: {silhouette_avg}\")\n",
65+
"\n",
66+
"# Display some reviews from each cluster\n",
67+
"for cluster_id in range(k):\n",
68+
" cluster_samples = data[data['cluster_label'] == cluster_id].sample(5) # Displaying 5 samples per cluster\n",
69+
" print(f\"\\nCluster {cluster_id}:\")\n",
70+
" for index, row in cluster_samples.iterrows():\n",
71+
" print(row['product'])\n",
72+
" print(\"Sentiment:\", row['sentiment'])\n",
73+
" print('-' * 50)\n",
74+
"\n",
75+
"# You can further analyze the clusters and refine the process as needed\n"
76+
]
77+
},
78+
{
79+
"cell_type": "code",
80+
"execution_count": null,
81+
"metadata": {},
82+
"outputs": [],
83+
"source": []
84+
}
85+
],
86+
"metadata": {
87+
"kernelspec": {
88+
"display_name": "Python 3 (ipykernel)",
89+
"language": "python",
90+
"name": "python3"
91+
},
92+
"language_info": {
93+
"codemirror_mode": {
94+
"name": "ipython",
95+
"version": 3
96+
},
97+
"file_extension": ".py",
98+
"mimetype": "text/x-python",
99+
"name": "python",
100+
"nbconvert_exporter": "python",
101+
"pygments_lexer": "ipython3",
102+
"version": "3.11.4"
103+
}
104+
},
105+
"nbformat": 4,
106+
"nbformat_minor": 2
107+
}

clustering movie review.ipynb

Lines changed: 24 additions & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -2,83 +2,19 @@
22
"cells": [
33
{
44
"cell_type": "code",
5-
"execution_count": 1,
5+
"execution_count": null,
66
"metadata": {},
7-
"outputs": [
8-
{
9-
"name": "stdout",
10-
"output_type": "stream",
11-
"text": [
12-
"Silhouette Score: 0.057004055728191866\n",
13-
"\n",
14-
"Cluster 0:\n",
15-
"Acer ED322QR 31.5 Inch (80.01 cm) Full HD Curved VA Backlit LED Monitor I 144Hz Refresh Rate I Zero Frame I AMD Free Sync I Eye Care Features I Stereo Speakers\n",
16-
"--------------------------------------------------\n",
17-
"HP 3.1 USB HP 32 GB Flash Drive\n",
18-
"--------------------------------------------------\n",
19-
"Logitech MX Anywhere 3 Compact Performance Mouse – Wireless, Magnetic Scrolling, Ergonomic, 4000DPI Sensor, Custom Buttons, USB-C, Bluetooth, Apple Mac, iPad, Windows PC, Linux, Chrome - Graphite\n",
20-
"--------------------------------------------------\n",
21-
"SanDisk Cruzer Blade 32GB USB Flash Drive\n",
22-
"--------------------------------------------------\n",
23-
"APLT-Portable Slim Wireless Mouse for Laptops 2.4Ghz Silent Wireless Optical Mouse for Laptop, Desktop ( White)\n",
24-
"--------------------------------------------------\n",
25-
"\n",
26-
"Cluster 1:\n",
27-
"Zebronics Zeb-Corolla In Ear Wired Earphone with Mic, 3.5mm Jack, 1.2 Meter Cable, Multi Function Button\n",
28-
"--------------------------------------------------\n",
29-
"MINISO We Bare Bears in-Ear Wired Headphones with Microphone, Comfortable Earbuds Cute Earphones for Mobile Smartphones Apple Xiaomi Realme Oppo Samsung and More - Brown\n",
30-
"--------------------------------------------------\n",
31-
"pTron Tangent Evo with 14Hrs Playback, Bluetooth 5.0 Wireless Headphones with Deep Bass, IPX4 Water Resistance, Ergonomic & Snug-fit, Voice Assistance, Magnetic Earbuds & Built-in HD Mic (Black)\n",
32-
"--------------------------------------------------\n",
33-
"Ambrane Dots 38 True Wireless Earbuds TWS with Pure HD Bass, 16H Playtime, IPX4 Waterproof, Responsive Touch Sensors for Multifunctions, Compact Type-C Charging Case (Green), Normal\n",
34-
"--------------------------------------------------\n",
35-
"Peripage A6 203dpi Thermal Label Printer Inkless Pocket Printer Bluetooth Connection Office Assistant/Life Helper DIY Printing Travel Recorder for iOS/Android/Windows\n",
36-
"--------------------------------------------------\n",
37-
"\n",
38-
"Cluster 2:\n",
39-
"AVITA LIBER V NS14A8INF542-CS Thin and Light 14 inch (35.56cm) Laptop( Intel Core i5-10210U/ 8GB/256GB SSD /Win 10 Home/ Backlit Keyboard/ Fingerprint Sensor/ MSO 365) 1.28kg, Cloud Silver\n",
40-
"--------------------------------------------------\n",
41-
"(Renewed) HP ProBook 7th Gen Core i5 Laptop, 16 GB RAM, 240GB NVME SSD, Intel HD Graphics, 15.6 inch (39.62 cms) FHD Screen, Win 10, MS Office, Backlit Keyboard, Fingerprint sensor, Black\n",
42-
"--------------------------------------------------\n",
43-
"CHIST Gaming Desktop Intel Core i5 8GB,GT 710 2GB Graphic Card, 19 Full HD Monitor, Keyboard Mouse, Wi-Fi Ready to Play (120GB SSD 1TB HDD)\n",
44-
"--------------------------------------------------\n",
45-
"(Renewed) Lenovo ThinkCenter M58 19-inch (48.26 cm) Desktop (Intel Core2 Duo 4 GB 500 GB HDD Windows 7 Professional MS Office), Black\n",
46-
"--------------------------------------------------\n",
47-
"Lenovo ThinkBook 15 Intel 11th Gen Core i5 15.6\" (39.62 cm) FHD IPS 300 nits Antiglare 100% sRGB Thin and Light Laptop (16GB/1TB HDD+128GB SSD/Windows 10/MS Office/Mineral Grey/1.7 Kg), 20VEA0HKIH\n",
48-
"--------------------------------------------------\n",
49-
"\n",
50-
"Cluster 3:\n",
51-
"Mi 80 cm (32 inches) Horizon Edition HD Ready Android Smart LED TV 4A|L32M6-EI (Grey)\n",
52-
"--------------------------------------------------\n",
53-
"Foxsky 127 cm (50 inches) 4K Ultra HD Smart LED TV 50FS-VS (Black) (2021 Model) | With Voice Assistant\n",
54-
"--------------------------------------------------\n",
55-
"Kevin 80 cm (32 Inches) HD Ready Smart LED TV KN32A (Black) (2021 Model) | With Alexa Built-in\n",
56-
"--------------------------------------------------\n",
57-
"Samsung 108 cm (43 inches) 4K Ultra HD Smart QLED TV QA43Q60AAKLXL (Black) (2021 Model)\n",
58-
"--------------------------------------------------\n",
59-
"eAirtec 60 cm (24 Inches) HD Ready Smart Android LED TV 24DJSmart (Black) (2021 Model)\n",
60-
"--------------------------------------------------\n",
61-
"\n",
62-
"Cluster 4:\n",
63-
"Ovista- 10000mAH Digital Display Power Bank with inbuilt 4 in 1 Cable USB Input Port with Fast Charging 10000mAh Slim Power Bank with 5V/2A Fast Charging (Model-PRB035)- Black\n",
64-
"--------------------------------------------------\n",
65-
"Ambrane 5000mAh Li-Polymer Powerbank with Fast Charging & Compact Size (PP-501, Pink)\n",
66-
"--------------------------------------------------\n",
67-
"URBN 20000mAh Li-Polymer Ultra Compact Type-C Power Bank with 12W Fast Charge, Type C & Micro Input (Black)\n",
68-
"--------------------------------------------------\n",
69-
"Conekt 10000mAh Li-Polymer Powerbank Zeal Proton Pro (White)\n",
70-
"--------------------------------------------------\n",
71-
"Zeal PL-10000 10400mAh Power Bank\n",
72-
"--------------------------------------------------\n"
73-
]
74-
}
75-
],
7+
"outputs": [],
768
"source": [
779
"\n",
7810
"import pandas as pd\n",
7911
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
8012
"from sklearn.cluster import KMeans\n",
8113
"from sklearn.metrics import silhouette_score\n",
14+
"from nltk.sentiment import SentimentIntensityAnalyzer\n",
15+
"\n",
16+
"# Load NLTK's sentiment analyzer\n",
17+
"sid = SentimentIntensityAnalyzer()\n",
8218
"\n",
8319
"data = pd.read_csv('Product listing.csv')\n",
8420
"\n",
@@ -108,6 +44,21 @@
10844
"# Assign cluster labels to each review\n",
10945
"data['cluster_label'] = kmeans.labels_\n",
11046
"\n",
47+
"# Sentiment Analysis\n",
48+
"def get_sentiment(text):\n",
49+
" # NLTK's sentiment analyzer\n",
50+
" sentiment_scores = sid.polarity_scores(text)\n",
51+
" # Classify sentiment based on compound score\n",
52+
" if sentiment_scores['compound'] >= 0.05:\n",
53+
" return 'Positive'\n",
54+
" elif sentiment_scores['compound'] <= -0.05:\n",
55+
" return 'Negative'\n",
56+
" else:\n",
57+
" return 'Neutral'\n",
58+
" \n",
59+
"data['sentiment'] = data['clean_text'].apply(get_sentiment)\n",
60+
"\n",
61+
"\n",
11162
"# Evaluate clustering using silhouette score\n",
11263
"silhouette_avg = silhouette_score(tfidf_matrix, kmeans.labels_)\n",
11364
"print(f\"Silhouette Score: {silhouette_avg}\")\n",
@@ -118,6 +69,7 @@
11869
" print(f\"\\nCluster {cluster_id}:\")\n",
11970
" for index, row in cluster_samples.iterrows():\n",
12071
" print(row['product'])\n",
72+
" print(\"Sentiment:\", row['sentiment'])\n",
12173
" print('-' * 50)\n",
12274
"\n",
12375
"# You can further analyze the clusters and refine the process as needed\n"
@@ -126,7 +78,7 @@
12678
],
12779
"metadata": {
12880
"kernelspec": {
129-
"display_name": "Python 3",
81+
"display_name": "Python 3 (ipykernel)",
13082
"language": "python",
13183
"name": "python3"
13284
},
@@ -140,7 +92,7 @@
14092
"name": "python",
14193
"nbconvert_exporter": "python",
14294
"pygments_lexer": "ipython3",
143-
"version": "3.11.9"
95+
"version": "3.11.4"
14496
}
14597
},
14698
"nbformat": 4,

0 commit comments

Comments
 (0)