added automatic topic labelling

code2k13 · code2k13 · commit e7411438c9e4 · 2022-06-18T18:57:02.000Z
diff --git a/README.md b/README.md
@@ -51,7 +51,8 @@ Now, we need to create a config file for Feed Visualizer. The config file contai
     "pretrained_model": "all-mpnet-base-v2",
     "clust_dist_threshold": 4,
     "tsne_iter": 8000,
-    "text_max_length": 2048
+    "text_max_length": 2048,
+    "topic_str_min_df": 0.25
 }
 ```
 
@@ -77,7 +78,8 @@ Here is some information on what each config setting does:
     "pretrained_model": "name of pretrained model. Here is list of all valid model names https://www.sbert.net/docs/pretrained_models.html#model-overview",
     "clust_dist_threshold": "Integer representing maximum radius of cluster. There is no correct value here. Experiment !",
     "tsne_iter": "Integer representing number of iterations for TSNE (higher is better)",
-    "text_max_length": "Integer representing number of characters to read from content/description for semantic encoding."
+    "text_max_length": "Integer representing number of characters to read from content/description for semantic encoding.",
+    "topic_str_min_df": "A float. For example value of 0.25 means that only phrases which are present in 25% or more items in a cluster will be considered for being used as name of the cluster."  
 }
 ```
 
diff --git a/config.json b/config.json
@@ -1,9 +1,10 @@
 {
-    "input_directory": "feeds",
+    "input_directory": "nasa",
     "output_directory": "feeds_output",
     "pretrained_model": "all-mpnet-base-v2",
-    "clust_dist_threshold":1,
+    "clust_dist_threshold":0.5,
     "tsne_iter": 8000,
     "text_max_length": 8048,
-    "random_state": 45
+    "random_state": 45,
+    "topic_str_min_df": 0.25
 }
diff --git a/visualization.html b/visualization.html
@@ -57,7 +57,9 @@
         function makeplot() {
             d3.csv("data.csv" + '?' + Math.floor(Math.random() * 1000)).then((d) => {
                 csv_data = d
-                let clusterNumbers = d.map(a => parseInt(a.cluster))
+                let clusterNumbers = []
+                let topics = {}
+                d.forEach(a => {topics[a.cluster] = a.topic;clusterNumbers.push(parseInt(a.cluster))})
                 cluster_count = Math.max(...clusterNumbers) + 1
                 d3.select('#clusters')
                     .selectAll('span')
@@ -69,17 +71,17 @@
                     .style("border", "1px solid grey")
                     .style("min-width", "25px")
                     .style("display", "inline-block")
-                    .style("color", "white")
-                    .style("text-shadow", "1px 1px grey")
+                    .style("color", function (d) {    return (d < (cluster_count*.3) || d > (cluster_count*.7))? 'white':'black'})
+                    //.style("text-shadow", "1px 1px grey")
                     .style("margin", "1px")
                     .style("border-radius", "2px")
                     .attr("data-clusterId", function (d) { return d })
                     .on("mouseover", function (e, d) {
                         //console.log(this.data.cluserId)
                         //let currentClusterId = this.getAttribute("data-clusterId")
                         let newTrace = JSON.parse(JSON.stringify(trace1));
-                        let new_colors = newTrace.marker.color.map(function (c,idx) {                          
-                            return csv_data[idx].cluster == d ? color(d / cluster_count)  : "#e0eeeeee"
+                        let new_colors = newTrace.marker.color.map(function (c, idx) {
+                            return csv_data[idx].cluster == d ? color(d / cluster_count) : "#e0eeeeee"
                         })
                         newTrace.marker.color = new_colors
                         drawPlot('myDiv', [newTrace], layout)
@@ -89,7 +91,7 @@
                         drawPlot('myDiv', [trace1], layout)
                     })
                     .text(function (d) {
-                        return d;
+                        return d + " - " + topics[d];
                     });
 
                 d.forEach(element => {
diff --git a/visualize.py b/visualize.py
@@ -13,6 +13,7 @@
 from bs4 import BeautifulSoup, SoupStrainer
 from sentence_transformers import SentenceTransformer
 from sklearn.cluster import AgglomerativeClustering
+from sklearn.feature_extraction.text import CountVectorizer
 from sklearn.manifold import TSNE
 from tqdm import tqdm
 
@@ -82,6 +83,22 @@ def get_coordinates(entries):
     clusters = clustering_model.fit_predict(tsne_output)
     return [x[0] for x in tsne.fit_transform(X)], [x[1] for x in tsne.fit_transform(X)], clusters
 
+def find_topics(df):
+    topics = []
+    for i in range(0,df["cluster"].max()+1):
+        try:        
+            df_text = df[df['cluster']==i]["label"]
+            vectorizer = CountVectorizer(ngram_range=(1,2),min_df=config["topic_str_min_df"],stop_words='english')
+            X = vectorizer.fit_transform(df_text)
+            possible_topics = vectorizer.get_feature_names_out()
+            idx_topic  = np.argmax([len(a) for a in possible_topics])
+            topics.append(possible_topics[idx_topic])
+            #x,y = np.argmax(np.max(X, axis=1)),np.argmax(np.max(X, axis=0))
+            #topics.append(vectorizer.get_feature_names_out()[y])
+        except:
+            topics.append("NA")
+            pass
+    return topics
 
 def main():
     all_entries = get_all_entries(config["input_directory"])
@@ -106,7 +123,9 @@ def main():
     df = pd.DataFrame({'x': x, 'y': y, 'label': labels,
                     'count': counts, 'url': entries.keys(), 'cluster': cluster_info})
 
-
+    topics = find_topics(df)
+    df["topic"] = df["cluster"].apply(lambda x : topics[x])
+    print('Assigning cluster names !')
     if not os.path.exists(config["output_directory"]):
         os.makedirs(config["output_directory"])
     df.to_csv(config["output_directory"]+"/data.csv")

Original file line number	Diff line number	Diff line change
`@@ -1,9 +1,10 @@`
`1`	`1`	`{`
`2`		`- "input_directory": "feeds",`
	`2`	`+ "input_directory": "nasa",`
`3`	`3`	`"output_directory": "feeds_output",`
`4`	`4`	`"pretrained_model": "all-mpnet-base-v2",`
`5`		`- "clust_dist_threshold":1,`
	`5`	`+ "clust_dist_threshold":0.5,`
`6`	`6`	`"tsne_iter": 8000,`
`7`	`7`	`"text_max_length": 8048,`
`8`		`- "random_state": 45`
	`8`	`+ "random_state": 45,`
	`9`	`+ "topic_str_min_df": 0.25`
`9`	`10`	`}`