moved convex hull calculations to main code

code2k13 · code2k13 · commit e407730c39eb · 2023-03-30T16:48:17.000Z
diff --git a/visualization.html b/visualization.html
@@ -3,7 +3,6 @@
 
 <head>
     <title>Feed Visualizer</title>
-    <!-- Load plotly.js into the DOM -->
     <script src="https://d3js.org/d3.v7.min.js"></script>
     <script src='https://cdn.plot.ly/plotly-2.11.1.min.js'></script>
     <link href="https://cdn.jsdelivr.net/npm/bootstrap@5.1.3/dist/css/bootstrap.min.css" rel="stylesheet"
@@ -45,17 +44,18 @@
                 Add information about the visualization here !
             </p>
         </div>
+        <div id="chk" class="lead">Enable Hulls: <input type="checkbox" onchange="update_hulls(this.checked)"
+                value='true' /></div>
         <div id="clusters"> </div>
         <div id='myDiv' style="width:100%;height:700px"> </div>
     </div>
     <script>
-
+        let hulls_enabled = false;
         let color = d3.scaleOrdinal(d3.schemeSet2);
         let clusters = {}
-        //let color = d3.scaleSequential(d3.interpolateRainbow);
-        //let color = d3.schemePastel1;
         let cluster_count = 1
         let csv_data = null;
+        let selected_cluster = -1;
 
         let trace1 = {
             x: [],
@@ -81,12 +81,9 @@
             xaxis: {
                 showgrid: false,
                 zeroline: false,
-
-
             },
             yaxis: {
                 showgrid: false,
-
                 zeroline: false
             },
             font: {
@@ -99,32 +96,43 @@
         }
 
         function draw_convex_hull(clusters) {
-            for (let k in clusters) {
-                let color_line = color(parseInt(k) / cluster_count)
-                let color_hull = d3.color(color_line)
-                color_hull.opacity = 0.2
-                color_hull = d3.color(color_hull)
-                var convexHull = d3.polygonHull(clusters[k]);
-                var convexHullTrace = {
-                    type: 'polygon',
-                    x: convexHull.map(function (d) { return d[0]; }),
-                    y: convexHull.map(function (d) { return d[1]; }),
-                    mode: 'lines',
-                    line: {
-                        color: color_line,
-                        width: 1,
-                        shape: 'spline'
-
-                    },
-                    fill: 'tozeroy',
-                    fillcolor: color_hull.toString(),
-                    showlegend: false,
-                    hoverinfo: 'skip'
-                };
-                convexHullTrace.x.push(convexHullTrace.x[0])
-                convexHullTrace.y.push(convexHullTrace.y[0])
-                Plotly.addTraces('myDiv', convexHullTrace);
-            }
+            if (hulls_enabled == false) { 
+                let tracesCount = document.getElementById('myDiv').data.length
+                for (let q = 1; q < tracesCount; q++) {
+                    Plotly.deleteTraces('myDiv', 1);
+                }
+                return;
+            };
+            d3.json("convex_hulls.json" + '?' + Math.floor(Math.random() * 1000)).then((d) => {
+                d.forEach((b, idx) => {
+
+                    if (selected_cluster > -1 && idx != selected_cluster) {
+                        return;
+                    }
+
+                    let color_line = color(idx / cluster_count)
+                    let color_hull = d3.color(color_line)
+                    color_hull.opacity = 0.2
+                    color_hull = d3.color(color_hull)
+                    var convexHullTrace = {
+                        type: 'polygon',
+                        x: b.x,
+                        y: b.y,
+                        mode: 'lines',
+                        line: {
+                            color: "#999",
+                            width: 1,
+                            shape: 'spline',
+                            dash: 'dot'
+
+                        },
+                        fillcolor: color_hull.toString(),
+                        showlegend: false,
+                        hoverinfo: 'skip'
+                    };
+                    Plotly.addTraces('myDiv', convexHullTrace);
+                })
+            })
         }
 
         function drawPlot(placeholder, data, layout) {
@@ -137,6 +145,10 @@
         }
 
         function makeplot() {
+            trace1.x = [];
+            trace1.y = []
+            document.getElementById("clusters").innerHTML = ""
+            document.getElementById("myDiv").innerHTML = ""
             d3.csv("data.csv" + '?' + Math.floor(Math.random() * 1000)).then((d) => {
                 csv_data = d
                 let clusterNumbers = []
@@ -152,21 +164,16 @@
                     .style("padding-left", "4px")
                     .style("padding-right", "4px")
                     .style("font-size", "small")
-                    //.style("border", "1px solid grey")
                     .style("min-width", "25px")
                     .style("display", "inline-block")
-                    //.style("color", function (d) {    return (d < (cluster_count*.3) || d > (cluster_count*.7))? 'white':'black'})
                     .style("color", "white")
-                    //.style("text-shadow", "1px 1px grey")
                     .style("margin", "1px")
                     .style("border-radius", "4px")
                     .attr("data-clusterId", function (d) { return d })
                     .on("mouseover", function (e, d) {
-                        //console.log(this.data.cluserId)
-                        //let currentClusterId = this.getAttribute("data-clusterId")
                         let newTrace = JSON.parse(JSON.stringify(trace1));
                         let new_colors = newTrace.marker.color.map(function (c, idx) {
-                            return csv_data[idx].cluster == d ? color(d / cluster_count) : "#dee2e6"
+                            return (csv_data[idx] && csv_data[idx].cluster == d) ? color(d / cluster_count) : "#dee2e6"
                         })
                         newTrace.marker.color = new_colors
                         drawPlot('myDiv', [newTrace], layout)
@@ -185,17 +192,13 @@
                 });
                 drawPlot('myDiv', data, layout)
 
-
             }).then(a => {
                 draw_convex_hull(clusters)
 
             })
 
         };
 
-
-
-
         function processData(row) {
             trace1.x.push(row.x)
             trace1.y.push(row.y)
@@ -207,12 +210,15 @@
                 clusters[cluster_key] = []
             }
             clusters[cluster_key].push([row.x, row.y])
-            console.log(cluster_key)
         }
 
+        function update_hulls(b) {
+            hulls_enabled = b;
+            draw_convex_hull();
+           //makeplot();
+        }
 
         makeplot()
-
     </script>
 </body>
 
diff --git a/visualize.py b/visualize.py
@@ -16,24 +16,27 @@
 from sklearn.feature_extraction.text import CountVectorizer
 from sklearn.manifold import TSNE
 from tqdm import tqdm
+from scipy.spatial import ConvexHull
 
-parser = argparse.ArgumentParser(description='Generates cool visualization from Atom/RSS feeds !')
+parser = argparse.ArgumentParser(
+    description='Generates cool visualization from Atom/RSS feeds !')
 parser.add_argument('-c', '--configuration', required=True,
                     help='location of configuration file.')
 args = parser.parse_args()
 
 with open(args.configuration, 'r') as config_file:
     config = json.load(config_file)
- 
+
 semantic_encoder_model = SentenceTransformer(config["pretrained_model"])
 
-def get_all_entries(path):    
+
+def get_all_entries(path):
     all_entries = {}
-    files = glob.glob(path+"/**/**/*.*",recursive=True)
+    files = glob.glob(path+"/**/**/*.*", recursive=True)
     for file in tqdm(files, desc='Reading posts from files'):
-         
+
         feed = feedparser.parse(file)
-        for entry in feed['entries']:        
+        for entry in feed['entries']:
             if 'summary' in entry:
                 all_entries[entry['link']] = [
                     entry['title'], entry['title'] + " " + entry['summary']]
@@ -43,7 +46,7 @@ def get_all_entries(path):
     return all_entries
 
 
-def generate_text_for_entry(raw_text,entry_counts):
+def generate_text_for_entry(raw_text, entry_counts):
     output = []
     raw_text = raw_text.replace("\n", " ")
     soup = BeautifulSoup(raw_text, features="html.parser")
@@ -59,9 +62,9 @@ def generate_text_for_entry(raw_text,entry_counts):
     return ' ' .join(output)
 
 
-def generate_embeddings(entries,entry_counts):
+def generate_embeddings(entries, entry_counts):
     sentences = [generate_text_for_entry(
-        entries[a][1][0:config["text_max_length"]],entry_counts) for a in entries]
+        entries[a][1][0:config["text_max_length"]], entry_counts) for a in entries]
     print('Generating embeddings ...')
     embeddings = semantic_encoder_model.encode(sentences)
     print('Generating embeddings ... Done !')
@@ -75,31 +78,54 @@ def generate_embeddings(entries,entry_counts):
 def get_coordinates(entries):
     X = [entries[e][-1] for e in entries]
     X = np.array(X)
-    tsne = TSNE(n_iter=config["tsne_iter"],init='pca',learning_rate='auto',random_state=config["random_state"])
-    clustering_model = AgglomerativeClustering(distance_threshold=config["clust_dist_threshold"], n_clusters=None)
+    tsne = TSNE(n_iter=config["tsne_iter"], init='pca',
+                learning_rate='auto', random_state=config["random_state"])
+    clustering_model = AgglomerativeClustering(
+        distance_threshold=config["clust_dist_threshold"], n_clusters=None)
     tsne_output = tsne.fit_transform(X)
-    tsne_output = (tsne_output-tsne_output.min())/(tsne_output.max()-tsne_output.min())
-    #tsne_output = (tsne_output-tsne_output.mean())/tsne_output.std()
+    tsne_output = (tsne_output-tsne_output.min()) / \
+        (tsne_output.max()-tsne_output.min())
+    # tsne_output = (tsne_output-tsne_output.mean())/tsne_output.std()
     clusters = clustering_model.fit_predict(tsne_output)
     return [x[0] for x in tsne.fit_transform(X)], [x[1] for x in tsne.fit_transform(X)], clusters
 
+
 def find_topics(df):
     topics = []
-    for i in range(0,df["cluster"].max()+1):
-        try:        
-            df_text = df[df['cluster']==i]["label"]
-            vectorizer = CountVectorizer(ngram_range=(1,2),min_df=config["topic_str_min_df"],stop_words='english')
+    for i in range(0, df["cluster"].max()+1):
+        try:
+            df_text = df[df['cluster'] == i]["label"]
+            vectorizer = CountVectorizer(ngram_range=(
+                1, 2), min_df=config["topic_str_min_df"], stop_words='english')
             X = vectorizer.fit_transform(df_text)
             possible_topics = vectorizer.get_feature_names_out()
-            idx_topic  = np.argmax([len(a) for a in possible_topics])
+            idx_topic = np.argmax([len(a) for a in possible_topics])
             topics.append(possible_topics[idx_topic])
-            #x,y = np.argmax(np.max(X, axis=1)),np.argmax(np.max(X, axis=0))
-            #topics.append(vectorizer.get_feature_names_out()[y])
+            # x,y = np.argmax(np.max(X, axis=1)),np.argmax(np.max(X, axis=0))
+            # topics.append(vectorizer.get_feature_names_out()[y])
         except:
             topics.append("NA")
             pass
     return topics
 
+
+def get_convex_hulls(df):
+    convex_hulls = []
+    cluster_labels = df['cluster'].unique()
+    cluster_labels.sort()
+    polygon_traces = []
+    for label in cluster_labels:
+        cluster_data = df.loc[df['cluster'] == label]
+        x = cluster_data['x'].values
+        y = cluster_data['y'].values
+        points = np.column_stack((x, y))
+        hull = ConvexHull(points)
+        hull_points = np.append(hull.vertices, hull.vertices[0])
+        convex_hulls.append(
+            {"x": x[hull_points].tolist(), "y": y[hull_points].tolist()})
+    return convex_hulls
+
+
 def main():
     all_entries = get_all_entries(config["input_directory"])
     entry_counts = {}
@@ -111,27 +137,27 @@ def main():
             entry_texts.append(all_entries[k][0])
 
     all_entries = disinct_entries
-    entries = generate_embeddings(all_entries,entry_counts)
+    entries = generate_embeddings(all_entries, entry_counts)
     print('Creating clusters ...')
     x, y, cluster_info = get_coordinates(entries)
     print('Creating clusters ... Done !')
     labels = [entries[k][0] for k in entries]
-
-
     counts = [entry_counts[k] if k in entry_counts else 0 for k in entries]
-
     df = pd.DataFrame({'x': x, 'y': y, 'label': labels,
-                    'count': counts, 'url': entries.keys(), 'cluster': cluster_info})
+                       'count': counts, 'url': entries.keys(), 'cluster': cluster_info})
 
     topics = find_topics(df)
-    df["topic"] = df["cluster"].apply(lambda x : topics[x])
+    df["topic"] = df["cluster"].apply(lambda x: topics[x])
     print('Assigning cluster names !')
     if not os.path.exists(config["output_directory"]):
         os.makedirs(config["output_directory"])
     df.to_csv(config["output_directory"]+"/data.csv")
-
+    convex_hulls = get_convex_hulls(df)
+    with open(config["output_directory"] + '/convex_hulls.json', 'w') as f:
+        f.write(json.dumps(convex_hulls))
     shutil.copy('visualization.html', config["output_directory"])
     print('Vizualization generation is complete !!')
 
+
 if __name__ == "__main__":
     main()