Skip to content

Commit e407730

Browse files
committed
moved convex hull calculations to main code
1 parent 9837246 commit e407730

File tree

2 files changed

+104
-72
lines changed

2 files changed

+104
-72
lines changed

visualization.html

+51-45
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33

44
<head>
55
<title>Feed Visualizer</title>
6-
<!-- Load plotly.js into the DOM -->
76
<script src="https://d3js.org/d3.v7.min.js"></script>
87
<script src='https://cdn.plot.ly/plotly-2.11.1.min.js'></script>
98
<link href="https://cdn.jsdelivr.net/npm/bootstrap@5.1.3/dist/css/bootstrap.min.css" rel="stylesheet"
@@ -45,17 +44,18 @@
4544
Add information about the visualization here !
4645
</p>
4746
</div>
47+
<div id="chk" class="lead">Enable Hulls: <input type="checkbox" onchange="update_hulls(this.checked)"
48+
value='true' /></div>
4849
<div id="clusters"> </div>
4950
<div id='myDiv' style="width:100%;height:700px"> </div>
5051
</div>
5152
<script>
52-
53+
let hulls_enabled = false;
5354
let color = d3.scaleOrdinal(d3.schemeSet2);
5455
let clusters = {}
55-
//let color = d3.scaleSequential(d3.interpolateRainbow);
56-
//let color = d3.schemePastel1;
5756
let cluster_count = 1
5857
let csv_data = null;
58+
let selected_cluster = -1;
5959

6060
let trace1 = {
6161
x: [],
@@ -81,12 +81,9 @@
8181
xaxis: {
8282
showgrid: false,
8383
zeroline: false,
84-
85-
8684
},
8785
yaxis: {
8886
showgrid: false,
89-
9087
zeroline: false
9188
},
9289
font: {
@@ -99,32 +96,43 @@
9996
}
10097

10198
function draw_convex_hull(clusters) {
102-
for (let k in clusters) {
103-
let color_line = color(parseInt(k) / cluster_count)
104-
let color_hull = d3.color(color_line)
105-
color_hull.opacity = 0.2
106-
color_hull = d3.color(color_hull)
107-
var convexHull = d3.polygonHull(clusters[k]);
108-
var convexHullTrace = {
109-
type: 'polygon',
110-
x: convexHull.map(function (d) { return d[0]; }),
111-
y: convexHull.map(function (d) { return d[1]; }),
112-
mode: 'lines',
113-
line: {
114-
color: color_line,
115-
width: 1,
116-
shape: 'spline'
117-
118-
},
119-
fill: 'tozeroy',
120-
fillcolor: color_hull.toString(),
121-
showlegend: false,
122-
hoverinfo: 'skip'
123-
};
124-
convexHullTrace.x.push(convexHullTrace.x[0])
125-
convexHullTrace.y.push(convexHullTrace.y[0])
126-
Plotly.addTraces('myDiv', convexHullTrace);
127-
}
99+
if (hulls_enabled == false) {
100+
let tracesCount = document.getElementById('myDiv').data.length
101+
for (let q = 1; q < tracesCount; q++) {
102+
Plotly.deleteTraces('myDiv', 1);
103+
}
104+
return;
105+
};
106+
d3.json("convex_hulls.json" + '?' + Math.floor(Math.random() * 1000)).then((d) => {
107+
d.forEach((b, idx) => {
108+
109+
if (selected_cluster > -1 && idx != selected_cluster) {
110+
return;
111+
}
112+
113+
let color_line = color(idx / cluster_count)
114+
let color_hull = d3.color(color_line)
115+
color_hull.opacity = 0.2
116+
color_hull = d3.color(color_hull)
117+
var convexHullTrace = {
118+
type: 'polygon',
119+
x: b.x,
120+
y: b.y,
121+
mode: 'lines',
122+
line: {
123+
color: "#999",
124+
width: 1,
125+
shape: 'spline',
126+
dash: 'dot'
127+
128+
},
129+
fillcolor: color_hull.toString(),
130+
showlegend: false,
131+
hoverinfo: 'skip'
132+
};
133+
Plotly.addTraces('myDiv', convexHullTrace);
134+
})
135+
})
128136
}
129137

130138
function drawPlot(placeholder, data, layout) {
@@ -137,6 +145,10 @@
137145
}
138146

139147
function makeplot() {
148+
trace1.x = [];
149+
trace1.y = []
150+
document.getElementById("clusters").innerHTML = ""
151+
document.getElementById("myDiv").innerHTML = ""
140152
d3.csv("data.csv" + '?' + Math.floor(Math.random() * 1000)).then((d) => {
141153
csv_data = d
142154
let clusterNumbers = []
@@ -152,21 +164,16 @@
152164
.style("padding-left", "4px")
153165
.style("padding-right", "4px")
154166
.style("font-size", "small")
155-
//.style("border", "1px solid grey")
156167
.style("min-width", "25px")
157168
.style("display", "inline-block")
158-
//.style("color", function (d) { return (d < (cluster_count*.3) || d > (cluster_count*.7))? 'white':'black'})
159169
.style("color", "white")
160-
//.style("text-shadow", "1px 1px grey")
161170
.style("margin", "1px")
162171
.style("border-radius", "4px")
163172
.attr("data-clusterId", function (d) { return d })
164173
.on("mouseover", function (e, d) {
165-
//console.log(this.data.cluserId)
166-
//let currentClusterId = this.getAttribute("data-clusterId")
167174
let newTrace = JSON.parse(JSON.stringify(trace1));
168175
let new_colors = newTrace.marker.color.map(function (c, idx) {
169-
return csv_data[idx].cluster == d ? color(d / cluster_count) : "#dee2e6"
176+
return (csv_data[idx] && csv_data[idx].cluster == d) ? color(d / cluster_count) : "#dee2e6"
170177
})
171178
newTrace.marker.color = new_colors
172179
drawPlot('myDiv', [newTrace], layout)
@@ -185,17 +192,13 @@
185192
});
186193
drawPlot('myDiv', data, layout)
187194

188-
189195
}).then(a => {
190196
draw_convex_hull(clusters)
191197

192198
})
193199

194200
};
195201

196-
197-
198-
199202
function processData(row) {
200203
trace1.x.push(row.x)
201204
trace1.y.push(row.y)
@@ -207,12 +210,15 @@
207210
clusters[cluster_key] = []
208211
}
209212
clusters[cluster_key].push([row.x, row.y])
210-
console.log(cluster_key)
211213
}
212214

215+
function update_hulls(b) {
216+
hulls_enabled = b;
217+
draw_convex_hull();
218+
//makeplot();
219+
}
213220

214221
makeplot()
215-
216222
</script>
217223
</body>
218224

visualize.py

+53-27
Original file line numberDiff line numberDiff line change
@@ -16,24 +16,27 @@
1616
from sklearn.feature_extraction.text import CountVectorizer
1717
from sklearn.manifold import TSNE
1818
from tqdm import tqdm
19+
from scipy.spatial import ConvexHull
1920

20-
parser = argparse.ArgumentParser(description='Generates cool visualization from Atom/RSS feeds !')
21+
parser = argparse.ArgumentParser(
22+
description='Generates cool visualization from Atom/RSS feeds !')
2123
parser.add_argument('-c', '--configuration', required=True,
2224
help='location of configuration file.')
2325
args = parser.parse_args()
2426

2527
with open(args.configuration, 'r') as config_file:
2628
config = json.load(config_file)
27-
29+
2830
semantic_encoder_model = SentenceTransformer(config["pretrained_model"])
2931

30-
def get_all_entries(path):
32+
33+
def get_all_entries(path):
3134
all_entries = {}
32-
files = glob.glob(path+"/**/**/*.*",recursive=True)
35+
files = glob.glob(path+"/**/**/*.*", recursive=True)
3336
for file in tqdm(files, desc='Reading posts from files'):
34-
37+
3538
feed = feedparser.parse(file)
36-
for entry in feed['entries']:
39+
for entry in feed['entries']:
3740
if 'summary' in entry:
3841
all_entries[entry['link']] = [
3942
entry['title'], entry['title'] + " " + entry['summary']]
@@ -43,7 +46,7 @@ def get_all_entries(path):
4346
return all_entries
4447

4548

46-
def generate_text_for_entry(raw_text,entry_counts):
49+
def generate_text_for_entry(raw_text, entry_counts):
4750
output = []
4851
raw_text = raw_text.replace("\n", " ")
4952
soup = BeautifulSoup(raw_text, features="html.parser")
@@ -59,9 +62,9 @@ def generate_text_for_entry(raw_text,entry_counts):
5962
return ' ' .join(output)
6063

6164

62-
def generate_embeddings(entries,entry_counts):
65+
def generate_embeddings(entries, entry_counts):
6366
sentences = [generate_text_for_entry(
64-
entries[a][1][0:config["text_max_length"]],entry_counts) for a in entries]
67+
entries[a][1][0:config["text_max_length"]], entry_counts) for a in entries]
6568
print('Generating embeddings ...')
6669
embeddings = semantic_encoder_model.encode(sentences)
6770
print('Generating embeddings ... Done !')
@@ -75,31 +78,54 @@ def generate_embeddings(entries,entry_counts):
7578
def get_coordinates(entries):
7679
X = [entries[e][-1] for e in entries]
7780
X = np.array(X)
78-
tsne = TSNE(n_iter=config["tsne_iter"],init='pca',learning_rate='auto',random_state=config["random_state"])
79-
clustering_model = AgglomerativeClustering(distance_threshold=config["clust_dist_threshold"], n_clusters=None)
81+
tsne = TSNE(n_iter=config["tsne_iter"], init='pca',
82+
learning_rate='auto', random_state=config["random_state"])
83+
clustering_model = AgglomerativeClustering(
84+
distance_threshold=config["clust_dist_threshold"], n_clusters=None)
8085
tsne_output = tsne.fit_transform(X)
81-
tsne_output = (tsne_output-tsne_output.min())/(tsne_output.max()-tsne_output.min())
82-
#tsne_output = (tsne_output-tsne_output.mean())/tsne_output.std()
86+
tsne_output = (tsne_output-tsne_output.min()) / \
87+
(tsne_output.max()-tsne_output.min())
88+
# tsne_output = (tsne_output-tsne_output.mean())/tsne_output.std()
8389
clusters = clustering_model.fit_predict(tsne_output)
8490
return [x[0] for x in tsne.fit_transform(X)], [x[1] for x in tsne.fit_transform(X)], clusters
8591

92+
8693
def find_topics(df):
8794
topics = []
88-
for i in range(0,df["cluster"].max()+1):
89-
try:
90-
df_text = df[df['cluster']==i]["label"]
91-
vectorizer = CountVectorizer(ngram_range=(1,2),min_df=config["topic_str_min_df"],stop_words='english')
95+
for i in range(0, df["cluster"].max()+1):
96+
try:
97+
df_text = df[df['cluster'] == i]["label"]
98+
vectorizer = CountVectorizer(ngram_range=(
99+
1, 2), min_df=config["topic_str_min_df"], stop_words='english')
92100
X = vectorizer.fit_transform(df_text)
93101
possible_topics = vectorizer.get_feature_names_out()
94-
idx_topic = np.argmax([len(a) for a in possible_topics])
102+
idx_topic = np.argmax([len(a) for a in possible_topics])
95103
topics.append(possible_topics[idx_topic])
96-
#x,y = np.argmax(np.max(X, axis=1)),np.argmax(np.max(X, axis=0))
97-
#topics.append(vectorizer.get_feature_names_out()[y])
104+
# x,y = np.argmax(np.max(X, axis=1)),np.argmax(np.max(X, axis=0))
105+
# topics.append(vectorizer.get_feature_names_out()[y])
98106
except:
99107
topics.append("NA")
100108
pass
101109
return topics
102110

111+
112+
def get_convex_hulls(df):
113+
convex_hulls = []
114+
cluster_labels = df['cluster'].unique()
115+
cluster_labels.sort()
116+
polygon_traces = []
117+
for label in cluster_labels:
118+
cluster_data = df.loc[df['cluster'] == label]
119+
x = cluster_data['x'].values
120+
y = cluster_data['y'].values
121+
points = np.column_stack((x, y))
122+
hull = ConvexHull(points)
123+
hull_points = np.append(hull.vertices, hull.vertices[0])
124+
convex_hulls.append(
125+
{"x": x[hull_points].tolist(), "y": y[hull_points].tolist()})
126+
return convex_hulls
127+
128+
103129
def main():
104130
all_entries = get_all_entries(config["input_directory"])
105131
entry_counts = {}
@@ -111,27 +137,27 @@ def main():
111137
entry_texts.append(all_entries[k][0])
112138

113139
all_entries = disinct_entries
114-
entries = generate_embeddings(all_entries,entry_counts)
140+
entries = generate_embeddings(all_entries, entry_counts)
115141
print('Creating clusters ...')
116142
x, y, cluster_info = get_coordinates(entries)
117143
print('Creating clusters ... Done !')
118144
labels = [entries[k][0] for k in entries]
119-
120-
121145
counts = [entry_counts[k] if k in entry_counts else 0 for k in entries]
122-
123146
df = pd.DataFrame({'x': x, 'y': y, 'label': labels,
124-
'count': counts, 'url': entries.keys(), 'cluster': cluster_info})
147+
'count': counts, 'url': entries.keys(), 'cluster': cluster_info})
125148

126149
topics = find_topics(df)
127-
df["topic"] = df["cluster"].apply(lambda x : topics[x])
150+
df["topic"] = df["cluster"].apply(lambda x: topics[x])
128151
print('Assigning cluster names !')
129152
if not os.path.exists(config["output_directory"]):
130153
os.makedirs(config["output_directory"])
131154
df.to_csv(config["output_directory"]+"/data.csv")
132-
155+
convex_hulls = get_convex_hulls(df)
156+
with open(config["output_directory"] + '/convex_hulls.json', 'w') as f:
157+
f.write(json.dumps(convex_hulls))
133158
shutil.copy('visualization.html', config["output_directory"])
134159
print('Vizualization generation is complete !!')
135160

161+
136162
if __name__ == "__main__":
137163
main()

0 commit comments

Comments
 (0)