16
16
from sklearn .feature_extraction .text import CountVectorizer
17
17
from sklearn .manifold import TSNE
18
18
from tqdm import tqdm
19
+ from scipy .spatial import ConvexHull
19
20
20
- parser = argparse .ArgumentParser (description = 'Generates cool visualization from Atom/RSS feeds !' )
21
+ parser = argparse .ArgumentParser (
22
+ description = 'Generates cool visualization from Atom/RSS feeds !' )
21
23
parser .add_argument ('-c' , '--configuration' , required = True ,
22
24
help = 'location of configuration file.' )
23
25
args = parser .parse_args ()
24
26
25
27
with open (args .configuration , 'r' ) as config_file :
26
28
config = json .load (config_file )
27
-
29
+
28
30
semantic_encoder_model = SentenceTransformer (config ["pretrained_model" ])
29
31
30
- def get_all_entries (path ):
32
+
33
+ def get_all_entries (path ):
31
34
all_entries = {}
32
- files = glob .glob (path + "/**/**/*.*" ,recursive = True )
35
+ files = glob .glob (path + "/**/**/*.*" , recursive = True )
33
36
for file in tqdm (files , desc = 'Reading posts from files' ):
34
-
37
+
35
38
feed = feedparser .parse (file )
36
- for entry in feed ['entries' ]:
39
+ for entry in feed ['entries' ]:
37
40
if 'summary' in entry :
38
41
all_entries [entry ['link' ]] = [
39
42
entry ['title' ], entry ['title' ] + " " + entry ['summary' ]]
@@ -43,7 +46,7 @@ def get_all_entries(path):
43
46
return all_entries
44
47
45
48
46
- def generate_text_for_entry (raw_text ,entry_counts ):
49
+ def generate_text_for_entry (raw_text , entry_counts ):
47
50
output = []
48
51
raw_text = raw_text .replace ("\n " , " " )
49
52
soup = BeautifulSoup (raw_text , features = "html.parser" )
@@ -59,9 +62,9 @@ def generate_text_for_entry(raw_text,entry_counts):
59
62
return ' ' .join (output )
60
63
61
64
62
- def generate_embeddings (entries ,entry_counts ):
65
+ def generate_embeddings (entries , entry_counts ):
63
66
sentences = [generate_text_for_entry (
64
- entries [a ][1 ][0 :config ["text_max_length" ]],entry_counts ) for a in entries ]
67
+ entries [a ][1 ][0 :config ["text_max_length" ]], entry_counts ) for a in entries ]
65
68
print ('Generating embeddings ...' )
66
69
embeddings = semantic_encoder_model .encode (sentences )
67
70
print ('Generating embeddings ... Done !' )
@@ -75,31 +78,54 @@ def generate_embeddings(entries,entry_counts):
75
78
def get_coordinates (entries ):
76
79
X = [entries [e ][- 1 ] for e in entries ]
77
80
X = np .array (X )
78
- tsne = TSNE (n_iter = config ["tsne_iter" ],init = 'pca' ,learning_rate = 'auto' ,random_state = config ["random_state" ])
79
- clustering_model = AgglomerativeClustering (distance_threshold = config ["clust_dist_threshold" ], n_clusters = None )
81
+ tsne = TSNE (n_iter = config ["tsne_iter" ], init = 'pca' ,
82
+ learning_rate = 'auto' , random_state = config ["random_state" ])
83
+ clustering_model = AgglomerativeClustering (
84
+ distance_threshold = config ["clust_dist_threshold" ], n_clusters = None )
80
85
tsne_output = tsne .fit_transform (X )
81
- tsne_output = (tsne_output - tsne_output .min ())/ (tsne_output .max ()- tsne_output .min ())
82
- #tsne_output = (tsne_output-tsne_output.mean())/tsne_output.std()
86
+ tsne_output = (tsne_output - tsne_output .min ()) / \
87
+ (tsne_output .max ()- tsne_output .min ())
88
+ # tsne_output = (tsne_output-tsne_output.mean())/tsne_output.std()
83
89
clusters = clustering_model .fit_predict (tsne_output )
84
90
return [x [0 ] for x in tsne .fit_transform (X )], [x [1 ] for x in tsne .fit_transform (X )], clusters
85
91
92
+
86
93
def find_topics (df ):
87
94
topics = []
88
- for i in range (0 ,df ["cluster" ].max ()+ 1 ):
89
- try :
90
- df_text = df [df ['cluster' ]== i ]["label" ]
91
- vectorizer = CountVectorizer (ngram_range = (1 ,2 ),min_df = config ["topic_str_min_df" ],stop_words = 'english' )
95
+ for i in range (0 , df ["cluster" ].max ()+ 1 ):
96
+ try :
97
+ df_text = df [df ['cluster' ] == i ]["label" ]
98
+ vectorizer = CountVectorizer (ngram_range = (
99
+ 1 , 2 ), min_df = config ["topic_str_min_df" ], stop_words = 'english' )
92
100
X = vectorizer .fit_transform (df_text )
93
101
possible_topics = vectorizer .get_feature_names_out ()
94
- idx_topic = np .argmax ([len (a ) for a in possible_topics ])
102
+ idx_topic = np .argmax ([len (a ) for a in possible_topics ])
95
103
topics .append (possible_topics [idx_topic ])
96
- #x,y = np.argmax(np.max(X, axis=1)),np.argmax(np.max(X, axis=0))
97
- #topics.append(vectorizer.get_feature_names_out()[y])
104
+ # x,y = np.argmax(np.max(X, axis=1)),np.argmax(np.max(X, axis=0))
105
+ # topics.append(vectorizer.get_feature_names_out()[y])
98
106
except :
99
107
topics .append ("NA" )
100
108
pass
101
109
return topics
102
110
111
+
112
+ def get_convex_hulls (df ):
113
+ convex_hulls = []
114
+ cluster_labels = df ['cluster' ].unique ()
115
+ cluster_labels .sort ()
116
+ polygon_traces = []
117
+ for label in cluster_labels :
118
+ cluster_data = df .loc [df ['cluster' ] == label ]
119
+ x = cluster_data ['x' ].values
120
+ y = cluster_data ['y' ].values
121
+ points = np .column_stack ((x , y ))
122
+ hull = ConvexHull (points )
123
+ hull_points = np .append (hull .vertices , hull .vertices [0 ])
124
+ convex_hulls .append (
125
+ {"x" : x [hull_points ].tolist (), "y" : y [hull_points ].tolist ()})
126
+ return convex_hulls
127
+
128
+
103
129
def main ():
104
130
all_entries = get_all_entries (config ["input_directory" ])
105
131
entry_counts = {}
@@ -111,27 +137,27 @@ def main():
111
137
entry_texts .append (all_entries [k ][0 ])
112
138
113
139
all_entries = disinct_entries
114
- entries = generate_embeddings (all_entries ,entry_counts )
140
+ entries = generate_embeddings (all_entries , entry_counts )
115
141
print ('Creating clusters ...' )
116
142
x , y , cluster_info = get_coordinates (entries )
117
143
print ('Creating clusters ... Done !' )
118
144
labels = [entries [k ][0 ] for k in entries ]
119
-
120
-
121
145
counts = [entry_counts [k ] if k in entry_counts else 0 for k in entries ]
122
-
123
146
df = pd .DataFrame ({'x' : x , 'y' : y , 'label' : labels ,
124
- 'count' : counts , 'url' : entries .keys (), 'cluster' : cluster_info })
147
+ 'count' : counts , 'url' : entries .keys (), 'cluster' : cluster_info })
125
148
126
149
topics = find_topics (df )
127
- df ["topic" ] = df ["cluster" ].apply (lambda x : topics [x ])
150
+ df ["topic" ] = df ["cluster" ].apply (lambda x : topics [x ])
128
151
print ('Assigning cluster names !' )
129
152
if not os .path .exists (config ["output_directory" ]):
130
153
os .makedirs (config ["output_directory" ])
131
154
df .to_csv (config ["output_directory" ]+ "/data.csv" )
132
-
155
+ convex_hulls = get_convex_hulls (df )
156
+ with open (config ["output_directory" ] + '/convex_hulls.json' , 'w' ) as f :
157
+ f .write (json .dumps (convex_hulls ))
133
158
shutil .copy ('visualization.html' , config ["output_directory" ])
134
159
print ('Vizualization generation is complete !!' )
135
160
161
+
136
162
if __name__ == "__main__" :
137
163
main ()
0 commit comments