Skip to content

Commit eef62ae

Browse files
committed
viz stuff moved to ukko
1 parent 9a3e104 commit eef62ae

12 files changed

+130
-65
lines changed

dump_events_to_json.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -44,15 +44,17 @@ def run(candidate_tree_path,
4444

4545
e.node[n]['body'] = id2interaction[n]['body']
4646
e.node[n]['root'] = (n == root)
47+
e.node[n]['datetime'] = str(e.node[n]['datetime'])
4748

48-
# some simple clustering
49-
assignment = greedy_clustering_on_graph(e)
50-
for n in e.nodes_iter():
51-
e.node[n]['cluster_label'] = assignment[n]
49+
# # some simple clustering
50+
# assignment = greedy_clustering_on_graph(e)
51+
# for n in e.nodes_iter():
52+
# e.node[n]['cluster_label'] = assignment[n]
5253

5354
if to_original_graph:
5455
events = map(convert_to_original_graph,
5556
events)
57+
# import pdb; pdb.set_trace()
5658

5759
d3_events = [to_d3_graph(e)
5860
for e in events]

dump_vis_timeline_data.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@ def run(cand_trees, k, summary_kws, undirected):
5454
'id': group_id,
5555
'terms': summ['topics']['topic_terms'],
5656
# 'terms': summ['frequent_terms'],
57+
# 'terms': summ['tdidf_terms'],
5758
'participants': dict(
5859
summ['participants']['participant_count']
5960
),

interactions.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
import networkx as nx
1010
import pandas as pd
1111

12+
import cPickle as pkl
1213
from itertools import izip
1314
from datetime import datetime as dt
1415
from memory_profiler import profile
@@ -354,6 +355,10 @@ def add_bow_to_graph(cls, g, dictionary):
354355
if i % 1000 == 0:
355356
logger.debug('adding BoW: {} / {}'.format(i, N))
356357
g.node[n]['bow'] = tfidf_mat[node2row[n], :]
358+
359+
print('dumping tfidf vectorizer')
360+
pkl.dump(tfidf, open('/cs/home/hxiao/code/lst/tmp/tfidf.pkl', 'w'))
361+
357362
return g
358363

359364
@classmethod

lda/corpora.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,7 @@ def __len__(self):
8484
for path in arg.stoplist_paths:
8585
CorpusEnron.stoplist |= load_items_by_line(path.strip())
8686
print('new length of stoplist: {}'.format(len(CorpusEnron.stoplist)))
87-
if True:
87+
if False:
8888
print('WARN: stoplist empty!')
8989
CorpusEnron.stoplist = set()
9090

meta_graph.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,12 @@ def convert_to_meta_graph_undirected(node_names, participants, timestamps,
141141

142142
def convert_to_original_graph(mg):
143143
g = nt.DiGraph()
144+
for m in mg.nodes():
145+
s = mg.node[m]['sender']
146+
g.add_node(s['id'], s)
147+
for r in mg.node[m]['recipients']:
148+
g.add_node(r['id'], r)
149+
144150
for n in mg.nodes():
145151
sender = mg.node[n]['sender_id']
146152
for recipient in mg.node[n]['recipient_ids']:

meta_graph_stat.py

Lines changed: 43 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import scipy
2+
import cPickle as pkl
23
import numpy as np
34
import itertools
45
import networkx as nx
@@ -145,9 +146,12 @@ def topics(self, interactions, dictionary, lda, top_k=10):
145146
# minimum_probability=0
146147
# )
147148
topic_dist = np.asarray([v for _, v in topic_dist])
148-
149+
149150
# topic_terms
150-
beta = lda.state.get_lambda()
151+
if not hasattr(lda, 'wordtopics'):
152+
lda.load_word_topics()
153+
beta = lda.wordtopics
154+
# beta = lda.state.get_lambda()
151155

152156
# normalize and weight by beta dist
153157
weighted_terms = (
@@ -158,11 +162,15 @@ def topics(self, interactions, dictionary, lda, top_k=10):
158162

159163
topic_terms = [lda.id2word[id] for id in bestn]
160164

161-
topic_divergence = self._topic_divergence(message_ids, id2msg,
162-
dictionary, lda)
163-
return {'topic_dist': topic_dist,
165+
top_topics = np.argsort(topic_dist)[::-1][:3]
166+
print('top_topics', top_topics)
167+
# topic_divergence = self._topic_divergence(message_ids, id2msg,
168+
# dictionary, lda)
169+
return {# 'topic_dist': topic_dist,
164170
'topic_terms': topic_terms,
165-
'topic_divergence': topic_divergence}
171+
'top_topics': top_topics
172+
# 'topic_divergence': topic_divergence
173+
}
166174

167175
def frequent_terms(self, interactions, top_k=10):
168176
id2msg = {}
@@ -181,6 +189,25 @@ def frequent_terms(self, interactions, top_k=10):
181189
print 'frequent_terms', terms
182190
return terms
183191

192+
def tfidf_terms(self, interactions, dictionary, top_k=10):
193+
text = '\n'.join(['{} {}'.format(m['subject'], m['body'])
194+
for m in interactions])
195+
tfidf_vec = pkl.load(open('/cs/home/hxiao/code/lst/tmp/tfidf.pkl'))
196+
counts = dictionary.doc2bow(
197+
IU.tokenize_document(text)
198+
)
199+
raw_vect = np.zeros(len(dictionary.keys()))
200+
for word, cnt in counts:
201+
raw_vect[word] = cnt
202+
203+
vect = tfidf_vec.transform([raw_vect])
204+
vect = np.asarray(vect.todense()).flatten()
205+
206+
tfidf_terms = [dictionary[i]
207+
for i in np.argsort(vect)[::-1][:top_k]]
208+
print 'tfidf_terms', tfidf_terms
209+
return tfidf_terms
210+
184211
def hashtags(self):
185212
tags = itertools.chain(
186213
*[self.g.node[n]['hashtags']
@@ -308,6 +335,7 @@ def build_default_summary_kws(interactions, people_info,
308335
summary_kws = {
309336
'basic_structure_stats': {},
310337
'time_span': {},
338+
# Deprecated
311339
'topics': {
312340
'interactions': interactions,
313341
'dictionary': dictionary,
@@ -329,10 +357,15 @@ def build_default_summary_kws(interactions, people_info,
329357
'interactions': interactions,
330358
'undirected': undirected
331359
},
332-
'frequent_terms': {
333-
'interactions': interactions,
334-
'top_k': 10
335-
}
360+
# 'frequent_terms': {
361+
# 'interactions': interactions,
362+
# 'top_k': 10
363+
# },
364+
# 'tfidf_terms': {
365+
# 'interactions': interactions,
366+
# 'dictionary': dictionary,
367+
# 'top_k': 10
368+
# }
336369
}
337370
return summary_kws
338371

print_topics.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -24,9 +24,12 @@ def get_topic_terms(model, topicid, topn, id2token):
2424
# for i in xrange(m.num_topics):
2525
# print(' '.join(get_topic_terms(m, i, 10, m.id2word)))
2626

27-
s = '\n'.join([' '.join([w for _, w in lst])
28-
for lst in m.show_topics(num_topics=-1, formatted=False)]
29-
)
27+
for i, lst in enumerate(m.show_topics(num_topics=-1, formatted=False)):
28+
print '{}: {}'.format(i, ' '.join([w for _, w in lst]))
3029

31-
print s
30+
# s = '\n'.join([' '.join([w for _, w in lst])
31+
# for lst in m.show_topics(num_topics=-1, formatted=False)]
32+
# )
33+
34+
# print s
3235

scripts/make_json_data_for_d3.sh

100644100755
Lines changed: 45 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -11,59 +11,67 @@
1111
# - original graph is not allowed for undirected case
1212

1313
if [ -z $1 ]; then
14-
echo "dataset name is not given"
15-
exit -1
14+
echo "dataset name is not given"
15+
exit -1
1616
fi
1717

1818

1919
dataset=$1
2020
pickle_dir=tmp/${dataset}
2121
extra=$2
2222

23-
output_dir="/cs/home/hxiao/public_html/event_html/data/${dataset}"
2423
metadata_dir="/cs/home/hxiao/public_html/event_html/data/${dataset}"
2524

26-
if [ ! -d $output_dir ]; then
27-
mkdir -p ${output_dir}
25+
output_dir="html/data/${dataset}"
26+
remote_host="shell.cs.helsinki.fi"
27+
remote_dir="/cs/home/hxiao/public_html/event_html/data/${dataset}"
28+
29+
if [ ! -d "${output_dir}/event/original_graph" ]; then
30+
mkdir -p ${output_dir}/event/original_graph
31+
fi
32+
33+
if [ ! -d "${output_dir}/event/meta_graph" ]; then
34+
mkdir -p ${output_dir}/event/meta_graph
2835
fi
2936

3037
for p in $(ls ${pickle_dir}/result-*.pkl); do
31-
echo "${p}"
32-
# just events
33-
output_name=$(basename ${p})
34-
output_name="${output_name%.*}.json"
38+
echo "${p}"
39+
output_name=$(basename ${p})
40+
output_name="${output_name%.*}.json"
3541

36-
if [ ! -f "${output_dir}/event/original_graph/${output_name}" ]; then
37-
echo 'dumping event to original graph'
38-
python dump_events_to_json.py \
39-
--candidate_tree_path ${p} \
40-
--dirname "${output_dir}/event/original_graph" \
41-
--interactions_path data/${dataset}/interactions.* \
42-
--people_path data/${dataset}/people.* \
43-
--to_original_graph \
44-
-k 5 \
45-
${extra}
46-
else
47-
echo "original graph exists"
48-
fi
42+
if [ ! -f "${output_dir}/event/original_graph/${output_name}" ]; then
43+
echo 'dumping event to original graph'
44+
python dump_events_to_json.py \
45+
--candidate_tree_path ${p} \
46+
--dirname "${output_dir}/event/original_graph" \
47+
--interactions_path data/${dataset}/interactions.* \
48+
--people_path data/${dataset}/people.* \
49+
--to_original_graph \
50+
-k 10 \
51+
${extra}
52+
else
53+
echo "original graph exists"
54+
fi
4955

50-
if [ ! -f "${output_dir}/event/meta_graph/${output_name}" ]; then
51-
echo 'dumping event to meta graph'
52-
python dump_events_to_json.py \
53-
--candidate_tree_path ${p} \
54-
--dirname "${output_dir}/event/meta_graph" \
55-
--interactions_path data/${dataset}/interactions.* \
56-
--people_path data/${dataset}/people.* \
57-
-k 5 \
58-
${extra}
59-
else
60-
echo "meta graph exists"
61-
fi
56+
if [ ! -f "${output_dir}/event/meta_graph/${output_name}" ]; then
57+
echo 'dumping event to meta graph'
58+
python dump_events_to_json.py \
59+
--candidate_tree_path ${p} \
60+
--dirname "${output_dir}/event/meta_graph" \
61+
--interactions_path data/${dataset}/interactions.* \
62+
--people_path data/${dataset}/people.* \
63+
-k 10 \
64+
${extra}
65+
else
66+
echo "meta graph exists"
67+
fi
6268
done
6369

6470
echo "dumping event names..."
6571
python dump_all_event_json_names.py \
66-
${output_dir}/event/meta_graph \
67-
${output_dir}/event_names.json
72+
${output_dir}/event/meta_graph \
73+
${output_dir}/event_names.json
74+
75+
rsync -vr ${output_dir}/ ${remote_host}:${remote_dir}/
6876

69-
chmod -R a+rx ${output_dir}
77+
ssh ${remote_host} "chmod -R a+rx ${remote_dir}"

scripts/make_json_data_for_timeline.sh

100644100755
Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,10 @@ dataset=$1
77
pickle_dir="tmp/${dataset}"
88
extra=$2
99

10-
output_dir="/cs/home/hxiao/public_html/event_html/data/${dataset}"
10+
# output_dir="/cs/home/hxiao/public_html/event_html/data/${dataset}"
11+
output_dir="html/data/${dataset}"
12+
remote_host="shell.cs.helsinki.fi"
13+
remote_dir="/cs/home/hxiao/public_html/event_html/data/${dataset}"
1114

1215
if [ ! -d "${output_dir}/timeline" ]; then
1316
mkdir -p "${output_dir}/timeline"
@@ -36,7 +39,10 @@ done
3639

3740
echo "dumping timeline names..."
3841
python dump_all_event_json_names.py \
39-
${output_dir}/timeline \
40-
${output_dir}/timeline_names.json
42+
${output_dir}/timeline \
43+
${output_dir}/timeline_names.json
4144

42-
chmod -R a+rx /cs/home/hxiao/public_html/event_html/data
45+
# chmod -R a+rx /cs/home/hxiao/public_html/event_html/data
46+
rsync -vr ${output_dir}/timeline ${remote_host}:${remote_dir}/
47+
rsync -v ${output_dir}/timeline_names.json ${remote_host}:${remote_dir}/
48+
ssh ${remote_host} "chmod -R a+rx ${remote_dir}"

scripts/train_letter_lda.sh

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,5 +13,4 @@ if [ -z $1 ]; then
1313
else
1414
arg="dump_msg"
1515
fi
16-
./scripts/train_lda.sh letter_18c $arg 20 200 -1
17-
# "-s /cs/home/hxiao/code/lst/data/letter_18c_stopwords.txt"
16+
./scripts/train_lda.sh letter $arg 15 500 -1 "-s /cs/home/hxiao/code/lst/data/letter_18c_stopwords.txt"

util.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -93,9 +93,10 @@ def load_summary_related_data(interactions_path, people_path,
9393
dictionary = gensim.corpora.dictionary.Dictionary.load(
9494
corpus_dict_path
9595
)
96-
lda = gensim.models.ldamodel.LdaModel.load(
97-
lda_model_path
98-
)
96+
# lda = gensim.models.ldamodel.LdaModel.load(
97+
# lda_model_path
98+
# )
99+
lda = gensim.models.wrappers.LdaMallet.load(lda_model_path)
99100
return interactions, people_info, dictionary, lda
100101

101102

viz_util.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ def to_d3_graph(g):
55
data = {'nodes': [], 'edges': []}
66
for n in g.nodes_iter():
77
node = g.node[n]
8+
# print('node', node)
89
for f in ('topics', 'bow', 'hashtag_bow'):
910
if f in node:
1011
del node[f]

0 commit comments

Comments
 (0)