viz stuff moved to ukko

xiaohan2012 · xiaohan2012 · commit eef62aedaef1 · 2016-04-23T20:16:07.000+03:00
diff --git a/dump_events_to_json.py b/dump_events_to_json.py
@@ -44,15 +44,17 @@ def run(candidate_tree_path,
             
             e.node[n]['body'] = id2interaction[n]['body']
             e.node[n]['root'] = (n == root)
+            e.node[n]['datetime'] = str(e.node[n]['datetime'])
 
-        # some simple clustering
-        assignment = greedy_clustering_on_graph(e)
-        for n in e.nodes_iter():
-            e.node[n]['cluster_label'] = assignment[n]
+        # # some simple clustering
+        # assignment = greedy_clustering_on_graph(e)
+        # for n in e.nodes_iter():
+        #     e.node[n]['cluster_label'] = assignment[n]
             
     if to_original_graph:
         events = map(convert_to_original_graph,
                      events)
+        # import pdb; pdb.set_trace()
 
     d3_events = [to_d3_graph(e)
                  for e in events]
diff --git a/dump_vis_timeline_data.py b/dump_vis_timeline_data.py
@@ -54,6 +54,7 @@ def run(cand_trees, k, summary_kws, undirected):
             'id': group_id,
             'terms': summ['topics']['topic_terms'],
             # 'terms': summ['frequent_terms'],
+            # 'terms': summ['tdidf_terms'],
             'participants': dict(
                 summ['participants']['participant_count']
             ),
diff --git a/interactions.py b/interactions.py
@@ -9,6 +9,7 @@
 import networkx as nx
 import pandas as pd
 
+import cPickle as pkl
 from itertools import izip
 from datetime import datetime as dt
 from memory_profiler import profile
@@ -354,6 +355,10 @@ def add_bow_to_graph(cls, g, dictionary):
             if i % 1000 == 0:
                 logger.debug('adding BoW: {} / {}'.format(i, N))
             g.node[n]['bow'] = tfidf_mat[node2row[n], :]
+
+        print('dumping tfidf vectorizer')
+        pkl.dump(tfidf, open('/cs/home/hxiao/code/lst/tmp/tfidf.pkl', 'w'))
+
         return g
 
     @classmethod
diff --git a/lda/corpora.py b/lda/corpora.py
@@ -84,7 +84,7 @@ def __len__(self):
         for path in arg.stoplist_paths:
             CorpusEnron.stoplist |= load_items_by_line(path.strip())
         print('new length of stoplist: {}'.format(len(CorpusEnron.stoplist)))
-    if True:
+    if False:
         print('WARN: stoplist empty!')
         CorpusEnron.stoplist = set()
     
diff --git a/meta_graph.py b/meta_graph.py
@@ -141,6 +141,12 @@ def convert_to_meta_graph_undirected(node_names, participants, timestamps,
 
 def convert_to_original_graph(mg):
     g = nt.DiGraph()
+    for m in mg.nodes():
+        s = mg.node[m]['sender']
+        g.add_node(s['id'], s)
+        for r in mg.node[m]['recipients']:
+            g.add_node(r['id'], r)
+
     for n in mg.nodes():
         sender = mg.node[n]['sender_id']
         for recipient in mg.node[n]['recipient_ids']:
diff --git a/meta_graph_stat.py b/meta_graph_stat.py
@@ -1,4 +1,5 @@
 import scipy
+import cPickle as pkl
 import numpy as np
 import itertools
 import networkx as nx
@@ -145,9 +146,12 @@ def topics(self, interactions, dictionary, lda, top_k=10):
         #     minimum_probability=0
         # )
         topic_dist = np.asarray([v for _, v in topic_dist])
-        
+
         # topic_terms
-        beta = lda.state.get_lambda()
+        if not hasattr(lda, 'wordtopics'):
+            lda.load_word_topics()
+        beta = lda.wordtopics
+        # beta = lda.state.get_lambda()
 
         # normalize and weight by beta dist
         weighted_terms = (
@@ -158,11 +162,15 @@ def topics(self, interactions, dictionary, lda, top_k=10):
 
         topic_terms = [lda.id2word[id] for id in bestn]
         
-        topic_divergence = self._topic_divergence(message_ids, id2msg,
-                                                  dictionary, lda)
-        return {'topic_dist': topic_dist,
+        top_topics = np.argsort(topic_dist)[::-1][:3]
+        print('top_topics', top_topics)
+        # topic_divergence = self._topic_divergence(message_ids, id2msg,
+        #                                           dictionary, lda)
+        return {# 'topic_dist': topic_dist,
                 'topic_terms': topic_terms,
-                'topic_divergence': topic_divergence}
+                'top_topics': top_topics
+                # 'topic_divergence': topic_divergence
+                }
 
     def frequent_terms(self, interactions, top_k=10):
         id2msg = {}
@@ -181,6 +189,25 @@ def frequent_terms(self, interactions, top_k=10):
         print 'frequent_terms', terms
         return terms
 
+    def tfidf_terms(self, interactions, dictionary, top_k=10):
+        text = '\n'.join(['{} {}'.format(m['subject'], m['body'])
+                   for m in interactions])
+        tfidf_vec = pkl.load(open('/cs/home/hxiao/code/lst/tmp/tfidf.pkl'))
+        counts = dictionary.doc2bow(
+            IU.tokenize_document(text)
+            )
+        raw_vect = np.zeros(len(dictionary.keys()))
+        for word, cnt in counts:
+            raw_vect[word] = cnt
+
+        vect = tfidf_vec.transform([raw_vect])
+        vect = np.asarray(vect.todense()).flatten()
+
+        tfidf_terms = [dictionary[i]
+                       for i in np.argsort(vect)[::-1][:top_k]]
+        print 'tfidf_terms', tfidf_terms
+        return tfidf_terms
+
     def hashtags(self):
         tags = itertools.chain(
             *[self.g.node[n]['hashtags']
@@ -308,6 +335,7 @@ def build_default_summary_kws(interactions, people_info,
     summary_kws = {
         'basic_structure_stats': {},
         'time_span': {},
+        # Deprecated
         'topics': {
             'interactions': interactions,
             'dictionary': dictionary,
@@ -329,10 +357,15 @@ def build_default_summary_kws(interactions, people_info,
             'interactions': interactions,
             'undirected': undirected
         },
-        'frequent_terms': {
-            'interactions': interactions,
-            'top_k': 10
-        }
+        # 'frequent_terms': {
+        #     'interactions': interactions,
+        #     'top_k': 10
+        # },
+        # 'tfidf_terms': {
+        #     'interactions': interactions,
+        #     'dictionary': dictionary,
+        #     'top_k': 10
+        # }
     }
     return summary_kws
 
diff --git a/print_topics.py b/print_topics.py
@@ -24,9 +24,12 @@ def get_topic_terms(model, topicid, topn, id2token):
 # for i in xrange(m.num_topics):
 #     print(' '.join(get_topic_terms(m, i, 10, m.id2word)))
 
-s = '\n'.join([' '.join([w for _, w in lst])
-               for lst in m.show_topics(num_topics=-1, formatted=False)]
-              )
+for i, lst in enumerate(m.show_topics(num_topics=-1, formatted=False)):
+    print '{}: {}'.format(i, ' '.join([w for _, w in lst]))
 
-print s
+# s = '\n'.join([' '.join([w for _, w in lst])
+#                for lst in m.show_topics(num_topics=-1, formatted=False)]
+#               )
+
+# print s
 
diff --git a/scripts/make_json_data_for_d3.sh b/scripts/make_json_data_for_d3.sh
@@ -11,59 +11,67 @@
 # - original graph is not allowed for undirected case
 
 if [ -z $1 ]; then
-	echo "dataset name is not given"
-	exit -1
+    echo "dataset name is not given"
+    exit -1
 fi
 
 
 dataset=$1
 pickle_dir=tmp/${dataset}
 extra=$2
 
-output_dir="/cs/home/hxiao/public_html/event_html/data/${dataset}"
 metadata_dir="/cs/home/hxiao/public_html/event_html/data/${dataset}"
 
-if [ ! -d $output_dir ]; then
-	mkdir -p ${output_dir}
+output_dir="html/data/${dataset}"
+remote_host="shell.cs.helsinki.fi"
+remote_dir="/cs/home/hxiao/public_html/event_html/data/${dataset}"
+
+if [ ! -d "${output_dir}/event/original_graph" ]; then
+    mkdir -p ${output_dir}/event/original_graph
+fi
+
+if [ ! -d "${output_dir}/event/meta_graph" ]; then
+    mkdir -p ${output_dir}/event/meta_graph
 fi
 
 for p in $(ls ${pickle_dir}/result-*.pkl); do
-	echo "${p}"
-	# just events
-	output_name=$(basename ${p})
-	output_name="${output_name%.*}.json"
+    echo "${p}"
+    output_name=$(basename ${p})
+    output_name="${output_name%.*}.json"
 
-	if [ ! -f "${output_dir}/event/original_graph/${output_name}" ]; then
-		echo 'dumping event to original graph'
-		python dump_events_to_json.py \
-			--candidate_tree_path ${p} \
-			--dirname "${output_dir}/event/original_graph" \
-			--interactions_path data/${dataset}/interactions.* \
-			--people_path data/${dataset}/people.* \
-			--to_original_graph \
-			-k 5 \
-			${extra}
-	else
-		echo "original graph exists"
-	fi
+    if [ ! -f "${output_dir}/event/original_graph/${output_name}" ]; then
+	echo 'dumping event to original graph'
+	python dump_events_to_json.py \
+	    --candidate_tree_path ${p} \
+	    --dirname "${output_dir}/event/original_graph" \
+	    --interactions_path data/${dataset}/interactions.* \
+	    --people_path data/${dataset}/people.* \
+	    --to_original_graph \
+	    -k 10 \
+	    ${extra}
+    else
+	echo "original graph exists"
+    fi
 
-	if [ ! -f "${output_dir}/event/meta_graph/${output_name}" ]; then
-		echo 'dumping event to meta graph'
-		python dump_events_to_json.py \
-			--candidate_tree_path ${p} \
-			--dirname "${output_dir}/event/meta_graph" \
-			--interactions_path data/${dataset}/interactions.* \
-			--people_path data/${dataset}/people.* \
-			-k 5 \
-			${extra}
-	else
-		echo "meta graph exists"
-	fi
+    if [ ! -f "${output_dir}/event/meta_graph/${output_name}" ]; then
+	echo 'dumping event to meta graph'
+	python dump_events_to_json.py \
+	    --candidate_tree_path ${p} \
+	    --dirname "${output_dir}/event/meta_graph" \
+	    --interactions_path data/${dataset}/interactions.* \
+	    --people_path data/${dataset}/people.* \
+	    -k 10 \
+	    ${extra}
+    else
+	echo "meta graph exists"
+    fi
 done
 
 echo "dumping event names..."
 python dump_all_event_json_names.py \
-	${output_dir}/event/meta_graph \
-	${output_dir}/event_names.json
+    ${output_dir}/event/meta_graph \
+    ${output_dir}/event_names.json
+
+rsync -vr  ${output_dir}/ ${remote_host}:${remote_dir}/
 
-chmod -R a+rx ${output_dir}
+ssh ${remote_host} "chmod -R a+rx ${remote_dir}"
diff --git a/scripts/make_json_data_for_timeline.sh b/scripts/make_json_data_for_timeline.sh
@@ -7,7 +7,10 @@ dataset=$1
 pickle_dir="tmp/${dataset}"
 extra=$2
 
-output_dir="/cs/home/hxiao/public_html/event_html/data/${dataset}"
+# output_dir="/cs/home/hxiao/public_html/event_html/data/${dataset}"
+output_dir="html/data/${dataset}"
+remote_host="shell.cs.helsinki.fi"
+remote_dir="/cs/home/hxiao/public_html/event_html/data/${dataset}"
 
 if [ ! -d "${output_dir}/timeline" ]; then
 	mkdir -p "${output_dir}/timeline"
@@ -36,7 +39,10 @@ done
 
 echo "dumping timeline names..."
 python dump_all_event_json_names.py \
-	${output_dir}/timeline \
-	${output_dir}/timeline_names.json
+    ${output_dir}/timeline \
+    ${output_dir}/timeline_names.json
 
-chmod -R a+rx /cs/home/hxiao/public_html/event_html/data
+# chmod -R a+rx /cs/home/hxiao/public_html/event_html/data
+rsync -vr  ${output_dir}/timeline ${remote_host}:${remote_dir}/
+rsync -v ${output_dir}/timeline_names.json ${remote_host}:${remote_dir}/
+ssh ${remote_host} "chmod -R a+rx ${remote_dir}"
diff --git a/scripts/train_letter_lda.sh b/scripts/train_letter_lda.sh
@@ -13,5 +13,4 @@ if [ -z $1 ]; then
 else
     arg="dump_msg"
 fi
-./scripts/train_lda.sh letter_18c $arg 20 200 -1 
-# "-s /cs/home/hxiao/code/lst/data/letter_18c_stopwords.txt"
+./scripts/train_lda.sh letter $arg 15 500 -1 "-s /cs/home/hxiao/code/lst/data/letter_18c_stopwords.txt"
diff --git a/util.py b/util.py
@@ -93,9 +93,10 @@ def load_summary_related_data(interactions_path, people_path,
     dictionary = gensim.corpora.dictionary.Dictionary.load(
         corpus_dict_path
     )
-    lda = gensim.models.ldamodel.LdaModel.load(
-        lda_model_path
-    )
+    # lda = gensim.models.ldamodel.LdaModel.load(
+    #     lda_model_path
+    # )
+    lda = gensim.models.wrappers.LdaMallet.load(lda_model_path)
     return interactions, people_info, dictionary, lda
 
 
diff --git a/viz_util.py b/viz_util.py
@@ -5,6 +5,7 @@ def to_d3_graph(g):
     data = {'nodes': [], 'edges': []}
     for n in g.nodes_iter():
         node = g.node[n]
+        # print('node', node)
         for f in ('topics', 'bow', 'hashtag_bow'):
             if f in node:
                 del node[f]