-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpairwise_buildsets.py
159 lines (131 loc) · 5.21 KB
/
pairwise_buildsets.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
import argparse
import math
import glob
import os
import sys
# Note that python doesn't support tail call optimization, so the recursive
# calls for set creation should be optimized for a loop
sys.setrecursionlimit(2000000)
from itertools import combinations
import numpy as np
import pandas as pd
from utilities.common import rbind_all, pairs_to_sets
parser = argparse.ArgumentParser(
description='Build gene sets from scores')
parser.add_argument('--low', type=int, default=15,
help='Smallest allowable size of gene set')
parser.add_argument('--high', type=int, default=200,
help='Largest allowable size of gene set')
parser.add_argument('--count', type=int, default=10,
help='Number of sets to attempt to generate')
parser.add_argument('--title', type=str, required=True,
help='Title string')
parser.add_argument('--name', type=str, required=True,
help='Name of cell type to target')
parser.add_argument('--type', type=str, required=True,
help='General or specific cell type')
parser.add_argument('--input', type=str, required=True,
help='Path to input score files')
parser.add_argument('--output', type=str, required=True,
help='Path to output set id files')
parser.add_argument('--feedback', type=str, required=True,
help='Path to feedback file (list of created sets)')
args = parser.parse_args()
# args = parser.parse_args(([
# '--low', '100',
# '--high', '200',
# '--count', '300',
# '--input', 'results/100_200_main',
# '--title', '100_200_main',
# '--type', 'General_Cell_Type',
# '--name', '"B cell"',
# '--output', 'results/100_200_main',
# '--feedback', 'results/feedback.sets.tsv'
# ]))
# Important since some names have spaces
cell_name = args.name.replace('"', '').replace(' ', '_')
root_filename = '{}.score.*.{}.{}.tsv'.format(
args.title, args.type, cell_name)
root_glob = os.path.join(args.input, root_filename)
score_paths = glob.glob(root_glob)
def temp_read(path):
try:
return pd.read_table(path)
except:
print(path)
pass
# score_files = list(map(pd.read_table, score_paths))
score_files = list(map(temp_read, score_paths))
score_data = rbind_all(score_files)
paired_scores = score_data.assign(weighted_score=score_data.frequency * score_data.score) \
.groupby(['low', 'high'], as_index=False) \
.agg({'weighted_score': 'sum'}) \
.sort_values('weighted_score', ascending=False) \
.reset_index(drop=True)
def set_values(paired_scores, min_size, max_size, head_size):
"""For top head_size paired scores, generate sets that conform to min/max"""
values = paired_scores.head(head_size)
scores = values.weighted_score
current_sets = pairs_to_sets(values, min_size, max_size)
print("Sizes: ", head_size, list(map(len, current_sets)))
feedback_frames = []
for gene_set in current_sets:
feedback_frames.append(pd.DataFrame({
'genes_considered': [head_size],
'genes_in_set': [len(gene_set)],
'score_max': [np.max(scores)],
'score_min': [np.min(scores)],
'score_avg': [np.mean(scores)],
'score_std': [np.std(scores)]
}))
return feedback_frames, current_sets
def unique_sets(existing_sets, dfs, new_sets):
sets_confirmed = []
dfs_confirmed = []
for gene_set, df in zip(new_sets, dfs):
if gene_set not in existing_sets:
sets_confirmed.append(gene_set)
dfs_confirmed.append(df)
return dfs_confirmed, sets_confirmed
all_sets = []
all_dfs = []
head_sizes = np.arange(paired_scores.shape[0] // args.low) * args.low + args.low
head_sizes = list(filter(lambda head_size: head_size < 70 * args.high, head_sizes))
print(head_sizes)
print("number of iterations:", len(head_sizes))
empty_sets = 0
# For each given step, generate sets. If unique, add them to the list
for idx, head_size in enumerate(head_sizes):
print(" - ", idx, " ", math.floor( 100 * idx / len(head_sizes)), "%")
df, gene_sets = set_values(
paired_scores, args.low, args.high, head_size)
df, gene_sets = unique_sets(all_sets, df, gene_sets)
if len(gene_sets) < 1:
if len(all_sets) > 0:
empty_sets = empty_sets + 1
if empty_sets > 5:
print("Escaping past too long sets")
break
continue
all_dfs = all_dfs + df
all_sets = all_sets + gene_sets
print("Found ", len(all_sets), "sets")
if len(all_sets) > args.count:
break
if len(all_dfs) < 1:
print("No valid gene sets generated")
sys.exit(1)
feedback = rbind_all(all_dfs)
feedback = feedback.assign(
cell_type=args.type, cell_name=cell_name, title=args.title)
feedback.index.name = 'index'
feedback.to_csv(args.feedback, sep='\t',
encoding='utf-8')
for idx, gene_set in enumerate(all_sets):
gene_set_lst = list(gene_set)
frame = pd.DataFrame({'gene': gene_set_lst})
filename_set = '{}.set.{}.{}.{:05d}.tsv'.format(
args.title, args.type, cell_name, idx)
path_set = os.path.join(args.output, filename_set)
frame.to_csv(path_set, sep='\t',
encoding='utf-8', index=False)