Skip to content

Commit 7cd747a

Browse files
committed
Initial commit.
0 parents  commit 7cd747a

38 files changed

+1231
-0
lines changed

AddHeaderToSedef.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
#!/usr/bin/env python
2+
import sys
3+
print("#"+"\t".join(["chr1", "start1", "end1","chr2", "start2","end2","name","score","strand1","strand2","max_len","aln_len", "comment","aln_len","indel_a","indel_b","alnB","matchB","mismatchB","transitionsB","transversions","fracMatch","fracMatchIndel","jck","k2K","aln_gaps","uppercaseA","uppercaseB","uppercaseMatches","aln_matches","aln_mismatches","aln_gaps","aln_gap_bases","cigar","filter_score"]))
4+
for line in sys.stdin:
5+
sys.stdout.write(line)

AlignIsoforms.sh

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
#!/usr/bin/env bash
2+
bedFile=$1
3+
gencode=$2
4+
genome=$3
5+
#set -v
6+
#set -x
7+
mkdir -p testing
8+
for gene in `cut -f 4 $bedFile | uniq`; do
9+
res=$gene
10+
for rgn in `grep "$gene" $bedFile | awk '{ print $13"\t"$14"\t"$15; print $16"\t"$17"\t"$18;}' | bedtools sort | bedtools merge | awk '{ print $1":"$2"-"$3;}'`; do
11+
12+
samtools faidx $gencode $gene > testing/transcript.fa
13+
samtools faidx $genome $rgn > testing/genome.fa
14+
dv=`minimap2 -x splice -O 8,20 testing/genome.fa testing/transcript.fa | awk '{ for (i=1; i <= NF; ++i) { if (substr($i,0,2) == "dv") print(substr($i,6)) } }'`
15+
res="$res\t$rgn\t $dv"
16+
done
17+
echo -e $res
18+
done
19+
20+

BamToFreq.sh

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
#!/usr/bin/env bash
2+
rgn=`echo $2 | tr ":-" "__"`
3+
4+
5+
bamToFreq $1 $2 $3 --covbed | \
6+
awk '{ if ($3 > 5) print;}' | \
7+
awk '{ print $1"\t"$2"\t"$2+1"\t"$3;}' | \
8+
bedtools merge -c 4,4,4 -o mean,stdev,mode > $4.$rgn.rgn 2> /dev/null
9+
10+
true

BedToCoverage.py

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
#!/usr/bin/env python
2+
3+
import numpy as np
4+
import argparse
5+
import sys
6+
ap = argparse.ArgumentParser(description="transform a bed into cvoerage.")
7+
8+
ap.add_argument("bed", help="Input bed file.")
9+
ap.add_argument("bin", help="Bin size", type=int)
10+
ap.add_argument("fai", help="Genome fai file.")
11+
ap.add_argument("--out", help="Output file", default="/dev/stdout")
12+
ap.add_argument("--minLength", help="Only consider alignments of this length", type=int, default=0)
13+
ap.add_argument("--counts", help="Print bin counts, not whole bed.", default=False, action='store_true')
14+
ap.add_argument("--totals", help="Print totals in bins, not coverage.", default=False, action='store_true')
15+
args=ap.parse_args()
16+
17+
faiFile = open(args.fai)
18+
bedFile = open(args.bed)
19+
outFile = open(args.out, 'w')
20+
21+
fai = { line.split()[0]: int(line.split()[1]) for line in faiFile }
22+
23+
chroms = {}
24+
for chrom in fai.keys():
25+
nBins = fai[chrom]/args.bin + np.ceil(float(fai[chrom]%args.bin)/args.bin) + 1
26+
if nBins > 0:
27+
chroms[chrom] = np.zeros(int(nBins), dtype=np.int32)
28+
29+
nReads = 0
30+
31+
for bedLine in bedFile:
32+
vals = bedLine.split()
33+
chrom = vals[0]
34+
if (chrom not in chroms):
35+
continue
36+
gStart = int(vals[1])
37+
gEnd = int(vals[2])
38+
start = int(gStart/args.bin)
39+
end = int(gEnd/args.bin + 1)
40+
if (end + 1 >= len(chroms[chrom])):
41+
continue
42+
if (gEnd - gStart > args.minLength):
43+
if (args.totals == False):
44+
chroms[chrom][start] +=1
45+
chroms[chrom][end+1] -=1
46+
nReads +=1
47+
ck = list(chroms.keys())
48+
if (args.totals == False):
49+
50+
for cki in ck:
51+
chrom = chroms[cki]
52+
cov = 0
53+
curInc = 0
54+
for i in range(0,len(chrom)-1):
55+
curInc = chrom[i]
56+
cov += curInc
57+
chrom[i] = cov
58+
59+
ck.sort()
60+
61+
if (args.counts):
62+
counts = {}
63+
for chrom in ck:
64+
chromBins = chroms[chrom]
65+
for i in range(0,len(chromBins)-1):
66+
if (chromBins[i] not in counts):
67+
counts[chromBins[i]] = 1
68+
else:
69+
counts[chromBins[i]]+=1
70+
for c,v in counts.iteritems():
71+
outFile.write("{}\t{}\n".format(c,v))
72+
sys.exit(0)
73+
74+
75+
for chrom in ck:
76+
nBins = len(chroms[chrom])-1
77+
lines = ""
78+
chromBins = chroms[chrom]
79+
for i in range(0,nBins):
80+
lines += "{}\t{}\t{}\t{}\n".format(chrom, i*args.bin, min((i+1)*args.bin, fai[chrom]), chromBins[i])
81+
if ( (i+1)%100000 == 0 or i == nBins-1):
82+
outFile.write(lines)
83+
lines = ""
84+
# sys.stderr.write("{} {}\n".format(chrom, i+1))

CombineCoverage.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
#!/usr/bin/env python
2+
import sys
3+
import argparse
4+
ap=argparse.ArgumentParser(description="Combine multiple rnaseq peaks into one file.")
5+
ap.add_argument("--out", help="Output file", required=True)
6+
ap.add_argument("--input", help="Input files", required=True, nargs="+")
7+
args=ap.parse_args()
8+
9+
inFile = [open(v) for v in args.input ]
10+
lines = [f.readlines() for f in inFile ]
11+
12+
13+
outFile = open(args.out, 'w')
14+
15+
16+

CombineMask.cpp

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
#include <cinttypes>
2+
3+
#include "htslib/kseq.h"
4+
5+
#include <zlib.h>
6+
7+
KSEQ_INIT(gzFile, gzread);
8+
9+
#include <vector>
10+
#include <string>
11+
#include <iostream>
12+
#include <fstream>
13+
14+
using namespace std;
15+
int main(int argc, char* argv[]) {
16+
if (argc == 1) {
17+
cout << "usage: comask output.fasta input1.fa input2.fa ..." << endl;
18+
exit(1);
19+
}
20+
21+
string outFileName = argv[1];
22+
ifstream testFile(outFileName.c_str());
23+
if (testFile.good()) {
24+
cout << "ERROR, output file " << outFileName << " already exists." << endl;
25+
exit(1);
26+
}
27+
ofstream outFile(outFileName.c_str());
28+
vector< string > fileNames;
29+
vector< gzFile > fastaFiles;
30+
vector< kseq_t *> ks;
31+
32+
for (int i = 2; i < argc; i++) {
33+
fileNames.push_back(argv[i]);
34+
fastaFiles.push_back(gzopen(argv[i], "r"));
35+
ks.push_back(kseq_init(fastaFiles[fastaFiles.size()-1]));
36+
}
37+
38+
if (ks.size() == 0) {
39+
cout << "No input files " << endl;
40+
exit(1);
41+
}
42+
while (kseq_read(ks[0]) >= 0) {
43+
cerr << ks[0]->name.s<< endl;
44+
for (int j = 1; j < ks.size(); j++) {
45+
if (kseq_read(ks[j]) == 0) {
46+
cout << "ERROR, could not fetch sequence " << ks[0]->name.s << " from " << fastaFiles[j] << endl;
47+
exit(0);
48+
}
49+
if (strcmp(ks[j]->name.s, ks[0]->name.s) != 0) {
50+
cout << "ERROR, input file " << fileNames[j] << " is out of sync with " << fileNames[0] << endl;
51+
exit(0);
52+
}
53+
}
54+
for (int p=0; p < ks[0]->seq.l; p ++) {
55+
for (int j=1; j < ks.size(); j++) {
56+
if (ks[j]->seq.s[p] >= 'a' && ks[j]->seq.s[p] <= 'z') {
57+
ks[0]->seq.s[p] = ks[j]->seq.s[p];
58+
}
59+
}
60+
}
61+
outFile << ">" << ks[0]->name.s << endl;
62+
int p=0;
63+
while (p < ks[0]->seq.l) {
64+
int end = min(p+60, (int) ks[0]->seq.l);
65+
string sub(&ks[0]->seq.s[p], end-p);
66+
outFile << sub << endl;
67+
p=end;
68+
}
69+
}
70+
71+
}

CombineMasked.py

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
#!/usr/bin/env python
2+
import sys
3+
fileList=sys.argv[1]
4+
originalFai=sys.argv[2]
5+
outputFile=sys.argv[3]
6+
7+
origFaiFile=open(originalFai)
8+
origOrder = [ l.split()[0] for l in origFaiFile]
9+
10+
11+
chroms={}
12+
13+
ifl=open(fileList)
14+
inFiles=ifl.readlines()
15+
16+
for line in inFiles:
17+
18+
19+
tmp=open(line.rstrip())
20+
header=tmp.readline()[1:]
21+
vals=header.split("/")
22+
23+
pre="/".join(vals[:-1])
24+
suf=int(vals[-1])
25+
tmp.close()
26+
27+
if pre not in chroms:
28+
chroms[pre] = {}
29+
chroms[pre][suf] = line.rstrip()
30+
31+
outFile=open(outputFile,'w')
32+
33+
34+
for origContig in origOrder:
35+
if origContig not in chroms:
36+
sys.stderr.write("ERROR: Missing contig: " + origContig + "\n")
37+
sys.exit(1)
38+
pre=origContig
39+
outFile.write(">" + pre + "\n")
40+
nParts=max(chroms[pre].keys())
41+
seq=""
42+
for i in range(0,nParts+1):
43+
partFile=open(chroms[pre][i])
44+
partFile.readline()
45+
seq+="".join([l.strip() for l in partFile.readlines()])
46+
47+
print("chrom " + pre + " has length " + str(len(seq)))
48+
last=int(len(seq)/60)
49+
chromSeq="\n".join([seq[j*60:(j+1)*60] for j in range(0,last) ])
50+
if len(seq) % 60 != 0:
51+
chromSeq += "\n" + seq[last*60:]
52+
chromSeq += "\n"
53+
54+
outFile.write(chromSeq)
55+
56+

CombinedMasked.py

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
#!/usr/bin/env python
2+
import sys
3+
fileList=sys.argv[1]
4+
outputFile=sys.argv[2]
5+
6+
chroms={}
7+
8+
ifl=open(fileList)
9+
inFiles=ifl.readlines()
10+
11+
for line in inFiles:
12+
13+
14+
tmp=open(line.rstrip())
15+
header=tmp.readline()[1:]
16+
vals=header.split("/")
17+
pre="/".join(vals[:-1])
18+
suf=int(vals[-1])
19+
print(suf)
20+
tmp.close()
21+
22+
if pre not in chroms:
23+
chroms[pre] = {}
24+
chroms[pre][suf] = line.rstrip()
25+
26+
outFile=open(outputFile,'w')
27+
28+
29+
for pre in chroms:
30+
outFile.write(">" + pre + "\n")
31+
nParts=max(chroms[pre].keys())
32+
seq=""
33+
for i in range(0,nParts+1):
34+
partFile=open(chroms[pre][i])
35+
print("appending " + str(pre) + " " + str(i))
36+
partFile.readline()
37+
seq+="".join([l.strip() for l in partFile.readlines()])
38+
print("chrom " + pre + " has length " + str(len(seq)))
39+
40+
last=int(len(seq)/60)
41+
chromSeq="\n".join([seq[j*60:(j+1)*60] for j in range(0,last) ])
42+
if len(seq) % 60 != 0:
43+
chromSeq += "\n" + seq[last*60:]
44+
chromSeq += "\n"
45+
46+
outFile.write(chromSeq)
47+
48+

Compar2.sh

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
#!/usr/bin/env bash
2+
3+
read -r line
4+
echo $line
5+
ra=`echo $line | awk '{ print $1":"$2"-"$3;}'`
6+
rb=`echo $line | awk '{ print $4":"$5"-"$6;}'`
7+
echo $line > testing/rep.bed
8+
samtools faidx assembly.repeat_masked.sd.fasta $ra > testing/a.fasta
9+
samtools faidx assembly.repeat_masked.sd.fasta $rb > testing/b.fasta
10+
11+
minimap2 --cs=long testing/a.fasta testing/b.fasta | paftools.js view -
12+

CountRep.cpp

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
#include <iostream>
2+
#include <string>
3+
#include <vector>
4+
using namespace std;
5+
int main() {
6+
int nl=0, nh=0, other=0;
7+
vector<int> low(256,0), high(256,0);
8+
string prevTitle = "";
9+
while (cin) {
10+
string line;
11+
cin >> line;
12+
if (line.size() > 0 and line[0] == '>') {
13+
if (prevTitle != "") {
14+
int s=nl+nh;
15+
if (s > 0) {
16+
cout << prevTitle << "\t"<< nl / ((float)s) << "\t" << nl << "\t" << nh << endl;
17+
}
18+
else {
19+
cout << prevTitle << "\t0\t0\t0" << endl;
20+
}
21+
}
22+
nl=0; nh=0;
23+
prevTitle = line;
24+
}
25+
for (int i=0; i < line.size(); i++) {
26+
if (line[i] >= 'a' && line[i] <= 'z') {
27+
nl+=1;
28+
}
29+
else {
30+
nh+=1;
31+
}
32+
}
33+
}
34+
int s=nl+nh;
35+
if (s > 0) {
36+
cout << prevTitle << "\t"<< nl / ((float)s) << "\t" << nl << "\t" << nh<< endl;
37+
}
38+
else {
39+
cout << prevTitle << "\t0\t0\t0" << endl;
40+
}
41+
}
42+

0 commit comments

Comments
 (0)