-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathvirtree_multihit.py
136 lines (111 loc) · 4.88 KB
/
virtree_multihit.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
#!/usr/bin/env python
import sys, os, re, shlex, subprocess, pandas, argparse
from collections import defaultdict
#hmmdb = "hmm/vog_05p.hmm"
# run HMMER3
def run_hmmer(input_file, cpus, evalue, database):
output_file = re.sub(".faa", ".hmmout", input_file)
cmd = "hmmsearch -E "+ str(evalue) +" --cpu "+ cpus +" --tblout "+ output_file +" "+ database +" "+ input_file
print(cmd)
cmd2 = shlex.split(cmd)
subprocess.call(cmd2, stdout=open("out.txt", "w"), stderr=open("err.txt", "w"))
os.remove("out.txt")
return output_file
# define function for parsing HMMER3 output
def parse_hmmout(hmmout):
input = open(hmmout, "r")
final_dict = defaultdict(int)
hit_dict = {}
#hit_dict = defaultdict(lambda:"no_annot")
#hit_dict = defaultdict(list)
bit_dict = defaultdict(float)
for i in input.readlines():
line = i.rstrip()
if line.startswith("#"):
pass
else:
newline = re.sub("\s+", "\t", line)
tabs = newline.split("\t")
protein = tabs[0]
hit = tabs[2]
eval = float(tabs[4])
score = float(tabs[5])
if score > 30:
# if score > bit_dict[protein]:
if protein in hit_dict:
new_entry = hit_dict[protein] +";"+ hit
hit_dict[protein] = new_entry
else:
bit_dict[protein] = score
hit_dict[protein] = hit
# else:
# pass
else:
pass
for i in hit_dict:
vog = hit_dict[i]
final_dict[vog] +=1
return final_dict
# main function that runs the program
def run_program(inputdir, project, database, minhit, evalue, cpus, redo):
df = pandas.DataFrame()
for i in os.listdir(inputdir):
if i.endswith(".faa"):
#name = re.sub("_genomic.fna.faa", "", i)
inputfile = os.path.join(inputdir, i)
#print(inputfile)
#protein_file = predict_proteins(inputfile, inputdir)
hmmout = run_hmmer(inputfile, cpus, evalue, database)
hit_dict = parse_hmmout(hmmout)
# fulllist = i.split(".")
# genomelist = fulllist[0:-2]
# genome = ".".join(genomelist)
genome = re.sub(".faa", "", i)
#print(genome)
s1 = pandas.DataFrame(pandas.Series(hit_dict, name = genome))
df = pandas.concat([df, s1], axis=1, sort=True)
df = df.fillna(0)
df2 = df.clip(0,1)
prof_tbl = project + "_profile.tsv"
df2.to_csv(prof_tbl,sep="\t")
#if BINARY == 1:
# df2 = df.clip(upper=1)
#else:
# df2 = df.clip(upper=9)
outputfile = project + "_bin.fna"
o = open(outputfile, "w")
for (columnName, columnData) in df2.items():
vogsum = sum([float(n) for n in columnData])
if vogsum >= minhit:
#print(columnName, vogsum)
string = "".join([str(int(n)) for n in columnData])
o.write(">"+ columnName +"\n"+ string +"\n")
########################################################################
##### use argparse to run through the command line options given #######
########################################################################
def main(argv=None):
args_parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, description="hmmtree: Part of a workflow for making trees based on protein family presence/absence \nFrank O. Aylward, Virginia Tech Department of Biological Sciences <faylward at vt dot edu>", epilog='*******************************************************************\n\n*******************************************************************')
args_parser.add_argument('-i', '--inputdir', required=True, help='Input folder of protein FASTA files (ending in .faa)')
args_parser.add_argument('-p', '--project', required=True, help='project name for outputs')
args_parser.add_argument('-db', '--database', required=False, default="hmm/vog_05p.hmm", help='PATH to HMM database to use. Default is hmm/vog_05p.hmm (default), vog_025p.hmm, and vog_005p.hmm. See README for details')
args_parser.add_argument('-g', '--minhit', required=False, default=int(1), help='minimum number of VOG hits that each viral region must have to be reported (default=4)')
args_parser.add_argument('-e', '--evalue', required=False, default=str(1e-20), help='e-value that is passed to HMMER3 for the VOG hmmsearch (default=1e-10)')
args_parser.add_argument('-t', '--cpus', required=False, default=str(1), help='number of cpus to use for the HMMER3 search')
args_parser.add_argument('-r', '--redo', type=bool, default=False, const=True, nargs='?', help='run without re-launching prodigal and HMMER3 (for quickly re-calculating outputs with different parameters if you have already run once)')
args_parser.add_argument('-v', '--version', action='version', version='ViralRecall v. 2.1')
args_parser = args_parser.parse_args()
# set up object names for input/output/database folders
inputdir = args_parser.inputdir
project = args_parser.project
database = args_parser.database
minhit = int(args_parser.minhit)
evalue = str(args_parser.evalue)
cpus = args_parser.cpus
redo = args_parser.redo
project = project.rstrip("/")
run_program(inputdir, project, database, minhit, evalue, cpus, redo)
return 0
if __name__ == '__main__':
status = main()
sys.exit(status)
# end