-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathlastp_aai.py
142 lines (115 loc) · 4.45 KB
/
lastp_aai.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
############################################################################################
##### This script computes pairwise homology between proteins in two genomes using LAST ####
##### It requires that LAST is installed and can be called from the working directory. ####
##### Usage: pairwise_lastp.py <input_folder> <output_folder> ####
############################################################################################
import os, sys, re, subprocess, shlex
from collections import defaultdict
from Bio import SeqIO
import numpy as np
input_folder = sys.argv[1]
output_folder = sys.argv[2]
length_dict = {}
proteins_in_genome = defaultdict(list)
# define lastout parsing function
def parse_lastout(lastout):
bit_dict = defaultdict(float)
hit_dict = defaultdict(list)
perc_dict = defaultdict(float)
done = {}
handle = open(lastout, "r")
for j in handle.readlines():
if j.startswith("#"):
pass
else:
line = j.rstrip()
tabs = line.split("\t")
query = tabs[0]
hit = tabs[1]
perc = float(tabs[2])
evalue = float(tabs[10])
bit = float(tabs[11])
aln = float(tabs[3])
# if query in done:
# pass
# else:
done[query] = query
if evalue < 1e-3:
if bit >= bit_dict[query]:
hit_dict[query].append(hit)
bit_dict[query] = bit
perc_dict[query] = perc
else:
pass
return hit_dict, bit_dict, perc_dict
# Iterate through .faa files and make sure that the LAST index files have been computed.
folders = os.listdir(input_folder)
print "Formatting LAST databases..."
for faa in folders:
if faa.endswith(".faa"):
prefix = re.sub(".faa", "", faa)
db = os.path.join(input_folder, prefix)
filepath = os.path.join(input_folder, faa)
# get length of each protein and put it in a dictionary. This is used for computing alingment lengths later.
for prot in SeqIO.parse(filepath, "fasta"):
length_dict[prot.id] = len(prot.seq)
proteins_in_genome[prefix].append(prot.id)
lastpath = os.path.join(input_folder, prefix+".suf")
cmd = "lastdb -p "+ db +" "+ filepath
cmd2 = shlex.split(cmd)
subprocess.call(cmd2, stdin=open("stdout.txt", "w"), stdout=open("stderr.txt", "w"))
done = {}
for faa in folders:
if faa.endswith(".faa"):
prefix = re.sub(".faa", "", faa)
db = os.path.join(input_folder, prefix)
filepath = os.path.join(input_folder, faa)
for faa2 in folders:
if faa2.endswith(".faa"):
prefix2 = re.sub(".faa", "", faa2)
db2 = os.path.join(input_folder, prefix2)
filepath2 = os.path.join(input_folder, faa2)
if prefix +"__"+ prefix2 in done:
pass
elif prefix == prefix2:
pass
else:
done[prefix +"__"+ prefix2] = prefix +"__"+ prefix2
done[prefix2 +"__"+ prefix] = prefix2 +"__"+ prefix
# run first lastout
output1 = os.path.join(output_folder, prefix +"__"+ prefix2 +".lastout")
cmd = "lastal -P 8 -m 500 "+ db2 +" "+ filepath +" -f BlastTab"
cmd2 = shlex.split(cmd)
# subprocess.call(cmd2, stdin=open("stderr.txt", "w"), stdout=open(output1, "w"))
# now run reciprocal
output2 = os.path.join(output_folder, prefix2 +"__"+ prefix +".lastout")
cmd = "lastal -P 8 -m 500 "+ db +" "+ filepath2 +" -f BlastTab"
cmd2 = shlex.split(cmd)
# subprocess.call(cmd2, stdin=open("stderr.txt", "w"), stdout=open(output2, "w"))
hit_dict1, bit_dict1, perc_dict1 = parse_lastout(output1)
hit_dict2, bit_dict2, perc_dict2 = parse_lastout(output2)
aai_dict = defaultdict(list)
for query1 in hit_dict1:
hit_list1 = hit_dict1[query1]
hit_list2 = []
for hit1 in hit_list1:
if query1 in hit_dict2[hit1]:
#if hit1 in hit_dict2:
#print query1, hit1
#hit2 = hit_dict2[hit1]
# query2 = hit1
# hit2 = hit_dict2[query2]
#print hit1, hit2, query1, query2
# if query1 == hit2 and query2 == hit1:
meanval = np.mean([perc_dict1[query1], perc_dict2[hit1]])
aai_dict[prefix +"__"+ prefix2].append(meanval)
break
#else:
# print query1, query2, hit1, hit2
aai = np.mean(aai_dict[prefix +"__"+ prefix2])
num_hits = float(len(aai_dict[prefix +"__"+ prefix2]))
num_prots1 = float(len(proteins_in_genome[prefix]))
num_prots2 = float(len(proteins_in_genome[prefix2]))
af1 = 100*(num_hits / num_prots1)
af2 = 100*(num_hits / num_prots2)
print prefix, prefix2, aai, num_hits, num_prots1, num_prots2, af1, af2