Skip to content

Commit c48f5da

Browse files
author
emptyewer
committed
+ modified bqp output
- removed redundant bqa files + changed query blast qui functionality with threading and to read modified bqp files
1 parent 6e201d6 commit c48f5da

File tree

4 files changed

+93
-84
lines changed

4 files changed

+93
-84
lines changed

functions/junctionf_gui.py

Lines changed: 34 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -54,14 +54,13 @@ def junction_search(self, directory, junction_folder, input_data_folder, blast_r
5454
print ">>> The primary, secondary, and tertiary sequences searched are:"
5555
sys.stdout.flush()
5656
unmapped_sam_files = self.fileio.get_sam_filelist(directory, input_data_folder)
57-
processed_file_list = self.fileio.get_file_list(directory, blast_results_query, ".bqa")
58-
unmapped_sam_files = self._get_unprocessed_files(unmapped_sam_files, ".sam", processed_file_list, ".bqa")
5957

6058
print '>>> Starting junction search.'
6159
sys.stdout.flush()
6260

6361
for f in unmapped_sam_files:
64-
open(filename)
62+
print '>>> File: ', f
63+
sys.stdout.flush()
6564
filename = os.path.join(directory, input_data_folder, f)
6665
input_filehandle = open(filename)
6766
input_file_size = os.path.getsize(filename)
@@ -71,12 +70,11 @@ def junction_search(self, directory, junction_folder, input_data_folder, blast_r
7170
self._search_for_junctions(input_filehandle, jseqs, exclusion_sequence, output_filehandle, f, input_file_size)
7271
output_filehandle.close()
7372
input_filehandle.close()
74-
self._multi_convert(directory, junction_folder, blast_results_folder, blast_results_query)
73+
self._multi_convert(directory, junction_folder, blast_results_folder)
7574

76-
def _multi_convert(self, directory, infolder, outfolder, blast_results_query):
75+
def _multi_convert(self, directory, infolder, outfolder):
7776
file_list = self.fileio.get_file_list(directory, infolder, ".txt")
78-
processed_file_list = self.fileio.get_file_list(directory, blast_results_query, ".bqa")
79-
file_list = self._get_unprocessed_files(file_list, ".junctions.txt", processed_file_list, ".bqa")
77+
8078
print ' '
8179
for f in file_list:
8280
self.fileio.make_FASTA(os.path.join(directory, infolder, f),
@@ -107,8 +105,7 @@ def blast_search(self, directory, db_name, blast_results_folder, blast_results_q
107105
print ">>> Selected Blast DB: %s" % db_name
108106
sys.stdout.flush()
109107
file_list = self.fileio.get_file_list(directory, blast_results_folder, ".fa")
110-
processed_file_list = self.fileio.get_file_list(directory, blast_results_query, ".bqa")
111-
file_list = self._get_unprocessed_files(file_list, ".junctions.txt", processed_file_list, ".bqa")
108+
112109
for file_name in file_list:
113110
output_file = os.path.join(directory, blast_results_folder, file_name.replace(".junctions.fa", '.blast.txt'))
114111
blast_command_list = [os.path.join(blast_path, 'blastn' + suffix),
@@ -124,39 +121,31 @@ def blast_search(self, directory, db_name, blast_results_folder, blast_results_q
124121

125122
def generate_tabulated_blast_results(self, directory, blast_results_folder, blast_results_query_folder, gene_list_file):
126123
blast_list = self.fileio.get_file_list(directory, blast_results_folder, ".txt")
127-
processed_file_list = self.fileio.get_file_list(directory, blast_results_query_folder, ".bqa")
128-
blast_list = self._get_unprocessed_files(blast_list, ".blast.txt", processed_file_list, ".bqa")
129124

130125
for blasttxt in blast_list:
131126
print ">>> Parsing BLAST results file %s ..." % blasttxt
132-
blast_dict, accession_dict, gene_dict = self._blast_parser(directory, blast_results_folder,
133-
blasttxt, gene_list_file)
134-
# for key in blast_dict.keys():
135-
# if key not in ['total', 'pos_que']:
136-
# stats = {'in_orf' : 0, 'in_frame': 0, 'downstream': 0,
137-
# 'upstream': 0, 'not_in_frame': 0,
138-
# 'intron' : 0, 'backwards': 0, 'frame_orf': 0, 'total': 0
139-
# }
140-
# for nm in blast_dict[key].keys():
141-
# blast_dict[key][nm] = list(set(blast_dict[key][nm]))
142-
# for j in blast_dict[key][nm]:
143-
# j.ppm = blast_dict['pos_que'][j.pos_que] * 1000000 / blast_dict['total']
144-
# stats[j.frame] += 1
145-
# stats[j.orf] += 1
146-
# if j.frame_orf:
147-
# stats["frame_orf"] += 1
148-
# stats['total'] += 1
149-
# blast_dict[key]['stats'] = stats
127+
blast_dict, gene_dict = self._blast_parser(directory, blast_results_folder,
128+
blasttxt, gene_list_file)
129+
for gene in blast_dict.keys():
130+
if gene not in ['total', 'pos_que']:
131+
stats = {'in_orf' : 0, 'in_frame': 0, 'downstream': 0,
132+
'upstream': 0, 'not_in_frame': 0,
133+
'intron' : 0, 'backwards': 0, 'frame_orf': 0, 'total': 0
134+
}
135+
for nm in blast_dict[gene].keys():
136+
for j in blast_dict[gene][nm]:
137+
j.ppm = j.count * 1000000 / blast_dict['total']
138+
stats[j.frame] += 1
139+
stats[j.orf] += 1
140+
if j.frame_orf:
141+
stats["frame_orf"] += 1
142+
stats['total'] += 1
143+
blast_dict[gene]['stats'] = stats
150144

151-
blast_dict.pop('pos_que')
152145
blast_query_p = open(os.path.join(directory, blast_results_query_folder,
153146
blasttxt.replace(".blast.txt", ".bqp")), "wb")
154-
lists_p = open(os.path.join(directory, blast_results_query_folder,
155-
blasttxt.replace(".blast.txt", ".bqa")), "wb")
156147
cPickle.dump(blast_dict, blast_query_p)
157-
cPickle.dump([accession_dict, gene_dict], lists_p)
158148
blast_query_p.close()
159-
lists_p.close()
160149
self.fileio.remove_file(directory, blast_results_folder,
161150
self.fileio.get_file_list(directory, blast_results_folder, ".fa"))
162151

@@ -230,10 +219,8 @@ def _blast_parser(self, directory, infolder, fileName, gene_list_file):
230219
print_counter = 0
231220
previous_bitscore = 0
232221
results_dictionary = {}
233-
accession_dict = {}
234222
gene_dict = {}
235223
collect_results = 'n'
236-
pos_que = []
237224
for line in blast_results_handle.readlines():
238225
line.strip()
239226
split = line.split()
@@ -251,18 +238,14 @@ def _blast_parser(self, directory, infolder, fileName, gene_list_file):
251238
previous_bitscore = float(split[11]) * 0.98
252239
nm_number = split[1]
253240
gene_name = gene_list[nm_number]['gene_name']
254-
accession_dict[nm_number] = gene_list[nm_number]['gene_name']
255241
if gene_name not in gene_dict.keys():
256242
gene_dict[gene_name] = [nm_number]
257243
else:
258244
gene_dict[gene_name].append(nm_number)
259245

260-
pq = nm_number + "-" + split[8] + "-" + split[6]
261246
j = sts.jcnt()
262247
j.position = int(split[8])
263248
j.query_start = int(split[6])
264-
j.pos_que = pq
265-
pos_que.append(pq)
266249
fudge_factor = j.query_start - 1
267250
frame = j.position - gene_list[nm_number]['orf_start'] - fudge_factor
268251
if frame % 3 == 0 or frame == 0:
@@ -291,15 +274,23 @@ def _blast_parser(self, directory, infolder, fileName, gene_list_file):
291274
results_dictionary[gene_name][nm_number] = [j]
292275
else:
293276
if nm_number not in results_dictionary[gene_name].keys():
294-
results_dictionary[gene_name][nm_number] = [j]
277+
results_dictionary[gene_name][nm_number] = []
278+
279+
junction_present = False
280+
junction_index = 0
281+
for index, pj in enumerate(results_dictionary[gene_name][nm_number]):
282+
if pj.position == j.position and pj.query_start == j.query_start:
283+
junction_index = index
284+
junction_present = True
285+
if junction_present:
286+
results_dictionary[gene_name][nm_number][junction_index].count += 1
295287
else:
296288
results_dictionary[gene_name][nm_number].append(j)
297289
else:
298290
collect_results = 'n'
299291
results_dictionary['total'] = blast_results_count
300-
results_dictionary['pos_que'] = Counter(pos_que)
301292
blast_results_handle.close()
302-
return results_dictionary, accession_dict, gene_dict
293+
return results_dictionary, gene_dict
303294

304295
# def _search_junctions(self, infile, junction_sequence, outfile):
305296
# def longest_common_substring(s1, s2):

functions/structures.py

Lines changed: 5 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -5,21 +5,16 @@ def __init__(self):
55
self.frame = ''
66
self.ppm = 0.0
77
self.orf = ''
8-
self.pos_que = ''
98
self.frame_orf = False
9+
self.count = 1
1010

1111
def __repr__(self):
12-
string = "<Junction pos:%d, q.start:%d, ppm:%.3f, frame:%s, orf:%s, pos_que:%s>" % (self.position,
13-
self.query_start,
14-
self.ppm,
15-
self.frame,
16-
self.orf,
17-
self.pos_que)
12+
string = "<Junction pos:%d, " \
13+
"q.start:%d, ppm:%.3f, " \
14+
"frame:%s, orf:%s, " \
15+
"count:%d>" % (self.position, self.query_start, self.ppm, self.frame, self.orf, self.count)
1816
return string
1917

20-
def __eq__(self, other):
21-
return self.pos_que == other.pos_que
22-
2318
def __ne__(self, other):
2419
return not self.__eq__(other)
2520

library_quality.py

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
filename = filelist[int(sel)]
2121
output_filename = os.path.join(directory, os.path.splitext(os.path.basename(filename))[0] + "_library_quality.csv")
2222
output_handle = open(output_filename, 'w')
23-
output_handle.write("Gene_Name,NM_Number,In_Frame,Out_of_Frame,Downstream,In_ORF,Upstream,Backwards\n,,\n")
23+
output_handle.write("Gene_Name,NM_Number,In_Frame,Out_of_Frame,Downstream,In_ORF,Upstream,Backwards,In_ORF_Frame,Upstream_Inframe\n,,\n")
2424
bqp = cPickle.load(open(filename, 'rb'))
2525
for gene_name in bqp:
2626
if gene_name != 'total':
@@ -32,7 +32,9 @@
3232
'in_orf': 0,
3333
'upstream': 0,
3434
'downstream': 0,
35-
'backwards' : 0
35+
'backwards': 0,
36+
'orf_frame': 0,
37+
'up_frame': 0
3638
}
3739
if nm_number != 'stats':
3840
if first_line_nm:
@@ -47,6 +49,10 @@
4749
if key not in junctions_property:
4850
stats[j.frame] += 1
4951
stats[j.orf] += 1
52+
if j.frame == 'in_frame' and j.orf == 'in_orf':
53+
stats['orf_frame'] += 1
54+
if j.frame =='in_frame' and j.orf == 'upstream':
55+
stats['up_frame'] += 1
5056
# if first_line_junction:
5157
# output_handle.write("%d, %d, %s, %s\n" % (j.position, j.query_start, j.frame, j.orf))
5258
# first_line_junction = False
@@ -55,19 +61,23 @@
5561
else:
5662
pass
5763
if first_line_junction:
58-
output_handle.write("%d,%d,%d,%d,%d,%d\n" % (stats['not_in_frame'],
64+
output_handle.write("%d,%d,%d,%d,%d,%d,%d,%d\n" % (stats['not_in_frame'],
5965
stats['in_frame'],
6066
stats['downstream'],
6167
stats['in_orf'],
6268
stats['upstream'],
63-
stats['backwards']))
69+
stats['backwards'],
70+
stats['orf_frame'],
71+
stats['up_frame']))
6472
first_line_junction = False
6573
else:
66-
output_handle.write(",,%d,%d,%d,%d,%d,%d\n" % (stats['not_in_frame'],
74+
output_handle.write(",,%d,%d,%d,%d,%d,%d,%d,%d\n" % (stats['not_in_frame'],
6775
stats['in_frame'],
6876
stats['downstream'],
6977
stats['in_orf'],
7078
stats['upstream'],
71-
stats['backwards']))
79+
stats['backwards'],
80+
stats['orf_frame'],
81+
stats['up_frame']))
7282
output_handle.write(",,\n")
7383
output_handle.close()

0 commit comments

Comments
 (0)