@@ -54,14 +54,13 @@ def junction_search(self, directory, junction_folder, input_data_folder, blast_r
54
54
print ">>> The primary, secondary, and tertiary sequences searched are:"
55
55
sys .stdout .flush ()
56
56
unmapped_sam_files = self .fileio .get_sam_filelist (directory , input_data_folder )
57
- processed_file_list = self .fileio .get_file_list (directory , blast_results_query , ".bqa" )
58
- unmapped_sam_files = self ._get_unprocessed_files (unmapped_sam_files , ".sam" , processed_file_list , ".bqa" )
59
57
60
58
print '>>> Starting junction search.'
61
59
sys .stdout .flush ()
62
60
63
61
for f in unmapped_sam_files :
64
- open (filename )
62
+ print '>>> File: ' , f
63
+ sys .stdout .flush ()
65
64
filename = os .path .join (directory , input_data_folder , f )
66
65
input_filehandle = open (filename )
67
66
input_file_size = os .path .getsize (filename )
@@ -71,12 +70,11 @@ def junction_search(self, directory, junction_folder, input_data_folder, blast_r
71
70
self ._search_for_junctions (input_filehandle , jseqs , exclusion_sequence , output_filehandle , f , input_file_size )
72
71
output_filehandle .close ()
73
72
input_filehandle .close ()
74
- self ._multi_convert (directory , junction_folder , blast_results_folder , blast_results_query )
73
+ self ._multi_convert (directory , junction_folder , blast_results_folder )
75
74
76
- def _multi_convert (self , directory , infolder , outfolder , blast_results_query ):
75
+ def _multi_convert (self , directory , infolder , outfolder ):
77
76
file_list = self .fileio .get_file_list (directory , infolder , ".txt" )
78
- processed_file_list = self .fileio .get_file_list (directory , blast_results_query , ".bqa" )
79
- file_list = self ._get_unprocessed_files (file_list , ".junctions.txt" , processed_file_list , ".bqa" )
77
+
80
78
print ' '
81
79
for f in file_list :
82
80
self .fileio .make_FASTA (os .path .join (directory , infolder , f ),
@@ -107,8 +105,7 @@ def blast_search(self, directory, db_name, blast_results_folder, blast_results_q
107
105
print ">>> Selected Blast DB: %s" % db_name
108
106
sys .stdout .flush ()
109
107
file_list = self .fileio .get_file_list (directory , blast_results_folder , ".fa" )
110
- processed_file_list = self .fileio .get_file_list (directory , blast_results_query , ".bqa" )
111
- file_list = self ._get_unprocessed_files (file_list , ".junctions.txt" , processed_file_list , ".bqa" )
108
+
112
109
for file_name in file_list :
113
110
output_file = os .path .join (directory , blast_results_folder , file_name .replace (".junctions.fa" , '.blast.txt' ))
114
111
blast_command_list = [os .path .join (blast_path , 'blastn' + suffix ),
@@ -124,39 +121,31 @@ def blast_search(self, directory, db_name, blast_results_folder, blast_results_q
124
121
125
122
def generate_tabulated_blast_results (self , directory , blast_results_folder , blast_results_query_folder , gene_list_file ):
126
123
blast_list = self .fileio .get_file_list (directory , blast_results_folder , ".txt" )
127
- processed_file_list = self .fileio .get_file_list (directory , blast_results_query_folder , ".bqa" )
128
- blast_list = self ._get_unprocessed_files (blast_list , ".blast.txt" , processed_file_list , ".bqa" )
129
124
130
125
for blasttxt in blast_list :
131
126
print ">>> Parsing BLAST results file %s ..." % blasttxt
132
- blast_dict , accession_dict , gene_dict = self ._blast_parser (directory , blast_results_folder ,
133
- blasttxt , gene_list_file )
134
- # for key in blast_dict.keys():
135
- # if key not in ['total', 'pos_que']:
136
- # stats = {'in_orf' : 0, 'in_frame': 0, 'downstream': 0,
137
- # 'upstream': 0, 'not_in_frame': 0,
138
- # 'intron' : 0, 'backwards': 0, 'frame_orf': 0, 'total': 0
139
- # }
140
- # for nm in blast_dict[key].keys():
141
- # blast_dict[key][nm] = list(set(blast_dict[key][nm]))
142
- # for j in blast_dict[key][nm]:
143
- # j.ppm = blast_dict['pos_que'][j.pos_que] * 1000000 / blast_dict['total']
144
- # stats[j.frame] += 1
145
- # stats[j.orf] += 1
146
- # if j.frame_orf:
147
- # stats["frame_orf"] += 1
148
- # stats['total'] += 1
149
- # blast_dict[key]['stats'] = stats
127
+ blast_dict , gene_dict = self ._blast_parser (directory , blast_results_folder ,
128
+ blasttxt , gene_list_file )
129
+ for gene in blast_dict .keys ():
130
+ if gene not in ['total' , 'pos_que' ]:
131
+ stats = {'in_orf' : 0 , 'in_frame' : 0 , 'downstream' : 0 ,
132
+ 'upstream' : 0 , 'not_in_frame' : 0 ,
133
+ 'intron' : 0 , 'backwards' : 0 , 'frame_orf' : 0 , 'total' : 0
134
+ }
135
+ for nm in blast_dict [gene ].keys ():
136
+ for j in blast_dict [gene ][nm ]:
137
+ j .ppm = j .count * 1000000 / blast_dict ['total' ]
138
+ stats [j .frame ] += 1
139
+ stats [j .orf ] += 1
140
+ if j .frame_orf :
141
+ stats ["frame_orf" ] += 1
142
+ stats ['total' ] += 1
143
+ blast_dict [gene ]['stats' ] = stats
150
144
151
- blast_dict .pop ('pos_que' )
152
145
blast_query_p = open (os .path .join (directory , blast_results_query_folder ,
153
146
blasttxt .replace (".blast.txt" , ".bqp" )), "wb" )
154
- lists_p = open (os .path .join (directory , blast_results_query_folder ,
155
- blasttxt .replace (".blast.txt" , ".bqa" )), "wb" )
156
147
cPickle .dump (blast_dict , blast_query_p )
157
- cPickle .dump ([accession_dict , gene_dict ], lists_p )
158
148
blast_query_p .close ()
159
- lists_p .close ()
160
149
self .fileio .remove_file (directory , blast_results_folder ,
161
150
self .fileio .get_file_list (directory , blast_results_folder , ".fa" ))
162
151
@@ -230,10 +219,8 @@ def _blast_parser(self, directory, infolder, fileName, gene_list_file):
230
219
print_counter = 0
231
220
previous_bitscore = 0
232
221
results_dictionary = {}
233
- accession_dict = {}
234
222
gene_dict = {}
235
223
collect_results = 'n'
236
- pos_que = []
237
224
for line in blast_results_handle .readlines ():
238
225
line .strip ()
239
226
split = line .split ()
@@ -251,18 +238,14 @@ def _blast_parser(self, directory, infolder, fileName, gene_list_file):
251
238
previous_bitscore = float (split [11 ]) * 0.98
252
239
nm_number = split [1 ]
253
240
gene_name = gene_list [nm_number ]['gene_name' ]
254
- accession_dict [nm_number ] = gene_list [nm_number ]['gene_name' ]
255
241
if gene_name not in gene_dict .keys ():
256
242
gene_dict [gene_name ] = [nm_number ]
257
243
else :
258
244
gene_dict [gene_name ].append (nm_number )
259
245
260
- pq = nm_number + "-" + split [8 ] + "-" + split [6 ]
261
246
j = sts .jcnt ()
262
247
j .position = int (split [8 ])
263
248
j .query_start = int (split [6 ])
264
- j .pos_que = pq
265
- pos_que .append (pq )
266
249
fudge_factor = j .query_start - 1
267
250
frame = j .position - gene_list [nm_number ]['orf_start' ] - fudge_factor
268
251
if frame % 3 == 0 or frame == 0 :
@@ -291,15 +274,23 @@ def _blast_parser(self, directory, infolder, fileName, gene_list_file):
291
274
results_dictionary [gene_name ][nm_number ] = [j ]
292
275
else :
293
276
if nm_number not in results_dictionary [gene_name ].keys ():
294
- results_dictionary [gene_name ][nm_number ] = [j ]
277
+ results_dictionary [gene_name ][nm_number ] = []
278
+
279
+ junction_present = False
280
+ junction_index = 0
281
+ for index , pj in enumerate (results_dictionary [gene_name ][nm_number ]):
282
+ if pj .position == j .position and pj .query_start == j .query_start :
283
+ junction_index = index
284
+ junction_present = True
285
+ if junction_present :
286
+ results_dictionary [gene_name ][nm_number ][junction_index ].count += 1
295
287
else :
296
288
results_dictionary [gene_name ][nm_number ].append (j )
297
289
else :
298
290
collect_results = 'n'
299
291
results_dictionary ['total' ] = blast_results_count
300
- results_dictionary ['pos_que' ] = Counter (pos_que )
301
292
blast_results_handle .close ()
302
- return results_dictionary , accession_dict , gene_dict
293
+ return results_dictionary , gene_dict
303
294
304
295
# def _search_junctions(self, infile, junction_sequence, outfile):
305
296
# def longest_common_substring(s1, s2):
0 commit comments