@@ -4,6 +4,7 @@ use strict;
4
4
use warnings;
5
5
use Exporter qw( import) ;
6
6
use File::Basename qw/ fileparse basename dirname/ ;
7
+ use File::Which qw/ which/ ;
7
8
use Data::Dumper;
8
9
use List::Util qw/ shuffle/ ;
9
10
use Scalar::Util qw/ looks_like_number/ ;
@@ -14,9 +15,11 @@ use threads::shared;
14
15
use lib dirname($INC {" Mashtree.pm" });
15
16
use Bio::Matrix::IO;
16
17
use Bio::TreeIO;
18
+ use Bio::Sketch::Mash;
19
+ use Bio::AlignIO;
17
20
18
21
our @EXPORT_OK = qw(
19
- logmsg openFastq _truncateFilename distancesToPhylip createTreeFromPhylip sortNames treeDist mashDist mashHashes raw_mash_distance raw_mash_distance_unequal_sizes
22
+ logmsg openFastq _truncateFilename distancesToPhylip createTreeFromPhylip sortNames treeDist mashDist mashHashes raw_mash_distance raw_mash_distance_unequal_sizes sketchesToAlignment createTreeFromBinaryAlignment
20
23
@fastqExt @fastaExt @bamExt @vcfExt @richseqExt @mshExt
21
24
$MASHTREE_VERSION
22
25
) ;
@@ -26,7 +29,7 @@ local $0=basename $0;
26
29
# #####
27
30
# CONSTANTS
28
31
29
- our $VERSION = " 1.1.2 " ;
32
+ our $VERSION = " 2.0 " ;
30
33
our $MASHTREE_VERSION =$VERSION ;
31
34
our @fastqExt =qw( .fastq.gz .fastq .fq .fq.gz) ;
32
35
our @fastaExt =qw( .fasta .fna .faa .mfa .fas .fsa .fa) ;
@@ -101,6 +104,68 @@ sub _truncateFilename{
101
104
return $name ;
102
105
}
103
106
107
+ # Read sketches and create an alignment in phylip format:
108
+ # 1. Make a presence/absence perl hash of min-hashes
109
+ # 2. Create a "sequence" for each sketch
110
+ # 3. Create the MSA of these sequences
111
+ # This function uses $settings with keys:
112
+ # presence: the nucleotide that stands for presence. Default: "1"
113
+ # absence: the nucleotide that stands for absence. Default: "0"
114
+ # format: the format of the output alignment. Default: "phylip"
115
+ sub sketchesToAlignment{
116
+ my ($sketches , $outdir , $settings ) = @_ ;
117
+
118
+ my $presence = $$settings {presence } || 1;
119
+ my $absence = $$settings {absence } || 0;
120
+ my $format = $$settings {format } || " phylip" ;
121
+
122
+ my $outfile = " $outdir /aln.$format " ;
123
+
124
+ # # Presence/absence of hashes
125
+ my %p ; # presence/absence
126
+ for my $file (@$sketches ){
127
+ my $msh = Bio::Sketch::Mash-> new($file );
128
+ my $sketches =$$msh {sketches }[0]{hashes };
129
+ for my $s (@$sketches ){
130
+ $p {$s }{$file }=1;
131
+ }
132
+ }
133
+
134
+ # # Create pseudo-sequences
135
+ my %sampleSeq ;
136
+ # Sort to help keep output stable
137
+ my @hashInt = sort {$a <=> $b } keys (%p );
138
+ for my $h (@hashInt ){
139
+ for my $file (@$sketches ){
140
+ # If the hash is present, then give the "present" nucleotide
141
+ if ($p {$h }{$file }){
142
+ $sampleSeq {$file } .= $presence ;
143
+ }
144
+ # If the hash is not present, then give the "absent" nucleotide
145
+ else {
146
+ $sampleSeq {$file } .= $absence ;
147
+ }
148
+ }
149
+ }
150
+
151
+ # # Make alignment to string
152
+ my $alnStr = " " ;
153
+ for my $file (@$sketches ){
154
+ my $name = _truncateFilename($file , $settings );
155
+ $alnStr .= " >$name \n " ;
156
+ $alnStr .= $sampleSeq {$file }." \n " ;
157
+ }
158
+
159
+ # # Convert alignment to phylip
160
+ my $alnin = Bio::AlignIO-> new(-string => $alnStr , -format => " fasta" );
161
+ my $alnout = Bio::AlignIO-> new(-file => " >" .$outfile ,-format => $format );
162
+ while (my $aln = $alnin -> next_aln){
163
+ $alnout -> write_aln($aln );
164
+ }
165
+
166
+ return $outfile ;
167
+ }
168
+
104
169
105
170
# 1. Read the mash distances
106
171
# 2. Create a phylip file
@@ -187,6 +252,51 @@ sub sortNames{
187
252
return @sorted ;
188
253
}
189
254
255
+ # Create a tree from a binary alignment
256
+ sub createTreeFromBinaryAlignment{
257
+ my ($aln , $outdir , $settings ) = @_ ;
258
+
259
+ my $outfile = " $outdir /tree.dnd" ;
260
+
261
+ my $raxml = which(" raxmlHPC" ) || which(" raxmlHPC-PTHREADS" );
262
+ if (!$raxml ){
263
+ die " ERROR: could not find raxml in your path." ;
264
+ }
265
+
266
+ # Avoid raxml crashing because original files were there
267
+ for my $filename (glob (" $outdir /RAxML*.raxml" )){
268
+ unlink $filename ;
269
+ }
270
+
271
+ # Run raxml from in the output directory
272
+ # -f a bootstrapping algorithm
273
+ # -s the alignment
274
+ # -n suffix for output files is raxml
275
+ # -T number of threads
276
+ # -p and -x are seeds
277
+ # -N 10 only ten bootstraps for fast analysis
278
+ # -m BINGAMMA binary gamma model
279
+ my $raxmllog = " $outdir /raxml.log" ;
280
+ logmsg " RAxML log will be in $raxmllog " ;
281
+ system (" cd $outdir && $raxml -f a -s aln.phylip -n raxml -T $$settings {numcpus} -p $$settings {seed} -x $$settings {seed} -N 10 -m BINGAMMA > " .basename($raxmllog )." 2>&1" );
282
+ if ($? ){
283
+ my $raxmlErr = $! ;
284
+ open (my $logfh , $raxmllog ) or logmsg " ERROR opening log file $raxmllog : $! " ;
285
+ while (<$logfh >){
286
+ print ;
287
+ }
288
+ close $logfh ;
289
+
290
+ die " ERROR running raxml: $raxmlErr " if $? ;
291
+ }
292
+
293
+ # Since we're not too interested in bootstrapping here,
294
+ # just return the best tree.
295
+ link (" $outdir /RAxML_bestTree.raxml" , $outfile );
296
+
297
+ return $outfile ;
298
+ }
299
+
190
300
# Create tree file with Quicktree but bioperl
191
301
# as a backup.
192
302
sub createTreeFromPhylip{
0 commit comments