Skip to content

Commit c77b6cf

Browse files
committed
the start of version 2: using binary alignment for a pseudophylogenetic method
1 parent b966948 commit c77b6cf

File tree

3 files changed

+127
-12
lines changed

3 files changed

+127
-12
lines changed

Makefile.PL

+2-1
Original file line numberDiff line numberDiff line change
@@ -54,10 +54,11 @@ WriteMakefile1(
5454
'DBD::SQLite' => 0,
5555
'Bio::SeqIO' => 0,
5656
'Bio::TreeIO' => 0,
57+
'Bio::AlignIO' => 0,
5758
'Bio::Matrix::IO'=> 0,
5859
'Bio::Tree::Statistics'=> 0,
5960
'Bio::Tree::DistanceFactory'=> 0,
60-
'Bio::Sketch::Mash' => 0, # ensures that Mash gets loaded
61+
"Bio::Sketch::Mash" => 0.23,
6162
#'Bio::Kmer' => 0.19,
6263
#'Graph::Dijkstra'=> 0,
6364
#'Readonly' => 0,

bin/mashtree

+13-9
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ use FindBin;
2121
use lib "$FindBin::RealBin/../lib";
2222
use lib "$FindBin::RealBin/../lib/perl5";
2323
use File::Which qw/which/;
24-
use Mashtree qw/logmsg @fastqExt @fastaExt @mshExt @richseqExt _truncateFilename createTreeFromPhylip $MASHTREE_VERSION/;
24+
use Mashtree qw/logmsg @fastqExt @fastaExt @mshExt @richseqExt _truncateFilename sketchesToAlignment createTreeFromBinaryAlignment $MASHTREE_VERSION/;
2525
use Mashtree::Db;
2626
use Bio::Tree::DistanceFactory;
2727
use Bio::Matrix::IO;
@@ -74,11 +74,13 @@ sub main{
7474
die "need more arguments\n".usage() if(@reads < 1);
7575

7676
# Check for prereq executables.
77-
for my $exe(qw(mash quicktree)){
77+
for my $exe(qw(mash)){
7878
if(!which($exe)){
7979
die "ERROR: could not find $exe in your PATH";
8080
}
8181
}
82+
which("raxmlHPC") || which("raxmlHPC-PTHREADS")
83+
or die "ERROR: need raxml in your PATH";
8284

8385
# Check mash version
8486
my $mashVersion = `mash --version`;
@@ -143,18 +145,20 @@ sub main{
143145

144146
my $sketches=sketchAll(\@reads,"$$settings{tempdir}",$settings);
145147

146-
my $phylip = mashDistance($sketches,\@reads,$$settings{tempdir},$settings);
148+
my $aln = sketchesToAlignment($sketches,"$$settings{tempdir}",$settings);
147149

148-
my $treeObj = createTreeFromPhylip($phylip,$$settings{tempdir},$settings);
150+
my $tree = createTreeFromBinaryAlignment($aln,$$settings{tempdir}, $settings);
149151

150152
# Write the tree
151153
if($$settings{outtree}){
152-
open(my $treeFh, ">", $$settings{outtree}) or die "ERROR writing tree to $$settings{outtree}: $!";
153-
print $treeFh $treeObj->as_text('newick');
154-
close $treeFh
154+
cp($tree, $$settings{outtree}) or die "ERROR: could not copy $tree => $$settings{outtree}: $!";
155155
} else {
156-
print $treeObj->as_text('newick');
157-
print "\n";
156+
open(my $treeFh, $tree) or die "ERROR: could not read from $tree: $!";
157+
while(<$treeFh>){
158+
print;
159+
}
160+
close $treeFh;
161+
print "\n"; # ensure a newline after the tree
158162
}
159163

160164
return 0;

lib/Mashtree.pm

+112-2
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ use strict;
44
use warnings;
55
use Exporter qw(import);
66
use File::Basename qw/fileparse basename dirname/;
7+
use File::Which qw/which/;
78
use Data::Dumper;
89
use List::Util qw/shuffle/;
910
use Scalar::Util qw/looks_like_number/;
@@ -14,9 +15,11 @@ use threads::shared;
1415
use lib dirname($INC{"Mashtree.pm"});
1516
use Bio::Matrix::IO;
1617
use Bio::TreeIO;
18+
use Bio::Sketch::Mash;
19+
use Bio::AlignIO;
1720

1821
our @EXPORT_OK = qw(
19-
logmsg openFastq _truncateFilename distancesToPhylip createTreeFromPhylip sortNames treeDist mashDist mashHashes raw_mash_distance raw_mash_distance_unequal_sizes
22+
logmsg openFastq _truncateFilename distancesToPhylip createTreeFromPhylip sortNames treeDist mashDist mashHashes raw_mash_distance raw_mash_distance_unequal_sizes sketchesToAlignment createTreeFromBinaryAlignment
2023
@fastqExt @fastaExt @bamExt @vcfExt @richseqExt @mshExt
2124
$MASHTREE_VERSION
2225
);
@@ -26,7 +29,7 @@ local $0=basename $0;
2629
######
2730
# CONSTANTS
2831

29-
our $VERSION = "1.1.2";
32+
our $VERSION = "2.0";
3033
our $MASHTREE_VERSION=$VERSION;
3134
our @fastqExt=qw(.fastq.gz .fastq .fq .fq.gz);
3235
our @fastaExt=qw(.fasta .fna .faa .mfa .fas .fsa .fa);
@@ -101,6 +104,68 @@ sub _truncateFilename{
101104
return $name;
102105
}
103106

107+
# Read sketches and create an alignment in phylip format:
108+
# 1. Make a presence/absence perl hash of min-hashes
109+
# 2. Create a "sequence" for each sketch
110+
# 3. Create the MSA of these sequences
111+
# This function uses $settings with keys:
112+
# presence: the nucleotide that stands for presence. Default: "1"
113+
# absence: the nucleotide that stands for absence. Default: "0"
114+
# format: the format of the output alignment. Default: "phylip"
115+
sub sketchesToAlignment{
116+
my($sketches, $outdir, $settings) = @_;
117+
118+
my $presence = $$settings{presence} || 1;
119+
my $absence = $$settings{absence} || 0;
120+
my $format = $$settings{format} || "phylip";
121+
122+
my $outfile = "$outdir/aln.$format";
123+
124+
## Presence/absence of hashes
125+
my %p; # presence/absence
126+
for my $file(@$sketches){
127+
my $msh = Bio::Sketch::Mash->new($file);
128+
my $sketches=$$msh{sketches}[0]{hashes};
129+
for my $s(@$sketches){
130+
$p{$s}{$file}=1;
131+
}
132+
}
133+
134+
## Create pseudo-sequences
135+
my %sampleSeq;
136+
# Sort to help keep output stable
137+
my @hashInt = sort{$a<=>$b} keys(%p);
138+
for my $h(@hashInt){
139+
for my $file(@$sketches){
140+
# If the hash is present, then give the "present" nucleotide
141+
if($p{$h}{$file}){
142+
$sampleSeq{$file} .= $presence;
143+
}
144+
# If the hash is not present, then give the "absent" nucleotide
145+
else{
146+
$sampleSeq{$file} .= $absence;
147+
}
148+
}
149+
}
150+
151+
## Make alignment to string
152+
my $alnStr = "";
153+
for my $file(@$sketches){
154+
my $name = _truncateFilename($file, $settings);
155+
$alnStr .= ">$name\n";
156+
$alnStr .= $sampleSeq{$file}."\n";
157+
}
158+
159+
## Convert alignment to phylip
160+
my $alnin = Bio::AlignIO->new(-string=>$alnStr, -format=>"fasta");
161+
my $alnout= Bio::AlignIO->new(-file=>">".$outfile,-format=>$format);
162+
while(my $aln = $alnin->next_aln){
163+
$alnout->write_aln($aln);
164+
}
165+
166+
return $outfile;
167+
}
168+
104169

105170
# 1. Read the mash distances
106171
# 2. Create a phylip file
@@ -187,6 +252,51 @@ sub sortNames{
187252
return @sorted;
188253
}
189254

255+
# Create a tree from a binary alignment
256+
sub createTreeFromBinaryAlignment{
257+
my($aln, $outdir, $settings) = @_;
258+
259+
my $outfile = "$outdir/tree.dnd";
260+
261+
my $raxml = which("raxmlHPC") || which("raxmlHPC-PTHREADS");
262+
if(!$raxml){
263+
die "ERROR: could not find raxml in your path.";
264+
}
265+
266+
# Avoid raxml crashing because original files were there
267+
for my $filename (glob("$outdir/RAxML*.raxml")){
268+
unlink $filename;
269+
}
270+
271+
# Run raxml from in the output directory
272+
# -f a bootstrapping algorithm
273+
# -s the alignment
274+
# -n suffix for output files is raxml
275+
# -T number of threads
276+
# -p and -x are seeds
277+
# -N 10 only ten bootstraps for fast analysis
278+
# -m BINGAMMA binary gamma model
279+
my $raxmllog = "$outdir/raxml.log";
280+
logmsg "RAxML log will be in $raxmllog";
281+
system("cd $outdir && $raxml -f a -s aln.phylip -n raxml -T $$settings{numcpus} -p $$settings{seed} -x $$settings{seed} -N 10 -m BINGAMMA > ".basename($raxmllog)." 2>&1");
282+
if($?){
283+
my $raxmlErr = $!;
284+
open(my $logfh, $raxmllog) or logmsg "ERROR opening log file $raxmllog: $!";
285+
while(<$logfh>){
286+
print;
287+
}
288+
close $logfh;
289+
290+
die "ERROR running raxml: $raxmlErr" if $?;
291+
}
292+
293+
# Since we're not too interested in bootstrapping here,
294+
# just return the best tree.
295+
link("$outdir/RAxML_bestTree.raxml", $outfile);
296+
297+
return $outfile;
298+
}
299+
190300
# Create tree file with Quicktree but bioperl
191301
# as a backup.
192302
sub createTreeFromPhylip{

0 commit comments

Comments
 (0)