Skip to content

Commit 05fe16e

Browse files
committed
determining minimum depth using the new abundance filter
1 parent a7b265b commit 05fe16e

File tree

2 files changed

+296
-3
lines changed

2 files changed

+296
-3
lines changed

bin/mashtree.pl

Lines changed: 26 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ sub main{
3737

3838
# Mash-specific options
3939
$$settings{genomesize}||=5000000;
40-
$$settings{mindepth}||=5;
40+
$$settings{mindepth}||=0;
4141
$$settings{kmerlength}||=21;
4242
$$settings{'sketch-size'}||=10000;
4343

@@ -111,7 +111,8 @@ sub mashSketch{
111111
# Do different things depending on fastq vs fasta
112112
my $sketchXopts="";
113113
if(grep {$_ eq $fileExt} @fastqExt){
114-
$sketchXopts.="-m $$settings{mindepth} -g $$settings{genomesize} ";
114+
my $minDepth=determineMinimumDepth($fastq,$$settings{mindepth},$$settings{kmerlength},$settings);
115+
$sketchXopts.="-m $minDepth -g $$settings{genomesize} ";
115116
} elsif(grep {$_ eq $fileExt} @fastaExt) {
116117
$sketchXopts.=" ";
117118
} else {
@@ -196,6 +197,26 @@ sub mashDist{
196197
return \@dist;
197198
}
198199

200+
sub determineMinimumDepth{
201+
my($fastq,$mindepth,$kmerlength,$settings)=@_;
202+
203+
return $mindepth if($mindepth > 0);
204+
205+
my $basename=basename($fastq,@fastqExt);
206+
207+
# Run the min abundance finder to find the valleys
208+
my $histFile="$$settings{tempdir}/$basename.hist.tsv";
209+
my @valley=`min_abundance_finder.pl $fastq --kmer $kmerlength --valleys --nopeaks --hist $histFile`;
210+
die "ERROR with min_abundance_finder.pl on $fastq: $!" if($?);
211+
chomp(@valley);
212+
213+
# Discard the header but keep the first line
214+
my($minKmerCount, $countOfCounts)=split(/\t/,$valley[1]);
215+
216+
logmsg "Setting the min depth as $minKmerCount for $fastq";
217+
218+
return $minKmerCount;
219+
}
199220

200221
sub usage{
201222
"$0: use distances from Mash (min-hash algorithm) to make a NJ tree
@@ -214,7 +235,9 @@ sub usage{
214235
215236
MASH SKETCH OPTIONS
216237
--genomesize 5000000
217-
--mindepth 5
238+
--mindepth 0 If mindepth is zero, then it will be
239+
chosen in a smart but slower method,
240+
to discard lower-abundance kmers.
218241
--kmerlength 21
219242
--sketch-size 10000
220243
"

bin/min_abundance_finder.pl

Lines changed: 270 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,270 @@
1+
#!/usr/bin/env perl
2+
# Find the minimum abundance of kmers
3+
# Original script was in Python at
4+
# https://gist.github.com/alexjironkin/4ed43412878723491240814a0d5a6ed6/223dea45d70c9136703a4afaab0178cdbfbd2042
5+
# Original author of python script: @alexjironkin, Public Health Englad
6+
# I wanted to increase capatibility with Perl and have a more
7+
# standalone script instead of relying on the khmer package.
8+
# Author: Lee Katz <lkatz@cdc.gov>
9+
10+
use strict;
11+
use warnings;
12+
use Getopt::Long qw/GetOptions/;
13+
use Data::Dumper qw/Dumper/;
14+
use File::Basename qw/basename fileparse/;
15+
use List::Util qw/max/;
16+
use IO::Uncompress::Gunzip qw/gunzip/;
17+
18+
# http://perldoc.perl.org/perlop.html#Symbolic-Unary-Operators
19+
# +Inf and -Inf will be a binary complement of all zeros
20+
use constant MAXINT => ~0;
21+
use constant MININT => -~0;
22+
23+
local $0=basename $0;
24+
sub logmsg{print STDERR "$0: @_\n";}
25+
26+
exit main();
27+
28+
sub main{
29+
my $settings={};
30+
GetOptions($settings,qw(help kmerlength|kmer=i delta=i gt|greaterthan=i hist|histogram=s valleys! peaks!)) or die $!;
31+
$$settings{kmerlength}||=21;
32+
$$settings{delta} ||=100;
33+
$$settings{gt} ||=0;
34+
35+
$$settings{peaks} //=0;
36+
$$settings{valleys} //=1;
37+
38+
my($fastq)=@ARGV;
39+
die usage() if(!$fastq || $$settings{help});
40+
die "ERROR: I could not find fastq at $fastq" if(!-e $fastq);
41+
42+
# Find peaks and valleys, but use the histogram file if it exists
43+
# and if the user specified it.
44+
my $histogram=[];
45+
if($$settings{hist} && -e $$settings{hist}){
46+
# Read the cached histogram file
47+
$histogram=readHistogram($$settings{hist},$settings);
48+
} else {
49+
# Count kmers and make a histogram. kmerToHist()
50+
# will optionally write the histogram file.
51+
my $kmercount=countKmers($fastq,$$settings{kmerlength},$settings);
52+
$histogram=kmerToHist($kmercount,$settings);
53+
}
54+
# Find peaks and valleys using the simplified
55+
# 'delta' algorithm.
56+
my $peaks=findThePeaksAndValleys($histogram,$$settings{delta},$settings);
57+
58+
# Configure the output
59+
my @outputPos=();
60+
push(@outputPos, @{$$peaks{valleys}}) if($$settings{valleys});
61+
push(@outputPos, @{$$peaks{peaks}}) if($$settings{peaks});
62+
@outputPos=sort {$$a[0] <=> $$b[0]} @outputPos;
63+
64+
if(!@outputPos){
65+
logmsg "WARNING: no peaks or valleys were reported";
66+
}
67+
68+
# Header for output table
69+
print join("\t",qw(kmer count))."\n";
70+
# Finish off the table of output.
71+
for my $position (@outputPos){
72+
print join("\t",@$position)."\n";
73+
}
74+
75+
return 0;
76+
}
77+
78+
sub readHistogram{
79+
my($infile,$settings)=@_;
80+
my @hist=(0);
81+
open(HIST,$infile) or die "ERROR: could not read $infile: $!";
82+
logmsg "Reading histogram from $infile";
83+
while(<HIST>){
84+
chomp;
85+
my($count,$countOfCounts)=split /\t/;
86+
$hist[$count]=$countOfCounts;
87+
}
88+
close HIST;
89+
90+
# ensure defined values
91+
for(my $i=0;$i<@hist;$i++){
92+
$hist[$i] //= 0;
93+
}
94+
95+
return \@hist;
96+
}
97+
98+
sub countKmers{
99+
my($fastq,$kmerlength,$settings)=@_;
100+
my %kmer=();
101+
102+
# Pure perl to make this standalone... the only reason
103+
# we are counting kmers in Perl instead of C.
104+
my $fastqFh=openFastq($fastq,$settings);
105+
my $i=0;
106+
while(<$fastqFh>){ # burn the read ID line
107+
my $seq=<$fastqFh>;
108+
chomp($seq);
109+
my $numKmersInRead=length($seq)-$kmerlength+1;
110+
111+
# Count kmers in a sliding window.
112+
# We must keep this loop optimized for speed.
113+
for(my $j=0;$j<$numKmersInRead;$j++){
114+
$kmer{substr($seq,$j,$kmerlength)}++;
115+
}
116+
117+
# Burn the quality score lines
118+
<$fastqFh>;
119+
<$fastqFh>;
120+
121+
}
122+
close $fastqFh;
123+
124+
return \%kmer;
125+
}
126+
127+
sub kmerToHist{
128+
my($kmercountHash,$settings)=@_;
129+
my %hist=();
130+
my @hist=(0);
131+
132+
for my $kmercount(values(%$kmercountHash)){
133+
$hist{$kmercount}++;
134+
}
135+
136+
# Turn this hash into an array
137+
for(1..max(keys(%hist))){
138+
$hist[$_] = $hist{$_} || 0;
139+
}
140+
141+
if($$settings{hist}){
142+
logmsg "Writing histogram to $$settings{hist}";
143+
open(HIST, ">", $$settings{hist}) or die "ERROR: could not write histogram to $$settings{hist}: $!";
144+
for(my $i=0;$i<@hist;$i++){
145+
print HIST "$i\t$hist[$i]\n";
146+
}
147+
close HIST;
148+
}
149+
150+
return \@hist;
151+
}
152+
153+
sub findThePeaksAndValleys{
154+
my($hist, $delta, $settings)=@_;
155+
156+
my($min,$max)=(MAXINT,MININT);
157+
my($minPos,$maxPos)=(0,0);
158+
my @maxTab=();
159+
my @minTab=();
160+
161+
my $lookForMax=1;
162+
163+
my $numZeros=0; # If we see too many counts of zero, then exit.
164+
165+
for(my $kmerCount=$$settings{gt}+1;$kmerCount<@$hist;$kmerCount++){
166+
my $countOfCounts=$$hist[$kmerCount];
167+
if($countOfCounts == 0){
168+
$numZeros++;
169+
}
170+
if($countOfCounts > $max){
171+
$max=$countOfCounts;
172+
$maxPos=$kmerCount;
173+
}
174+
if($countOfCounts < $min){
175+
$min=$countOfCounts;
176+
$minPos=$kmerCount;
177+
}
178+
179+
if($lookForMax){
180+
if($countOfCounts < $max - $delta){
181+
push(@maxTab,[$maxPos,$max]);
182+
$min=$countOfCounts;
183+
$minPos=$kmerCount;
184+
$lookForMax=0;
185+
}
186+
}
187+
else{
188+
if($countOfCounts > $min + $delta){
189+
push(@minTab,[$minPos,$min]);
190+
$max=$countOfCounts;
191+
$maxPos=$kmerCount;
192+
$lookForMax=1;
193+
}
194+
}
195+
196+
last if($numZeros > 3);
197+
}
198+
199+
return {peaks=>\@maxTab, valleys=>\@minTab};
200+
}
201+
202+
sub findTheValley{
203+
my($peak1,$peak2,$hist,$settings)=@_;
204+
205+
my $valley=$$peak1[1];
206+
my $kmerCount=$$peak1[0];
207+
for(my $i=$$peak1[0]+1;$i<=$$peak2[0];$i++){
208+
if($valley < $$hist[$i]){
209+
$valley=$$hist[$i];
210+
$kmerCount=$i;
211+
} else {
212+
last;
213+
}
214+
}
215+
216+
return [$kmerCount,$valley];
217+
}
218+
219+
# Opens a fastq file in a smart way
220+
sub openFastq{
221+
my($fastq,$settings)=@_;
222+
223+
my $fh;
224+
225+
my @fastqExt=qw(.fastq.gz .fastq .fq.gz .fq);
226+
my($name,$dir,$ext)=fileparse($fastq,@fastqExt);
227+
228+
# Open the file in different ways, depending on if it
229+
# is gzipped or if the user has gzip installed.
230+
if($ext =~/\.gz$/){
231+
# use binary gzip if we can... why not take advantage
232+
# of the compiled binary's speedup?
233+
if(-e "/usr/bin/gzip"){
234+
open($fh,"gzip -cd $fastq | ") or die "ERROR: could not open $fastq for reading!: $!";
235+
}else{
236+
$fh=new IO::Uncompress::Gunzip($fastq) or die "ERROR: could not read $fastq: $!";
237+
}
238+
} else {
239+
open($fh,"<",$fastq) or die "ERROR: could not open $fastq for reading!: $!";
240+
}
241+
return $fh;
242+
}
243+
244+
245+
sub usage{
246+
"
247+
$0:
248+
Find the valley between two peaks on a set of kmers
249+
such that you can discard the kmers that are
250+
likely representative of contamination.
251+
This script does not require any dependencies.
252+
253+
Usage: $0 file.fastq[.gz]
254+
--gt 0 Look for the first peak at this kmer count
255+
and then the next valley.
256+
--kmer 21 kmer length
257+
--delta 100 How different the counts have to be to
258+
detect a valley or peak
259+
260+
OUTPUT
261+
--hist '' A file to write the histogram, or a file
262+
to read the histogram if it already exists.
263+
Useful if you want to rerun this script.
264+
--valleys, Valleys will be in the output by default
265+
--novalleys
266+
--peaks, Maximum peaks will not be in the output
267+
--nopeaks by default
268+
"
269+
}
270+

0 commit comments

Comments
 (0)