determining minimum depth using the new abundance filter

lskatz · lskatz · commit 05fe16e2ea0c · 2016-10-20T11:42:55.000-04:00
diff --git a/bin/mashtree.pl b/bin/mashtree.pl
@@ -37,7 +37,7 @@ sub main{
 
   # Mash-specific options
   $$settings{genomesize}||=5000000;
-  $$settings{mindepth}||=5;
+  $$settings{mindepth}||=0;
   $$settings{kmerlength}||=21;
   $$settings{'sketch-size'}||=10000;
 
@@ -111,7 +111,8 @@ sub mashSketch{
     # Do different things depending on fastq vs fasta
     my $sketchXopts="";
     if(grep {$_ eq $fileExt} @fastqExt){
-      $sketchXopts.="-m $$settings{mindepth} -g $$settings{genomesize} ";
+      my $minDepth=determineMinimumDepth($fastq,$$settings{mindepth},$$settings{kmerlength},$settings);
+      $sketchXopts.="-m $minDepth -g $$settings{genomesize} ";
     } elsif(grep {$_ eq $fileExt} @fastaExt) {
       $sketchXopts.=" ";
     } else {
@@ -196,6 +197,26 @@ sub mashDist{
   return \@dist;
 }
 
+sub determineMinimumDepth{
+  my($fastq,$mindepth,$kmerlength,$settings)=@_;
+
+  return $mindepth if($mindepth > 0);
+
+  my $basename=basename($fastq,@fastqExt);
+  
+  # Run the min abundance finder to find the valleys
+  my $histFile="$$settings{tempdir}/$basename.hist.tsv";
+  my @valley=`min_abundance_finder.pl $fastq --kmer $kmerlength --valleys --nopeaks --hist $histFile`;
+  die "ERROR with min_abundance_finder.pl on $fastq: $!" if($?);
+  chomp(@valley);
+  
+  # Discard the header but keep the first line
+  my($minKmerCount, $countOfCounts)=split(/\t/,$valley[1]);
+
+  logmsg "Setting the min depth as $minKmerCount for $fastq";
+
+  return $minKmerCount;
+}
 
 sub usage{
   "$0: use distances from Mash (min-hash algorithm) to make a NJ tree
@@ -214,7 +235,9 @@ sub usage{
 
   MASH SKETCH OPTIONS
   --genomesize         5000000
-  --mindepth           5     
+  --mindepth           0    If mindepth is zero, then it will be
+                            chosen in a smart but slower method,
+                            to discard lower-abundance kmers.
   --kmerlength         21
   --sketch-size        10000
   "
diff --git a/bin/min_abundance_finder.pl b/bin/min_abundance_finder.pl
@@ -0,0 +1,270 @@
+#!/usr/bin/env perl
+# Find the minimum abundance of kmers
+# Original script was in Python at 
+#   https://gist.github.com/alexjironkin/4ed43412878723491240814a0d5a6ed6/223dea45d70c9136703a4afaab0178cdbfbd2042
+# Original author of python script: @alexjironkin, Public Health Englad
+# I wanted to increase capatibility with Perl and have a more
+# standalone script instead of relying on the khmer package.
+# Author: Lee Katz <lkatz@cdc.gov>
+
+use strict;
+use warnings;
+use Getopt::Long qw/GetOptions/;
+use Data::Dumper qw/Dumper/;
+use File::Basename qw/basename fileparse/;
+use List::Util qw/max/;
+use IO::Uncompress::Gunzip qw/gunzip/;
+
+# http://perldoc.perl.org/perlop.html#Symbolic-Unary-Operators
+# +Inf and -Inf will be a binary complement of all zeros
+use constant MAXINT =>  ~0;
+use constant MININT => -~0;
+
+local $0=basename $0;
+sub logmsg{print STDERR "$0: @_\n";}
+
+exit main();
+
+sub main{
+  my $settings={};
+  GetOptions($settings,qw(help kmerlength|kmer=i delta=i gt|greaterthan=i hist|histogram=s valleys! peaks!)) or die $!;
+  $$settings{kmerlength}||=21;
+  $$settings{delta}     ||=100;
+  $$settings{gt}        ||=0;
+
+  $$settings{peaks}     //=0;
+  $$settings{valleys}   //=1;
+
+  my($fastq)=@ARGV;
+  die usage() if(!$fastq || $$settings{help});
+  die "ERROR: I could not find fastq at $fastq" if(!-e $fastq);
+
+  # Find peaks and valleys, but use the histogram file if it exists
+  # and if the user specified it.
+  my $histogram=[];
+  if($$settings{hist} && -e $$settings{hist}){
+    # Read the cached histogram file
+    $histogram=readHistogram($$settings{hist},$settings);
+  } else {
+    # Count kmers and make a histogram.  kmerToHist()
+    # will optionally write the histogram file.
+    my $kmercount=countKmers($fastq,$$settings{kmerlength},$settings);
+    $histogram=kmerToHist($kmercount,$settings);
+  }
+  # Find peaks and valleys using the simplified 
+  # 'delta' algorithm.
+  my $peaks=findThePeaksAndValleys($histogram,$$settings{delta},$settings);
+
+  # Configure the output
+  my @outputPos=();
+  push(@outputPos, @{$$peaks{valleys}}) if($$settings{valleys});
+  push(@outputPos, @{$$peaks{peaks}})   if($$settings{peaks});
+  @outputPos=sort {$$a[0] <=> $$b[0]} @outputPos;
+
+  if(!@outputPos){
+    logmsg "WARNING: no peaks or valleys were reported";
+  }
+
+  # Header for output table
+  print join("\t",qw(kmer count))."\n";
+  # Finish off the table of output.
+  for my $position (@outputPos){
+    print join("\t",@$position)."\n";
+  }
+
+  return 0;
+}
+
+sub readHistogram{
+  my($infile,$settings)=@_;
+  my @hist=(0);
+  open(HIST,$infile) or die "ERROR: could not read $infile: $!";
+  logmsg "Reading histogram from $infile";
+  while(<HIST>){
+    chomp;
+    my($count,$countOfCounts)=split /\t/;
+    $hist[$count]=$countOfCounts;
+  }
+  close HIST;
+
+  # ensure defined values
+  for(my $i=0;$i<@hist;$i++){
+    $hist[$i] //= 0;
+  }
+
+  return \@hist;
+}
+
+sub countKmers{
+  my($fastq,$kmerlength,$settings)=@_;
+  my %kmer=();
+
+  # Pure perl to make this standalone... the only reason
+  # we are counting kmers in Perl instead of C.
+  my $fastqFh=openFastq($fastq,$settings);
+  my $i=0;
+  while(<$fastqFh>){ # burn the read ID line
+    my $seq=<$fastqFh>;
+    chomp($seq);
+    my $numKmersInRead=length($seq)-$kmerlength+1;
+
+    # Count kmers in a sliding window.
+    # We must keep this loop optimized for speed.
+    for(my $j=0;$j<$numKmersInRead;$j++){
+      $kmer{substr($seq,$j,$kmerlength)}++;
+    }
+
+    # Burn the quality score lines
+    <$fastqFh>;
+    <$fastqFh>;
+
+  }
+  close $fastqFh;
+
+  return \%kmer;
+}
+
+sub kmerToHist{
+  my($kmercountHash,$settings)=@_;
+  my %hist=();
+  my @hist=(0);
+
+  for my $kmercount(values(%$kmercountHash)){
+    $hist{$kmercount}++;
+  }
+
+  # Turn this hash into an array
+  for(1..max(keys(%hist))){
+    $hist[$_] = $hist{$_} || 0;
+  }
+
+  if($$settings{hist}){
+    logmsg "Writing histogram to $$settings{hist}";
+    open(HIST, ">", $$settings{hist}) or die "ERROR: could not write histogram to $$settings{hist}: $!";
+    for(my $i=0;$i<@hist;$i++){
+      print HIST "$i\t$hist[$i]\n";
+    }
+    close HIST;
+  }
+
+  return \@hist;
+}
+
+sub findThePeaksAndValleys{
+  my($hist, $delta, $settings)=@_;
+
+  my($min,$max)=(MAXINT,MININT);
+  my($minPos,$maxPos)=(0,0);
+  my @maxTab=();
+  my @minTab=();
+
+  my $lookForMax=1;
+
+  my $numZeros=0; # If we see too many counts of zero, then exit.
+  
+  for(my $kmerCount=$$settings{gt}+1;$kmerCount<@$hist;$kmerCount++){
+    my $countOfCounts=$$hist[$kmerCount];
+    if($countOfCounts == 0){ 
+      $numZeros++;
+    }
+    if($countOfCounts > $max){
+      $max=$countOfCounts;
+      $maxPos=$kmerCount;
+    }
+    if($countOfCounts < $min){
+      $min=$countOfCounts;
+      $minPos=$kmerCount;
+    }
+
+    if($lookForMax){
+      if($countOfCounts < $max - $delta){
+        push(@maxTab,[$maxPos,$max]);
+        $min=$countOfCounts;
+        $minPos=$kmerCount;
+        $lookForMax=0;
+      }
+    }
+    else{
+      if($countOfCounts > $min + $delta){
+        push(@minTab,[$minPos,$min]);
+        $max=$countOfCounts;
+        $maxPos=$kmerCount;
+        $lookForMax=1;
+      }
+    }
+
+    last if($numZeros > 3);
+  }
+
+  return {peaks=>\@maxTab, valleys=>\@minTab};
+}
+
+sub findTheValley{
+  my($peak1,$peak2,$hist,$settings)=@_;
+
+  my $valley=$$peak1[1];
+  my $kmerCount=$$peak1[0];
+  for(my $i=$$peak1[0]+1;$i<=$$peak2[0];$i++){
+    if($valley < $$hist[$i]){
+      $valley=$$hist[$i];
+      $kmerCount=$i;
+    } else {
+      last;
+    }
+  }
+  
+  return [$kmerCount,$valley];
+}
+
+# Opens a fastq file in a smart way
+sub openFastq{
+  my($fastq,$settings)=@_;
+
+  my $fh;
+
+  my @fastqExt=qw(.fastq.gz .fastq .fq.gz .fq);
+  my($name,$dir,$ext)=fileparse($fastq,@fastqExt);
+
+  # Open the file in different ways, depending on if it
+  # is gzipped or if the user has gzip installed.
+  if($ext =~/\.gz$/){
+    # use binary gzip if we can... why not take advantage
+    # of the compiled binary's speedup?
+    if(-e "/usr/bin/gzip"){
+      open($fh,"gzip -cd $fastq | ") or die "ERROR: could not open $fastq for reading!: $!";
+    }else{
+      $fh=new IO::Uncompress::Gunzip($fastq) or die "ERROR: could not read $fastq: $!";
+    }
+  } else {
+    open($fh,"<",$fastq) or die "ERROR: could not open $fastq for reading!: $!";
+  }
+  return $fh;
+}
+
+
+sub usage{
+  "
+  $0: 
+       Find the valley between two peaks on a set of kmers
+       such that you can discard the kmers that are 
+       likely representative of contamination.
+       This script does not require any dependencies.
+
+  Usage: $0 file.fastq[.gz]
+  --gt     0   Look for the first peak at this kmer count
+               and then the next valley.
+  --kmer   21  kmer length
+  --delta  100 How different the counts have to be to
+               detect a valley or peak
+
+  OUTPUT
+  --hist   ''  A file to write the histogram, or a file
+               to read the histogram if it already exists.
+               Useful if you want to rerun this script.
+  --valleys,   Valleys will be in the output by default
+  --novalleys
+  --peaks,     Maximum peaks will not be in the output
+  --nopeaks    by default
+  "
+}
+