|
| 1 | +#!/usr/bin/env perl |
| 2 | +# Find the minimum abundance of kmers |
| 3 | +# Original script was in Python at |
| 4 | +# https://gist.github.com/alexjironkin/4ed43412878723491240814a0d5a6ed6/223dea45d70c9136703a4afaab0178cdbfbd2042 |
| 5 | +# Original author of python script: @alexjironkin, Public Health Englad |
| 6 | +# I wanted to increase capatibility with Perl and have a more |
| 7 | +# standalone script instead of relying on the khmer package. |
| 8 | +# Author: Lee Katz <lkatz@cdc.gov> |
| 9 | + |
| 10 | +use strict; |
| 11 | +use warnings; |
| 12 | +use Getopt::Long qw/GetOptions/; |
| 13 | +use Data::Dumper qw/Dumper/; |
| 14 | +use File::Basename qw/basename fileparse/; |
| 15 | +use List::Util qw/max/; |
| 16 | +use IO::Uncompress::Gunzip qw/gunzip/; |
| 17 | + |
| 18 | +# http://perldoc.perl.org/perlop.html#Symbolic-Unary-Operators |
| 19 | +# +Inf and -Inf will be a binary complement of all zeros |
| 20 | +use constant MAXINT => ~0; |
| 21 | +use constant MININT => -~0; |
| 22 | + |
| 23 | +local $0=basename $0; |
| 24 | +sub logmsg{print STDERR "$0: @_\n";} |
| 25 | + |
| 26 | +exit main(); |
| 27 | + |
| 28 | +sub main{ |
| 29 | + my $settings={}; |
| 30 | + GetOptions($settings,qw(help kmerlength|kmer=i delta=i gt|greaterthan=i hist|histogram=s valleys! peaks!)) or die $!; |
| 31 | + $$settings{kmerlength}||=21; |
| 32 | + $$settings{delta} ||=100; |
| 33 | + $$settings{gt} ||=0; |
| 34 | + |
| 35 | + $$settings{peaks} //=0; |
| 36 | + $$settings{valleys} //=1; |
| 37 | + |
| 38 | + my($fastq)=@ARGV; |
| 39 | + die usage() if(!$fastq || $$settings{help}); |
| 40 | + die "ERROR: I could not find fastq at $fastq" if(!-e $fastq); |
| 41 | + |
| 42 | + # Find peaks and valleys, but use the histogram file if it exists |
| 43 | + # and if the user specified it. |
| 44 | + my $histogram=[]; |
| 45 | + if($$settings{hist} && -e $$settings{hist}){ |
| 46 | + # Read the cached histogram file |
| 47 | + $histogram=readHistogram($$settings{hist},$settings); |
| 48 | + } else { |
| 49 | + # Count kmers and make a histogram. kmerToHist() |
| 50 | + # will optionally write the histogram file. |
| 51 | + my $kmercount=countKmers($fastq,$$settings{kmerlength},$settings); |
| 52 | + $histogram=kmerToHist($kmercount,$settings); |
| 53 | + } |
| 54 | + # Find peaks and valleys using the simplified |
| 55 | + # 'delta' algorithm. |
| 56 | + my $peaks=findThePeaksAndValleys($histogram,$$settings{delta},$settings); |
| 57 | + |
| 58 | + # Configure the output |
| 59 | + my @outputPos=(); |
| 60 | + push(@outputPos, @{$$peaks{valleys}}) if($$settings{valleys}); |
| 61 | + push(@outputPos, @{$$peaks{peaks}}) if($$settings{peaks}); |
| 62 | + @outputPos=sort {$$a[0] <=> $$b[0]} @outputPos; |
| 63 | + |
| 64 | + if(!@outputPos){ |
| 65 | + logmsg "WARNING: no peaks or valleys were reported"; |
| 66 | + } |
| 67 | + |
| 68 | + # Header for output table |
| 69 | + print join("\t",qw(kmer count))."\n"; |
| 70 | + # Finish off the table of output. |
| 71 | + for my $position (@outputPos){ |
| 72 | + print join("\t",@$position)."\n"; |
| 73 | + } |
| 74 | + |
| 75 | + return 0; |
| 76 | +} |
| 77 | + |
| 78 | +sub readHistogram{ |
| 79 | + my($infile,$settings)=@_; |
| 80 | + my @hist=(0); |
| 81 | + open(HIST,$infile) or die "ERROR: could not read $infile: $!"; |
| 82 | + logmsg "Reading histogram from $infile"; |
| 83 | + while(<HIST>){ |
| 84 | + chomp; |
| 85 | + my($count,$countOfCounts)=split /\t/; |
| 86 | + $hist[$count]=$countOfCounts; |
| 87 | + } |
| 88 | + close HIST; |
| 89 | + |
| 90 | + # ensure defined values |
| 91 | + for(my $i=0;$i<@hist;$i++){ |
| 92 | + $hist[$i] //= 0; |
| 93 | + } |
| 94 | + |
| 95 | + return \@hist; |
| 96 | +} |
| 97 | + |
| 98 | +sub countKmers{ |
| 99 | + my($fastq,$kmerlength,$settings)=@_; |
| 100 | + my %kmer=(); |
| 101 | + |
| 102 | + # Pure perl to make this standalone... the only reason |
| 103 | + # we are counting kmers in Perl instead of C. |
| 104 | + my $fastqFh=openFastq($fastq,$settings); |
| 105 | + my $i=0; |
| 106 | + while(<$fastqFh>){ # burn the read ID line |
| 107 | + my $seq=<$fastqFh>; |
| 108 | + chomp($seq); |
| 109 | + my $numKmersInRead=length($seq)-$kmerlength+1; |
| 110 | + |
| 111 | + # Count kmers in a sliding window. |
| 112 | + # We must keep this loop optimized for speed. |
| 113 | + for(my $j=0;$j<$numKmersInRead;$j++){ |
| 114 | + $kmer{substr($seq,$j,$kmerlength)}++; |
| 115 | + } |
| 116 | + |
| 117 | + # Burn the quality score lines |
| 118 | + <$fastqFh>; |
| 119 | + <$fastqFh>; |
| 120 | + |
| 121 | + } |
| 122 | + close $fastqFh; |
| 123 | + |
| 124 | + return \%kmer; |
| 125 | +} |
| 126 | + |
| 127 | +sub kmerToHist{ |
| 128 | + my($kmercountHash,$settings)=@_; |
| 129 | + my %hist=(); |
| 130 | + my @hist=(0); |
| 131 | + |
| 132 | + for my $kmercount(values(%$kmercountHash)){ |
| 133 | + $hist{$kmercount}++; |
| 134 | + } |
| 135 | + |
| 136 | + # Turn this hash into an array |
| 137 | + for(1..max(keys(%hist))){ |
| 138 | + $hist[$_] = $hist{$_} || 0; |
| 139 | + } |
| 140 | + |
| 141 | + if($$settings{hist}){ |
| 142 | + logmsg "Writing histogram to $$settings{hist}"; |
| 143 | + open(HIST, ">", $$settings{hist}) or die "ERROR: could not write histogram to $$settings{hist}: $!"; |
| 144 | + for(my $i=0;$i<@hist;$i++){ |
| 145 | + print HIST "$i\t$hist[$i]\n"; |
| 146 | + } |
| 147 | + close HIST; |
| 148 | + } |
| 149 | + |
| 150 | + return \@hist; |
| 151 | +} |
| 152 | + |
| 153 | +sub findThePeaksAndValleys{ |
| 154 | + my($hist, $delta, $settings)=@_; |
| 155 | + |
| 156 | + my($min,$max)=(MAXINT,MININT); |
| 157 | + my($minPos,$maxPos)=(0,0); |
| 158 | + my @maxTab=(); |
| 159 | + my @minTab=(); |
| 160 | + |
| 161 | + my $lookForMax=1; |
| 162 | + |
| 163 | + my $numZeros=0; # If we see too many counts of zero, then exit. |
| 164 | + |
| 165 | + for(my $kmerCount=$$settings{gt}+1;$kmerCount<@$hist;$kmerCount++){ |
| 166 | + my $countOfCounts=$$hist[$kmerCount]; |
| 167 | + if($countOfCounts == 0){ |
| 168 | + $numZeros++; |
| 169 | + } |
| 170 | + if($countOfCounts > $max){ |
| 171 | + $max=$countOfCounts; |
| 172 | + $maxPos=$kmerCount; |
| 173 | + } |
| 174 | + if($countOfCounts < $min){ |
| 175 | + $min=$countOfCounts; |
| 176 | + $minPos=$kmerCount; |
| 177 | + } |
| 178 | + |
| 179 | + if($lookForMax){ |
| 180 | + if($countOfCounts < $max - $delta){ |
| 181 | + push(@maxTab,[$maxPos,$max]); |
| 182 | + $min=$countOfCounts; |
| 183 | + $minPos=$kmerCount; |
| 184 | + $lookForMax=0; |
| 185 | + } |
| 186 | + } |
| 187 | + else{ |
| 188 | + if($countOfCounts > $min + $delta){ |
| 189 | + push(@minTab,[$minPos,$min]); |
| 190 | + $max=$countOfCounts; |
| 191 | + $maxPos=$kmerCount; |
| 192 | + $lookForMax=1; |
| 193 | + } |
| 194 | + } |
| 195 | + |
| 196 | + last if($numZeros > 3); |
| 197 | + } |
| 198 | + |
| 199 | + return {peaks=>\@maxTab, valleys=>\@minTab}; |
| 200 | +} |
| 201 | + |
| 202 | +sub findTheValley{ |
| 203 | + my($peak1,$peak2,$hist,$settings)=@_; |
| 204 | + |
| 205 | + my $valley=$$peak1[1]; |
| 206 | + my $kmerCount=$$peak1[0]; |
| 207 | + for(my $i=$$peak1[0]+1;$i<=$$peak2[0];$i++){ |
| 208 | + if($valley < $$hist[$i]){ |
| 209 | + $valley=$$hist[$i]; |
| 210 | + $kmerCount=$i; |
| 211 | + } else { |
| 212 | + last; |
| 213 | + } |
| 214 | + } |
| 215 | + |
| 216 | + return [$kmerCount,$valley]; |
| 217 | +} |
| 218 | + |
| 219 | +# Opens a fastq file in a smart way |
| 220 | +sub openFastq{ |
| 221 | + my($fastq,$settings)=@_; |
| 222 | + |
| 223 | + my $fh; |
| 224 | + |
| 225 | + my @fastqExt=qw(.fastq.gz .fastq .fq.gz .fq); |
| 226 | + my($name,$dir,$ext)=fileparse($fastq,@fastqExt); |
| 227 | + |
| 228 | + # Open the file in different ways, depending on if it |
| 229 | + # is gzipped or if the user has gzip installed. |
| 230 | + if($ext =~/\.gz$/){ |
| 231 | + # use binary gzip if we can... why not take advantage |
| 232 | + # of the compiled binary's speedup? |
| 233 | + if(-e "/usr/bin/gzip"){ |
| 234 | + open($fh,"gzip -cd $fastq | ") or die "ERROR: could not open $fastq for reading!: $!"; |
| 235 | + }else{ |
| 236 | + $fh=new IO::Uncompress::Gunzip($fastq) or die "ERROR: could not read $fastq: $!"; |
| 237 | + } |
| 238 | + } else { |
| 239 | + open($fh,"<",$fastq) or die "ERROR: could not open $fastq for reading!: $!"; |
| 240 | + } |
| 241 | + return $fh; |
| 242 | +} |
| 243 | + |
| 244 | + |
| 245 | +sub usage{ |
| 246 | + " |
| 247 | + $0: |
| 248 | + Find the valley between two peaks on a set of kmers |
| 249 | + such that you can discard the kmers that are |
| 250 | + likely representative of contamination. |
| 251 | + This script does not require any dependencies. |
| 252 | +
|
| 253 | + Usage: $0 file.fastq[.gz] |
| 254 | + --gt 0 Look for the first peak at this kmer count |
| 255 | + and then the next valley. |
| 256 | + --kmer 21 kmer length |
| 257 | + --delta 100 How different the counts have to be to |
| 258 | + detect a valley or peak |
| 259 | +
|
| 260 | + OUTPUT |
| 261 | + --hist '' A file to write the histogram, or a file |
| 262 | + to read the histogram if it already exists. |
| 263 | + Useful if you want to rerun this script. |
| 264 | + --valleys, Valleys will be in the output by default |
| 265 | + --novalleys |
| 266 | + --peaks, Maximum peaks will not be in the output |
| 267 | + --nopeaks by default |
| 268 | + " |
| 269 | +} |
| 270 | + |
0 commit comments