Skip to content

Commit 20d33b2

Browse files
committed
more documentation; fixed unit test for jackknife
1 parent b48bfd6 commit 20d33b2

File tree

6 files changed

+324
-39
lines changed

6 files changed

+324
-39
lines changed

README.md

+4
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,10 @@ with a random seed.
9797

9898
## Further documentation
9999

100+
For perl library help, run `perldoc` on a `.pm` file, e.g., `perldoc lib/Mashtree/Db.pm`.
101+
102+
For executable help run `--help`, e.g., `mashtree_bootstrap.pl --help`.
103+
100104
For more information and help please see the [docs folder](docs/)
101105

102106
For more information on plugins, see the [plugins folder](plugins). (in development)

bin/mashtree

+1-1
Original file line numberDiff line numberDiff line change
@@ -366,7 +366,7 @@ sub mashDistance{
366366
close $mshListFh;
367367

368368
# Instatiate the database and create the table before the threads get to it
369-
my $mashtreeDbFilename="$outdir/distances.sqlite";
369+
my $mashtreeDbFilename="$outdir/distances.db.tsv";
370370
my $mashtreeDb=Mashtree::Db->new($mashtreeDbFilename,{significant_figures=>$$settings{sigfigs}});
371371

372372
# Make an array of distance files for each thread.

bin/mashtree_jackknife.pl

+2-2
Original file line numberDiff line numberDiff line change
@@ -268,8 +268,8 @@ sub subsampleMashSketchesWorker{
268268
}
269269

270270
# Add distances to database
271-
print $logFh "Creating database, $subsampleDir/distances.sqlite\n";
272-
my $mashtreeDb = Mashtree::Db->new("$subsampleDir/distances.sqlite");
271+
print $logFh "Creating database, $subsampleDir/distances.db.tsv\n";
272+
my $mashtreeDb = Mashtree::Db->new("$subsampleDir/distances.db.tsv");
273273
$mashtreeDb->addDistancesFromHash(\%dist);
274274
# Convert to Phylip
275275
my $phylipFile = "$subsampleDir/distances.phylip";

lib/Mashtree.pm

+155-16
Original file line numberDiff line numberDiff line change
@@ -23,10 +23,47 @@ our @EXPORT_OK = qw(
2323

2424
local $0=basename $0;
2525

26+
=pod
27+
28+
=head1 NAME Mashtree
29+
30+
=head1 SYNOPSIS
31+
32+
Helps run a mashtree analysis to make rapid trees for genomes.
33+
Please see github.com/lskatz/Mashtree for more information.
34+
35+
=head1 VARIABLES
36+
37+
=over
38+
39+
=item $VERSION
40+
41+
=item $MASHTREE_VERSION (same value as $VERSION)
42+
43+
=item @fastqExt = qw(.fastq.gz .fastq .fq .fq.gz)
44+
45+
=item @fastaExt = qw(.fasta .fna .faa .mfa .fas .fsa .fa)
46+
47+
=item @bamExt = qw(.sorted.bam .bam)
48+
49+
=item @vcfExt = qw(.vcf.gz .vcf)
50+
51+
=item @mshExt = qw(.msh)
52+
53+
=item @richseqExt = qw(.gb .gbank .genbank .gbk .gbs .gbf .embl .ebl .emb .dat .swiss .sp)
54+
55+
=item $fhStick :shared
56+
57+
Used to mark whether a file is being read, so that Mashtree limits disk I/O
58+
59+
=back
60+
61+
=cut
62+
2663
######
2764
# CONSTANTS
2865

29-
our $VERSION = "1.3.1";
66+
our $VERSION = "1.4.0";
3067
our $MASHTREE_VERSION=$VERSION;
3168
our @fastqExt=qw(.fastq.gz .fastq .fq .fq.gz);
3269
our @fastaExt=qw(.fasta .fna .faa .mfa .fas .fsa .fa);
@@ -41,6 +78,16 @@ our @richseqExt=qw(.gb .gbank .genbank .gbk .gbs .gbf .embl .ebl .emb .dat .swis
4178
# Helpful things
4279
my $fhStick :shared; # A thread can only open a fastq file if it has the talking stick.
4380

81+
=head1 METHODS
82+
83+
=over
84+
85+
=item $SIG{'__DIE__'}
86+
87+
Remakes how `die` works, so that it references the caller
88+
89+
=cut
90+
4491
#################################################
4592
### COMMON SUBS/TOOLS (not object subroutines) ##
4693
#################################################
@@ -53,6 +100,15 @@ $SIG{'__DIE__'} = sub {
53100
$e =~ s/(at [^\s]+? line \d+\.$)/\nStopped $1/;
54101
die("$0: $callerSub: $e");
55102
};
103+
104+
=pod
105+
106+
=item logmsg
107+
108+
Prints a message to STDERR with the thread number and the program name, with a trailing newline.
109+
110+
=cut
111+
56112
# Centralized logmsg
57113
#sub logmsg {print STDERR "$0: ".(caller(1))[3].": @_\n";}
58114
sub logmsg {
@@ -69,7 +125,14 @@ sub logmsg {
69125
print STDERR $msg;
70126
}
71127

72-
# Opens a fastq file in a thread-safe way.
128+
=pod
129+
130+
=item openFastq
131+
132+
Opens a fastq file in a thread-safe way.
133+
134+
=cut
135+
73136
sub openFastq{
74137
my($fastq,$settings)=@_;
75138

@@ -87,7 +150,14 @@ sub openFastq{
87150
return $fh;
88151
}
89152

90-
# Removes fastq extension, removes directory name,
153+
=pod
154+
155+
=item _truncateFilename
156+
157+
Removes fastq extension, removes directory name,
158+
159+
=cut
160+
91161
sub _truncateFilename{
92162
my($file,$settings)=@_;
93163
# strip off msh and any other known extentions
@@ -101,9 +171,17 @@ sub _truncateFilename{
101171
return $name;
102172
}
103173

174+
=pod
175+
176+
=item distancesToPhylip
177+
178+
1. Read the mash distances
179+
2. Create a phylip file
180+
181+
Arguments: hash of distances, output directory, settings hash
182+
183+
=cut
104184

105-
# 1. Read the mash distances
106-
# 2. Create a phylip file
107185
sub distancesToPhylip{
108186
my($distances,$outdir,$settings)=@_;
109187

@@ -172,6 +250,20 @@ sub distancesToPhylip{
172250
return $phylip;
173251
}
174252

253+
=pod
254+
255+
=item sortNames
256+
257+
Sorts names.
258+
259+
Arguments:
260+
261+
1. $name - array of names
262+
2. $settings - options
263+
* $$settings{'sort-order'} is either "abc", "random", "input-order"
264+
265+
=cut
266+
175267
sub sortNames{
176268
my($name,$settings)=@_;
177269
my @sorted;
@@ -187,8 +279,15 @@ sub sortNames{
187279
return @sorted;
188280
}
189281

190-
# Create tree file with Quicktree but bioperl
191-
# as a backup.
282+
=pod
283+
284+
=item createTreeFromPhylip($phylip, $outdir, $settings)
285+
286+
Create tree file with Quicktree but bioperl
287+
as a backup.
288+
289+
=cut
290+
192291
sub createTreeFromPhylip{
193292
my($phylip,$outdir,$settings)=@_;
194293

@@ -225,8 +324,15 @@ sub createTreeFromPhylip{
225324

226325
}
227326

228-
# Lee's implementation of a tree distance. The objective
229-
# is to return zero if two trees are the same.
327+
=pod
328+
329+
=item treeDist($treeObj1, $treeObj2)
330+
331+
Lee's implementation of a tree distance. The objective
332+
is to return zero if two trees are the same.
333+
334+
=cut
335+
230336
sub treeDist{
231337
my($treeObj1,$treeObj2)=@_;
232338

@@ -309,8 +415,15 @@ sub treeDist{
309415
return $euclideanDistance;
310416
}
311417

312-
# Find the distance between two mash sketch files
313-
# Alternatively: two hash lists.
418+
=pod
419+
420+
=item mashDist($file1, $file2, $k, $settings)
421+
422+
Find the distance between two mash sketch files
423+
Alternatively: two hash lists.
424+
425+
=cut
426+
314427
sub mashDist{
315428
my($file1, $file2, $k, $settings)=@_;
316429

@@ -354,6 +467,14 @@ sub mashDist{
354467
return $mash_distance;
355468
}
356469

470+
=pod
471+
472+
=item mashHashes($sketch)
473+
474+
Return an array of hashes, the kmer length, and the genome estimated length
475+
476+
=cut
477+
357478
sub mashHashes{
358479
my($sketch)=@_;
359480
my @hash;
@@ -393,9 +514,16 @@ sub mashHashes{
393514
return (\@hash, $kmer, $length);
394515
}
395516

396-
# Compare unequal sized hashes. Treat the first
397-
# set of hashes as the reference (denominator)
398-
# set.
517+
=pod
518+
519+
=item raw_mash_distance_unequal_sizes($hashes1, $hashes2)
520+
521+
Compare unequal sized hashes. Treat the first
522+
set of hashes as the reference (denominator)
523+
set.
524+
525+
=cut
526+
399527
sub raw_mash_distance_unequal_sizes{
400528
my($hashes1, $hashes2) = @_;
401529

@@ -416,7 +544,15 @@ sub raw_mash_distance_unequal_sizes{
416544
return($common,$total);
417545
}
418546

419-
# https://github.com/onecodex/finch-rs/blob/master/src/distance.rs#L34
547+
=pod
548+
549+
=item raw_mash_distance($hashes1, $hashes2)
550+
551+
Return the number of kmers in common and the number compared total. inspiration from
552+
https://github.com/onecodex/finch-rs/blob/master/src/distance.rs#L34
553+
554+
=cut
555+
420556
sub raw_mash_distance{
421557
my($hashes1, $hashes2) = @_;
422558

@@ -472,7 +608,7 @@ sub raw_mash_distance{
472608
# The only difference is that it isn't an object method
473609
# and that it is called without an OO implementation.
474610

475-
=head2 transfer_bootstrap_expectation
611+
=item transfer_bootstrap_expectation
476612
477613
Title : transfer_bootstrap_expectation
478614
Usage : my $tree_with_bs = transfer_bootstrap_expectation(\@bs_trees,$guide_tree);
@@ -482,6 +618,9 @@ sub raw_mash_distance{
482618
Returns : L<Bio::Tree::TreeI>
483619
Args : Arrayref of L<Bio::Tree::TreeI>s
484620
Guide tree, L<Bio::Tree::TreeI>s
621+
622+
=back
623+
485624
=cut
486625

487626
sub transfer_bootstrap_expectation{

0 commit comments

Comments
 (0)