edits and readme

taylorreiter · taylorreiter · commit 59ff86724562 · 2023-03-29T15:27:23.000-04:00
diff --git a/README.md b/README.md
@@ -4,4 +4,33 @@ NCBI now provides [a clustered nr database](https://ncbiinsights.ncbi.nlm.nih.go
 We were interested in using this database to reduce search times and to increase the taxonomic diversity of returned sequences when doing BLAST searches.
 However, as of March 2023, the database is not available for download.
 Therefore, we re-made this database ourselves.
-The [README.sh](./README.sh) file in this repository documents how we performed the clustering and created a taxonomy sheet that annotates the lowest common ancestor for each protein cluster.
+The [Snakefile](./Snakefile) in this repository documents how we performed the clustering and created a taxonomy sheet that annotates the lowest common ancestor for each protein cluster.
+
+## Getting started with this repository
+
+This repository uses snakemake to run the pipeline and conda to manage software environments and installations.
+You can find operating system-specific instructions for installing miniconda [here](https://docs.conda.io/en/latest/miniconda.html).
+We executed the pipeline on AWS EC2 with an Ubuntu image (ubuntu/images/hvm-ssd/ubuntu-jammy-22.04-amd64-server-20230208).
+
+```
+curl -JLO https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh # download the miniconda installation script
+bash Miniconda3-latest-Linux-x86_64.sh # run the miniconda installation script. Accept the license and follow the defaults.
+source ~/.bashrc # source the .bashrc for miniconda to be available in the environment
+# configure miniconda channel order
+conda config --add channels defaults
+conda config --add channels bioconda
+conda config --add channels conda-forge
+conda config --set channel_priority strict # make channel priority strict so snakemake doesn't yell at you
+conda install mamba # install mamba for faster software installation.
+
+conda env create -n nr -f environment.yml
+conda activate nr
+```
+
+After cloning the repository, you can then run the snakefile with:
+
+```
+snakemake -j 1 --use-conda --rerun-incomplete -k -n
+```
+
+where `-j` specifies the number of threads to run with, `--use-conda` uses conda to manage software environments, `--rerun-incomplete` re-runs incomplete files, `-k` tells the pipeline to continue with independent steps when one step fails, and `-n` signifies to run a dry run first.
diff --git a/Snakefile b/Snakefile
@@ -6,9 +6,6 @@ rule all:
         "nr_cluster_taxid_formatted_final.tsv.gz",
         "nr_cluster_uniq_reps_line_count.txt"
     
-# rules to add:
-# 3. add header sequences to nr_cluster.tsv    
-
 #############################################################################
 ## cluster NR with mmseqs2
 #############################################################################
@@ -125,6 +122,7 @@ rule get_lca_taxid_for_each_cluster:
     output: "nr_cluster_taxid_lca.tsv"
     params: datadir="inputs/taxdump/"
     #conda: "envs/taxonkit.yml" # needs to be 0.14.2, which isn't on conda yet
+    # in the meantime, the executable can be downloaded from this url: https://github.com/shenwei356/taxonkit/files/11073880/taxonkit_linux_amd64.tar.gz
     shell:'''
     ./taxonkit lca --data-dir {params.datadir} -i 2 -s ";" -o {output} {input.tsv} --buffer-size 1G
     '''
diff --git a/environment.yml b/environment.yml
@@ -0,0 +1,6 @@
+channels:
+  - conda-forge
+  - bioconda
+  - defaults
+dependencies:
+  - snakemake=7.25.0
diff --git a/envs/mmseqs2.yml b/envs/mmseqs2.yml
@@ -0,0 +1,6 @@
+channels:
+  - conda-forge
+  - bioconda
+  - defaults
+dependencies:
+  - mmseqs2=14.7e284