|
1 | 1 | #!/usr/bin/env python3
|
2 | 2 |
|
3 | 3 | import collections
|
| 4 | +import csv |
4 | 5 | import datetime
|
5 | 6 | import gzip
|
6 | 7 | import os
|
@@ -29,8 +30,8 @@ def run(self):
|
29 | 30 |
|
30 | 31 | # download genome files for each databases
|
31 | 32 | for subset in ("species", "subspecies", "geneflow", "typestrains"):
|
32 |
| - list = os.path.join(btyper3_path, "seq_ani_db", subset, "{}.txt".format(subset)) |
33 |
| - self.download_genomes(btyper3_path, list, subset) |
| 33 | + db = os.path.join(btyper3_path, "seq_ani_db", subset, "{}.tsv".format(subset)) |
| 34 | + self.download_genomes(btyper3_path, db, subset) |
34 | 35 |
|
35 | 36 | def download(self, url, dest, append=False, decompress=False):
|
36 | 37 | print("downloading {!r} to {!r}".format(url, dest))
|
@@ -66,13 +67,16 @@ def download_pubmlst(self, btyper3_path):
|
66 | 67 |
|
67 | 68 | def download_genomes(self, btyper3_path, genome_list, ani_directory):
|
68 | 69 | with open(genome_list) as genomes:
|
69 |
| - for line in genomes: |
70 |
| - if line.startswith("#"): |
71 |
| - continue |
72 |
| - gname, gpath = map(str.strip, line.split()[:2]) |
73 |
| - gfile = os.path.join(btyper3_path, "seq_ani_db", ani_directory, gname) |
| 70 | + reader = csv.reader(genomes, dialect="excel-tab") |
| 71 | + |
| 72 | + header = next(reader) |
| 73 | + id_col = header.index("id") |
| 74 | + url_col = header.index("url") |
| 75 | + |
| 76 | + for row in reader: |
| 77 | + gfile = os.path.join(btyper3_path, "seq_ani_db", ani_directory, row[id_col]) |
74 | 78 | if not os.path.isfile(gfile):
|
75 |
| - self.download(url=gpath, dest=gfile) |
| 79 | + self.download(url=row[url_col], dest=gfile) |
76 | 80 |
|
77 | 81 |
|
78 | 82 | setuptools.setup(cmdclass={"build_py": build_py})
|
0 commit comments