bshall · kamperh · Aug 28, 2020 · Aug 28, 2020 · Aug 28, 2020 · Aug 28, 2020
diff --git a/README.md b/README.md
@@ -33,27 +33,36 @@ Leader-board for the ZeroSpeech 2020 challenge can be found [here](https://zeros
 
 3.  Preprocess audio and extract train/test log-Mel spectrograms:
     ```
-    python preprocess.py in_dir=/path/to/dataset dataset=[2019/english or 2019/surprise]
+    python preprocess.py in_dir=/path/to/dataset dataset=[2019/english or 2019/surprise or buckeye]
     ```
-    Note: `in_dir` must be the path to the `2019` folder. 
-    For `dataset` choose between `2019/english` or `2019/surprise`.
-    Other datasets will be added in the future.
+    For `dataset` choose between `2019/english`, `2019/surprise` or `buckeye`.
+    Note: `in_dir` must be the path to the `2019` folder or the original
+    Buckeye dataset directory. Other datasets will be added in the future.
 
     Example usage:
     ```
     python preprocess.py in_dir=../datasets/2020/2019 dataset=2019/english
     ```
+    or
+    ```
+    python preprocess.py in_dir=/home/kamperh/endgame/projects/stellenbosch/vqseg/datasets/swb300-wavs/ dataset=swbd preprocessing=8khz
+    ```
 
 ## Training
 
 1.  Train the VQ-CPC model (or download pretrained weights [here](https://github.com/bshall/VectorQuantizedCPC/releases/tag/v0.1)):
     ```
-    python train_cpc.py checkpoint_dir=path/to/checkpoint_dir dataset=[2019/english or 2019/surprise]
+    python train_cpc.py checkpoint_dir=path/to/checkpoint_dir dataset=[2019/english or 2019/surprise or buckeye]
     ```
     Example usage:
     ```
     python train_cpc.py checkpoint_dir=checkpoints/cpc/2019english dataset=2019/english
     ```
+    or
+    ```
+    python train_cpc.py checkpoint_dir=checkpoints/cpc/buckeye dataset=buckeye training.sample_frames=64  model.encoder.n_embeddings=256 training.scheduler.warmup_epochs=250
+    python train_cpc.py checkpoint_dir=checkpoints/cpc/swbd1 dataset=swbd training.sample_frames=64  preprocessing=8khz
+    ```
 
 2.  Train the vocoder:
     ```
@@ -95,10 +104,16 @@ Voice conversion samples are available [here](https://bshall.github.io/VectorQua
 
 1.  Encode test data for evaluation:
     ```
-    python encode.py checkpoint=path/to/checkpoint out_dir=path/to/out_dir dataset=[2019/english or 2019/surprise]
+    python encode.py checkpoint=path/to/checkpoint out_dir=path/to/out_dir dataset=[2019/english or 2019/surprise or buckeye]
+    ```
+    Example usage:
+    ```
+    python encode.py checkpoint=checkpoints/cpc/english2019/model.ckpt-22000.pt out_dir=submission/2019/english/test dataset=2019/english
     ```
+    or
     ```
-    e.g. python encode.py checkpoint=checkpoints/2019english/model.ckpt-500000.pt out_dir=submission/2019/english/test dataset=2019/english
+    python encode.py checkpoint=checkpoints/cpc/english2019/model.ckpt-22000.pt split=val save_indices=True out_dir=outputs/buckeye/val_zs2019/ dataset=buckeye
+    python encode.py checkpoint=checkpoints/cpc/swbd1/model.ckpt-22000.pt split=val save_indices=True out_dir=outputs/swbd/val_swbd1/ dataset=swbd preprocessing=8khz 
     ```
 
 2. Run ABX evaluation script (see [bootphon/zerospeech2020](https://github.com/bootphon/zerospeech2020)).

diff --git a/config/dataset/2019/english.yaml b/config/dataset/2019/english.yaml
@@ -1,5 +1,3 @@
 dataset:
-  dataset: 2019
-  language: english
   path: 2019/english
   n_speakers: 102
diff --git a/config/dataset/2019/surprise.yaml b/config/dataset/2019/surprise.yaml
@@ -1,5 +1,3 @@
 dataset:
-  dataset: 2019
-  language: surprise
   path: 2019/surprise
   n_speakers: 113
diff --git a/config/dataset/buckeye.yaml b/config/dataset/buckeye.yaml
@@ -0,0 +1,3 @@
+dataset:
+  path: buckeye
+  n_speakers: 32
diff --git a/config/dataset/swbd.yaml b/config/dataset/swbd.yaml
@@ -0,0 +1,3 @@
+dataset:
+  path: swbd
+  n_speakers: 1131
diff --git a/config/encode.yaml b/config/encode.yaml
@@ -3,6 +3,9 @@ defaults:
     - preprocessing: default
     - model: default
 
+split: test
 checkpoint: ???
 out_dir: ???
-save_auxiliary: False
+save_auxiliary: False
+save_indices: False
+save_embedding: False
diff --git a/config/preprocessing/8khz.yaml b/config/preprocessing/8khz.yaml
@@ -0,0 +1,10 @@
+preprocessing:
+    sr: 8000
+    n_fft: 1024
+    n_mels: 40
+    fmin: 50
+    preemph: 0.97
+    top_db: 80
+    hop_length: 80
+    win_length: 200
+    bits: 8
diff --git a/dataset.py b/dataset.py
@@ -37,7 +37,7 @@ def __getitem__(self, index):
         mels = list()
         paths = random.sample(paths, self.n_utterances_per_speaker)
         for path in paths:
-            path = self.root.parent / path
+            path = self.root / path
             mel = np.load(path.with_suffix(".mel.npy"))
             pos = random.randint(0, mel.shape[1] - self.n_sample_frames)
             mel = mel[:, pos:pos + self.n_sample_frames]

diff --git a/encode.py b/encode.py
@@ -17,7 +17,7 @@ def encode_dataset(cfg):
     out_dir.mkdir(exist_ok=True, parents=True)
 
     root_path = Path(utils.to_absolute_path("datasets")) / cfg.dataset.path
-    with open(root_path / "test.json") as file:
+    with open((root_path / cfg.split).with_suffix(".json")) as file:
         metadata = json.load(file)
 
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
@@ -32,6 +32,10 @@ def encode_dataset(cfg):
 
     encoder.eval()
 
+    if cfg.save_embedding:
+        embedding_path = out_dir / "embedding.npy"
+        np.save(embedding_path, encoder.codebook.embedding.cpu().numpy())
+
     if cfg.save_auxiliary:
         auxiliary = []
 
@@ -41,7 +45,7 @@ def hook(module, input, output):
         encoder.encoder[-1].register_forward_hook(hook)
 
     for _, _, _, path in tqdm(metadata):
-        path = root_path.parent / path
+        path = root_path / path
         mel = torch.from_numpy(np.load(path.with_suffix(".mel.npy"))).unsqueeze(0).to(device)
         with torch.no_grad():
             z, c, indices = encoder.encode(mel)
@@ -52,15 +56,24 @@ def hook(module, input, output):
         with open(out_path.with_suffix(".txt"), "w") as file:
             np.savetxt(file, z, fmt="%.16f")
 
+        if cfg.save_indices:
+            indices_path = out_dir / "indices"
+            indices_path.mkdir(exist_ok=True, parents=True)
+            out_path = indices_path / path.stem
+            indices = indices.squeeze().cpu().numpy()
+            if not indices.shape==():
+                with open(out_path.with_suffix(".txt"), "w") as file:
+                    np.savetxt(file, indices, fmt="%d")
+
         if cfg.save_auxiliary:
-            aux_path = out_dir.parent / "auxiliary_embedding1"
+            aux_path = out_dir / "auxiliary_embedding1"
             aux_path.mkdir(exist_ok=True, parents=True)
             out_path = aux_path / path.stem
             c = c.squeeze().cpu().numpy()
             with open(out_path.with_suffix(".txt"), "w") as file:
                 np.savetxt(file, c, fmt="%.16f")
 
-            aux_path = out_dir.parent / "auxiliary_embedding2"
+            aux_path = out_dir / "auxiliary_embedding2"
             aux_path.mkdir(exist_ok=True, parents=True)
             out_path = aux_path / path.stem
             aux = auxiliary.pop().squeeze().cpu().numpy()

diff --git a/preprocess.py b/preprocess.py
@@ -54,18 +54,23 @@ def process_wav(wav_path, out_path, sr=160000, preemph=0.97, n_fft=2048, n_mels=
 @hydra.main(config_path="config/preprocessing.yaml")
 def preprocess_dataset(cfg):
     in_dir = Path(utils.to_absolute_path(cfg.in_dir))
-    out_dir = Path(utils.to_absolute_path("datasets")) / str(cfg.dataset.dataset)
+    out_dir = Path(utils.to_absolute_path("datasets")) / str(cfg.dataset.path)
     out_dir.mkdir(parents=True, exist_ok=True)
 
     executor = ProcessPoolExecutor(max_workers=cpu_count())
-    for split in ["train", "test"]:
+    for split in ["train", "test", "val"]:
         print("Extracting features for {} set".format(split))
         futures = []
-        split_path = out_dir / cfg.dataset.language / split
+        split_path = out_dir / split
+        if not split_path.with_suffix(".json").exists():
+            print("Skipping {} (no json file)".format(split))
+            continue
         with open(split_path.with_suffix(".json")) as file:
             metadata = json.load(file)
             for in_path, start, duration, out_path in metadata:
                 wav_path = in_dir / in_path
+                assert wav_path.with_suffix(".wav").exists(), "'{}' does not exist".format(
+                    wav_path.with_suffix(".wav"))                
                 out_path = out_dir / out_path
                 out_path.parent.mkdir(parents=True, exist_ok=True)
                 futures.append(executor.submit(