Open
Description
Hi,
In your paper, you said the "next-utterance classification" task and "language modeling" task were trained in a multi-task learning setting, and also in train.py, there is a function load the persona dataset as described. However, there is a logical gap that I can not fully understand that is you also trained the wrong candidates (i.e. replies) for language modeling task. Can anyone explain this to me?
def get_data_loaders(args, tokenizer):
""" Prepare the dataset for training and evaluation """
personachat = get_dataset(tokenizer, args.dataset_path, args.dataset_cache)
logger.info("Build inputs and labels")
datasets = {"train": defaultdict(list), "valid": defaultdict(list)}
for dataset_name, dataset in personachat.items():
num_candidates = len(dataset[0]["utterances"][0]["candidates"])
if args.num_candidates > 0 and dataset_name == 'train':
num_candidates = min(args.num_candidates, num_candidates)
for dialog in dataset:
persona = dialog["personality"].copy()
for _ in range(args.personality_permutations):
for utterance in dialog["utterances"]:
history = utterance["history"][-(2*args.max_history+1):]
for j, candidate in enumerate(utterance["candidates"][-num_candidates:]):
lm_labels = bool(j == num_candidates-1)
instance = build_input_from_segments(persona, history, candidate, tokenizer, lm_labels)
for input_name, input_array in instance.items():
datasets[dataset_name][input_name].append(input_array)
datasets[dataset_name]["mc_labels"].append(num_candidates - 1)
datasets[dataset_name]["n_candidates"] = num_candidates
persona = [persona[-1]] + persona[:-1] # permuted personalities
logger.info("Pad inputs and convert to Tensor")
tensor_datasets = {"train": [], "valid": []}
for dataset_name, dataset in datasets.items():
dataset = pad_dataset(dataset, padding=tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS[-1]))
for input_name in MODEL_INPUTS:
tensor = torch.tensor(dataset[input_name])
if input_name != "mc_labels":
tensor = tensor.view((-1, datasets[dataset_name]["n_candidates"]) + tensor.shape[1:])
tensor_datasets[dataset_name].append(tensor)
logger.info("Build train and validation dataloaders")
train_dataset, valid_dataset = TensorDataset(*tensor_datasets["train"]), TensorDataset(*tensor_datasets["valid"])
train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset) if args.distributed else None
valid_sampler = torch.utils.data.distributed.DistributedSampler(valid_dataset) if args.distributed else None
train_loader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, shuffle=(not args.distributed))
valid_loader = DataLoader(valid_dataset, sampler=valid_sampler, batch_size=args.valid_batch_size, shuffle=False)
logger.info("Train dataset (Batch, Candidates, Seq length): {}".format(train_dataset.tensors[0].shape))
logger.info("Valid dataset (Batch, Candidates, Seq length): {}".format(valid_dataset.tensors[0].shape))
return train_loader, valid_loader, train_sampler, valid_sampler
Metadata
Metadata
Assignees
Labels
No labels