有了数据后,我们需求将音频转换为Mel频谱图,并将每个音频样本的字符标签映射为整数标签:
class TextTransform:
"""Maps characters to integers and vice versa"""
def __init__(self):
char_map_str = """
' 0
<SPACE> 1
a 2
b 3
c 4
d 5
e 6
f 7
g 8
h 9
i 10
j 11
k 12
l 13
m 14
n 15
o 16
p 17
q 18
r 19
s 20
t 21
u 22
v 23
w 24
x 25
y 26
z 27
"""
self.char_map = {}
self.index_map = {}
for line in char_map_str.strip.split('\n'):
ch, index = line.split
self.char_map[ch] = int(index)
self.index_map[int(index)] = ch
self.index_map[1] = ' '
def text_to_int(self, text):
""" Use a character map and convert text to an integer sequence """
int_sequence =
for c in text:
if c == ' ':
ch = self.char_map['']
else:
ch = self.char_map[c]
int_sequence.append(ch)
return int_sequence
def int_to_text(self, labels):
""" Use a character map and convert integer labels to an text sequence """
string =
for i in labels:
string.append(self.index_map)
return ''.join(string).replace('', ' ')
train_audio_transforms = nn.Sequential(
torchaudio.transforms.MelSpectrogram(sample_rate=16000, n_mels=128),
torchaudio.transforms.FrequencyMasking(freq_mask_param=15),
torchaudio.transforms.TimeMasking(time_mask_param=35)
)
valid_audio_transforms = torchaudio.transforms.MelSpectrogram
text_transform = TextTransform
def data_processing(data, data_type="train"):
spectrograms =
labels =
input_lengths =
label_lengths =
for (waveform, _, utterance, _, _, _) in data:
if data_type == 'train':
spec = train_audio_transforms(waveform).squeeze(0).transpose(0, 1)
else:
spec = valid_audio_transforms(waveform).squeeze(0).transpose(0, 1)
spectrograms.append(spec)
label = torch.Tensor(text_transform.text_to_int(utterance.lower))
labels.append(label)
input_lengths.append(spec.shape[0]//2)
label_lengths.append(len(label))
spectrograms = nn.utils.rnn.pad_sequence(spectrograms, batch_first=True).unsqueeze(1).transpose(2, 3)
labels = nn.utils.rnn.pad_sequence(labels, batch_first=True)
return spectrograms, labels, input_lengths, label_lengths
在本教程中,我们运用“贪心”解码方法将模型的输入处理为字符,这些字符可组合创建文本。“贪心”解码器接收模型输入,该输入是字符的最大概率矩阵,对于每个工夫步长(频谱图帧),它选择概率最高的标签。假如标签是空白标签,则将其从最终的文本中删除。
def GreedyDecoder(output, labels, label_lengths, blank_label=28, collapse_repeated=True):
arg_maxes = torch.argmax(output, dim=2)
decodes =
targets =
for i, args in enumerate(arg_maxes):
decode =
targets.append(text_transform.int_to_text(labels[:label_lengths].tolist))
for j, index in enumerate(args):
if index != blank_label:
if collapse_repeated and j != 0 and index == args[j -1]:
continue
decode.append(index.item)
decodes.append(text_transform.int_to_text(decode))
return decodes, targets