Vocal Tract Length Normalization (VTLN)

Calculate the Vocal Tract Length

Using Praat

After we installed the Pratt together with the VocalToolkit : https://www.praatvocaltoolkit.com/calculate-vocal-tract-length.html, we can just calculate the vocal tract length from the pratt easily.

image-20220912215239543

We can open that Praat the main panel, and hit the process buttom,

image-20220912215345612

image-20220912215404723

image-20220912215427231

Here is the result, and the calculating formula was from: Johnson, Keith. Acoustic and Auditory Phonetics. 2nd ed. Malden, Mass: Blackwell Pub, 2003. p. 96.

$$
F_{n} = \frac{ (2n - 1)c}{4L}, c = 35,000\ cm/ sec
$$

In here, $$F_{4}$$ is 3369.558 Hz, and here can be the how it calculated:
$$
3369.558 \ Hz = \frac{(7)* 35,000\ cm/ sec}{4L} \
3369.558 \ \frac{1}{sec} = \frac{(7)* 35,000\ cm/ sec}{4L} \
\frac{3369.558}{sec} = \frac{(7)* 35,000\ cm/ sec}{4L} \
\frac{3369.558 * 4L}{sec} = (7)* 35,000\ cm/ sec \
3369.558 * 4L = (7)* 35,000\ cm \
L = \frac{(7)* 35,000\ cm}{3369.558 * 4} \
L = \frac{245000}{13478} \
L = 18.1777711827 \ cm
$$

Python Code Work

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
from __future__ import unicode_literals
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
from scipy.signal import lfilter
from scipy.io import wavfile

fs, data = wavfile.read("./1_u_sox.wav")
x = data.copy()
x = x / 32767

u = lfilter ([1, -0.99], [1], x)


wlen2 = len(u)//2 # according to the half signal therom

fft_val = np.fft . fft (u) # get the fft value the sample number is 31614, which include the REL and IMAGE part of the signal

abs_fft = np.abs(fft_val) # get the absolute value of the fft_val, which they are: sqrt(x**2 + y**2), x is the REL, y is the IMG

nyquist_fft = np.abs(fft_val)[: wlen2] # this let we get the half of the sample data

log_fft = np.log(np.abs(np.fft . fft (u) [: wlen2])) # this will add the log function since our auditory system is unlinear and close to the log

Cepst = np. fft . ifft(log_fft)

cepst = np.zeros(wlen2, dtype=np.complex) # generate wlen2's 0j

# define the cepstL like 30
cepstL = 30

cepst [: cepstL] = Cepst[:cepstL]
cepst[-cepstL + 1:] = Cepst[-cepstL + 1:]


spec = np.real(np. fft . fft (cepst) ) # we will get the fft spectrum to us



def local_maxium(x):
d = np.diff(x)
l_d = len(d)
maxium = []
loc = []
for i in range(l_d - 1):
if d[i] > 0 and d[i + 1] <= 0:
maxium.append(x[i + 1])
loc.append(i + 1)
return maxium, loc




val, loc = local_maxium(spec)

#print("################## This the val #################")
#for i in val:
# print(i)
#print("################## This the location #################")
#for i in loc:
# print(i)


wlen = len(u)
wlen2 = wlen // 2
freq = [i * fs / wlen for i in range(wlen2)]





color_spectrum = "#1f165b"
color_envlope = "#141414"
color_text_label = "#f82912"




plt.plot(freq, log_fft, 'k', color = color_spectrum)#color="#1f165b"
#plt.title('Spectrum')
#plt.savefig('spectrum.png', bbox_inches='tight', dpi = 300)


#plt.rcParams.update({
# "text.usetex": True,
# "font.family": "serif"
#})

plt.rcParams['text.usetex'] = True


plt.plot(freq, spec, 'k', color=color_envlope)
plt.title('$Ceptrum_{Formants}$')
plt.legend(("wavform", "envelope"),
shadow=True, loc=(1.05, 0.38), handlelength=1.5, fontsize=16)

plt.xlabel("$Frequency$", color="C0", fontsize=20)
plt.ylabel("$dB$", color="C0", fontsize=20)



# we only extract the first Four formants

formant_list = []
for i in range(4):
plt.plot([freq[loc[i]], freq[loc[i]]], [np.min(spec), spec[loc[i]]], '-.k')
plt.text(freq[loc[i]], spec[loc[i]], '$F_{}={}$'.format(i+1, int(freq[loc[i]])), color="green",horizontalalignment='center', verticalalignment='center', fontsize=12)
plt.text(28000, i+1,'$F_{}={}$'.format(i+1, int(freq[loc[i]])), color=color_text_label,horizontalalignment='center', verticalalignment='center', fontsize=12)
formant_list.append(freq[loc[i]])



VTL = ((1 * (35000/(4 * formant_list[0]))) + (3 * (35000/(4 * formant_list[1]))) + (5 * (35000/(4 * formant_list[2]))) + (7 * (35000/(4 * formant_list[3]))))/4





plt.text(28000, -4,'$VTL={}cm$'.format(round(VTL)), color=color_text_label,horizontalalignment='center', verticalalignment='center', fontsize=12)
plt.savefig('Ceptrum Formants_demo.png', bbox_inches='tight', dpi = 600)

This will be the final visualization output:

Ceptrum Formants_demo

Hamming Windowing

This is to add the hamming window:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
from scipy.signal import lfilter
from scipy.io import wavfile
#import os
import scipy.signal as signal

file_name = "./schwa_sample_sox.wav"

def read_file(file_name):
fs, data = wavfile.read(file_name)
x = data.copy()
x = x / 32767
u = lfilter ([1, -0.99], [1], x)
wlen2 = len(u)//2
freq = [i * fs / len(u) for i in range(wlen2)]
return u, wlen2, freq




fs, x = wavfile.read(file_name)

N = len(x)
wlen = N // 2


w = signal.get_window('hamming', N)

X = np.fft.fft(x * w)[: wlen]

IFFT Work

This is the IFFT demo and save in the EPS file.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
from scipy.signal import lfilter
from scipy.io import wavfile
import numpy as np


fs, data = wavfile.read("./schwa_sample_sox.wav")
x = data.copy()
x = x / 32767


wlen = len(x)
wlen2 = len(x)//2
# Time period
length = data.shape[0] / fs

t = np.linspace(0., length, data.shape[0])

freq = [i * fs / wlen for i in range(wlen2)]

# Do a Fourier transform on the signal

tx = np.fft.fft(x)

tx = np.fft.fft(x)

# Do an inverse Fourier transform on the signal

itx = np.fft.ifft(tx)


# Plot the original sine wave using inverse Fourier transform

plt.plot(t, itx);


plt.xlabel('Time')

plt.ylabel('Amplitude')

plt.grid(True)

plt.savefig('scatter.eps',dpi=600,format='eps')

ifft_demo

Cut Audio Chunks

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
from pydub import AudioSegment
import os
from pydub.silence import split_on_silence

if not os.path.isdir("splitaudio"):
os.mkdir("splitaudio")

audio = AudioSegment.from_file("Wk4 vowels_BOR.wav")
lengthaudio = len(audio)
print("Length of Audio File", lengthaudio)

start = 0
# In Milliseconds, this will cut 10 Sec of audio
threshold = 1000
end = 0
counter = 0



chunks = split_on_silence (
# Use the loaded audio.
audio,
# Specify that a silent chunk must be at least 2 seconds or 2000 ms long.
min_silence_len = 80,
# Consider a chunk silent if it's quieter than -16 dBFS.
# (You may want to adjust this parameter.)
silence_thresh = -18
)



# Process each chunk with your parameters
for i, chunk in enumerate(chunks):
# Create a silence chunk that's 0.5 seconds (or 500 ms) long for padding.
silence_chunk = AudioSegment.silent(duration=500)

# Add the padding chunk to beginning and end of the entire chunk.
audio_chunk = silence_chunk + chunk + silence_chunk

# Normalize the entire chunk.
# normalized_chunk = match_target_amplitude(audio_chunk, -20.0)


filename = f'splitaudio/chunk{counter}.wav'

audio_chunk.export(filename, format="wav")
counter +=1

Source Filter Theory

https://slideplayer.com/slide/8271737/

https://www2.ims.uni-stuttgart.de/EGG/frmst1.htm

image-20220914145911751

https://www.phon.ucl.ac.uk/courses/spsci/acoustics/week2-3.pdf


Vocal Tract Length Normalization (VTLN)
http://xiaos.site/2022/09/12/Vocal-Tract-Length-Normalization-VTLN/
Author
Xiao Zhang
Posted on
September 12, 2022
Licensed under