Calculate the Vocal Tract Length

Using Praat

After we installed the Pratt together with the VocalToolkit : https://www.praatvocaltoolkit.com/calculate-vocal-tract-length.html, we can just calculate the vocal tract length from the pratt easily.

We can open that Praat the main panel, and hit the process buttom,

Here is the result, and the calculating formula was from: Johnson, Keith. Acoustic and Auditory Phonetics. 2nd ed. Malden, Mass: Blackwell Pub, 2003. p. 96.

$$
F_{n} = \frac{ (2n - 1)c}{4L}, c = 35,000\ cm/ sec
$$

In here, $$F_{4}$$ is 3369.558 Hz, and here can be the how it calculated:
$$
3369.558 \ Hz = \frac{(7)* 35,000\ cm/ sec}{4L} \
3369.558 \ \frac{1}{sec} = \frac{(7)* 35,000\ cm/ sec}{4L} \
\frac{3369.558}{sec} = \frac{(7)* 35,000\ cm/ sec}{4L} \
\frac{3369.558 * 4L}{sec} = (7)* 35,000\ cm/ sec \
3369.558 * 4L = (7)* 35,000\ cm \
L = \frac{(7)* 35,000\ cm}{3369.558 * 4} \
L = \frac{245000}{13478} \
L = 18.1777711827 \ cm
$$

Python Code Work

from __future__ import unicode_literals
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
from scipy.signal import lfilter
from scipy.io import wavfile

fs, data = wavfile.read("./1_u_sox.wav")
x = data.copy()
x = x / 32767

u = lfilter ([1, -0.99], [1], x)


wlen2 = len(u)//2 # according to the half signal therom

fft_val = np.fft . fft (u) # get the fft value the sample number is 31614, which include the REL and IMAGE part of the signal

abs_fft = np.abs(fft_val) # get the absolute value of the fft_val, which they are: sqrt(x**2 + y**2), x is the REL, y is the IMG

nyquist_fft = np.abs(fft_val)[: wlen2]   # this let we get the half of the sample data

log_fft = np.log(np.abs(np.fft . fft (u) [: wlen2]))   # this will add the log function since our auditory system is unlinear and close to the log 

Cepst = np. fft . ifft(log_fft)

cepst = np.zeros(wlen2, dtype=np.complex)    # generate wlen2's 0j

# define the cepstL like 30
cepstL = 30

cepst [: cepstL] = Cepst[:cepstL]
cepst[-cepstL + 1:] = Cepst[-cepstL + 1:]


spec = np.real(np. fft . fft (cepst) )    # we will get the fft spectrum to us



def local_maxium(x):
    d = np.diff(x)
    l_d = len(d)
    maxium = []
    loc = []
    for i in range(l_d - 1):
        if d[i] > 0 and d[i + 1] <= 0:
            maxium.append(x[i + 1])
            loc.append(i + 1)
    return maxium, loc



    
val, loc = local_maxium(spec)

#print("################## This the val #################")
#for i in val:
#    print(i)
#print("################## This the location #################")
#for i in loc:
#   print(i)


wlen = len(u)
wlen2 = wlen // 2
freq = [i * fs / wlen for i in range(wlen2)]





color_spectrum = "#1f165b"
color_envlope = "#141414"
color_text_label = "#f82912"




plt.plot(freq, log_fft, 'k', color = color_spectrum)#color="#1f165b"
#plt.title('Spectrum')
#plt.savefig('spectrum.png', bbox_inches='tight', dpi = 300)


#plt.rcParams.update({
#    "text.usetex": True,
#    "font.family": "serif"
#})

plt.rcParams['text.usetex'] = True


plt.plot(freq, spec, 'k', color=color_envlope)
plt.title('$Ceptrum_{Formants}$')
plt.legend(("wavform", "envelope"),
          shadow=True, loc=(1.05, 0.38), handlelength=1.5, fontsize=16)

plt.xlabel("$Frequency$", color="C0", fontsize=20)
plt.ylabel("$dB$", color="C0", fontsize=20)



# we only extract the first Four formants

formant_list = []
for i in range(4):
    plt.plot([freq[loc[i]], freq[loc[i]]], [np.min(spec), spec[loc[i]]], '-.k')
    plt.text(freq[loc[i]], spec[loc[i]], '$F_{}={}$'.format(i+1, int(freq[loc[i]])), color="green",horizontalalignment='center', verticalalignment='center', fontsize=12)
    plt.text(28000, i+1,'$F_{}={}$'.format(i+1, int(freq[loc[i]])), color=color_text_label,horizontalalignment='center', verticalalignment='center', fontsize=12)
    formant_list.append(freq[loc[i]])



VTL = ((1 * (35000/(4 * formant_list[0]))) + (3 * (35000/(4 * formant_list[1]))) + (5 * (35000/(4 * formant_list[2]))) + (7 * (35000/(4 * formant_list[3]))))/4





plt.text(28000, -4,'$VTL={}cm$'.format(round(VTL)), color=color_text_label,horizontalalignment='center', verticalalignment='center', fontsize=12)
plt.savefig('Ceptrum Formants_demo.png', bbox_inches='tight', dpi = 600)

This will be the final visualization output:

Ceptrum Formants_demo

Hamming Windowing

This is to add the hamming window:

import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
from scipy.signal import lfilter
from scipy.io import wavfile
#import os
import scipy.signal as signal

file_name =  "./schwa_sample_sox.wav"

def read_file(file_name):
    fs, data = wavfile.read(file_name)
    x = data.copy()
    x = x / 32767
    u = lfilter ([1, -0.99], [1], x)
    wlen2 = len(u)//2
    freq = [i * fs / len(u) for i in range(wlen2)]
    return u, wlen2, freq




fs, x = wavfile.read(file_name)

N = len(x)
wlen = N // 2


w = signal.get_window('hamming', N)

X = np.fft.fft(x * w)[: wlen]

IFFT Work

This is the IFFT demo and save in the EPS file.

import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
from scipy.signal import lfilter
from scipy.io import wavfile
import numpy as np


fs, data = wavfile.read("./schwa_sample_sox.wav") 
x = data.copy()
x = x / 32767


wlen = len(x)
wlen2 = len(x)//2
# Time period
length = data.shape[0] / fs

t = np.linspace(0., length, data.shape[0])

freq = [i * fs / wlen for i in range(wlen2)]

# Do a Fourier transform on the signal

tx  = np.fft.fft(x)

tx  = np.fft.fft(x)

# Do an inverse Fourier transform on the signal

itx = np.fft.ifft(tx)


# Plot the original sine wave using inverse Fourier transform

plt.plot(t, itx);


plt.xlabel('Time')

plt.ylabel('Amplitude')

plt.grid(True)

plt.savefig('scatter.eps',dpi=600,format='eps')

ifft_demo

Cut Audio Chunks

from pydub import AudioSegment
import os
from pydub.silence import split_on_silence

if not os.path.isdir("splitaudio"):
    os.mkdir("splitaudio")

audio = AudioSegment.from_file("Wk4 vowels_BOR.wav")
lengthaudio = len(audio)
print("Length of Audio File", lengthaudio)

start = 0
# In Milliseconds, this will cut 10 Sec of audio
threshold = 1000
end = 0
counter = 0



chunks = split_on_silence (
    # Use the loaded audio.
    audio, 
    # Specify that a silent chunk must be at least 2 seconds or 2000 ms long.
    min_silence_len = 80,
    # Consider a chunk silent if it's quieter than -16 dBFS.
    # (You may want to adjust this parameter.)
    silence_thresh = -18
)



# Process each chunk with your parameters
for i, chunk in enumerate(chunks):
    # Create a silence chunk that's 0.5 seconds (or 500 ms) long for padding.
    silence_chunk = AudioSegment.silent(duration=500)

    # Add the padding chunk to beginning and end of the entire chunk.
    audio_chunk = silence_chunk + chunk + silence_chunk

    # Normalize the entire chunk.
    # normalized_chunk = match_target_amplitude(audio_chunk, -20.0)


    filename = f'splitaudio/chunk{counter}.wav'

    audio_chunk.export(filename, format="wav")
    counter +=1