Contents
The SMAC front end
%The SMAC speech feature extraction front end % [SMACS, FB, SM1, SM1C] = SMAC(S, FS, FB, NC, CMS, WL, FP, PR) % Input parameters: % S input speech signal % FS sampling frequency (dafault 8000) % FB filter bank parameters: [NumberOfBands, FFTLength, Bandwidth] % or an [NumberOfBands x FFTLength/2] matrix, (dafault [12 1024 4]) % NC number of augmenting cepstral coefficients (dafault 2) % CMS ceptral mean subtraction (only on CCs) (dafault 1) % WL length of window in sec (dafault 0.025) % FP frame period in sec (dafault 0.010) % PR preemphasis coefficients (dafault 0.97) % % Output values: % SMACS the SMAC feature vector matrix [NumberOfBands x NumberOfFrames] % FB the filterbank matrix (can also be used in subsequent calls) % SM1 first spectral moment matrix % SM1C first central spectral moment (the SM part of SMAC) % % Examples: % [s fs] = wavread('input.wav'); % [smacs, FB, sm1] = smac(s,fs); % extract spectral moments % figure; specgram(s,1024,fs,100,80); % plot spectrum % hold; plot(0:0.01:0.01*(size(sm1,2)-1), sm1'); % plot spectral moment % % % batch feature extraction from a list of input files and target files % [wavs feats] = textread('target.scp', '%s%s'); % FB = 12; NC = 2; % the length of the final vector is FB+NC = 14 % for fi=1:length(wavs) % [s fs] = wavread(wavs{fi}); disp(wavs{fi}); % read wav file % s = resample(s,8000,fs); % resample at 8 kHz % [smacs, FB] = smac(s,8000,FB,NC); % keep filter bank in FB % htkwrite(feats{fi}, smacs, 100000, 6); % save as HTK MFCC kind % end % %For more details see: %P. Tsiakoulis, A. Potamianos, and D. Dimitriadis, %Spectral moment features augmented by low order cepstral coefficients for robust ASR, %IEEE Signal Processing Letters, June 2010. % %P. Tsiakoulis, Sep. 2009, (modified June 2012) function [smacs, FB, sm1, sm1c, sm0] = smac(S, FS, FB, NC, CMS, WL, FP, PR) % default paramters count = 2; if nargin<count, FS = 8000; end; count = count + 1; if nargin<count, FB = [12 1024 4]; end; count = count + 1; if nargin<count, NC = 2; end; count = count + 1; if nargin<count, CMS = 1; end; count = count + 1; if nargin<count, WL = 0.025; end; count = count + 1; if nargin<count, FP = 0.010; end; count = count + 1; if nargin<count, PR = 0.97; end; count = count + 1; % initilize filterbank if length(FB)>3 [NB FL] = size(FB); FL = 2*FL; else NB = FB(1); if length(FB)>=2, FL = FB(2); else FL = 1024; end if length(FB)>=3, BM = FB(3); else BM = 4; end FB = gaborfb(FS, NB, FL, BM); end [m mi] = max(FB,[],2); cfreq = FS*mi/FL; % init W = round(WL*FS); F = round(FP*FS); linsp = linspace(0, FS/2, FL/2); % normalize S = S/max(S); % make vector S = reshape(S, [1 length(S)]); % Preemphasize here if PR>0, S(2:end) = S(2:end) - PR*S(1:end-1); end NFrames = length(1:F:(length(S)-W)); sm0 = zeros(NB, NFrames); sm1 = zeros(NB, NFrames); sm1c = zeros(NB, NFrames); % frame analysis for fri = 1:NFrames fridx = (F*(fri-1)+1):(F*(fri-1)+W); zz = abs(fft(S(fridx), FL)); for g=1:NB z2 = zz(1:FL/2).*FB(g,:); z2 = z2.^2; sz2 = sum(z2); sm0(g,fri) = sz2; sm1(g,fri) = sum((linsp).*z2)/sz2; sm1c(g,fri) = sm1(g,fri) - cfreq(g); end end %sm1 = 2*pi*sm1/FS; %sm1c = 2*pi*sm1c/FS; % cep sm0 = 10*log10(sm0); sm0(isnan(sm0) | isinf(sm0)) = 0; sm0 = mfb2cep(sm0', NC, 1)'; %cms if CMS, m_sm0 = mean(sm0, 2); sm0 = sm0 - repmat(m_sm0, 1, size(sm0,2)); end % smacs if NC > 0 smacs = [sm1c; sm0(end:-1:1,:)]; else smacs = sm1c; end smacs(isnan(smacs) | isinf(smacs)) = 0;
Helper functions
% gabor.m % GABOR Gabor Filter % Creates a Gabor filter described by F(frequency), A(bandwith) and S % The impulse resonse is truncated to N = (4/beta) + 1 points % (each side) wher beta = A/S [ S is the sampling frequency ]. % The frequency response is sampled at 512 points (range [0,pi]). % corresponding to 1024 in the [0,2*pi] range % % % [h, H] = GABOR(F, A, S, N) % %h impulse response exp(-[(A/S)*t]^2)*cos(2*pi*F/S*t) %H frequency response % sqrt(pi)/(2*A/S) { exp (-[(f-F)*pi/A]^2) + exp(...) } %F,A the center frquency (Hz) and the bandwidth parameter (Hz) %S the sampling frequency % A Potamianos, 8 Mar 94, GaTech % P Tsiakoulis, 3 Jul 09, NTUA -> added nb function [h, H] = gabor(fr, bw, sfr, nb) if nargin<4, nb = 512; end b = bw / sfr; w_c = 2*pi*fr/sfr; N = ceil((4/b) + 1) ; n = -N:1:N; h = exp(-(b*n).^2).*cos(w_c*n) ; n = linspace(0,pi,nb); H = sqrt(pi)/(2*b)*(exp(-((n-w_c)/(2*b)).^2)+exp(-((n+w_c)/(2*b)).^2)); %n = linspace(0,sfr/2,512); %H = sqrt(pi)/(2*bw/sfr)*(exp(-((n-fr)*pi/bw).^2)+exp(-((n+fr)*pi/bw).^2)); % gaborfb.m % P Tsiakoulis, 3 Jul 09 function [H, F, h] = gaborfb(FS, NB, FL, BM) F = zeros(1, NB); f_mel_max = log10(1+(FS/2)/700); % mel warped sfr/2 F(1, 1:NB)=(10.^(f_mel_max*(1:NB)/(NB+1))-1)*700; bwtmp = []; bwtmp(1, 2:NB+1) = F; bwtmp = diff(bwtmp, 1, 2); % filter spacing bw = BM*bwtmp; H = zeros(NB, FL/2); h = cell(NB); for g=1:NB [hg, Hg] = gabor(F(g),bw(g),FS, FL/2); H(g,:) = Hg; h{g} = hg; end % mfb2cep.m % MFB2CEP Tranform from spectral envelope to cepstrum domain [potam] % % USAGE: c = mfb2cep(m,nc) % % where c(t,bin) is the cepstrum coefficients time-frequency representation % (bin=[1...], i.e., NO c[0]) and m(t,f) is the (Mel) spectral envelope % time-frequency representation (nc is the number of Cep coefficients % INCLUDING c[0] (default: 13)) % % see also: LOADCEP, LOADMFB, CEP2MFB % Alexandros Potamianos, AT&T Bell Labs, Dec 11, 1995 function c = mfb2cep(m,nc,c0) %%%%% I/O %%%%% if (nargin < 1) help mfb2cep;return; end if (nargin < 2) nc = 13; end if (nargin < 3) c0 = 0; end nf = size(m,2); N = size(m,1); if (nc>nf) disp(['Number of cepstrum coefficients reduced to ',num2str(nf)]); nc = nf-1; end; %%%%% TRANSFORM %%%%% %%%% copy from ricks icostrans() %%%%% init w = pi/nf; tmp = ones(nf,nc); for i = 1:nc-1 tmp(:,i+1) = (1/nf)*cos(i*[0.5:nf-0.5]*w)'; end if c0 c = zeros(N,nc); else c = zeros(N,nc-1); end %%%% do the deed for i = 1:N tt = (tmp'*m(i,:)'); %%%% equivalent implementation with FFT %%%% (almost the same as above apart from the half-sample computation) %tt = ifft([0 m(i,:) m(i,(nf-1):-1:1)]); if c0 c(i,:) = real(tt(1:nc)'); %%%% keep c0 and truncate else c(i,:) = real(tt(2:nc)'); %%%% throw away c0 and truncate end end