THE AUDITORY MODELING TOOLBOX

This documentation page applies to an outdated major AMT version. We show it for archival purposes only.
Click here for the documentation menu and here to download the latest AMT (1.6.0).

View the help

Go to function

JOERGENSEN2011 - the speech-based envelope power spectrum model

Program code:

function output = joergensen2011(x,y,fs_input,IO_param)
%JOERGENSEN2011  the speech-based envelope power spectrum model
%   Usage: output = joergensen2011(x,y,fs_input,IO_param)
%
%   output = JOERGENSEN2011(x,y,fs_input,IO_param) returns the output of
%   signal-to-noise envelope-power (SNRenv) ratio using the 
%   multi-resolution speech-based envelope spectrum model (mr-sEPSM)
%   described in Joergensen et al. (2013)
% 
%   Input parameters:
%     x:           noisy speech mixture 
%     y:           noise alone
%     fs:          sample rate in Hz
%     IO_param:    (optional) vector with parameters for the ideal observer 
%                  that converts the SNRenv to probability of correct, assuming a
%                  given speech material. It contains four parameters of the ideal observer 
%                  formatted as [k q m sigma_s].
%
%   Output parameters:
%     output.SNRenv:     The SNRenv
%     output.P_correct:  The probability of correct given the
%                        SNRenv. This field is only included if IO_param is specified.
%                        Its calculation requires the Statistics ToolBox. 
%
%   The model consists of the following stages:
%
%   1) A gammatone bandpass filterbank to simulate the auditory filters
%
%   2) An envelope extraction stage via the Hilbert Transform
%
%   3) A modulation filterbank
%  
%   4) Computation of the long-term envelope power (*output.SNRenv*)
%
%   5) A decision mechanism based on a statistically ideal observer
%   (*output.P_correct*)
%
%   See also: joergensen2013 demo_joergensen2013
%
%   References:
%     S. Joergensen and T. Dau. Predicting speech intelligibility based on
%     the signal-to-noise envelope power ratio after modulation-frequency
%     selective processing. J. Acoust. Soc. Am., 130(3):1475-1487, 2011.
%     
%
%   Url: http://amtoolbox.sourceforge.net/amt-0.9.9/doc/models/joergensen2011.php

% Copyright (C) 2009-2015 Piotr Majdak and the AMT team.
% This file is part of Auditory Modeling Toolbox (AMT) version 0.9.9
%
% This program is free software: you can redistribute it and/or modify
% it under the terms of the GNU General Public License as published by
% the Free Software Foundation, either version 3 of the License, or
% (at your option) any later version.
%
% This program is distributed in the hope that it will be useful,
% but WITHOUT ANY WARRANTY; without even the implied warranty of
% MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
% GNU General Public License for more details.
%
% You should have received a copy of the GNU General Public License
% along with this program.  If not, see <http://www.gnu.org/licenses/>.

if nargin < 3
    error('Too few input arguments.');
end

if length(x)~=length(y)
    error('x and y should have the same length');
end

% initialization
x           = x(:)';                             % Noisy speech row vector
y           = y(:)';                             % Noise alone speech row vector
fs          = 22050;
cf_aud = [63    80   100  125  160  200    250    315  400   500  630 800  1000  1250  1600  2000 2500 3150 4000 5000 6300  8000]; % centerfrequencies of the gammatone filters
cf_mod = [1 2 4 8 16 32 64 ];% centerfrequencies of the modulation filters
HT_diffuse = [37.5 31.5  26.5  22.1 17.9 14.4 11.4 8.4 5.8  3.8  2.1  1.0  0.8  1.9  0.5 -1.5 -3.1 -4.0 -3.8 -1.8 2.5   6.8 ]; % Diffuse field hearing threshold in quiet: ISO 389-7:2005
N = length(x);

if fs_input ~= fs
    x	= resample(x, fs, fs_input);
    y 	= resample(y, fs, fs_input);
end

%% Filtering through gammatone filterbank
g = gammatonefir(cf_aud,fs,'complex');
X  = 2*real(ufilterbank(x,g,1));
Y  = 2*real(ufilterbank(y,g,1));

%% ------------------ determining which frequency bands that are above
%% the hearing threshold.
% The spectrum levels (in SPL) of the stimuli are determined from a 1/3-octave analysis, since the threshold to compare with
% is spectrum levels.

mix_rms = thirdOctRMSAnalysis(x,fs,cf_aud);
mixRMS_dB =  20*log10(mix_rms);
bands = find(mixRMS_dB>HT_diffuse(1:length(cf_aud))); %     The bands to process further are the bands where the mix has spectrum levels above the hearing threshold.

%% ------------------ calculating envelopes of temporal outputs
%  lowpass filtering at 150 Hz
[bb, aa] = butter(1, 150*2/fs);

x_env = zeros(N,length(cf_aud));
y_env = zeros(N,length(cf_aud));

x_env(:,bands) = abs(hilbert(X(:,bands))); %  envelope
y_env(:,bands) = abs(hilbert(Y(:,bands)));

x_env = filter(bb,aa,x_env);
y_env = filter(bb,aa,y_env);

%downsampling
D = 10;
fsNew = fs/D;

x_env = resample(x_env,fsNew,fs);
y_env = resample(y_env,fsNew,fs);
%
SNRenv_n_p = zeros(length(cf_mod),length(cf_aud));

%% ----------------- Analysis via modulation filterbank
    P_env_xx = zeros(length(cf_mod),length(cf_aud));
    P_env_yy = P_env_xx;
    
for p = bands% For every audio channel
        
    
    P_env_xx(:,p)  = modFbank_v2(x_env(:,p),fsNew,cf_mod);
    P_env_yy(:,p)  = modFbank_v2(y_env(:,p),fsNew,cf_mod);
    
    
    if sum(sum(isnan(P_env_xx(:,p))))
        P_env_xx(isnan(P_env_xx(:,p)),n,p) = 0;
    end
    
    if sum(sum(isnan(P_env_yy(:,p))))
        P_env_yy(isnan(P_env_yy(:,p)),n,p) = 0;
    end
    
    P_env_yy(:,p) = min(P_env_xx(:,p),P_env_yy(:,p));    %     The envelope power of the noise is the minimum of Penv of the mixture
    %     or the noise.
   
    threshold = 0.01;          % The envelope power cannot go below 0.001 (-30 dB) reflecting
    % our minimum threshold of sensitivity to modulation detection
    
    P_env_yy(:,p) = max( P_env_yy(:,p),threshold);
    P_env_xx(:,p)= max(P_env_xx(:,p),threshold);
    
    SNRenvs_tmp = (P_env_xx(:,p)-P_env_yy(:,p)) ./P_env_yy(:,p); % calculation of SNRenv
    
    SNRenv_n_p(:,p) = max(0.001,SNRenvs_tmp); % Truncated at -30 dB for numerical reasons.
 
    
end

% ----------------- Integrating the SNRenv across audio and modulation bands---------------------------------------------------

SNRenv_p = sqrt(sum(SNRenv_n_p.^2,1));   %   Combine across modulation filters:  %SNRenv(p) eq. (4)

SNRenv = (sqrt(sum(SNRenv_p.^2)));%         Combine across audio filters: eq. (5)
output.SNRenv = SNRenv;

if nargin < 4
elseif nargin < 5
    P_correct = IdealObserver_v1(SNRenv,IO_param);
    output.P_correct = P_correct;
end
% ---------------------------------------------------------------------------------------------

end


function y = rms(x)
L = length(x);
y=norm(x)/sqrt(L);
end


function rms_out = thirdOctRMSAnalysis(x,fs,midfreq)

if nargin<3
    midfreq=[63 80 100 125 160 200 250 315 400 500 630 800 1000 1250 1600 2000 2500 3150 4000 5000 6300 8000 ];
end

N = length(x);
X = (fft(x));
X_mag  = abs(X) ;
X_power = X_mag.^2/N ;% power spectrum.
X_power_pos = X_power(1:fix(N/2)+1) ;
X_power_pos(2:end) = X_power_pos(2:end).* (2)  ; %take positive frequencies only and mulitply by two-squared to get the same total energy(used since the integration is only performed for positive freqiencies)

freq= linspace(0,fs/2,length(X_power_pos));

%resolution of data
resol=freq(2)-freq(1);

crossfreq(1)=midfreq(1)/(2^(1/6));
crossfreq(2:length(midfreq)+1)=midfreq*(2^(1/6));

%cross-over indicies
y=crossfreq/resol;

%rounding up
crosselem=round(y);
for n=1:length(y)
    if crosselem(n)<y(n)
        crosselem(n)=crosselem(n)+1;
    end
end

nn=1;
rms_out(1:length(midfreq)) = 0;
while crossfreq(nn+1)<=freq(end)% for nn =1:length(crossfreq)-1
 
    rms_out(nn) = sqrt(sum(X_power_pos(crosselem(nn):crosselem(nn+1)-1))/N);
    nn=nn+1;
    if 1+nn > length(crossfreq)
        break
    end
end

end


function P_env = modFbank_v2(Env,fs,cf_mod)
%
% This function is an implementation of a modulation filterbank similar to the EPSM-filterbank
% as presented by Ewert & Dau 2000. This implementation consists of a lowpass
% filter with a cutoff at 1 Hz, in parallel with 8 bandpass filters with
% octave spacing. the Center-frequencies of the bandpass filters are lower
% than the original from Ewert & Dau (2000).
%
% Inputs:
%   Env:  The envelope to be filtered
%   fs: sampling frequency of the envelope
%   cf_mod:  centerfrequencies of the modulation filters
%
% Outputs:
%   P_env:  Envelope power for each of the modulation filters
%
%
% Created by Søren Jørgensen jan 2010
% last update 02 may 2014
% Copyright Søren Jørgensen

if nargin<3
    %band center frequencies
    cf_mod=[1 2 4 8 16 32 64];
end

if size(Env,1) > 1
    Env = Env';
end

if mod(length( Env),2) == 0
    %number is even
    Env =  Env(1:end-1);
else
    %number is odd
end

Q = 1;
N = length(Env);
X = fft(Env);
X_mag  = abs(X) ;
X_power = X_mag.^2/N ;% power spectrum.

X_power_pos = X_power(1:fix(N/2)+1) ;
X_power_pos(2:end) = X_power_pos(2:end).* (2)  ; %take positive frequencies only and mulitply by two-squared to get the same total energy(used since the integration is only performed for positive freqiencies)
pos_freqs= linspace(0,fs/2,length(X_power_pos));
freqs = [pos_freqs -1*fliplr(pos_freqs(2:end))];

% Initialize transfer function
TFs = zeros(length(cf_mod),length(freqs));

% Calculating frequency-domain transferfunction for each center frequency:
for k = 2:length(cf_mod)
    TFs(k,1:end) = 1./(1+ (1j*Q*(freqs(1:end)./cf_mod(k) - cf_mod(k)./freqs(1:end)))); % p287 Hambley.
end
% squared filter magnitude transfer functions
Wcf = (abs(TFs)).^2;

fcut = 1;% cutoff frequency of lowpassfilter:
n = 3;% order:
% Lowpass filter squared transfer function:
Wcf(1,:) =  1./(1+((2*pi*freqs/(2*pi*fcut)).^(2*n))); % third order butterworth filter TF from: http://en.wikipedia.org/wiki/Butterworth_filter

% ------------ DC-power, --------------------------
% %here devided by two such that a fully modulated tone has an AC-power of 1.
DC_power =  X_power_pos(1)/ N /2;

X_power_pos = repmat(X_power_pos,length(cf_mod),1);

Vout = X_power_pos.*Wcf(:,1:floor(end/2)+1);
% Integration estimated as a sum from f > 0
% integrate envelope power in the
% passband of the filter. Index goes from 2:end since
P_env =  sum(Vout(:,2:end),2)/ N / DC_power;
% integration is for f>0


end


function Pcorrect  = IdealObserver_v1(SNRenv_lin,parameters)
%%
% IdealObserver: Converts the overall SNRenv to percent correct.
%
% Usage: [Pcorrect SNRenv ] = IdealObserver(SNRenv_lin,parameters)
%
% SNRenv_lin :  vector with the SNRenv values (not in dB) for each input SNR
% Parameters :  vector with the parameters for the ideal Observer formatted as [k q m sigma_s]
%
% Søren Jørgensen august 2010
% last update 10-June 2013
% Copyright Søren Jørgensen
%%

if nargin < 2
    error('You have to specify the k,q,m,sigma_s parameters for the IdealObserver')
end
k = parameters(1);
q = parameters(2);
m = parameters(3);
sigma_s = parameters(4);


% ---------- Converting from SNRenv to d_prime  --------------
d_prime = k*(SNRenv_lin).^q;

%----------- Converting from d_prime to Percent correct, Green and Birdsall (1964)----------
Un = 1*norminv(1-(1/m));
mn = Un + (.577 /Un);% F^(-1)[1/n] Basically gives the value that would be drawn from a normal destribution with probability p = 1/n.
sig_n=  1.28255/Un;
Pcorrect = normcdf(d_prime,mn,sqrt(sigma_s.^2+sig_n.^2))*100;


% Green, D. M. and Birdsall, T. G. (1964). "The effect of vocabulary size",
% In Signal Detection and Recognition by Human Observers,
% edited by John A. Swets (John Wiley & Sons, New York)
end