%DEMO_HAUTH2020 framework to test the binaural speech intelligibility model hauth2020
%
% Url: http://amtoolbox.org/amt-1.1.0/doc/demos/demo_hauth2020.php
% Copyright (C) 2009-2021 Piotr Majdak, Clara Hollomey, and the AMT team.
% This file is part of Auditory Modeling Toolbox (AMT) version 1.1.0
%
% This program is free software: you can redistribute it and/or modify
% it under the terms of the GNU General Public License as published by
% the Free Software Foundation, either version 3 of the License, or
% (at your option) any later version.
%
% This program is distributed in the hope that it will be useful,
% but WITHOUT ANY WARRANTY; without even the implied warranty of
% MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
% GNU General Public License for more details.
%
% You should have received a copy of the GNU General Public License
% along with this program. If not, see <http://www.gnu.org/licenses/>.
% hauth2020 predicts the SRTs from the Anechoic Experiment of Beutelmann and Brand
% (2006). In this experiment, SRTs are simulated for speech located at 0 deg in
% the horizontal plane, and noise located at different positions [angles].
% The binaural processing in this model works blindly and only requires the
% mixture of speech an noise [required signal]. The back-end used here is the SII.
% SII values are produced for different signal-to-noise ratios. Depending
% on the back-end you want to use, optional signals have to be used. They
% are processed in the same way as the required signal. In this Demo, the
% optional signals are clean speech and noise.
%
%% .. figure::
%
% Predicted speech intelligibility index as a function of SNR
%
% #Author: Christopher Hauth <christopher.hauth@uni-oldenburg.de>
% #Author: Dr. Thomas Brand <thomas.brand@uni-oldenburg.de>
% Date : 22.10.2020
% Version: 1.0
%-------------------------------------------------------------------------%
fast = 1; % set to 1 to get faster results, set to 0 to get more accurate results
display_level = 'no_debug'; % set to 'debug' to see more information, set to 'no_debug' to have less mess on your display
list_clean(1).name = '0_05413.wav';
list_clean(2).name = '0_31601.wav';
list_clean(3).name = '0_46393.wav';
%---------------Experimental Conditions-----------------------------------%
% Define your experimental conditions, number of Monte Carlo simulations
% (binaural processing) and number of sentences (statistics across sentences)
% For Matrix type sentences it is recommended to use 10 sentences, where each word
% of the test appears once.
if fast
amt_disp('Fast mode activated - results may be inaccurate.');
vSNR_test = -20:5:0;
iNumofSentences = 1;
iNumofMonteCarlo = 3;
else
amt_disp('Accurate mode activated - please be patient.');
vSNR_test = -20:0;
iNumofSentences = 10;
iNumofMonteCarlo = 10;
end
fs_model = 44100;
sentences_clean = {list_clean.name};
sentence_choose_clean = sentences_clean;
randomizer = randperm(length(sentence_choose_clean));
vangles_test = 45;
% Vector of input SNRs. For each SNR, (iNumofSentences x iNumofMonteCarlo) SII values are obtained.
% Make sure to test different different SNRs in order to be able to map the
% SII to an SRT, e.g.:
% If you want to know the SII of a single SNR, use only one value:
%vSNR_test = -18;
%% Calibration
% the calibration factor is mean level between the ears
% The calibration can be adjusted for your needs. For the SII, which is used here, 65
% dB FS (relative to full scale) is assumed to be 65 dB SPL.
% However, if you only aim for the
% resynthesized output, please adjust this level to avoid clipping.
lev_desired = 65; % the value 65 is required here for correct use of the SII;
% Use the co-located noise condition to calibrate the input (For the speeech signal, also the noise is used)
[calibnoise, fs] = amt_load('hauth2020', '0_speaker_reference_olnoise.wav');
calibnoise = calibnoise(1:end-round(1.5*fs),:);
lev_Speech = 20*log10(rms(calibnoise)); % actual rms-level of speech
lev_S = mean(lev_Speech); % the mean between both ears is considered
Delta_L_speech = lev_desired - lev_S; % Calibration Gain is the difference between the desired level and the actual level
Delta_L_speech_lin = 10.^((Delta_L_speech)/20); % Convert to linear gain
% Similar calibration of the noise
lev_Noise = 20*log10(rms(calibnoise)); % actual rms-level of the noise
lev_Noise = mean(lev_Noise); % reference is MEAN level between the two ears
Delta_L_noise = lev_desired - lev_Noise; % Calibration Gain is the difference between the desired level and the actual level
Delta_L_noise_lin = 10.^((Delta_L_noise)/20);% Convert to linear gain
%-------------------------------------------------------------------------%
clear sii_min_all sii_max_all sii_syn_all sii_L_all sii_R_all
% Iterate through all SNRs
for kk = 1:length(vSNR_test)
amt_disp(['Processing SNR ' num2str(kk) ' out of ' num2str(length(vSNR_test)) '.']);
% Iterate through the different sentences
for ll = 1:iNumofSentences
amt_disp(['Processing sentence ' num2str(ll) ' out of ' num2str(iNumofSentences) '.']);
sentence_clean = sentence_choose_clean{randomizer(ll)};
% Read sentences and noise from wav files
[speech_clean, fs_s]= amt_load('hauth2020', sentence_clean);
[noise, fs_n] = amt_load('hauth2020', sprintf('%d_speaker_reference_olnoise.wav', vangles_test));
% resample signals if necessary
if fs ~= fs_s
speech_clean = resample(speech_clean,fs_model,fs_s);
noise = resample(noise,fs_model,fs_n);
end
% Get length of the speech signal
lenSpeech = length(speech_clean);
lenNoise = length(noise);
% Truncate noise to have the same lenght as speech
noise = noise(1:lenSpeech,:);
speech_clean = speech_clean(1:lenSpeech,:);
% adjust level of speech:
speech_clean = Delta_L_speech_lin.*speech_clean;
% adjust level of noise:
noise = Delta_L_noise_lin.*noise;
% adjust SNR of mixed input signal (speech + noise)
% This is a required signal:
mixed_input = 10.^((vSNR_test(kk))/20).*speech_clean+noise;
inputLen = length(mixed_input);
% Adjust level of the clean speech if you want to
% use it as an optional input:
speech_clean_proc = 10.^((vSNR_test(kk))/20).*speech_clean;
% All optional signals are arranged in a matrix.
% Here: [S_l(:) S_r(:) N_l(:) N_r(:)]
OptionalSignals = [speech_clean_proc noise];
% Apply the binaural model to the mixed signal
% (and optionally to the clean speech and noise)
% Monte Carlo simulations are used to model the binaural uncerntainty
for oo=1:iNumofMonteCarlo
% Do binaural processing:
% out_struct contains the processed mixed signal as well as
% the processed optional signals: Moreover, as the SII is
% used as back-end, it also contains the frequency-specific
% levels
amt_disp(['Processing Monte Carlo ' num2str(oo) ' out of ' num2str(iNumofMonteCarlo) '...'],'volatile');
out_struct = hauth2020(mixed_input, fs,'OptSigs',OptionalSignals,display_level);
% Speech Intelligibility back-end: (SII in this example)
% Use your speech intelligibility back-end here
[sii_min_temp(oo),A,Z] = hauth2020_sii(out_struct.levels.LevelOptSig1min,out_struct.levels.LevelOptSig2min,-Inf*ones(30,1),2,0);
[sii_max_temp(oo),A,Z] = hauth2020_sii(out_struct.levels.LevelOptSig1max,out_struct.levels.LevelOptSig2max,-Inf*ones(30,1),2,0);
[sii_syn_temp(oo),A,Z] = hauth2020_sii(out_struct.levels.LevelOptSig1syn,out_struct.levels.LevelOptSig2syn,-Inf*ones(30,1),2,0);
[sii_L_temp(oo),A,Z] = hauth2020_sii(out_struct.levels.LevelOptSig1L,out_struct.levels.LevelOptSig2L,-Inf*ones(30,1),2,0);
[sii_R_temp(oo),A,Z] = hauth2020_sii(out_struct.levels.LevelOptSig1R,out_struct.levels.LevelOptSig2R,-Inf*ones(30,1),2,0);
sii_min_all(ll,kk,oo) = sii_min_temp(oo);
sii_max_all(ll,kk,oo) = sii_max_temp(oo);
sii_syn_all(ll,kk,oo) = sii_syn_temp(oo);
sii_L_all(ll,kk,oo) = sii_L_temp(oo);
sii_R_all(ll,kk,oo) = sii_R_temp(oo);
end
end
end
amt_disp();
%% Plotting
sii_min_all_squeezed = squeeze(mean(sii_min_all,1));
sii_max_all_squeezed = squeeze(mean(sii_max_all,1));
sii_syn_all_squeezed = squeeze(mean(sii_syn_all,1));
sii_L_all_squeezed = squeeze(mean(sii_L_all,1));
sii_R_all_squeezed = squeeze(mean(sii_R_all,1));
for ii = 1:numel(vSNR_test)
xlab{ii} = vSNR_test(ii);
end
figure
subplot(3,1,1)
plot(max(sii_min_all_squeezed.' ),'k', 'linewidth', 2)
hold on
plot(min(sii_min_all_squeezed.' ),'k', 'linewidth', 2)
plot(sii_min_all_squeezed)
xlim([1 numel(vSNR_test)])
grid on
xlabel('SNR [dB]')
ylabel('Speech Intelligibility Index')
set(gca,'xtick',[1:numel(vSNR_test)],'xticklabels',xlab);
title('Minimum Speech Intelligibility Index')
subplot(3,1,2)
plot(max(sii_max_all_squeezed.' ),'k', 'linewidth', 2)
hold on
plot(min(sii_max_all_squeezed.' ),'k', 'linewidth', 2)
plot(sii_max_all_squeezed)
xlim([1 numel(vSNR_test)])
grid on
xlabel('SNR [dB]')
ylabel('Speech Intelligibility Index')
set(gca,'xtick',[1:numel(vSNR_test)],'xticklabels',xlab);
title('Maximum Speech Intelligibility Index')
subplot(3,1,3)
plot(max(sii_L_all_squeezed.' ), 'linewidth', 2)
hold on
plot(min(sii_R_all_squeezed.' ),'r', 'linewidth', 2)
xlim([1 numel(vSNR_test)])
grid on
xlabel('SNR [dB]')
ylabel('Speech Intelligibility Index')
set(gca,'xtick',[1:numel(vSNR_test)],'xticklabels',xlab);
legend('Left ear SII', 'Right ear SII','location', 'southeast')
title('L/R Speech Intelligibility Index')