%demo_hauth2020 Demonstration of the binaural speech intelligibility model
%
% DEMO_HAUTH2020 predicts the speech reception thresholds (SRTs) from the
% anechoic experiment of Beutelmann and Brand (2006). In that experiment,
% SRTs were simulated for speech located at 0 degrees in
% the horizontal plane, and noise located at different angles.
% The binaural processing in this model works blindly and only requires the
% mixture of speech an noise [required signal]. The back-end used here is the
% speech inteligibility index (SII).
%
% In this demonstration, SIIs are produced for various signal-to-noise ratios (SNRs)
% of the optional speech and noise. In the code, depending
% on the requested back-end, optional signals can be used. They
% will be processed in the same way as the required signal.
%
% Figure 1: Predicted speech intelligibility index (SSI) as a function of signal-to-noise ratio (SNR)
%
% See also: hauth2020
%
% Url: http://amtoolbox.org/amt-1.6.0/doc/demos/demo_hauth2020.php
% #Author: Christopher Hauth (2020): original implementation.
% #Author: Thomas Brand (2020): original implementation.
% #Author: Clara Hollomey (2021): integration in the AMT.
% #Author: Piotr Majdak (2024): code and documentation improvements for the AMT 1.6.
% #Author: Michael Mihocic (2024): code adapted for Octave.
% This file is licensed unter the GNU General Public License (GPL) either
% version 3 of the license, or any later version as published by the Free Software
% Foundation. Details of the GPLv3 can be found in the AMT directory "licences" and
% at <https://www.gnu.org/licenses/gpl-3.0.html>.
% You can redistribute this file and/or modify it under the terms of the GPLv3.
% This file is distributed without any warranty; without even the implied warranty
% of merchantability or fitness for a particular purpose.
%-------------------------------------------------------------------------%
fast = 1; % set to 1 to get faster results, set to 0 to get more accurate results
display_level = 'no_debug'; % set to 'debug' to see more information, set to 'no_debug' to have less mess on your display
list_clean(1).name = '0_05413.wav';
list_clean(2).name = '0_31601.wav';
list_clean(3).name = '0_46393.wav';
%---------------Experimental Conditions-----------------------------------%
% Define your experimental conditions, number of Monte Carlo simulations
% (binaural processing) and number of sentences (statistics across sentences)
% For Matrix type sentences it is recommended to use 10 sentences, where each word
% of the test appears once.
if fast
amt_disp('Fast mode activated - results may be inaccurate.');
vSNR_test = -20:5:0;
iNumofSentences = 1;
iNumofMonteCarlo = 3;
else
amt_disp('Accurate mode activated - please be patient.');
vSNR_test = -20:0;
iNumofSentences = 10;
iNumofMonteCarlo = 10;
end
fs_model = 44100;
sentences_clean = {list_clean.name};
sentence_choose_clean = sentences_clean;
randomizer = randperm(length(sentence_choose_clean));
vangles_test = 45;
% Vector of input SNRs. For each SNR, (iNumofSentences x iNumofMonteCarlo) SII values are obtained.
% Make sure to test different different SNRs in order to be able to map the
% SII to an SRT, e.g.:
% If you want to know the SII of a single SNR, use only one value:
%vSNR_test = -18;
%% Calibration
% the calibration factor is mean level between the ears
% The calibration can be adjusted for your needs. For the SII, which is used here, 65
% dB FS (relative to full scale) is assumed to be 65 dB SPL.
% However, if you only aim for the
% resynthesized output, please adjust this level to avoid clipping.
lev_desired = 65; % the value 65 is required here for correct use of the SII;
% Use the co-located noise condition to calibrate the input (For the speeech signal, also the noise is used)
[calibnoise, fs] = amt_load('hauth2020', '0_speaker_reference_olnoise.wav');
calibnoise = calibnoise(1:end-round(1.5*fs),:);
lev_Speech = 20*log10(rms(calibnoise)); % actual rms-level of speech
lev_S = mean(lev_Speech); % the mean between both ears is considered
Delta_L_speech = lev_desired - lev_S; % Calibration Gain is the difference between the desired level and the actual level
Delta_L_speech_lin = 10.^((Delta_L_speech)/20); % Convert to linear gain
% Similar calibration of the noise
lev_Noise = 20*log10(rms(calibnoise)); % actual rms-level of the noise
lev_Noise = mean(lev_Noise); % reference is MEAN level between the two ears
Delta_L_noise = lev_desired - lev_Noise; % Calibration Gain is the difference between the desired level and the actual level
Delta_L_noise_lin = 10.^((Delta_L_noise)/20);% Convert to linear gain
%-------------------------------------------------------------------------%
% clear sii_min_all sii_max_all sii_syn_all sii_L_all sii_R_all
% Iterate through all SNRs
amt_disp('');
for kk = 1:length(vSNR_test)
amt_disp(['Processing SNR ' num2str(kk) ' out of ' num2str(length(vSNR_test)) '.']);
% Iterate through the different sentences
for ll = 1:iNumofSentences
amt_disp(['Processing sentence ' num2str(ll) ' out of ' num2str(iNumofSentences) '.']);
sentence_clean = sentence_choose_clean{randomizer(ll)};
% Read sentences and noise from wav files
[speech_clean, fs_s]= amt_load('hauth2020', sentence_clean);
[noise, fs_n] = amt_load('hauth2020', sprintf('%d_speaker_reference_olnoise.wav', vangles_test));
% resample signals if necessary
if fs ~= fs_s
speech_clean = resample(speech_clean,fs_model,fs_s);
noise = resample(noise,fs_model,fs_n);
end
% Get length of the speech signal
lenSpeech = length(speech_clean);
lenNoise = length(noise);
% Truncate noise to have the same lenght as speech
noise = noise(1:lenSpeech,:);
speech_clean = speech_clean(1:lenSpeech,:);
% adjust level of speech:
speech_clean = Delta_L_speech_lin.*speech_clean;
% adjust level of noise:
noise = Delta_L_noise_lin.*noise;
% adjust SNR of mixed input signal (speech + noise)
% This is a required signal:
mixed_input = 10.^((vSNR_test(kk))/20).*speech_clean+noise;
inputLen = length(mixed_input);
% Adjust level of the clean speech if you want to
% use it as an optional input:
speech_clean_proc = 10.^((vSNR_test(kk))/20).*speech_clean;
% All optional signals are arranged in a matrix.
% Here: [S_l(:) S_r(:) N_l(:) N_r(:)]
OptionalSignals = [speech_clean_proc noise];
% Apply the binaural model to the mixed signal
% (and optionally to the clean speech and noise)
% Monte Carlo simulations are used to model the binaural uncerntainty
for oo=1:iNumofMonteCarlo
% Do binaural processing:
% out_struct contains the processed mixed signal as well as
% the processed optional signals: Moreover, as the SII is
% used as back-end, it also contains the frequency-specific
% levels
amt_disp(['Processing Monte Carlo ' num2str(oo) ' out of ' num2str(iNumofMonteCarlo) '...'],'volatile');
out_struct = hauth2020(mixed_input, fs,'OptSigs',OptionalSignals,display_level);
% Speech Intelligibility back-end: (SII in this example)
% Use your speech intelligibility back-end here
[sii_min_temp(oo),A,Z] = hauth2020_sii(out_struct.levels.LevelOptSig1min,out_struct.levels.LevelOptSig2min,-Inf*ones(30,1),2,0);
[sii_max_temp(oo),A,Z] = hauth2020_sii(out_struct.levels.LevelOptSig1max,out_struct.levels.LevelOptSig2max,-Inf*ones(30,1),2,0);
[sii_syn_temp(oo),A,Z] = hauth2020_sii(out_struct.levels.LevelOptSig1syn,out_struct.levels.LevelOptSig2syn,-Inf*ones(30,1),2,0);
[sii_L_temp(oo),A,Z] = hauth2020_sii(out_struct.levels.LevelOptSig1L,out_struct.levels.LevelOptSig2L,-Inf*ones(30,1),2,0);
[sii_R_temp(oo),A,Z] = hauth2020_sii(out_struct.levels.LevelOptSig1R,out_struct.levels.LevelOptSig2R,-Inf*ones(30,1),2,0);
sii_min_all(ll,kk,oo) = sii_min_temp(oo);
sii_max_all(ll,kk,oo) = sii_max_temp(oo);
sii_syn_all(ll,kk,oo) = sii_syn_temp(oo);
sii_L_all(ll,kk,oo) = sii_L_temp(oo);
sii_R_all(ll,kk,oo) = sii_R_temp(oo);
end
end
end
amt_disp();
%% Plotting
sii_min_all_squeezed = squeeze(mean(sii_min_all,1));
sii_max_all_squeezed = squeeze(mean(sii_max_all,1));
sii_syn_all_squeezed = squeeze(mean(sii_syn_all,1));
sii_L_all_squeezed = squeeze(mean(sii_L_all,1));
sii_R_all_squeezed = squeeze(mean(sii_R_all,1));
for ii = 1:numel(vSNR_test)
xlab{ii} = vSNR_test(ii);
end
subplot(3,1,1)
plot(max(sii_min_all_squeezed.' ),'k', 'linewidth', 2)
hold on
plot(min(sii_min_all_squeezed.' ),'k', 'linewidth', 2)
plot(sii_min_all_squeezed)
xlim([1 numel(vSNR_test)])
grid on
xlabel('SNR [dB]')
ylabel('SII')
if isoctave
set(gca,'xtick',1:numel(vSNR_test));
set(gca,'xticklabel',cellfun(@num2str, xlab, 'UniformOutput', false));
else
set(gca,'xtick',1:numel(vSNR_test),'xticklabels',xlab); % xticklabels not supported in Octave
end
title('Minimum Speech Intelligibility Index')
subplot(3,1,2)
plot(max(sii_max_all_squeezed.' ),'k', 'linewidth', 2)
hold on
plot(min(sii_max_all_squeezed.' ),'k', 'linewidth', 2)
plot(sii_max_all_squeezed)
xlim([1 numel(vSNR_test)])
grid on
xlabel('SNR [dB]')
ylabel('SII')
if isoctave
set(gca,'xtick',1:numel(vSNR_test),'xticklabel',cellfun(@num2str, xlab, 'UniformOutput', false));
else
set(gca,'xtick',1:numel(vSNR_test),'xticklabels',xlab);
end
title('Maximum Speech Intelligibility Index')
subplot(3,1,3)
plot(max(sii_L_all_squeezed.' ), 'linewidth', 2)
hold on
plot(min(sii_R_all_squeezed.' ),'r', 'linewidth', 2)
xlim([1 numel(vSNR_test)])
grid on
xlabel('SNR [dB]')
ylabel('SII')
if isoctave
set(gca,'xtick',1:numel(vSNR_test));
set(gca,'xticklabel',cellfun(@num2str, xlab, 'UniformOutput', false));
else
set(gca,'xtick',1:numel(vSNR_test),'xticklabels',xlab);
end
legend('Left ear', 'Right ear','location', 'southeast')
title('L/R Speech Intelligibility Index')