% FUNCTION [ PERF, PERF_OVERLAP ] = EVAL_SEGMAT( RESULTSEG, TRUTHSEG, TOTAL_TIME, [ SMALL_TIME ] ) % % To evaluate speech/silence classification performance % on multichannel recordings. This is purely based on % time boundaries, NOT on frames. % % EVAL_SEGMAT is particularly useful on spontaneous % multichannel speech, as there are many overlaps. % % % RESULTSEG and TRUTHSEG are two structures of length NCHANNEL. % We assume the order of channels is the same in both. % % RESULTSEG( a ).seg is a 2 x * matrix of values in seconds, % describing RESULT speech segment boundaries for channel "a". % % TRUTHSEG( a ).seg is a 2 x * matrix of values in seconds. % describing TRUE speech segment boundaries for channel "a". % % For both matrices, for each column: % - row 1: start of speech segment % - row 2: end of speech segment % % TOTAL_TIME is the total duration of the recording in seconds. % % SMALL_TIME is a small duration constant in seconds to evaluate % whether to time values are equal or not ( default 1e-10 ). % % % PERF is a structure. % % PERF.CHANNEL( a ) is a structure with several fields % (correctly detected speech, FAR, FRR, PRC, RCL and so on). % % PERF.OVERALL is a structure with several fields % (obtained from early concatenation of all channels). % % PERF_OVERLAP is a structure similar to PERF, % but evaluation is restricted to segments with % more than one concurrent speakers (found in either % result or truth or both). % % Therefore the overall performance figures are not necessarily the % same as the average of the performance figures obtained for each % channel. % % It is easy to find the segments containing errors by looking % at the PERF.CHANNEL( a ).RGTMAT matrix and the related % PERF.CHANNEL( a ).XXX_IND fields. % % For each column of a RGTMAT matrix: % - row 1: start time of the segment in seconds % - row 2: end time of the segment in seconds % - row 3: RESULT classification: 0 means silent segment, 1 means speech. % - row 4: TRUE classification: 0 means silent segment, 1 means speech. % % All performance figures are derived from those RGTMAT matrices. % % For more info... look at the code: % dbtype eval_segmat % % % By Guillaume LATHOUD, 2004 - lathoud@idiap.ch function [ perf, perf_overlap ] = eval_segmat( in_resultseg, in_truthseg, total_time, small_time ) % ( 0 ) Deal with parameters if nargin < 3 error( 'eval_segmat: needs at least 3 input parameters' ); end if nargout < 1 error( 'eval_segmat: needs at least one output parameter' ); end if ~exist( 'small_time', 'var' ) small_time = 1e-10; end % We will also return the input arguments perf.resultseg = in_resultseg; perf.truthseg = in_truthseg; perf.total_time = total_time; % Store them for modifications (=definition of silent segments, see below) resultseg = in_resultseg; truthseg = in_truthseg; % Check them if ~isstruct( resultseg ) error( 'eval_segmat: needs a structure for "resultseg"' ); end if ~isstruct( truthseg ) error( 'eval_segmat: needs a structure for "truthseg"' ); end % Check same number of channels if length( resultseg ) ~= length( truthseg ) error( 'eval_segmat: inconsistent number of channels!' ); end nchannel = length( resultseg ); for a = 1:nchannel if ~isempty( resultseg(a).seg ) if ~ismember( size( resultseg(a).seg, 1 ), [2 3] ) error( sprintf( 'eval_segmat: needs a 2 x N1 or 3 x N1 matrix for "resultseg(%d).seg"', a ) ); end end if ~isempty( truthseg(a).seg ) if ~ismember( size( truthseg(a).seg, 1 ), [2 3] ) error( sprintf( 'eval_segmat: needs a 2 x N2 or 3 x N2 matrix for "truthseg(%d).seg"', a ) ); end end end if ~isnumeric( total_time ) error( 'eval_segmat: needs a scalar value for "total_time"' ); end % If needed, convert 2-row format (speech segments only) % to 3-row format(speech segments AND silence segments) for a = 1:nchannel % if ~isempty( resultseg( a ).seg ) if size( resultseg( a ).seg, 1 ) == 2 % Define silence segments resultseg( a ).seg = define_silence_in_seg( resultseg( a ).seg, total_time ); % Remove empty segments resultseg( a ).seg = remove_empty_in_seg( resultseg( a ).seg ); end % end end for a = 1:nchannel % if ~isempty( truthseg( a ).seg ) if size( truthseg( a ).seg, 1 ) == 2 % Define silence segments truthseg( a ).seg = define_silence_in_seg( truthseg( a ).seg, total_time ); % Remove empty segments truthseg( a ).seg = remove_empty_in_seg( truthseg( a ).seg ); end % end end % To store accuracy values channel = []; % Structure overall = []; % Structure for a = 1:nchannel rgtmat = [0; total_time ]; %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % 1) Take care of segments t = [ resultseg( a ).seg( 1,: ) truthseg( a ).seg( 1, : ) ]; t = [ t resultseg( a ).seg( 2,: ) truthseg( a ).seg( 2, : ) ]; t = sort( unique( t ) ); % "unique" is not enough to suppress duplicates % minor errors may appear (e.g. 1e-14) t = t( find( diff( [ -Inf t ] ) > small_time ) ); if abs( t( 1 ) ) > small_time error( 'eval_segmat: error #123!' ); end t( 1 ) = 0; if abs( t( end ) - total_time ) > small_time error( 'eval_segmat: error #456!' ); end t( end ) = total_time; rgtmat = [ t(1:end-1); t(2:end) ]; % Check the matrix if any( abs( rgtmat( 1, 2:end ) - rgtmat( 2, 1:end-1 ) ) > 0 ) error( 'eval_segmat: error in rgtmat!' ); end %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % 2) Take care of activity seg_middle = mean( rgtmat( 1:2, : ), 1 ); for b = 1:length( seg_middle ) t = seg_middle( b ); % ( 3.1 ) Copy result activity ind = find( ( resultseg(a).seg(1,:) <= t ) & ... ( t <= resultseg(a).seg(2,:) ) ); if isempty( ind ) rgtmat( 3, b ) = 0; else % Sanity check if length( ind ) > 1 error( 'eval_segmat: insanity #3!' ); end rgtmat( 3, b ) = resultseg(a).seg(3,ind ); end % ( 3.2 ) Copy GT activity ind = find( ( truthseg(a).seg(1,:) <= t ) & ... ( t <= truthseg(a).seg(2,:) ) ); if isempty( ind ) rgtmat( 4, b ) = 0; else rgtmat( 4, b ) = truthseg(a).seg( 3,ind ); end end %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % 3) Use "rgtmat" to sum: % - total duration of correctly detected speech % - total duration of missed speech % - total duration of correctly detected silence % - total duration of missed silence correct_speech = 0; missed_speech = 0; correct_silence = 0; missed_silence = 0; seg_dur = diff( rgtmat( 1:2, : ), [], 1 ); % Correctly detected speech correct_speech_ind = find( rgtmat( 3, : ) & rgtmat( 4, : ) ); correct_speech = sum( seg_dur( correct_speech_ind ) ); % Missed speech missed_speech_ind = find( (~rgtmat( 3, : )) & rgtmat( 4, : ) ); missed_speech = sum( seg_dur( missed_speech_ind ) ); % Correctly detected silence correct_silence_ind = find( (~rgtmat( 3, : )) & (~rgtmat( 4, : )) ); correct_silence = sum( seg_dur( correct_silence_ind ) ); % Missed silence missed_silence_ind = find( rgtmat( 3, : ) & (~rgtmat( 4, : )) ); missed_silence = sum( seg_dur( missed_silence_ind ) ); % Sanity check if abs( correct_speech + missed_speech + correct_silence + missed_silence - total_time ) > 1e-10 error( 'eval_segmat: insanity #6 !' ); end %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % 4) Count speech segments (rather than durations) rseg = simplify_seg( resultseg( a ).seg, small_time ); result_speech_segments = sum( rseg( 3,: ) ); tseg = simplify_seg( truthseg( a ).seg, small_time ); truth_speech_segments = sum( tseg( 3,: ) ); iseg = intersect_seg( { rseg, tseg }, small_time ); % Count the number of correct speech segments in the result tmp = zeros( 1, size( rseg, 2 ) ); for t = mean( iseg( 1:2, find( iseg( 3,: ) ) ) ) tmp( find( rseg( 3,: ) & (rseg( 1,: ) <= t) & (t <= rseg( 2,: )) ) ) = 1; end result_correct_speech_segments = sum( tmp ); % Count the number of correct segments in the truth tmp = zeros( 1, size( tseg, 2 ) ); for t = mean( iseg( 1:2, find( iseg( 3,: ) ) ) ) tmp( find( tseg( 3,: ) & (tseg( 1,: ) <= t) & (t <= tseg( 2,: )) ) ) = 1; end truth_correct_speech_segments = sum( tmp ); % Store the list of missed truth segments truth_missed_speech_segments = find( ~tmp ); %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % 5) Store values for this channel channel( a ).rgtmat = rgtmat; % Total durations in seconds channel( a ).correct_speech = correct_speech; channel( a ).missed_speech = missed_speech; channel( a ).correct_silence = correct_silence; channel( a ).missed_silence = missed_silence; % List of the corresponding columns in "channel( a ).rgtmat" channel( a ).correct_speech_ind = correct_speech_ind; channel( a ).missed_speech_ind = missed_speech_ind; channel( a ).correct_silence_ind = correct_silence_ind; channel( a ).missed_silence_ind = missed_silence_ind; % Number of segments channel( a ).result_speech_segments = result_speech_segments; channel( a ).truth_speech_segments = truth_speech_segments; channel( a ).result_correct_speech_segments = result_correct_speech_segments; channel( a ).truth_correct_speech_segments = truth_correct_speech_segments; channel( a ).truth_missed_speech_segments = truth_missed_speech_segments; end %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % 5) Overall durations: concatenate the channels overall.total_channel_time = nchannel * total_time; overall.correct_speech = sum( [ channel.correct_speech ] ); overall.missed_speech = sum( [ channel.missed_speech ] ); overall.correct_silence = sum( [ channel.correct_silence ] ); overall.missed_silence = sum( [ channel.missed_silence ] ); overall.result_speech_segments = sum( [ channel.result_speech_segments ] ); overall.truth_speech_segments = sum( [ channel.truth_speech_segments ] ); overall.result_correct_speech_segments = sum( [ channel.result_correct_speech_segments ] ); overall.truth_correct_speech_segments = sum( [ channel.truth_correct_speech_segments ] ); overall.truth_missed_speech_segments = sum( [ channel.truth_missed_speech_segments ] ); %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % 6) Compute perf. measures for each channel for a = 1:nchannel % ( 6.1 ) Based on durations % I want to be safe... tp = channel( a ).correct_speech; fp = channel( a ).missed_silence; tn = channel( a ).correct_silence; fn = channel( a ).missed_speech; [ far, frr, hter ] = far_frr( tp, fp, tn, fn ); [ prc, rcl, F ] = prc_rcl( tp, fp, tn, fn ); % Store the results channel( a ).far = far; channel( a ).frr = frr; channel( a ).hter = hter; channel( a ).prc = prc; channel( a ).rcl = rcl; channel( a ).F = F; % ( 6.2 ) Based on number of segments channel( a ).seg_prc = channel( a ).result_correct_speech_segments / ( eps + channel( a ).result_speech_segments ); channel( a ).seg_rcl = channel( a ).truth_correct_speech_segments / ( eps + channel( a ).truth_speech_segments ); channel( a ).seg_F = 2 * channel( a ).seg_prc * channel( a ).seg_rcl / ( eps + channel( a ).seg_prc + channel( a ).seg_rcl ); end %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % 7) Compute overall perf. measures % ( 7.1 ) Based on durations % I want to be safe... tp = overall.correct_speech; fp = overall.missed_silence; tn = overall.correct_silence; fn = overall.missed_speech; [ far, frr, hter ] = far_frr( tp, fp, tn, fn ); [ prc, rcl, F ] = prc_rcl( tp, fp, tn, fn ); % Store the results overall.far = far; overall.frr = frr; overall.hter = hter; overall.prc = prc; overall.rcl = rcl; overall.F = F; % ( 7.2 ) Based on number of segments overall.seg_prc = overall.result_correct_speech_segments / overall.result_speech_segments; overall.seg_rcl = overall.truth_correct_speech_segments / overall.truth_speech_segments; overall.seg_F = 2 * overall.seg_prc * overall.seg_rcl / ( overall.seg_prc + overall.seg_rcl ); %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % 8) Return values perf.channel = channel; perf.overall = overall; %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % ( 9 ) Optional overlap evaluation if nargout > 1 %%% % ( 9.1 ) Find overlap segments % In the result a_seglist = {}; for a = 1:length( resultseg ) seg = resultseg( a ).seg; a_seglist{ a } = seg( 1:2, find( seg( 3,: ) ) ); end rov_seg = define_silence_in_seg( find_overlap( a_seglist ), truthseg(1).seg(2,end) ); % In the truth a_seglist = {}; for a = 1:length( truthseg ) seg = truthseg( a ).seg; a_seglist{ a } = seg( 1:2, find( seg( 3,: ) ) ); end tov_seg = define_silence_in_seg( find_overlap( a_seglist ), truthseg(1).seg(2,end) ); % Merge them ov_seg = merge_seg( { rov_seg, tov_seg }, small_time ); %%% % ( 9.2 ) For each channel, extract the "ov_seg" % time subset of the result and the truth % % -> store it as is % -> also store a (temporary) concatenated version ov_subset = ov_seg( 1:2, find( ov_seg( 3,: ) ) ); if isempty( ov_subset ) if ~strcmp( warning, 'off' ) disp( 'eval_segmat: WARNING! empty overlap subset. You may want to check the ground-truth.' ); end perf_overlap = []; perf_overlap.ov_subset = []; return; end ov_resultseg = []; ov_truthseg = []; for a = 1:nchannel % Intersection between "ov_seg" and result. ov_resultseg( a ).seg = extract_subset_from_seg( resultseg( a ).seg, ov_subset, small_time ); % Intersection between "ov_seg" and truth ov_truthseg( a ).seg = extract_subset_from_seg( truthseg( a ).seg, ov_subset, small_time ); end % Concatenate the overlaps to be able to call :"eval_perfmat" ovc_total_time = sum( diff( ov_seg( 1:2, find( ov_seg( 3,: ) ) ) ) ); ovc_resultseg = []; ovc_truthseg = []; ov_result_timelist = []; ovc_result_timelist = []; ov_truth_timelist = []; ovc_truth_timelist = []; for a = 1:nchannel % Process result matrix and store it dur = diff( ov_resultseg( a ).seg( 1:2, : ), [], 1 ); t = [ 0 cumsum( dur ) ]; % Sanity check if abs( t(end) - ovc_total_time ) > small_time error( 'eval_segmat: insanity #7 !' ); end ovc_resultseg( a ).seg = [ t( 1:end-1 ); t( 2:end ); ov_resultseg( a ).seg( 3,: ) ]; % Store the time values for later processing of the "rgtmat" matrix ov_result_timelist( a ).t = [ ov_resultseg( a ).seg( 1, : ) ov_resultseg( a ).seg( 2,end ) ]; ovc_result_timelist( a ).t = [ ovc_resultseg( a ).seg( 1, : ) ovc_resultseg( a ).seg( 2,end ) ]; % Format for "eval_segmat" : list speech segments only ovc_resultseg( a ).seg = ovc_resultseg( a ).seg( 1:2, find( ovc_resultseg( a ).seg( 3,: ) ) ); %%% % Process truth matrix and store it dur = diff( ov_truthseg( a ).seg( 1:2, : ), [], 1 ); t = [ 0 cumsum( dur ) ]; % Sanity check if abs( t(end) - ovc_total_time ) > small_time error( 'eval_segmat: insanity #7 !' ); end ovc_truthseg( a ).seg = [ t( 1:end-1 ); t( 2:end ); ov_truthseg( a ).seg( 3,: ) ]; % Store the time values for later processing of the "rgtmat" matrix ov_truth_timelist( a ).t = [ ov_truthseg( a ).seg( 1, : ) ov_truthseg( a ).seg( 2,end ) ]; ovc_truth_timelist( a ).t = [ ovc_truthseg( a ).seg( 1, : ) ovc_truthseg( a ).seg( 2,end ) ]; % Format for "eval_segmat" : list speech segments only ovc_truthseg( a ).seg = ovc_truthseg( a ).seg( 1:2, find( ovc_truthseg( a ).seg( 3, : ) ) ); end %%% % ( 9.3 ) Call eval_segmat with the concatenated version a_perf = eval_segmat( ovc_resultseg, ovc_truthseg, ovc_total_time, small_time ); %%% % ( 9.4 ) Put the true time values back in a_perf.resultseg = ov_resultseg; a_perf.truthseg = ov_truthseg; % Fix the rgtmat matrix of each channel for a = 1:nchannel ovc_timelist( a ).t = [ ovc_result_timelist( a ).t ovc_truth_timelist( a ).t ]; ov_timelist( a ).t = [ ov_result_timelist( a ).t ov_truth_timelist( a ).t ]; for b = 1:2 for c = 1:size( a_perf.channel( a ).rgtmat, 2 ) t = a_perf.channel( a ).rgtmat( b, c ); ind = find( abs( ovc_timelist( a ).t - t ) < small_time ); % Sanity check if length( ind ) < 0 error( 'eval_segmat: insanity #8!' ); end % Put true time value back in a_perf.channel( a ).rgtmat( b, c ) = ov_timelist( a ).t( ind( 1 ) ); end end end %%% % ( 9.5 ) Store the result perf_overlap = a_perf; end % if nargout > 1