function [ multiseg, p ] = segment_multichannel( in_p ) % FUNCTION [ MULTISEG, P ] = SEGMENT_MULTICHANNEL( IN_P ) % % Segment a multichannel audio recording (e.g. lapels or headsets). % Produces intermediary ".mat" files as well as a human-readable % "-multiseg.txt" file. The output is also returned in the form % of a struct array: MULTISEG (see below). % % This is an implementation of the baseline multichannel segmentation % presented in a paper at the 2004 NIST Meeting Recognition workshop (RT-04), % titled "Unsupervised Location-Based Segmentation of Multi-Party Speech". % % % IN_P is a structure containing parameters, of which two % are mandatory: % IN_P.WAVFILE_LIST = cell array of NCHANNELS strings (filenames). % IN_P.OUTPUT_BASENAME = string, the base name for all output files. % % There can be many other parameters (look at the code, % e.g. with "dbtype segment_multichannel"). % % MULTISEG is a struct array of NCHANNELS elements. % MULTISEG( i_channel ).SEG is a 2-row matrix. Each % column describes a speech segment (row 1 = start time in seconds, % row 2 = end time in seconds). % % P is IN_P enriched with default values, % in particular it gives the names of the output files. % % % NOTE: for some reason this script does not work well with MATLAB 6.5. % Use MATLAB 6.1 instead. % % By Guillaume Lathoud - lathoud@idiap.ch v = version; if ~strcmp( v(1:3), '6.1' ) warning( 'segment_multichannel works best with MATLAB 6.1' ); end % ( 1 ) Check parameters if nargin < 1 error( 'segment_multichannel needs a "in_p" input argument' ) end p = in_p; check_param( { 'wavfile_list', 'output_basename' }, fieldnames( p ) ); p_default.out_dir = 'DATA'; p_default.out_energy_file = []; p_default.out_multiseg_file = []; p_default.frame_list = [ -Inf +Inf ]; p_default.frame_length_sec = 0.032; p_default.frame_shift_sec = []; p_default.verbose = 1; p_default.verbose_time = 10; p_default.speechmindur_sec = 0.100; % seconds p_default.dilation1_sec = 4 * 0.016; % seconds p_default.dilation2_sec = 2 * 0.016; % seconds p_default.silmindur_sec = 1.000; % seconds p = fill_default( p, p_default ); if isempty( p.frame_shift_sec ) p.frame_shift_sec = p.frame_length_sec / 2; end if isempty( p.out_energy_file ) p.out_energy_file = fullfile( p.out_dir, [ p.output_basename '.energy.mat' ] ); end if isempty( p.out_multiseg_file ) p.out_multiseg_file = fullfile( p.out_dir, [ p.output_basename '.multiseg.mat' ] ); end if p.verbose disp( ' ' ); disp( 'segment_multichannel parameters:' ); disp( p ); end %%% % Make sure there is an output dir if ~exist( p.out_dir, 'dir' ) system( [ 'mkdir -p ' p.out_dir ] ); end % Get recording duration [ siz, fs ] = wavread( p.wavfile_list{ 1 }, 'size' ); rec_dur_sec = siz( 1 ) / fs; %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % ( 2 ) Extract energy if exist( p.out_energy_file, 'file' ) disp( [ '[ segment_multichannel: file already exists (loaded) : "' p.out_energy_file '" ]' ] ); load( p.out_energy_file, 'channel_e', 'channel_e_par' ); nchannel = size( channel_e, 1 ); else nchannel = length( p.wavfile_list ); if nchannel == 0 error( 'nchannel == 0!' ); end channel_e = []; in_q = []; in_q.frame_length_sec = p.frame_length_sec; in_q.frame_shift_sec = p.frame_shift_sec; in_q.verbose = p.verbose; in_q.verbose_time = p.verbose_time; in_q.frame_list = p.frame_list; for a = 1:nchannel in_q.in_name = p.wavfile_list{ a }; [ out, q ] = energy( in_q ); channel_e = [ channel_e; out( 2, : ) ]; end % Store it channel_e_par = q; save( p.out_energy_file, 'channel_e', 'channel_e_par' ); disp( [ '[ segment_multichannel: wrote out : "' p.out_energy_file '" ]' ] ); end %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % ( 3 ) Energy-based segmentation p.out_multiseg_file = fullfile( p.out_dir, [ p.output_basename '.multiseg.mat' ] ); if exist( p.out_multiseg_file, 'file' ) disp( [ '[ segment_multichannel: file already exists (loaded) : "' p.out_multiseg_file '" ]' ] ); load( p.out_multiseg_file, 'multiseg', 'binseg0', 'binseg' ); else nframes = size( channel_e, 2 ); % For each channel, for each frame, classify as % speech or silence. % % Speech frame iff: % - energy above a threshold. % - channel is the strongest for this frame. % Normalized log energies disp( sprintf( 'segment_multichannel: %d channels', nchannel ) ); log_e = repmat( -Inf, nchannel, nframes ); for a = 1:nchannel tmp = channel_e( a, : ); frame_ind = find( tmp & (~isnan( tmp )) & (~isinf( tmp )) ); tmp( frame_ind ) = log( tmp( frame_ind ) ); tmp = tmp - min( tmp( frame_ind ) ); log_e( a, frame_ind ) = tmp( frame_ind ); end % Find the strongest channel for each frame [tmp, strongest_channel] = max( log_e, [], 1 ); % Convert to a binary matrix sl_binmat = zeros( nchannel, nframes ); sl_binmat( sub2ind( size( sl_binmat ), strongest_channel, 1:nframes ) ) = 1; % Threshold on energy for each channel and each frame e_binmat = zeros( nchannel, nframes ); q = []; q.model_type = 'GU'; q.mindur_type = 'none'; q.verbose = 0; for a = 1:nchannel e_binmat( a, : ) = silence_seg( log_e( a, : ), q ); end % Energy above threshold AND strongest channel binseg0 = e_binmat & sl_binmat; %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % Low-pass each channel to obtain usable % speech and silence segments % % Extract the boundaries of those segments speechmindur_frames = 2 * round( p.speechmindur_sec / ( 2 * channel_e_par.frame_shift_sec ) ); dilation1_frames = round( p.dilation1_sec / channel_e_par.frame_shift_sec ); dilation2_frames = round( p.dilation2_sec / channel_e_par.frame_shift_sec ); silmindur_frames = 2 * round( p.silmindur_sec / ( 2 * channel_e_par.frame_shift_sec ) ); disp( sprintf( 'large_exp_7: speechmindur_frames = %d, dilation1_frames = %d, dilation2_frames = %d silmindur_frames = %d', speechmindur_frames, dilation1_frames, dilation2_frames, silmindur_frames ) ); binseg = binseg0; multiseg = []; for a = 1:nchannel seg = []; % Smooth a bit binseg( a, : ) = erosion( dilation( binseg( a,: ), dilation1_frames ), dilation1_frames ); % Apply minimum duration binseg( a,: ) = dilation( erosion( binseg( a, : ), speechmindur_frames/2 ), speechmindur_frames/2 ); % Dilate it a bit binseg( a, : ) = dilation( binseg( a, : ), dilation2_frames ); % Remove silences that are too small binseg( a, : ) = erosion( dilation( binseg( a, : ), silmindur_frames/2 ), silmindur_frames / 2 ); % Extract segment boundaries % from frame-level activity/silence classification tmp = [ 0 binseg( a, : ) 0 ]; seg_start = ( find( diff( tmp(1:end) ) > 0 ) - 1 ) * channel_e_par.frame_shift_sec; seg_end = ( find( diff( tmp(2:end) ) < 0 ) - 1 ) * channel_e_par.frame_shift_sec + channel_e_par.frame_length_sec; if length( seg_start ) ~= length( seg_end ) error( 'segment_multichannel insane!' ); end seg = [ seg_start; seg_end ]; % Store the result multiseg( a ).seg = seg; if isempty( seg ) disp( ' ' ); disp( sprintf( 'segment_multichannel: WARNING! no speech found on channel %d! You may want to actually listen to the WAV file.', a ) ); end end % Add start time start_time = ( channel_e_par.frame_list( 1 ) - 1 ) * channel_e_par.frame_shift_sec; for a = 1:nchannel multiseg( a ).seg = multiseg( a ).seg + start_time; end % Save a bit of memory binseg0 = uint8( binseg0 ); binseg = uint8( binseg ); % Write out result save( p.out_multiseg_file, 'multiseg', 'binseg0', 'binseg' ); disp( [ '[ segment_multichannel: wrote out : "' p.out_multiseg_file '" ]' ] ); end %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % 4) Text output % Output to text file out_file = fullfile( p.out_dir, [ p.output_basename '-multiseg.txt' ] ); if exist( out_file, 'file' ) disp( [ '[ segment_multichannel: file already exists, skipped: "' out_file '" ]' ] ); else fid = fopen( out_file, 'wt' ); for a = 1:length( multiseg ) if ~isempty( p.wavfile_list{ a } ) fprintf( fid, '%s\n', p.wavfile_list{ a } ); else fprintf( fid, '< lapel %d >\n', a ); end nseg = size( multiseg( a ).seg, 2 ); fprintf( fid, 'endTime %6.6f\n', rec_dur_sec ); for b = 1:nseg fprintf( fid, '%10.6f %10.6f\n', multiseg( a ).seg( :, b ) ); end fprintf( fid, '.\n' ); end fclose( fid ); disp( [ 'segment_multichannel: wrote out "' out_file '"' ] ); end