function FASTTDE_detect_locate_wrapper( recording_name ) % "FASTTDE" implementation of microphone array detection-localization, % and MFCC extraction (basic single microphone MFCC, as well as sector-based MFCC). % % This function is a wrapper for "FASTTDE_detect_locate.m". % All parameters are set here, if you need to change them. % % For details, see IDIAP RR 06-26 by G. Lathoud. % % Implementation by G. Lathoud, including the optimized C code in MEX % wrappers: SAM-SPARSE-MEAN. % % INPUT: multiple wave files (microphone array). % % PROCESS: detection-localization and MFCC extraction. % PAR.BLOCK_SIZE_SEC determines the block size (e.g. 10 seconds). % Initialization is done on the first block (offline). % After that, each block is processed in a fully online manner: % the detection model and the short-term clustering model % are updated AFTER processing a block. % % OUTPUT: Matlab file containing an object called "result", % which itself contains: % % - spherical location estimates obtained from % sector-based detection (IDIAP RR 05-52) % and GCC-PHAT Time Delay Estimation (Knapp Carter 76, Brandstein 95) within each active sector. % -> "result.sphloc_mat", one location estimate per column. % row 1: absolute frame index (integer), i.e. from the start of the recording. % row 2: sector index (integer). % row 3: azimuth in radians (very reliable with a Uniform Circular Array or UCA). % row 4: elevation in radians (not very reliable with a UCA). % row 5: radius in meters: NaN because not given by FASTTDE. % row 6: log posterior probability of activity (following IDIAP RR 05-52). % row 7: (optional) tag given by short-term clustering (NaN if none). % % - if PAR.DO_STC = 1: % for each location estimate, a cluster tag % from short-term clustering of location estimates (Lathoud 04). % (online implementation). % -> row 7 of "result.sphloc_mat" % % - if PAR.DO_SBMFCC = 1: % for each location estimate, a sector-based MFCC vector, % using multiple channels % -> "result.sbmfcc_mat", one column per location estimate in "result.sphloc_mat": % row 1: frame index (integer) % row 2: sector index (integer) % row 3 to end: MFCC vector % % - if PAR.DO_MFCC = 1: % for each time frame, compute classical MFCCs % from one microphone only (PAR.MFCC_CHANNEL). % -> "result.mfcc_mat", one column per time frame: % row 1: frame index (integer) % row 2 to end: MFCC vector % % - other miscellaneous parameters, including ALL parameter values set here. % % By Guillaume Lathoud, 2006. % lathoud@idiap.ch if nargin < 1 error( [ mfilename ' requires 1 input argument.' ] ); end dbstop error dbstop warning %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % ( 0 ) PARAMETERS % ( 0.1 ) Main parameters % Where to read the data PAR.IN_PATH = 'DATA'; % Where to find the "seq_av163_modified.m" function % that provides meta-information on recordings addpath( PAR.IN_PATH ); % Where to write the data PAR.OUT_PATH = 'DATA'; % Sector-based detection: user-defined target FAR value, between 0 and 1 PAR.DETECTION.FAR_T = 0.005; % Means FAR_T = 0.5% % Sector-based detection: user-defined target FRR value, between 0 and 1 PAR.DETECTION.FRR_T = 0.005; % Means FRR_T = 0.5% % FASTTDE: Two square subarrays, following the thesis of Brandstein, Section 7.2 % In each 2 x 2 matrix, each row defines a pair of microphones using their indices, % from 1 to size( in_p.geometry, 2 ); PAR.FASTTDE_SQUARE_SUBARRAY_LIST = { [ 1 5; 3 7 ], [ 2 6; 4 8 ] }; % Upsampling factor for the time-domain GCC-PHAT PAR.FASTTDE_UPFACTOR = 20; % Whether we shall extract sector-based MFCC's PAR.DO_SBMFCC = 1; % 0 if you don't want them % Whether we shall extract single channel MFCCs PAR.DO_MFCC = 1; % Which microphone do we use to extract single channel MFCCs PAR.MFCC_CHANNEL = 1; % Dilation in space-time around "sure" speech (found with FAR = FAR_T), % where we'll search for "not-so-sure" speech (found with FRR = FRR_T). % -> PAR.DETECTION.SPACE_TIME_DILATION( 1 ) is a number of sectors. % -> PAR.DETECTION.SPACE_TIME_DILATION( 2 ) is a number of time frames. % % Sectors are assumed to be circular (last one is a neighbour of the 1st one). % % For other geometries of sectors, you'll need to modify the % "take_detection_decision" function in "FULL_detect_locate". PAR.DETECTION.SPACE_TIME_DILATION = [ 0 32 ]; % If needed, after all above, we may limit our search to the N-best PAR.DETECTION.N_SECTORS_MAX = 6; % To speed up EM fitting, limit the amount of data PAR.DETECTION.DATAREPR_MAXPREVIOUS_SEC = 3; % In seconds PAR.DETECTION.DATAREPR_MAXCURRENT_SEC = PAR.DETECTION.DATAREPR_MAXPREVIOUS_SEC; % In seconds % Also limit the number of iterations PAR.DETECTION.MIN_ITER = 5; PAR.DETECTION.MAX_ITER = 7; % Input file containing SAM-SPARSE-MEAN parameters % "512" is not innocent: it implies zero-padding, which is vital for % time-domain gcc-phat estimation through inverse DFT. % (Otherwise we run into circular convolution issues.) PAR.IN_SSM_MODEL_FILENAME = 'DATA/ssm_parameters_512_16000_342-pointset_np80.mat'; % Size of a block for fitting the detection model PAR.BLOCK_SIZE_SEC = 10; % in seconds % How much data do we use to initialize the detection model, % when starting to process a recording. % After that initialization, everything is % ONLINE, FRAME-BY-FRAME (parameters are updated % at the END of each block). PAR.INIT_BLOCK_SIZE_SEC = PAR.BLOCK_SIZE_SEC; % In seconds % Convert that into a list with one element PAR.RECLIST( 1 ).NAME = recording_name; PAR.RECLIST( 1 ).START = -Inf; % in seconds, -Inf means beginning PAR.RECLIST( 1 ).STOP = +Inf; % in seconds, +Inf means beginning PAR.RECLIST( 1 ).CHANNEL_LIST = 1:8; % ( 0.2 ) Other parameters % Length of a frame PAR.FRAMELENGTH_SEC = 0.032; % Overlap between time frames: 50% PAR.TIMEFRAME_OVERLAP = ( 32 - 16 ) / 32; % Determine how we compute the spectrum PAR.SPECTRUM_DO_ZERO_MEAN = 1; PAR.SPECTRUM_PREEMP = 0.97; % Speed of sound in the air in m/s PAR.C = 342; %%%%%%%%%% % For detection PAR.DETECTION.DEBUG = 0; % 0 or 1 (to see figures with the fit) PAR.DETECTION.VERBOSE = 1; % 0 or 1 or 2 (levels of verbosity) %%%%%%%%%% % Parameters for short-term clustering PAR.DO_STC = 1; % do it! PAR.STC.T_SHORT = 7; % Number of frames ( about 100 ms in our case ) PAR.STC.UPDATE_LOCAL_MAX_ITER = 100; % for the update of the local models PAR.STC.UPDATE_LOCAL_CV_THR = 1e-5; PAR.STC.N_PAST_FRAMES = PAR.STC.T_SHORT; % Pruning not necessary since we have very low complexity, % because PAR.STC.N_FUTURE_SAMPLES = 1. % ( Otherwise, a very small threshold to prune out impossible merges would be e.g. -20. ) PAR.STC.LLR_THRESHOLD = -Inf; PAR.STC.N_FUTURE_SAMPLES = 1; PAR.STC.VERBOSE = 0; PAR.STC.DAZ_MAX = 5/180*pi; PAR.STC.N_GRAPHS_MAX = 10000; % (Step 2. in NIST-RT04s article) PAR.STC.N_MERGES_MAX = 10000; % (Step 3. in NIST-RT04s article) % Output filename PAR.OUT_FILENAME = fullfile( PAR.OUT_PATH, [ mfilename '-' recording_name '-result.mat' ] ); % Verbosity flag (0 or 1) PAR.VERBOSE = 1; if PAR.VERBOSE disp( [ mfilename ' parameters:' ] ); disp( PAR ); end % Check whether the output already exists if exist( PAR.OUT_FILENAME, 'file' ) if PAR.VERBOSE disp( ' ' ); disp( [ mfilename ': output file already exists. Skipped.' ] ); disp( [ '( "' PAR.OUT_FILENAME '" )' ] ); end return; end % It does not exist yet, let us "lock" it by creating it. % This is useful for example when running batch processing in parallel % on several machines, where a call to us is one line of a script. fid = fopen( PAR.OUT_FILENAME, 'wt' ); if fid < 0 error( [ mfilename ' could not write-open "' PAR.OUT_FILENAME '".' ] ); end fclose( fid ); if PAR.VERBOSE disp( ' ' ); disp( sprintf( [ mfilename ' will write out the location estimates and sector-based MFCCs into file\n "' PAR.OUT_FILENAME '".' ] ) ); end %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % ( 1 ) Load everything we need if PAR.VERBOSE disp( ' ' ); disp( sprintf( [ mfilename ' is loading SSM A & B parameters from file\n "' PAR.IN_SSM_MODEL_FILENAME '"...' ] ) ); end load( PAR.IN_SSM_MODEL_FILENAME, 'ssm' ); %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % ( 2 ) Prepare initialization of the 3-D search within each sector n_sectors = numel( ssm.sector ); for a = 1:n_sectors s = ssm.sector( a ); % Arbitrary choice: 1 point in a plane close to the lowest elevation % Azimuth is in the middle of the sector. az_interval = ( s.az_max - s.az_min ) / 2; az = s.az_min + ( 1 ) * az_interval; el = ( 2 * s.el_min + s.el_max ) / 3; r = s.r_min + ( s.r_max - s.r_min ) * [ 0.1 ]; % Define and store the point ssm.sector( a ).init_sph = [ az; el; r ]; end %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % ( 3 ) Check the recordings, get some information clear result; clear tmp_result; tmp_result.nothing = []; % This create a "tmp_result.list" field tmp_result = get_rec_info( PAR, ssm, tmp_result ); %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % ( 4 ) Process the recordings n_recordings = numel( tmp_result.list ); if PAR.VERBOSE disp( ' ' ); disp( sprintf( [ mfilename ': starting to process the %d recordings...' ], n_recordings ) ); end % Structure where we will store the output clear result; for i_rec = 1:n_recordings if PAR.VERBOSE disp( ' ' ); disp( sprintf( [ mfilename ': Recording %d/%d: "%s"' ], i_rec, n_recordings, tmp_result.list( i_rec ).name ) ); end % Now that we have dealt with all the file management stuff, % we can call the script which does the job for us. result.list( i_rec ) = FASTTDE_detect_locate( tmp_result.list( i_rec ), ssm, PAR ); end %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % ( 5 ) Write out % Note that result.PAR = PAR, so that the % parameters are saved as well. result.PAR = PAR; % Get the computer name. % This, along with result.list( i_rec ).total_clock and total_fasttde_clock, % can be useful for computational load evaluation. [ s, w ] = system( 'uname -n' ); if s ~= 0 w = [ mfilename ': warning! Could not get the name of the machine I am running on.' ]; disp( w ); end result.machine_name = w; % Write out save( PAR.OUT_FILENAME, 'result' ); if PAR.VERBOSE disp( ' ' ); disp( [ mfilename ' wrote out "' PAR.OUT_FILENAME '".' ] ); disp( [ mfilename ': done.' ] ); end %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % To get some informations about recordings % (files, durations, fs, etc.) function result = get_rec_info( PAR, ssm, result ) if nargin < 3 error( [ mfilename ' needs 3 input arguments.' ] ); end if PAR.VERBOSE disp( ' ' ); end for i_rec = 1:numel( PAR.RECLIST ) channel_list = PAR.RECLIST( i_rec ).CHANNEL_LIST; nchannels = numel( channel_list ); % Consistency checks if nchannels < min( max( ssm.pairs(:) ), size( ssm.geometry, 2 ) ) error( [ mfilename '.get_rec_info(): inconsistent number of channels.' ] ); end % Get various info about this recording, % including list of microphone array wave files result.list( i_rec ).name = PAR.RECLIST( i_rec ).NAME; result.list( i_rec ).misc_info = seq_av163_modified( result.list( i_rec ).name, PAR.IN_PATH ); % Consistency check ma_geometry = result.list( i_rec ).misc_info.MA_GEOMETRY( :, PAR.RECLIST( i_rec ).CHANNEL_LIST ); translation = mean( ma_geometry, 2 ); ma_geometry = ma_geometry - repmat( translation, 1, nchannels ); if max( abs( ma_geometry(:) - ssm.geometry(:) ) ) > 1e-10 error( [ mfilename ': inconsistent geometry.' ] ); end % Access wave files % Get the duration of the file % Replace the "Inf" in PAR.RECLIST.START and STOP result.list( i_rec ).wavefile_list = result.list( i_rec ).misc_info.MA_FILE_LIST( channel_list ); nsamples_list = []; fs_list = []; for i_channel = 1:nchannels [ a_size, a_fs ] = wavread( result.list( i_rec ).wavefile_list{ i_channel }, 'size' ); nsamples_list( i_channel ) = a_size( 1 ); if a_size( 2 ) ~= 1 error( [ mfilename '.get_rec_info(): requires single channel wave files only.' ] ); end fs_list( i_channel ) = a_fs; end % Check consistency of the sampling frequency fs = unique( fs_list ); if numel( fs ) > 1 error( [ mfilename '.get_rec_info(): the sampling frequency must be the same for all wave files.' ] ); end if fs ~= ssm.fs error( [ mfilename ': the sampling frequency of wave files is not consistent with the SSM parameters in "' ssm_parameters_filename '".' ] ); end result.list( i_rec ).fs = fs; % Find the total duration if numel( unique( nsamples_list ) ) > 1 disp( [ mfilename ': truncating the recordings to the shortest one.' ] ); end nsamples = min( nsamples_list ); result.list( i_rec ).nsamples = nsamples; result.list( i_rec ).duration = nsamples / fs; % Replace the "Inf" in PAR.RECLIST( i_rec ).START and STOP result.list( i_rec ).start_sec = max( 0, PAR.RECLIST( i_rec ).START ); result.list( i_rec ).stop_sec = min( result.list( i_rec ).duration, PAR.RECLIST( i_rec ).STOP ); % Convert into samples result.list( i_rec ).start_sample = max( 1, 1 + round( result.list( i_rec ).start_sec * fs ) ); result.list( i_rec ).stop_sample = min( nsamples, 1 + round( result.list( i_rec ).stop_sec * fs ) ); % Verbosity if PAR.VERBOSE disp( sprintf( [ mfilename '.get_rec_info: information about recording %d/%d' ], i_rec, numel( PAR.RECLIST ) ) ); disp( result.list( i_rec ) ); end end