function [ xyzs, p ] = srpphatloc( in_p ) % FUNCTION [ XYZS, P ] = SRPPHATLOC( IN_P ) % % Single audio source localization: % look for the SRP-PHAT maximum over a grid of possible locations. % % By Guillaume Lathoud - lathoud@idiap.ch % % IN_P is a structure containing various parameters, % of which 4 are mandatory: % % IN_P.WAVFILE_LIST: NCHANNELS-long cell array of strings, % % IN_P.GEOMETRY: 3 x NCHANNELS matrix of cartesian coordinates, % each column contains the (x,y,z) location of a microphone. % % IN_P.PAIRS: NPAIRS x 2 matrix of integers in [ 1 NCHANNELS ], % each row defines one microphone pair. % % IN_P.C: scalar, speed of sound in the air in m/s. % % XYZS is a 4 x NFRAMES matrix of location estimates (rows 1 to 3) % and their SRP-PHAT value (row 4). Each column corresponds to one frame. % % P is IN_P with additional fields added, esp. the index of the first % and last frames that were processed. % % For more info, look at the code directly: % dbtype srpphatloc if nargin < 1 error( [ mfilename ' needs one input argument!' ] ); end p = in_p; %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % ( 1 ) Check parameters, set default values if necessary required_param = { 'wavfile_list', 'geometry', 'pairs', 'c' }; check_param( required_param, fieldnames( p ) ); % Compute microphone array's center mam = mean( p.geometry, 2 ); if ~all( abs( mam ) < 1e-6 ) warning( 'Microphone array''s center is not (0,0,0). You may want to check "p.celldef.geometry".' ); end clear p_default; p_default.framelength_sec = 0.032; p_default.upfactor = 20; p_default.verbose = 1; p_default.verbose_time = 10; p = fill_default( p, p_default ); clear p_default; p_default.frameshift_sec = p.framelength_sec / 2; p_default.filterorder = p.upfactor * 6; p_default.gridfilename = sprintf( 'grid_fullcircle_up%d.mat', p.upfactor ); p = fill_default( p, p_default ); % Check that the filter order is even if mod( p.filterorder, 2 ) error( [ mfilename ': "p.filterorder" must be even.' ] ); end % Check that the grid file exists if ~exist( p.gridfilename, 'file' ) error( [ mfilename ': file "p.gridfilename" does not exist ( "' p.gridfilename '" ).' ] ); end % Access WAV files to get some more parameters nchannels = size( p.geometry, 2 ); if nchannels ~= length( p.wavfile_list ) error( [ mfilename ': inconsistent number of channels.' ] ); end fs_list = repmat( NaN, nchannels, 1 ); nsamples_list = repmat( NaN, nchannels, 1 ); for a = 1:nchannels if ~exist( p.wavfile_list{ a }, 'file' ) error( sprintf( [ mfilename ': "p.wavfile_list{%d}" does not exist. ( "%s" ).' ], a, p.wavfile_list{ a } ) ); end [ siz, fs ] = wavread( p.wavfile_list{ a }, 'size' ); nsamples_list( a ) = siz( 1 ); fs_list( a ) = fs; end fs = unique( fs_list ); if length( fs ) > 1 error( [ mfilename ': WAV files must all have the SAME sampling frequency.' ] ); end p.fs = fs; p.framelength = round( p.framelength_sec * fs ); p.frameshift = round( p.frameshift_sec * fs ); nsamples = unique( nsamples_list ); if length( nsamples ) > 1 warning( 'WAV files do not all have the same length. Taking the minimum.' ); nsamples = min( nsamples ); end p.nsamples = nsamples; totalduration_sec = ( p.nsamples - 1 ) / p.fs; clear p_default; p_default.starttime_sec = 0; p_default.endtime_sec = totalduration_sec; p = fill_default( p, p_default ); % Check start and end times if p.starttime_sec < 0 error( [ mfilename ': p.starttime_sec must be >=0!' ] ); end if p.endtime_sec > totalduration_sec warning( '"p.endtime_sec" was cut down to the actual duration of the data.' ); p.endtime_sec = totalduration_sec; end % Convert start time and end time into frame indices % ( frame index starts at 1 ) p.firstframe = 1 + floor( p.starttime_sec / p.frameshift_sec ); p.lastframe = 1 + floor( ( p.endtime_sec - p.framelength_sec ) / p.frameshift_sec ); if p.verbose disp( [ mfilename ' parameters:' ] ); disp( p ); end npairs = size( p.pairs, 1 ); %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % ( 2 ) Load search grid load( p.gridfilename, 'grid' ); % Convert it into time-delay if p.verbose disp( [ mfilename ': loaded "' p.gridfilename '".' ] ); disp( [ mfilename ': converting the grid into time-delays...' ] ); end % Time delays in samples % ( samples are more convenient than seconds for FFT-based computations ) grid_td = location_to_timedelay( p.geometry, p.pairs, grid, p.fs, p.c ); %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % ( 3 ) Prepare low-pass filter for upsampling. % This filter is an approximated sinc filter. % Note that we will only need to interpolate % a small portion of the time-domain GCC-PHAT, % centered around time-delay zero and bounded % by a maximum time-delay corresponding to % the distance between the two microphones of a pair. disp( [ mfilename ': preparing for upsampling...' ] ); B = p.upfactor * fir1( p.filterorder, 1 / p.upfactor ); halforder = floor( p.filterorder / 2 ); % Determine the portion of time-domain GCC-PHAT % that we will look at maxdelay = p.fs / p.c * pair_distance( p.geometry, p.pairs ); halfportion = 1 + ceil( max( maxdelay ) ) + ceil( p.filterorder / p.upfactor ); if halfportion >= p.framelength error( [ mfilename ': maximum time-delay too large, please check "p.geometry" or increase "p.framelength_sec".' ] ); end % In the original time-domain GCC-PHAT % ( in the 2D matrix called "gccphat" further below ) rowzero = 1 + p.framelength; % Where time-delay zero is in the "gccphat" matrix rowstart = rowzero - halfportion; rowend = rowzero + halfportion; nrows = 1 + rowend - rowstart; % After upsampling % ( in the 2D matrix called "up_gccphat" further below ) up_rowzero = halforder + 1 + ( rowzero - rowstart ) * p.upfactor; up_nrows = nrows * p.upfactor; % We also convert the time-delays of the grid into a more usable format % -> 1-dimensional integer index in "up_gccphat(:)" up_grid_index = up_rowzero - round( grid_td * p.upfactor ); up_grid_index = up_grid_index + repmat( up_nrows * (0:npairs-1).', 1, size( up_grid_index, 2 ) ); % Save some memory clear grid_td; m = max( up_grid_index ); for a = [ 8 16 32 ] if m < 2 ^ a up_grid_index = eval( sprintf( 'uint%d( up_grid_index )', a ) ); break; end end %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % ( 4 ) Process frames nframes = p.lastframe - p.firstframe + 1; % Where we will store the results (cartesian coordinates + SRP-PHAT value) xyzs = repmat( NaN, 4, nframes ); % For windowing win = sparse( diag( hamming( p.framelength, 'periodic' ) ) ); if p.verbose disp( [ mfilename ': processing frames...' ] ); chrono = chrono_start( nframes, p.verbose_time ); end for i_frame = 1:nframes a_frame = p.firstframe + i_frame - 1; firstsample = 1 + ( a_frame - 1 ) * p.frameshift; lastsample = firstsample + p.framelength - 1; % Load the multichannel waveforms for this frame x = repmat( NaN, p.framelength, nchannels ); for a_channel = 1:nchannels x( :, a_channel ) = wavread( p.wavfile_list{ a_channel }, [ firstsample lastsample ] ); end % Hamming-windowed, zero-padded FFT X = fft( win * x, 2 * p.framelength, 1 ); % Cross-correlation in frequency domain CC = X( :, p.pairs( :, 1 ) ) .* conj( X( :, p.pairs( :, 2 ) ) ); % PHAT weighting % Not that zeroes happen very rarely but they do happen in real data abs_CC = abs( CC ); nonzero = find( abs_CC ); GCCPHAT = CC; GCCPHAT( nonzero ) = CC( nonzero ) ./ abs_CC( nonzero ); % Back to time-domain % "fftshift" so that time-delay zero corresponds % to row "rowzero" of the "gccphat" matrix gccphat = fftshift( real( ifft( GCCPHAT, 2 * p.framelength, 1 ) ), 1 ); % Upsample each column of the matrix up_gccphat = upsample( gccphat( rowstart:rowend, : ), p.upfactor ); up_gccphat = filter( B, 1, up_gccphat, [], 1 ); % Compute SRP-PHAT for each candidate location % = average across pairs of the GCC-PHAT function, % taken at the time-delays corresponding to this location srpphat = mean( up_gccphat( up_grid_index ), 1 ); % Find the global maximum of SRP-PHAT [ srpphat_max, ind_max ] = max( srpphat ); % Store the result xyzs( :, i_frame ) = [ grid( :, ind_max ); srpphat_max ]; if p.verbose chrono = chrono_check( chrono, i_frame ); end end if p.verbose chrono_stop( chrono ); end