function par = uss_filter( par ) if nargin < 1 error( [ mfilename ' needs 1 input argument.' ] ); end %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % ( 1 ) Check for mandatory parameters required_fields = { 'input_filename', 'input_format' }; check_param( required_fields, fieldnames( par ) ); par.input_format = lower( par.input_format ); if ~ismember( par.input_format, { 'wav', 'sphere', 'raw-le', 'raw-be' } ) error( [ mfilename ': unknown input format.' ] ); end % Get sampling frequency if ismember( par.input_format, { 'raw-le', 'raw-be' } ) % From the user check_param( { 'fs' }, fieldnames( par ) ); end if strcmp( par.input_format, 'wav' ) % From the file itself [ a_size, par.fs ] = wavread( par.input_filename, 'size' ); end if strcmp( par.input_format, 'sphere' ) % From the file itself sphere_header = read_sphere_header( par.input_filename ); par.fs = sphere_header.sample_rate; end %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % ( 2 ) Fill in default parameters %%%%%%%%%% % Main parameters par_default = []; par_default.frameshift_sec = 0.010; par_default.framelength_sec = 0.032; % Size of the block on which we apply CHN / RSE-USS / GMN-USS par_default.block_size_sec = 1.0; % Binary choice: shall we do channel normalization? % ( it comes first ) par_default.do_chn = 1; % Binary choice: shall we do Rayleigh + Shifted Erlang (RSE) % Unsupervised Spectral Subtraction (USS). % ( it comes second ) par_default.do_rse_uss = 1; % Binary choice: shall we do Geometric Mean Normalization (GMN) % Unsupervised Spectral Subtraction (USS). % ( it comes last ) % -> basically a simplified version of RSE-USS par_default.do_gmn_uss = 0; % Optional output: magnitude spectrum in the HTK format par_default.htkout_magspectrum_filename = []; % Optional output: MEL filter bank spectrum in the HTK format par_default.htkout_fbank_filename = []; % Optional output: MFCC in the HTK format par_default.htkout_mfcc_filename = []; % Debugging: show windows and do "keyboard" pauses par_default.debug = 0; % Verbosity par_default.verbose = 0; %%%%%%%%%% % Secondary parameters par_default = fill_second( par_default ); par = fill_default( par, par_default ); % Default verbosity for RSE fitting if isempty( par.rse_em_par.verbose ) par.rse_em_par.verbose = max( 0, par.verbose - 1 ); end % Deduce values in samples par.framelength = round( par.framelength_sec * par.fs ); par.frameshift = round( par.frameshift_sec * par.fs ); % From "par.framelength_sec", deduce the block size in number of frames par.block_size_frames = max( 1, round( par.block_size_sec / par.frameshift_sec ) ); % Parameters for "reduced" data representation if isempty( par.future_nframes_max ) par.future_nframes_max = par.block_size_frames; end if isempty( par.past_nframes_max ) par.past_nframes_max = par.future_nframes_max; end %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % ( 3 ) Open input and output files % Input: open and get number of samples % For "raw-le", "raw-be" and "wav", 16-bit samples are assumed fid_in = []; switch par.input_format case 'raw-le' fid_in = fopen( par.input_filename, 'r', 'ieee-le' ); fseek( fid_in, 0, 'eof' ); par.nsamples = floor( ftell( fid_in ) / 2 ); fseek( fid_in, 0, 'bof' ); case 'raw-be' fid_in = fopen( par.input_filename, 'r', 'ieee-be' ); fseek( fid_in, 0, 'eof' ); par.nsamples = floor( ftell( fid_in ) / 2 ); fseek( fid_in, 0, 'bof' ); case 'wav' a_size = wavread( par.input_filename, 'size' ); par.nsamples = a_size( 1 ); case 'sphere' sphere_header = read_sphere_header( par.input_filename ); par.nsamples = sphere_header.sample_count; switch sphere_header.sample_byte_format case '01' fid_in = fopen( par.input_filename, 'r', 'ieee-le' ); case '10' fid_in = fopen( par.input_filename, 'r', 'ieee-be' ); otherwise error( [ mfilename ': unknown sphere_header.sample_byte_format "' sphere_header.sample_byte_format '"' ] ); end otherwise error( [ mfilename ': input format "' par.input_format '" not supported yet.' ] ); end % From "par.nsamples", deduce the number of frames par.nframes = 0; if par.nsamples >= par.framelength par.nframes = 1 + floor( ( par.nsamples - par.framelength ) / par.frameshift ); end par.block_size_frames = min( par.nframes, par.block_size_frames ); % From "par.fs", deduce the band on which we fit the RSE model par.rse_band = max( 1, min( par.framelength, round( par.rse_band_hz / par.fs * 2 * par.framelength ) ) ); % From "par.fs", deduce the band on which we compute GMN par.gmn_band = max( 1, min( par.framelength, round( par.gmn_band_hz / par.fs * 2 * par.framelength ) ) ); % From "par.fs", deduce the band on which we compute filter banks (and MFCCs) par.mfcc_band = max( 1, min( par.framelength, round( par.mfcc_band_hz / par.fs * 2 * par.framelength ) ) ); % Prepare the filter bank matrix to linearly transform % magnitude spectrogram into filterbank values par.fbank_mat = prepare_fbank_mat( par ); % Prepare DCT matrix to linearly transform log filter bank values % into cepstral coefficients par.dct_mat = prepare_dct_mat( par ); % Outputs % Open and write header if ~isempty( par.htkout_magspectrum_filename ) fid_out_magspectrum = fopen( par.htkout_magspectrum_filename, 'w', 'ieee-be' ); write_htk_header( fid_out_magspectrum, par.nframes, par.frameshift_sec, 2 * par.framelength ); end if ~isempty( par.htkout_fbank_filename ) fid_out_fbank = fopen( par.htkout_fbank_filename, 'w', 'ieee-be' ); write_htk_header( fid_out_fbank, par.nframes, par.frameshift_sec, par.mfcc_B ); end if ~isempty( par.htkout_mfcc_filename ) fid_out_mfcc = fopen( par.htkout_mfcc_filename, 'w', 'ieee-be' ); % "1 +" for the C0 coefficient write_htk_header( fid_out_mfcc, par.nframes, par.frameshift_sec, 1 + par.mfcc_N ); end %%%%%%%%%% % Show the parameters if par.verbose disp( [ mfilename ' parameters:' ] ); disp( par ); end %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % ( 4 ) Process the input file % Where we'll store the current block a_block.start_frame = 0; a_block.stop_frame = 0; % Special variable to deal with the "all-zero fbank magnitudes" frames % -> we'll replace with the minimum values % -> if no "non-zero fbank magnitude" has been oberved yet, % ( previous_min_fbank_mag == [] ) % we replace with "par.fbank_mag_eps". previous_min_fbank_mag = []; % Initialize the various filters if par.do_chn chn( 'init', [] ); end if par.do_rse_uss rse_uss( 'init', [] ); end if par.do_gmn_uss gmn_uss( 'init', [] ); end if par.verbose chrono = chrono_start( par.nframes ); end % Start looping through blocks of time frames for start_frame = 1:par.block_size_frames:par.nframes % Get the input time domain signal for this block of frames a_block.start_frame = start_frame; a_block.stop_frame = min( par.nframes, start_frame + par.block_size_frames - 1 ); a_block.nframes = a_block.stop_frame - a_block.start_frame + 1; a_block.start_sample = 1 + ( a_block.start_frame - 1 ) * par.frameshift; a_block.stop_sample = par.framelength + ( a_block.stop_frame - 1 ) * par.frameshift; a_block.nsamples = a_block.stop_sample - a_block.start_sample + 1; switch par.input_format case { 'raw-le', 'raw-be' } fseek( fid_in, 2 * ( a_block.start_sample - 1 ), 'bof' ); a_block.x = fread( fid_in, a_block.nsamples, 'int16' ); case 'wav' a_block.x = wavread( par.input_filename, [ a_block.start_sample a_block.stop_sample ] ); case 'sphere' fseek( fid_in, sphere_header.header_size + 2 * ( a_block.start_sample - 1 ), 'bof' ); a_block.x = fread( fid_in, a_block.nsamples, 'int16' ); otherwise error( [ mfilename ': input format "' par.input_format '" not supported yet.' ] ); end if par.debug figure; plot( a_block.start_sample:a_block.stop_sample, a_block.x ); title( strrep( [ mfilename ', file "' par.input_filename '"' ], '_', '\_' ) ); xlabel( 'sample index' ); keyboard end % Move to frequency domain a_block.mag = abs( my_spectrum( a_block.x, par.framelength, par.frameshift, par.preemp, par.do_zero_mean ) ); if par.debug x = a_block.start_frame:a_block.stop_frame; y = 1:size( a_block.mag, 1 ); figure; imagesc( x, y, log( a_block.mag ) ); title( strrep( [ mfilename ', file "' par.input_filename '" original spectrum' ], '_', '\_' ) ); xlabel( 'time frame index' ); ylabel( 'frequency bin' ); keyboard end %%%%%%%%%% % ( 4.1 ) First filter: channel normalization if par.do_chn a_block.mag = chn( a_block.mag, par ); if par.debug x = a_block.start_frame:a_block.stop_frame; y = 1:size( a_block.mag, 1 ); figure; imagesc( x, y, log( a_block.mag ) ); title( strrep( [ mfilename ', file "' par.input_filename '" after CHN' ], '_', '\_' ) ); xlabel( 'time frame index' ); ylabel( 'frequency bin' ); keyboard end end %%%%%%%%%% % ( 4.2 ) Second filter: RSE-USS if par.do_rse_uss a_block.mag = rse_uss( a_block.mag, par ); if par.debug x = a_block.start_frame:a_block.stop_frame; y = 1:size( a_block.mag, 1 ); figure; imagesc( x, y, log( a_block.mag ) ); title( strrep( [ mfilename ', file "' par.input_filename '" after RSE-USS' ], '_', '\_' ) ); xlabel( 'time frame index' ); ylabel( 'frequency bin' ); keyboard end end %%%%%%%%%% % ( 4.3 ) Third filter: GMN-USS if par.do_gmn_uss a_block.mag = gmn_uss( a_block.mag, par ); if par.debug x = a_block.start_frame:a_block.stop_frame; y = 1:size( a_block.mag, 1 ); figure; imagesc( x, y, log( a_block.mag ) ); title( strrep( [ mfilename ', file "' par.input_filename '" after GMN-USS' ], '_', '\_' ) ); xlabel( 'time frame index' ); ylabel( 'frequency bin' ); keyboard end end %%%%%%%%%% % ( 4.4 ) Compute Mel filterbank values if ~isempty( par.htkout_fbank_filename ) | ~isempty( par.htkout_mfcc_filename ) % Apply MEL filterbank filters if ~par.mfcc_usepower % Use magnitude a_block.fbank = par.fbank_mat * a_block.mag; else % Use power a_block.fbank = par.fbank_mat * ( a_block.mag .^ 2 ); end if par.debug x = a_block.start_frame:a_block.stop_frame; y = 1:par.mfcc_B; figure; imagesc( x, y, log( a_block.fbank ) ); title( strrep( [ mfilename ', file "' par.input_filename '" log( fbank )' ], '_', '\_' ) ); xlabel( 'time frame index' ); ylabel( 'filterbank index' ); keyboard end end %%%%%%%%%% % ( 4.5 ) Compute MFCCs if ~isempty( par.htkout_mfcc_filename ) % Update minimum observed non-zero value tmp = a_block.fbank; tmp( find( tmp <= 0 ) ) = +Inf; tmp = min( tmp, [], 2 ); tmp( find( isnan( tmp ) | isinf( tmp ) | ( tmp <= 0 ) ) ) = par.fbank_mag_eps; if isempty( previous_min_fbank_mag ) previous_min_fbank_mag = tmp; else previous_min_fbank_mag = min( previous_min_fbank_mag, tmp ); end % Compute MFCC a_block.mfcc = par.dct_mat * log( max( repmat( previous_min_fbank_mag, 1, a_block.nframes ), a_block.fbank ) ); if par.debug x = a_block.start_frame:a_block.stop_frame; y = 0:par.mfcc_N; figure; imagesc( x, y, a_block.mfcc ); title( strrep( [ mfilename ', file "' par.input_filename '" MFCC' ], '_', '\_' ) ); xlabel( 'time frame index' ); ylabel( 'cepstral coefficient index' ); keyboard end end %%%%%%%%%% % ( 4.6 ) Sanity check if any( isnan( a_block.mag(:) ) | isinf( a_block.mag(:) ) ) error( 'uss_filter.m: NaN of Inf in "a_block.mag"' ); end if any( isnan( a_block.fbank(:) ) | isinf( a_block.fbank(:) ) ) error( 'uss_filter.m: NaN of Inf in "a_block.fbank"' ); end if any( isnan( a_block.mfcc(:) ) | isinf( a_block.mfcc(:) ) ) error( 'uss_filter.m: NaN of Inf in "a_block.mfcc"' ); end %%%%%%%%%% % ( 4.7 ) Output magnitude spectrum if ~isempty( par.htkout_magspectrum_filename ) fwrite( fid_out_magspectrum, a_block.mag, 'float32' ); end %%%%%%%%%% % ( 4.8 ) Output MEL filterbank values if ~isempty( par.htkout_fbank_filename ) fwrite( fid_out_fbank, a_block.fbank, 'float32' ); end %%%%%%%%%% % ( 4.9 ) Output MFCC values if ~isempty( par.htkout_mfcc_filename ) fwrite( fid_out_mfcc, a_block.mfcc, 'float32' ); end if par.verbose chrono = chrono_check( chrono, a_block.stop_frame ); end end % Loop through blocks of time frames if par.verbose chrono_stop( chrono ); end %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % ( end ) Close output files if ~isempty( par.htkout_magspectrum_filename ) fclose( fid_out_magspectrum ); if par.verbose disp( [ mfilename ' wrote out file "' par.htkout_magspectrum_filename '".' ] ); end end if ~isempty( par.htkout_fbank_filename ) fclose( fid_out_fbank ); if par.verbose disp( [ mfilename ' wrote out file "' par.htkout_fbank_filename '".' ] ); end end if ~isempty( par.htkout_mfcc_filename ) fclose( fid_out_mfcc ); if par.verbose disp( [ mfilename ' wrote out file "' par.htkout_mfcc_filename '".' ] ); end end %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % Define default secondary parameters function par_default = fill_second( par_default ) % Pre-emphasis par_default.preemp = 0.97; % Zero-mean activated or not par_default.do_zero_mean = 1; %%%%%%%%%% % Parameters for data representation par_default.past_nframes_max = []; par_default.future_nframes_max = []; %%%%%%%%%% % Parameters for channel estimation par_default.chn_prop = 0.20; % Use the 20% lowest magnitude values %%%%%%%%%% % Parameters for fitting the Rayleigh + Shifted Erlang model par_default.rse_band_hz = [ 300 3400 ]; % Quite useful on Aurora par_default.rse_em_par.erlang_t_factor = 1; par_default.rse_em_par.min_iter = 5; par_default.rse_em_par.max_iter = 200; par_default.rse_em_par.sigma_floor = 1e-10; par_default.rse_em_par.lambda_floor = 1e-10; % 0: moment method (faster and better) % 1: ML method (exact) par_default.rse_em_par.do_fminsearch = 0; opt = optimset( @fminsearch ); opt = optimset( opt, 'Display', 'none', 'MaxFunEvals', 50, 'MaxIter', 25, 'TolX', 1e-4, 'TolFun', 1e-4 ); par_default.rse_em_par.fminsearch_options = opt; % Activate default verbosity par_default.rse_em_par.verbose = []; %%%%%%%%%% % Parameters for Geometric Mean Normalization par_default.gmn_band_hz = [ 300 3400 ]; % Quite useful on Aurora %%%%%%%%%% % Parameter to deal with perverse cases (all-zero frames for example) par_default.fbank_mag_eps = 1e-50; %%%%%%%%%% % Parameters for MEL filter banks and MFCC extraction par_default.mfcc_band_hz = [ 300 3400 ]; par_default.mfcc_B = 24; % number of filterbanks par_default.mfcc_N = 12; % order of the CC par_default.mfcc_L = 22; % cepstral liftering parameter par_default.mfcc_usepower = 0; %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % Matrix to linearly transform magnitude values % into MEL filterbank values function fbank_mat = prepare_fbank_mat( par ) if nargin < 1 error( [ mfilename ' needs 1 input argument.' ] ); end % Check for mandatory parameters required_fields = { 'fs', 'framelength', 'mfcc_band_hz', 'mfcc_B' }; check_param( required_fields, fieldnames( par ) ); % Define the MEL scale, following HTK book 3.3 mel_from_f = inline( '2595 * log10( 1 + f / 700 )', 'f' ); f_from_mel = inline( '700 * ( 10 .^ ( mel / 2595 ) - 1 )', 'mel' ); % Sanity check f = 10000; if abs( f_from_mel( mel_from_f( f ) ) - f ) > 1e-8 error( [ mfilename '.prepare_fbank_mat() is insane.' ] ); end melband = mel_from_f( max( 0, min( par.fs / 2, par.mfcc_band_hz ) ) ); % Equal spacing in MEL space along the band n = par.mfcc_B + 1; mel_list = melband( 1 ) + diff( melband ) * ( 0:n ) / n; % Back to frequency scale f_list = max( par.mfcc_band_hz( 1 ), min( par.mfcc_band_hz( 2 ), f_from_mel( mel_list ) ) ); % Define the triangular filters fbank_mat = zeros( par.mfcc_B, 2 * par.framelength ); f_k = ( 0:( 2 * par.framelength-1 ) ) / ( 2 * par.framelength ) * par.fs; for a = 1:par.mfcc_B [ tmp, k_low ] = min( abs( f_k - f_list( a ) ) ); [ tmp, k_center ] = min( abs( f_k - f_list( a + 1 ) ) ); [ tmp, k_high ] = min( abs( f_k - f_list( a + 2 ) ) ); fbank_mat( a, k_low:k_high ) = interp1( [ k_low k_center k_high ], [ 0 1 0 ], k_low:k_high, 'linear', NaN ); end % Sanity check if any( isnan( fbank_mat(:) ) ) error( [ mfilename '.prepare_fbank_mat() is still insane.' ] ); end if par.debug figure; plot( fbank_mat.' ); title( 'filter banks' ); keyboard end %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % Matrix to linearly transform log filterbank values % into cepstral coefficients function dct_mat = prepare_dct_mat( par ) if nargin < 1 error( [ mfilename '.prepare_dct_mat() needs 1 input argument.' ] ); end % Check for mandatory parameters required_fields = { 'mfcc_B', 'mfcc_N', 'mfcc_L' }; check_param( required_fields, fieldnames( par ) ); % DCT: following formula in HTK book 3.3 r = 0:par.mfcc_N; % Includes the C_0 coefficient c = 1:par.mfcc_B; dct_mat = sqrt( 2 / par.mfcc_B ) * cos( pi / par.mfcc_B * r(:) * ( c(:).' - 0.5 ) ); % Optional liftering: following formula in HTK book 3.3 if ~isempty( par.mfcc_L ) dct_mat = full( diag( sparse( 1 + par.mfcc_L / 2 * sin( pi * r / par.mfcc_L ) ) ) * dct_mat ); end %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % Write HTK header of a file function write_htk_header( fid_out, n_frames, period_sec, dim ) if nargin < 4 error( [ mfilename '.write_htk_header() needs 4 input parameters.' ] ); end % Write 12-byte HTK header fwrite( fid_out, n_frames, 'int32' ); period = period_sec * 1e7; fwrite( fid_out, period, 'int32' ); size_of_sample = 4 * dim; fwrite( fid_out, size_of_sample, 'int16' ); HTK_Type = 9; % USER fwrite( fid_out, HTK_Type, 'int16' ); %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % Read header of a NIST SPHERE file % In case of success, a structure "sphere_header" is returned. % In case of failure, an error is fired. function sphere_header = read_sphere_header( filename ) if nargin < 1 error( [ mfilename '.read_sphere_header() needs 1 input argument.' ] ); end sphere_header.filename = filename; % Open fid = fopen( filename, 'rt' ); % We'll store the raw lines here line_list = {}; % Check first line a_line = fgetl( fid ); if ~ischar( a_line ) error( [ mfilename '.read_sphere_header() could not read first line.' ] ); end a_line = deblank( fliplr( deblank( fliplr( a_line ) ) ) ); if ~strcmp( a_line, 'NIST_1A' ) error( [ mfilename '.read_sphere_header(): first line is supposed to be "NIST_1A".' ] ); end line_list{ end+1 } = a_line; % Check second line (header size) a_line = fgetl( fid ); if ~ischar( a_line ) error( [ mfilename '.read_sphere_header() could not read second line.' ] ); end a_line = deblank( fliplr( deblank( fliplr( a_line ) ) ) ); [ sphere_header.header_size, is_ok ] = str2num( a_line ); if ~is_ok error( [ mfilename '.read_sphere_header() could not read the header size.' ] ); end line_list{ end+1 } = a_line; % Read and parse remaining lines while 1 a_line = fgetl( fid ); if ~ischar( a_line ) error( [ mfilename '.read_sphere_header() could not finish reading header.' ] ); end a_line = deblank( fliplr( deblank( fliplr( a_line ) ) ) ); line_list{ end+1 } = a_line; if ~isempty( a_line ) % Split the line in tokens separated by white spaces tok_list = {}; while ~isempty( a_line ) [ t, r ] = strtok( a_line ); tok_list{ end + 1 } = deblank( fliplr( deblank( fliplr( t ) ) ) ); a_line = deblank( fliplr( deblank( fliplr( r ) ) ) ); end % Special case: end of header if strcmp( tok_list{ 1 }, 'end_head' ) break end % Usual case: some parameter if numel( tok_list ) >= 3 if strcmp( tok_list{ 2 }, '-i' ) % In case of an integer, do a conversion and store the integer eval( [ 'sphere_header.' tok_list{ 1 } ' = ' tok_list{ 3 } ';' ] ); else % In the general case, store the parameter as a string eval( [ 'sphere_header.' tok_list{ 1 } ' = ''' tok_list{ 3 } ''';' ] ); end end end % Check whether we reached end of header if ftell( fid ) >= sphere_header.header_size break; end end % Loop reading header lines % Add filename and raw header sphere_header.filename = filename; sphere_header.line_list = line_list; fclose( fid );