function apply_uss_to_wav( wav_filename, time_span ) % function apply_uss_to_wav( wav_filename, time_span ) % % Implementation of the Unsupervised Spectral Subtraction scheme proposed in: % % "Unsupervised Spectral Subtraction", % by G. Lathoud, M. Magimai.-Doss, B. Mesot and H. Bourlard, % to appear in Proceedings of ASRU 2005. % % All matlab files used by this function must lie in the % "USS-MATLAB-FUNCTIONS" subdirectory. % The core function is "fit_raylsherl", which fits % a ( Rayleigh + Shifted Erlang ) model on some data. % % % INPUT ARGUMENTS: % % wav_filename: a string, name of a WAV file. % % time_span (optional): a vector of two numbers % that define where to start and stop in the waveform. % time_span( 1 ) is the start time in seconds, % time_span( 2 ) is the end time in seconds. % -Inf and +Inf values can be used to specify respectively % beginning and end of the waveform. % % % -- by G. Lathoud, 2005-08-26 dbstop error % Access to other matlab functions addpath( fullfile( pwd, 'USS-MATLAB-FUNCTIONS' ) ); % Parameters for spectrogram computation % - length of a time frame in seconds SP_PAR.FRAMELENGTH_SEC = 0.032; % - shift between two consecutive time frames in seconds SP_PAR.FRAMESHIFT_SEC = 0.016; % - pre-emphasis coefficient: y( t ) = x( t ) - PREEMP * x( t-1 ) SP_PAR.PREEMP = 0.97; % - binary flag: shall we zero-mean the signal? "1" means yes. SP_PAR.DO_ZERO_MEAN = 1; % Parameters for fitting the "Rayleigh + Shifted Erlang" model % We'll reduce the data to 100 points % - it is much faster (cost is directly proportional to number of points). % - in practice more points don't give better results. DATA_N_POINTS_MAX = 100; % - The Shifted Erlang starts at: erlang_t_factor * sigma_{rayleigh} FIT_PAR.erlang_t_factor = 1.0; % - To force a minimum number of iterations FIT_PAR.min_iter = 5; % - To impose a limit on the fitting process FIT_PAR.max_iter = 200; % - Floor on the Rayleigh parameter FIT_PAR.sigma_floor = 1e-10; % - Floor on the Shifted Erlang parameter FIT_PAR.lambda_floor = 1e-10; % - "moment" method if do_fminsearch is 0 (faster but inexact) % - "ml" method if do_fminsearch is 1 (slower but exact) FIT_PAR.do_fminsearch = 0; % - parameters for the "fminsearch" (simplex method) opt = optimset( @fminsearch ); opt = optimset( opt, 'Display', 'none', 'MaxFunEvals', 50, 'MaxIter', 25, 'TolX', 1e-4, 'TolFun', 1e-4 ); FIT_PAR.fminsearch_options = opt; % - Verbosity (show iterations during training) FIT_PAR.verbose = 0; % - Check for NaNs of Infs (this does not cost much) FIT_PAR.sanity_check = 1; % Check parameters given by the user if nargin < 1 error( [ mfilename ' needs at least 1 input argument.' ] ); end if ~exist( wav_filename, 'file' ) error( [ mfilename ' could not find file "' wav_filename '".' ] ); end if ~exist( 'time_span', 'var' ) time_span = [ -Inf +Inf ]; end % ( 1 ) Load the waveform disp( [ mfilename ' loading waveform "' wav_filename '".' ] ); [ x, fs, sample_span ] = load_waveform( wav_filename, time_span ); % Convert the time span (seconds) in sample indices time_span = ( sample_span - 1 ) / fs; disp( sprintf( [ mfilename ': fs: %.2f Hz, sample_span: [ %d %d ] samples, time_span: [ %.2f %.2f ] seconds' ], fs, sample_span, time_span ) ); % Show the waveform show_waveform( x, fs, time_span( 1 ), [ '"' wav_filename '" - original waveform' ] ); % ( 2 ) Compute the original spectrogram disp( [ mfilename ': computing the spectrogram...' ] ); [ X, framelength, frameshift ] = compute_spectrogram( x, fs, SP_PAR ); nframes = size( X, 2 ); disp( sprintf( [ mfilename ': %d frames, framelength:%d samples, frameshift:%d samples' ], nframes, framelength, frameshift ) ); % Keep strictly positive frequencies only X_pos = X( 2:framelength+1, : ); % Picture: show the original magnitude spectrogram show_magspec( X_pos, framelength, fs, SP_PAR, time_span(1), [ '"' wav_filename '" - original magnitude spectrogram' ] ); % ( 3 ) Fit the "Rayleigh + Shifted Erlang" model % on all non-zero magnitude data. disp( [ mfilename ': fitting the "Rayleigh + Shifted Erlang" model...' ] ); abs_X_pos = abs( X_pos ); % ( 3.1 ) Prepare the data: % flat representation independent of time or frequency % This is where pre-emphasis is necessary, % see the theoretical justifications in the paper. mag = abs( X_pos(:) ); % Keep only strictly positive data mag = mag( find( mag > 0 ) ); % First reduce the data representation if needed if numel( mag ) > DATA_N_POINTS_MAX mag = sort( mag ); % We pick samples regularly along the sorted data, % so that the picked samples have approximately the % same distribution as the original data. n = numel( mag ); ind = 1 + round( ( n-1 ) * ( 0.5:DATA_N_POINTS_MAX ) / DATA_N_POINTS_MAX ); mag = mag( ind ); end % ( 3.2 ) Fit the model raylsherl = fit_raylsherl( mag, FIT_PAR ); % Histogram of the original data in linear domain + fit of the model show_hist_fit( abs_X_pos, 1000, raylsherl, 'linear', [ wav_filename ' - fit in linear domain' ] ); % Histogram of the original data in log domain + fit of the model show_hist_fit( abs_X_pos, 1000, raylsherl, 'log', [ wav_filename ' - fit in log domain' ] ); % ( 4 ) Application: for speech recognition (MFCC and so on) % filter the original magnitude spectrogram, % by keeping only those values with P( activity | data ) > 0. % which is strictly equivalent to a threshold "raylsherl.erlang_t". % % Motivation: remove the noise, while keeping as much information % as possible % % We don't need to compute the posteriors P( activity | data ) % so the cost is small. disp( [ mfilename ': filtering the magnitude spectrogram...' ] ); filtered_magspec = max( raylsherl.erlang_t, abs_X_pos ); show_magspec( filtered_magspec, framelength, fs, SP_PAR, time_span( 1 ), [ '"' wav_filename '" - filtered magnitude spectrogram' ] ); %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% function [ x, fs, sample_span ] = load_waveform( wav_filename, time_span ) % Load a waveform file with name "wav_filename", % from time "time_span( 1 )" to time "time_span( 2 )" % ( in seconds ). % % "time_span" is optional. It can contain "-Inf" % or "+Inf" values which mean respectively the % beginning and end of the waveform. % % Return the waveform "x" as well as the sampling % frequency "fs" (in Hz) and the time span in samples. if nargin < 1 error( [ mfilename '.load_waveform() needs at least 1 input arguments.' ] ); end if ~exist( 'time_span', 'var' ) % By default, load the whole file time_span = [ -Inf +Inf ]; end [ a_size, fs ] = wavread( wav_filename, 'size' ); if a_size( 2 ) ~= 1 warning( [ mfilename ': will use channel #1 only.' ] ); end nsamples = a_size( 1 ); sample_span = max( 1, min( nsamples, round( 1 + fs * time_span ) ) ); % Load the part of the waveform specified by "sample_span" x = wavread( wav_filename, sample_span ); % Single channel only x = x( :,1 ); %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% function h_list = show_waveform( x, fs, start_time, title_string ) % Plot a waveform % Return the identifier of the figure if nargin < 2 error( [ mfilename '.show_magspec() needs at least 4 input arguments.' ] ); end if ~exist( 'start_time', 'var' ) start_time = 0; end if ~exist( 'title_string', 'var' ) title_string = [ 'time-domain waveform' ]; end h_list = []; nsamples = numel( x ); t = start_time + ( 0:nsamples-1 ) / fs; h_list( end+1 ) = figure; plot( t, x ); xlabel( 'time (seconds)' ); ylabel( 'amplitude' ); title( strrep( title_string, '_', '\_' ) ); %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% function [ X, framelength, frameshift ] = compute_spectrogram( x, fs, par ) % Take a waveform "x" ( vector of sample values ), % recorded at sampling frequency "fs" (Hz), % % bufferize as required by the par.FRAMELENGTH_SEC % and par.FRAMESHIFT_SEC parameters, % % apply zero-mean if par.DO_ZERO_MEAN == 1, % apply the preemphasis defined by par.PREEMP, % apply Hamming window, % % and return the complex-domain, zero-padded FFT "X" % ( framelength by nframes matrix of complex values ). if nargin < 3 error( [ mfilename '.compute_spectrogram() needs 3 input arguments.' ] ); end % Frame length and shift in samples framelength = round( par.FRAMELENGTH_SEC * fs ); frameshift = round( par.FRAMESHIFT_SEC * fs ); % Define the frames % the additional sample in each frame ( "1 +" ) will be used for pre-emphasis x_buffer = buffer( x(:), 1 + framelength, 1 + framelength - frameshift, 'nodelay' ); % Zero-mean if par.DO_ZERO_MEAN x_buffer = x_buffer - repmat( mean( x_buffer, 1 ), size( x_buffer, 1 ), 1 ); end % Pre-emphasis x_buffer = x_buffer( 2:end, : ) - par.PREEMP * x_buffer( 1:end-1, : ); % Hamming window and zero-padded FFT ham_win = diag( sparse( hamming( framelength ) ) ); X = fft( ham_win * x_buffer, 2 * framelength, 1 ); %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% function h_list = show_magspec( X, framelength, fs, sp_par, start_time, title_string ) % Show the magnitude spectrogram log( abs( X ) ) % in two windows (2-D and 3-D views). if nargin < 4 error( [ mfilename '.show_magspec() needs at least 4 input arguments.' ] ); end if ~exist( 'start_time', 'var' ) start_time = 0; end if ~exist( 'title_string', 'var' ) title_string = [ 'magnitude spectrogram' ]; end h_list = []; nframes = size( X, 2 ); f = ( 1:framelength ) / framelength * fs / 2; t = start_time + sp_par.FRAMELENGTH_SEC / 2 + ( 0:nframes-1 ) * sp_par.FRAMESHIFT_SEC; % Temporary hold warnings for log( 0 ) old_warning_state = warning; warning off; % Flat view h_list( end+1 ) = figure; imagesc( t, f, log( abs( X ) ) ); axis xy; xlabel( 'time (seconds)' ); ylabel( 'frequency (Hz)' ); title( strrep( title_string, '_', '\_' ) ); % 3-D view h_list( end+1 ) = figure; mesh( t, f, log( abs( X ) ) ); axis xy; xlabel( 'time (seconds)' ); ylabel( 'frequency (Hz)' ); title( strrep( title_string, '_', '\_' ) ); warning( old_warning_state ); %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% function h_list = show_hist_fit( mag, nbins, raylsherl, type_string, title_string ) % Histogram of the original data + fit of the model if nargin < 4 error( [ mfilename '.show_hist_fit() needs at least 4 input arguments.' ] ); end type_string = lower( type_string ); if ~ismember( type_string, { 'linear', 'log' } ) error( [ mfilename '.show_hist_fit(): wrong "type_string".' ] ); end is_linear = strcmp( type_string, 'linear' ); % We'll return the identifier of the figure h_list = []; % Histogram of the original data if is_linear % Linear domain [ hw, hx ] = hist( mag( find( mag > 0 ) ), nbins ); else % Log domain [ hw, hx ] = hist( log( mag( find( mag > 0 ) ) ), nbins ); end hx = hx(:); hw = hw(:) / sum( hw ); h_list( end+1 ) = figure; set( bar( hx, hw ), 'edgecolor', [ 0.7 0.7 0.7 ], 'facecolor', [ 0.7 0.7 0.7 ] ); % Superimpose the probability density functions % (joint probabilities) if is_linear % Linear domain [ tmp, tmp, log_joint, log_lkld ] = compute_post_raylsherl( raylsherl, hx, 1 ); distrib_mat = exp( [ permute( log_joint, [ 1 3 2 ] ) log_lkld ] ); xlabel( 'magnitude' ); else % Log domain [ tmp, tmp, log_joint, log_lkld ] = compute_post_raylsherl( raylsherl, exp( hx ), 1 ); distrib_mat = diag( sparse( exp( hx ) ) ) * exp( [ permute( log_joint, [ 1 3 2 ] ) log_lkld ] ); xlabel( 'log magnitude' ); end distrib_mat = distrib_mat / sum( distrib_mat( :,end ) ); hold on; plot( hx, distrib_mat, 'linewidth', 2 ); title( strrep( title_string, '_', '\_' ) ); legend( 'silence', 'activity', 'mixture' );