%MATLAB SCRIPT FILE randnbug.m  */
%
%    Demonstrates Bug in MATLAB Command randn
%    by J. S. Marron
%    Department of Statistics
%    University of North Carolina
%    marron@stat.unc.edu
%
%    Main Idea:  When RANDN is used to generate a simple Gaussian
%    AR(1) process the lag correlation is not the theoretically 
%    predicted value (and is unacceptably far away).  The same 
%    problem exists when the underlying Gaussians are generated 
%    by RAND, plus the standard Box Muller transformation (a look
%    at the Forsythe, Malcom, Muller reference cited in the manual
%    suggests this may be the same as what RANDN does).

%    Background:  A finite Gaussian AutoRegressive sequence of 
%    order 1 can be generated as:
%                        X_1 ~N(0,1)
%         X_i = rho*X_I-1 + (1-rho^2)^(1/2) * eps_i     i=1,2,...,n
%    where eps_1,...,eps_n are I.I.D. N(0,1).
%        The parameter rho reflects the amount of "dependence on
%    the past.  A very standard quantification of this dependence is 
%    the vector of "lag correlations":    
%           corr(X_i,X_i-j) = rho^j     FOR  i=j+1,...,n, j=0,...,n-1
%    In this matlab script, a number of such sequences (think of these
%    as "data sets") are generated, and for each the empirical lag
%    correlations are calculated.  The samples of these are compared 
%    with the theoretical values, and it is seen that the means are 
%    "significantly" (in the statistical sense) different from the
%    theoretical values.

disp('Running MATLAB script file randnbug.m' ) ;

irng = 1 ;     %  index for random number generator:
               %    1 - RANDN
               %    2 - RAND, plus Box Muller

istat = 2 ;    %  1 - Correlations
               %  2 - Covariances
               %  Note:  Correlations are the more standard 
               %      statistical tool, but they have somewhat
               %      less stable sampling properties, so the
               %      covariance (correlation without the
               %      denominator) is also considered. 

nset = 400 ;
        %  The number of AR(1) sequences, i.e. data sets.
nobs = 200 ;
        %  The number of observations in each data set.
rho = .5 ;
        %  The dependence parameter.
vlag = 0:1:4 ;
        %  Compare theoretical vs. empirical lag correlations
        %  at these lag values.

if istat == 1 ;
  statstr = 'Correlations' ;
elseif istat == 2 ;
  statstr = 'Covariances' ;
end ;

%  Generate nobs x nset matrix, where each column is one AR(1)
%  sequence, i.e. data set.

%  Start with first entry of each column, use N(0,1).
if irng == 1 ;
  mx = randn(1,nset) ;
        %  Generate with randn
elseif irng == 2 ;
  mu = rand(2,nset/2) ;
  mx = sqrt(-2 * log(mu(1,:))) .* cos( 2 * pi * mu(2,:)) ;
  mx = [mx sqrt(-2 * log(mu(1,:))) .* sin( 2 * pi * mu(2,:))] ;
        %  Generate with rand & Box Muller
end ;

%  Next build up AR(1) sequences.
while size(mx,1) < nobs ;    %  BUILD UP mx TO nobs ROWS
  nextx = rho * mx(size(mx,1),:) ;
  if irng == 1 ;
    eps = randn(1,nset) ;
        %  Generate with randn
  elseif irng == 2 ;
        %  Generate with rand & Box Muller
    mu = rand(2,nset/2) ;
    eps = sqrt(-2 * log(mu(1,:))) .* cos( 2 * pi * mu(2,:)) ;
    eps = [eps sqrt(-2 * log(mu(1,:))) .* sin( 2 * pi * mu(2,:))] ;
  end ;
  nextx = nextx + sqrt(1 - rho^2) * eps ;
  mx = [mx; nextx] ;
end ;



%  Now calculate matrix of lagged correlations
mrhohat = [] ;
for ilag = 1:length(vlag) ;    %  Loop through lags,
  lag = vlag(ilag) ;
  mxt1 = mx(1:(nobs - lag),:) ;
        %  A block of data
  mxt2 = mx((1+lag):nobs,:) ;
        %  Lagged version of block
  ext1 = mean(mxt1) ;
  ext2 = mean(mxt2) ;
        %  Row vector of sample means
  sdxt1 = std(mxt1) * sqrt((nobs - lag - 1) / (nobs - lag)) ;
  sdxt2 = std(mxt2) * sqrt((nobs - lag - 1) / (nobs - lag)) ;
        %  Standard deviations, put on scale of "1/n", not "1/(n-1)"
  ext12 = mean(mxt1 .* mxt2) ;
        %  "Cross term" in covariance calculation
  if istat == 1 ; 
        %  then do correlations
    rhohat = (ext12 - ext1 .* ext2) ./ (sdxt1 .* sdxt2) ;
        %   row vector of correlation coefficients, this lag
  elseif istat == 2 ; 
        %  then do covariances
    rhohat = (ext12 - ext1 .* ext2) ;
        %   row vector of covariances, this lag
  end ;
  mrhohat = [mrhohat, rhohat'] ;
        %  save matrix of correlations, each column is one lag,
        %  i.e. each row is one data set
end ;


%  Calculate summary statistics, and output results.
eavgr = mean(mrhohat) ;
        %  Empirical average correlation
tavgr = rho .^ vlag ;
        %  Theoretical correlation
  sdr = std(mrhohat) ;
cil = eavgr - 1.96 * sdr / sqrt(nset) ;
cir = eavgr + 1.96 * sdr / sqrt(nset) ;
outm = [vlag; cil; tavgr; cir] ;
        %  USUAL NORMAL THEORY CI'S

  frmtstr = '  %7.3f   %7.3f   %7.3f   %7.3f\n' ;
if irng == 1 ;
  result = ['Check ' statstr ', using randn' 10] ;
elseif irng == 2 ;
  result = ['Check ' statstr ', using rand & Box-Muller' 10] ;
end ;
result = [result '     lag       cil      theory     cir' 10] ;
result = [result sprintf(frmtstr,outm)] ;
result


disp('Hit any key to see histograms') ;
pause ;
for iplot = 1:4 ;
  subplot(2,2,iplot) ;
    hdel = .05 ; 
    hist(mrhohat(:,iplot+1),-.5:hdel:.95) ;
        %  SINCE FIRST COLUMN IS ALL ONES
      title(['Lag ' num2str(iplot) ' ' statstr]) ;
    hold on ;
      mu = eavgr(iplot+1) ;
      sig = sdr(iplot+1) ;
      sig2 = sig^2 ;
      xplot = linspace(mu-3*sig,mu+3*sig,101) ;
        %  ODD NUMBER TO INCLUDE PEAK
      yplot = (xplot - mu) .^ 2 ./ (2 * sig2) ;
      yplot = (1 / sqrt(2 * pi * sig2)) * exp(-yplot) ;
      yplot = nset * hdel * yplot ;
        %  SINCE HISTOGRAMS ON COUNTS SCALE
      tval = tavgr(iplot+1) ;
      plot(xplot,yplot,'g',[tval; tval],[0; nset/3.5],'c') ;
        %  OVERLAY HISTO WITH GAUSSIAN FIT
        axout = axis ;
      plot([tval; tval],[axout(3); axout(4)],'c') ;
          %  Overlay vertical line at theoretical mean
    hold off ;
end ;


disp('Hit any key to see Q-Q plots') ;
pause ;
for iplot = 1:4 ;
  subplot(2,2,iplot) ;
    qdata = sort(mrhohat(:,iplot+1)) ;
      mu = eavgr(iplot+1) ;
      sig = sdr(iplot+1) ;
      qdel = 1 / (nset + 1) ;
    qtheory = norminv((qdel:qdel:(1-qdel))',mu,sig) ;
    plot(qtheory,qdata,'+') ;
    title(['Lag ' num2str(iplot) ' ' statstr]) ;
end ;






