%MATLAB SCRIPT FILE s322eg5.m  */
%
%    EXAMPLE 5 FOR STATISTICS 322
%    POSTED 9/3/96
%    EXPLORES INDEPENDENCE AND RANDOM NUMBER GENERATION 

ipar = 1 ;     %  1,2,3

format compact ;
running = [10 'MATLAB script s322eg5.m, with ipar = ' num2str(ipar) 10]

%    IN THE FIELD OF RANDOM NUMBER GENERATION, THE HARDEST PART OF
%    FINDING SEQUENCES TO MIMIC I.I.D. RANDOM SEQUENCES IS THE
%    "INDEPENDENCE".  MOST SERIOUSLY PROPSED EXAMPLES OF GENERATORS,
%    THAT WERE LATE SHOWN TO BE POOR, FAILED IN THIS RESPECT.  THIS
%    PROGRAM EXPLORES INDEPENDENCE AND DEPENDENCE, THROUGH SOME
%    SIMPLE EXAMPLES.

if ipar == 1 ;    %  THEN DO PART 1 OF "CHECKS FOR INDEPENDENCE"

  %    LET'S INVESTIGATE HOW ELL THE PSEUDO RANDOM SEQUENCE GENERATED
  %    BY MATLAB SEEMS TO APPROXIMATE INDEPENDENCE.  ONE WAY OF
  %    LOOKING AT THIS IS TO STUDY THE "LAG ONE JOINT DISTRIBUTIONS"
  %    I.E. STUDY THE JOINT DISTRIBUTION OF (X_i,X_i-1)  (WHERE "_"
  %    DENOTES "SUBSCRIPT").

  %    OF COURSE LAG ONE DEPENDENCE IS ONLY ONE ASPECT THAT NEEDS TO BE
  %    TAKEN INTO ACCOUNT TO DECIDE IF THE SEQUENCE IS REALLY "I.I.D"
  %    BUT WE FOCUS ON IT HERE BECAUSE IT IS EASY.

  %    FIRST CONSIDER THE JOINT DISTRIBUTION FROM BEFORE:
  %             /   1     w.p.  1/6
  %        X = <    2     w.p.  1/3
  %             \   4     w.p.  1/2
  %    SINCE X_I ONLY TAKES ON 3 VALUES, HE PAIR ONLY TAKES ON 9
  %    DIFFERENT VALUES, WHICH ARE CONVENIENTLY REPRESENTED BY A
  %    3x3 TABLE.  LET p_ij DENTOE JOINT PROBABILITIES, AND p_Tj,
  %    p_iT BE THE MARGINAL PROBABILITIES.  A MEANS OF
  %    INVESTIGATING INDEPENDENCE IS TO TEST THE HYPOTHESIS:
  %               H_0:  p_ij = p_iT p_Tj      i<j = 1,2,3

  %    RECALL THE CLASSICAL CHI-SQUARE TEST FOR HYPOTHESES OF THIS TYPE
  %    IS BASED ON THE FACT THAT 
  %                CHI  =  SUM((o_ij - e_ij)^2 / e_ij)
  %    (WHERE o_ij = n_ij = n p_ij,  AND  e_ij = n p_Tj P_iT)
  %    HAS AN ASYMPTOTIC CHI SQUARE DISTRIBUTION.

          %  GENERATE SEQUENCE OF DATA, USING LINES COPIED FROM
          %  SCRIPT 322EG4.M
  nobs = 2000 ;
  rand('seed',23421745) ;
  values = [1 2 2 4 4 4] ;
  vind = ceil(6 * rand(nobs,2)) ;
  xdata = values(vind) ;
          %  THIS IS A 1x4000 ROW VECTOR
  xdata = reshape(xdata,nobs,2) ;
          %  CHANGED INTO A 2000x2 MATRIX 

  %  CALCULATE MATRIX OF JOINT PROBABILITIES
  values = [1 2 4] ;
  mvalues = ones(nobs,1) * values ;
          %  NOBS x 3 VERSION
  xdatai = xdata(:,1) * ones(1,3) ;
          %  NOBS x 3 VERSION
  indici = (xdatai == mvalues) ;
          %  1'S IN COLUMNS ACCORDING TO VALUES
  indicj = ((xdata(:,2) * ones(1,3)) == mvalues) ;

  phatij = [] ;
  for i = 1:1:length(values) ; 
          %  THIS LOOP THROUGH COLUMNS COULD BE EASILY AVOIDED IF 
          %  MATLAB ALLOWED THREE DIMENSIONAL ARRAYS.  ANYBODY SEE 
          %  A NON-LOOPING WAY TO DO THIS?
    phatij = [phatij; mean((indici(:,i) * ones(1,3)) .* indicj)] ;
  end ;
  oij = nobs * phatij ;
    phatit = sum(phatij')' ;
    phattj = sum(phatij) ;
  eij = nobs * (phatit * phattj) ;
  chisq = sum(sum(  (oij - eij).^2 ./ eij  )) ;
          %  USUAL CHI SQUARE STATISTIC

    df = 8 - 4 ;     %  DEGREES OF FREEDOM FOR CHI-SQUARE TEST
  pval = 1 - gamcdf(chisq,df/2,1/2) ;
          %  TAIL OF CDF OF GAMMA DISTRIBUTION
          %  CAUTION:  THIS IS MY gamcdf, BE SURE IT IS IN THE CURRENT
          %  DIRECTORY, OR IN YOUR MATLAB PATH AHEAD OF THE gamcdf
          %  THAT COMES IN THE STATISTICS TOOLBOX, OR YOU WILL GET 
          %  FUNNY ANSWERS.

  result = 'The p-value for testing independence was: ' ;
  result = [result num2str(pval) 10] ;
  if pval <= .05 ;
    [result '     So null hypothesis of independence is rejected at'] ;
    [result '     level .05,  i.e. have detected strong dependence'] ;
  else ;
    [result '     So null hypothesis of independence is accepted at'] ;
    [result '     level .05,  i.e. could not detect strong dependence'] ;
  end ;
  result

  %    AN INTERESTING EXERCISE, WHICH STRENGTHENS YOUR INTUITION ABOUT
  %    HYPOTHESIS TESTING IS TO "COMMENT OUT" THE LINE OF THIS PROGRAM
  %    WHICH DEFINES THE seed, AND RERUN IT SEVERAL TIME.  THIS REPEATES
  %    THE PROCESS FOR A NEW DATA SET EACH TIME ("INDEPENDENT" OF THE
  %    PREVIOUS ONES).  THE THEORY SAYS THAT UNDER H_0, YOU SHOULD 
  %    REJECT ABOUT 5% OF THE TIME.  REY IT AND SEE WHAT YOU THINK.  
  %    NOTE IT IS NOT EASY TO GET A GOOD "SEAT OF THE PANTS" IMPRESSION
  %    FOR THIS.  IF YOU ARE REALLY ENERGETIC, YOU COULD PUT A LOOP 
  %    AROUND THIS PART OF THE PROGRAM, AND GET AN ACCURATE ESTIMATE.

elseif ipar == 2 ;    %  THEN DO PART 2 OF "CHECKS FOR INDEPENDENCE"

  %    THE NEXT GOAL IS TO LOOK HEURISTICALLY AT THE POWER OF THE TEST
  %    CONSIDERED ABOVE.  OF COURSE POWER CONSIDERATIONS ARE IN GENERAL
  %    QUITE COMPLICATED, BUT LET'S AGAIN JUST LOOK IN ONE SIMPLE WAY.

  %    A SIMPLE MEANS OF INTRODUCING AN EASILY CONTROLLED AMOUNT OF LAG
  %    ONE DEPENDENCE, WHILE KEEPING THE SAME ONE DIMENSIONAL MARGINAL
  %    DISTRIBUTIONS, IS TO TAKE A "MISTURE" OF X_i WITH X_i-1, I.E.
  %    GIVEN A SEQUENCE OF I.I.D. (AND INDEPENDENT OF OTHER DATA)
  %    BERNOULLI(W) INDICATOR VARIABLES, I_1,...,I_n, DEFINE THE NEW
  %    VARIABLES  Y_i = I_i X_i + (1-I_i) X_i-1.
  %    FOR W = 0 OR 1, THE Y_i ARE EXACTLY INDEPENDENT, BUT FOR OTHER
  %    W, THE Y_i HAVE AN AMOUNT OF LAG ONE DEPENDENCE THAT INCREASES
  %    FOR W CLOSER TO 1/2.  LET'S INVESTIGATE THE EFFECT OF W ON THE
  %    ABOVE HYPOTHESIS TEST.

  nset = 100 ;
  nobs = 200 ;

  rand('seed',23373455) ;

  vw = [0 .02 .1 .5] ;
          %  VECTOR OF W VALUES, RECALL INDEP AT 0, MOST DEP AT .5  

  values = [1 2 2 4 4 4] ;
  svalues = [1 2 4] ;
  mvalues = ones(nobs,1) * svalues ;
  df = 8 - 4 ;     %  DEGREES OF FREEDOM FOR CHI-SQUARE TEST

  mpval = [] ;
  for iset = 1:nset ;
    if rem(iset,10) == 0 
      note = ['working on data set: ' num2str(iset)] ;
    end ;

    vind = ceil(6 * rand(nobs,2)) ;
    xdata = values(vind) ;
    xdata = reshape(xdata,nobs,2) ;
    vpval = [] ;
    for iw = 1:1:length(vw) ;    %  LOOP THROUGH W'S
      w = vw(iw) ;
      indicy = (rand(nobs,1) <= w) ;    
          %  VECTOR OF BERNOULLI'S FOR MIXING
      ydatai = xdata(:,1) ;
      ydataj = indicy .* xdata(:,1) + (1 - indicy) .* xdata(:,2) ;
          %  MIXED DATA
      indici = ((ydatai * ones(1,3)) == mvalues) ;
      indicj = ((ydataj * ones(1,3)) == mvalues) ;

      phatij = [] ;
      for i = 1:1:length(svalues) ; 
        phatij = [phatij; mean((indici(:,i) * ones(1,3)) .* indicj)] ;
      end ;
      oij = nobs * phatij ;
        phatit = sum(phatij')' ;
        phattj = sum(phatij) ;
      eij = nobs * (phatit * phattj) ;
      chisq = sum(sum(  (oij - eij).^2 ./ eij  )) ;
      pval = 1 - gamcdf(chisq,df/2,1/2) ;
  
      if pval <= .05 ;
        result = 'Reject'
      else ;
        result = 'Accept'
      end ;
            %  IT IS FUN TO WATCH THESE FLASHING BY 

      vpval = [vpval pval] ;

    end ;

    mpval = [mpval; vpval] ;

  end ;

    frmtstr = '%6.4f %6.4f %6.4f %6.4f' ;
    apv = mean(mpval) ;
    prj = mean(mpval <= .05) ;
  result = [       '                   For W :' sprintf(frmtstr,vw) 10] ;
  result = [result '        Average p-values :' sprintf(frmtstr,apv) 10] ;
  result = [result 'Proportion of rejections :' sprintf(frmtstr,prj) 10] ;
  result

  note = ['Notice this test of independence has: ' 10] ;
  note = [note '  a.  the right size (.05 when data are independent)' 10] ;
  note = [note '  b.  little power when dependence is mild' 10] ;
  note = [note '  c.  lots of power when dependence is strong' 10] ;
  note

elseif ipar == 3 ;    %  THEN DO PART 3 OF "CHECKS FOR INDEPENDENCE"

  %  A WIDELY USED MODEL FOR DEPENDENT DATA, IS THE "AUTOREGRESSIVE
  %  PROCESS OF ORDER 1".  A SIMPLE WAY TO GENERATE A FINITE SEQUENCE 
  %  OF VARIABLES WITH THIS STRUCTURE, FOR SIGMA = 1, IS
  %                        X_1 ~N(0,1)
  %         X_i = rho*X_I-1 + (1-rho^2)^(1/2) * eps_i     i=1,2,...,n
  %  WHERE eps_1,...,eps_n ARE I.I.D. N(0,1)
  %  IT CAN BE SHOWN THAT THE "LAG CORRELATIONS" ARE:
  %           corr(X_i,X_i-j) = rho^j     FOR  i=j+1,...,n, j=0,...,n-1
  %
  %  LET'S DEVELOP CODE FOR GENERATION OF SUCH DATA, AND THEN CHECK
  %  THAT WE HAVE IT RIGHT BY COMPARING THE THEORETICAL CORRELATIONS
  %  WITH THE EMPIRICAL AT LAGS 0,1,2,...,8

  nset = 100 ;
  nobs = 200 ;
  rho = .5 ;
  vlag = 0:1:8 ;

  note = '    Generating Data Sets' 
  randn('seed',23986598) ;
          %  CAREFUL, THIS IS randn, NOT rand.  THIS IS FOR GENERATING
          %  N(0,1)'S, AND HAS ITS OWN SEED, SEPARATE FROM THE SEED
          %  OF rand.
  mx = randn(1,nset) ;
          %  ROW VECTOR OF N(0,1)'S
  while size(mx,1) < nobs ;    %  BUILD UP mx TO nobs ROWS
    nextx = rho * mx(size(mx,1),:) ;
    eps = randn(1,nset) ;
    nextx = nextx + sqrt(1 - rho^2) * eps ;
    mx = [mx; nextx] ;
  end ;
  %    PROBABLY SOME OF THE "POLYNOMIAL OPERATIONS" IN MATLAB CAN BE 
  %    USED TO DO THIS AS A MATRIX OPERATION (I.E. WITH NO LOOP).

  note = '    Working on Lagged Correlations'
  mrhohat = [] ;


  for ilag = 1:length(vlag) ;
          %  COULD LOOP THROUGH DATA SETS, BUT HERE IS ANOTHER WAY:
          %  LOOP THROUGH LAGS, FOR MATRIX OF DATA SETS.
    lag = vlag(ilag) ;
    mxt1 = mx(1:(nobs - lag),:) ;
    mxt2 = mx((1+lag):nobs,:) ;
    ext1 = mean(mxt1) ;
    ext2 = mean(mxt2) ;
    sdxt1 = std(mxt1) * sqrt((nobs - lag - 1) / (nobs - lag)) ;
          %  PUT ON "1/n" SCALE, NOT "1/(n-1)"
    sdxt2 = std(mxt2) * sqrt((nobs - lag - 1) / (nobs - lag)) ;
    ext12 = mean(mxt1 .* mxt2) ;
    rhohat = (ext12 - ext1 .* ext2) ./ (sdxt1 .* sdxt2) ;
          %  VECTOR OF EMPIRICAL CORRELATIONS
          %  MATLAB CAN DIRECTLY GIVE A CORRELATION MATRIX WITH THE
          %  COMMAND corrcoef.  BUT THIS CAN ONLY BE USED ON ONE SET
          %  OF DATA AT A TIME (NOT A BLOCK OF DATA SETS AS DONE HERE).
    mrhohat = [mrhohat rhohat'] ;
  end ;

  %  CALCULATE SUMMARY STATISTICS AND OUTPUT RESULTS
  eavgr = mean(mrhohat) ;
          %  EMPIRICAL AVERAGE RHOHAT
  tavgr = rho .^ vlag ;
          %  THEORETICAL RHOHAT
    sdr = std(mrhohat) ;
  cil = eavgr - 1.96 * sdr / sqrt(nset) ;
  cir = eavgr + 1.96 * sdr / sqrt(nset) ;
  outm = [vlag; cil; tavgr; cir] ;
          %  USUAL NORMAL THEORY CI'S

    frmtstr = '  %7.3f   %7.3f   %7.3f   %7.3f\n' ;
  result = ['Check Correlations:' 10] ;
  result = [result '     lag       cil      theory     cir' 10] ;
  result = [result sprintf(frmtstr,outm)] ;
  result

  note = 'A real headache is that the theoretical values are not ' ;
  note = [note 'inside the CIs' 10] 

  cont = 'Hit any key to continue'
  pause ;



  note = 'Repeat above calculations, with a different generator' 
          %    WILL DESCRIBE THIS GAUSSIAN GENERATOR LATER    

  note = '    Generating Data Sets' 
  randn('seed',23986598) ;
          %  CAREFUL, THIS IS randn, NOT rand.  THIS IS FOR GENERATING
          %  N(0,1)'S, AND HAS ITS OWN SEED, SEPARATE FROM THE SEED
          %  OF rand.
%  mx = randn(1,nset) ;
          %  OLD LINE, NOW REPLACE BY:
        mx = rand(nset/2,2) ;
        mx1 = sqrt(-2 * log(mx(:,1))) .* cos( 2 * pi * mx(:,2)) ;
        mx2 = sqrt(-2 * log(mx(:,1))) .* sin( 2 * pi * mx(:,2)) ;
        mx = reshape([mx1 mx2],1,nset) ;
          %  ROW VECTOR OF N(0,1)'S
  while size(mx,1) < nobs ;    %  BUILD UP mx TO nobs ROWS
    nextx = rho * mx(size(mx,1),:) ;
%    eps = randn(1,nset) ;
          %  OLD LINE, NOW REPLACE BY:
        eps = rand(nset/2,2) ;
        eps1 = sqrt(-2 * log(eps(:,1))) .* cos( 2 * pi * eps(:,2)) ;
        eps2 = sqrt(-2 * log(eps(:,1))) .* sin( 2 * pi * eps(:,2)) ;
        eps = reshape([eps1 eps2],1,nset) ;
    nextx = nextx + sqrt(1 - rho^2) * eps ;
    mx = [mx; nextx] ;
  end ;
  %    PROBABLY SOME OF THE "POLYNOMIAL OPERATIONS" IN MATLAB CAN BE 
  %    USED TO DO THIS AS A MATRIX OPERATION (I.E. WITH NO LOOP).

  note = '    Working on Lagged Correlations'
  mrhohat = [] ;


  for ilag = 1:length(vlag) ;
          %  COULD LOOP THROUGH DATA SETS, BUT HERE IS ANOTHER WAY:
          %  LOOP THROUGH LAGS, FOR MATRIX OF DATA SETS.
    lag = vlag(ilag) ;
    mxt1 = mx(1:(nobs - lag),:) ;
    mxt2 = mx((1+lag):nobs,:) ;
    ext1 = mean(mxt1) ;
    ext2 = mean(mxt2) ;
    sdxt1 = std(mxt1) * sqrt((nobs - lag - 1) / (nobs - lag)) ;
          %  PUT ON "1/n" SCALE, NOT "1/(n-1)"
    sdxt2 = std(mxt2) * sqrt((nobs - lag - 1) / (nobs - lag)) ;
    ext12 = mean(mxt1 .* mxt2) ;
    rhohat = (ext12 - ext1 .* ext2) ./ (sdxt1 .* sdxt2) ;
          %  VECTOR OF EMPIRICAL CORRELATIONS
          %  MATLAB CAN DIRECTLY GIVE A CORRELATION MATRIX WITH THE
          %  COMMAND corrcoef.  BUT THIS CAN ONLY BE USED ON ONE SET
          %  OF DATA AT A TIME (NOT A BLOCK OF DATA SETS AS DONE HERE).
    mrhohat = [mrhohat rhohat'] ;
  end ;

  %  CALCULATE SUMMARY STATISTICS AND OUTPUT RESULTS
  eavgr = mean(mrhohat) ;
          %  EMPIRICAL AVERAGE RHOHAT
  tavgr = rho .^ vlag ;
          %  THEORETICAL RHOHAT
    sdr = std(mrhohat) ;
  cil = eavgr - 1.96 * sdr / sqrt(nset) ;
  cir = eavgr + 1.96 * sdr / sqrt(nset) ;
  outm = [vlag; cil; tavgr; cir] ;
          %  USUAL NORMAL THEORY CI'S

    frmtstr = '  %7.3f   %7.3f   %7.3f   %7.3f\n' ;
  result = ['Check Correlations:' 10] ;
  result = [result '     lag       cil      theory     cir' 10] ;
  result = [result sprintf(frmtstr,outm)] ;
  result

  %    NOW THE RESULTS ARE OK.  THIS SEEMS TO BE A PRETTY SERIOUS
  %    BUG IN MATLAB!!!!!!

    
end ;

