SAS Examples to illustrate Bootstrap

 
SAS Codes
Code 1 - Sampling means drawn from normal distributions
Code 2 - 1-sample two-tailed t-test (parametric and bootstrap)
Code 3 - 1-sample tests about central tendency (Proc Univariate)
Code 4 - Simulating binomial distributions using ranbin function
Code 5 - Paired Sample tests (t, Wilcoxon, Rank) (Proc Univariate)
Code 6 - Fisher's Exact Test for enumeration data
Code 7 - 1-sample binomial test for proportion
Code 8 - confidence intervals around proportions
Code 9 - correlation and regression
Code 10 - discriminant analysis
Code 11 - randomization and bootstrap tests for Manly's Example 1.1
Code 12 - IML code for diversity/evenness indices
Copyright Note - All codes posted here were written by M. Kowalewski
Code 1 - Simulations of sampling distributions of means derived from a normal distribution.
(back to the top)
* - simulations from normal distribution;
%let TEST=0.1;                        *--tested null hypothesis;
%let n=1000;                           *--number of samples;
%let incr=100;                           *--sample increment (=sample size);
%let var=nor;                          *--variable to be analyzed;
data normal;
        time=&incr*&n;
        do j=1 to time by &incr;
        do i=1 to &incr;
        nor=0.3*rannor(0);
        nor2=1.67*abs(rannor(0))-1.3;
        test=&var+&test;
        output;
        end;
        end;
keep &var j test;
proc chart;
        vbar test;
proc univariate data=normal noprint;
        var test;
        by j;
        output out=stats mean=mean std=std n=n probn=probn probt=ttest;
data new;
        set stats;
        if mean<=0 then p=1/&n;
        else p=0;
        if ttest<0.05 then reject=1/&n;
        else reject=0;
        keep mean std n probn ttest reject p;
proc chart data=new;
        vbar mean;
proc univariate data=new noprint;
        var mean std n;
        output out=final mean=mmean stdmean n skewness=skew range=mrange;
proc univariate data=new noprint;
        var p reject;
        output out=sign sum=prob reject;
proc univariate data=new noprint;
        var ttest;
        output out=sign2 mean=ttest;
data report;
        merge final sign sign2;
proc print;
        run;
QUIT;
(back to the top)

Code 2 - A t-test for a two-tailed H[0]:Mean=0, parametric and bootstrap approach.
(back to the top)
* - This example uses data from ZAR (Example 7.2, p. 96) ***;
* - Written by Michal Kowalewski, Virginia Tech, 2/3/00 ***;

* - PART 1. One sampled two-tailed t-test for null hypothesis mean ***;
DATA zar;
        infile cards;
        input rats;
CARDS;
1.7
0.7
-0.4
-1.8
0.2
0.9
-1.2
-0.9
-1.8
-1.4
-1.8
-2.0
;
PROC UNIVARIATE noprint;
  output out=param mean=mean n=n std=std stderr=stderr t=t probt=probt;
PROC PRINT;
run;

* PART 2: T-distribution for DF=11;
DATA tdist;
 DO i=0.0001 to 0.9999 by 0.0001;
  x=tinv(i,11);
  if abs(x)>=1.79807 then y=0.0001;
  else y=0;
  output;
 END;
 keep x y;
PROC chart;
 vbar x;
PROC UNIVARIATE noprint;
 var y;
 output out=prob sum=sum;
proc print;
run;

* PART 3: Bootstrap analysis for H[0]: mean=0 for data from example 7.2 above ***;
OPTIONS linesize=80 nodate;
**********(Enter these values)****************************************;
%let boot=1000;   *--number of bootstrap iterations;
%LET NULL=0;      *--true mean of the population;
***********************************************************************;
PROC iml symsize=1000 worksize=1000;  *--sets memory allocations;
USE ZAR;                              *--reads data;
READ all var{rats} into X;
START RANVEC(in,v_out);               *--creates a vector of random integers;
      k=nrow(in);
      v_index=in;
   do i=1 to k;
      rand=floor((k-i+1)*ranuni(0) + 1);
      v_ran=v_ran||v_index[rand];
      v_index=remove(v_index,rand);
   end;
      v_out=v_ran;
FINISH RANVEC;
START MIXUP(X,times,template);          *--creates a template of random values;
      n=nrow(X);
      template=t(1:n)*j(1,times,1);
   do i=1 to times;
      run ranvec(template[,i],out);
      template[,i]=t(out);
   end;
   do i=1 to n;
      run ranvec(t(template[i,]),out);
      template[i,]=out;
   end;
FINISH MIXUP;
START BOOT(in,times,out);                 *--bootstrap for 1-sample 2-tailed test for means;
      X=in;
      n=nrow(X);
      dm=sum(X)/n-&null;  *- Computes the difference between the sample mean and null value;
      var=(sum(X##2)-sum(X)**2/nrow(X))/(n-1); *- Computes sample variance;
      stderr=sqrt(var/n); *- Computes standard error;
      t=dm/stderr;              *- Computes student's t-statistic;
      PRINT T;
      run mixup(X,times,template);
      do i=1 to times;
        y=X[template[,i]];
        nr=nrow(Y);
        rm=sum(Y)/nr-&null;
        rvar=(sum(Y##2)-sum(Y)**2/nrow(Y))/(nr-1);
        rerr=sqrt(rvar/nr);
        trand=rm/rerr;
       if abs(t)>=abs(trand) then pp=1;
        else pp=0;
        ppp=ppp//pp;
        trr=trr//trand;
        VVV=vvv//rvar;
        ERR=ERR//rerr;
        MMM=MMM//RM;
        NNN=NNN//NR;
        end;
     p=sum(ppp)/times;
     print 'bootstrap probability' p;
     meant=sum(trr)/nrow(trr);
     print 'mean t-value' meant;
     meanv=sum(vvv)/nrow(vvv);
     print 'mean variance' meanv;
     meann=sum(nnn)/nrow(nnn);
     print 'sample size' meann;
     meanm=sum(mmm)/nrow(mmm);
     print 'mean mean' meanm;
     meanr=sum(err)/nrow(err);
     print 'mean standard error' meanr;
     out=trr;
FINISH BOOT;
***********************RUN BOOTSTRAP MODULE**********************;
RUN boot(X,&boot,dist);     *- Executes the analysis;
CREATE OUT from dist;       *- Creates an output datafile;
append from dist;
CLOSE OUT;
QUIT;
RUN;
******************************************************************;
PROC CHART data=OUT;
        vbar col1;
        run;
QUIT;
 
 

(back to the top)

Code 3 - One-sample tests about means and medians using PROC UNIVARIATE.
(back to the top)


* - example 7.1 from Zar, 1999;
%LET null=24.3;    * - mean postulated by our null hypothesis;
DATA sample;
    infile cards;
    input temp;
    var1=temp-&null;
    var2=&null-temp;
cards;
25.8
24.6
26.1
22.9
25.1
27.3
24
24.5
23.9
26.2
24.3
24.6
23.3
25.5
28.1
24.8
23.5
26.3
25.4
25.5
23.9
27
24.8
22.9
25.4
;
PROC univariate data=sample noprint;
    var var1 var2;
    output out=result mean=mean1 mean2 median=m1 m2 n=n stderr=sterr msign=sign1 sign2 probm=psign1 psign2
                      t=t1 t2 probt=pt1 pt2 signrank=wl1 wl2 probs=pwl1 pwl2;
proc print;
    run;
    quit;
 

(back to the top)

Code 4 - Simulating binomial distributions using ranbin function.
(back to the top)
%let n=20;       *-number of trials;
%let p=.01;      *-probability value;
data new;
        do i=1 to 10000;
        x=ranbin(0,&n,&p);
        output;
        end;
keep x;
proc chart;
        vbar x /midpoints=0 to &n by 1;
        run;
 
(back to the top)

Code 5 - Paired Sample Tests (t-test, Wilcoxon test, and Rank test) using PROC UNIVARIATE
(back to the top)
************************************************************************;
***  2-paired samples tests - Examples 9.1, 9.3, 24.11 from Zar, 1999 **;
***      Written by M. Kowalewski, Virginia Tech, 2/7/2000            **;
************************************************************************;
data pair;
        infile cards;
        input hind fore;
if hind>fore then sign=1;
else sign=0;
diff=hind-fore;
keep sign diff;
cards;
142  138
140  136
144  147
144  139
142  143
146  141
149  143
150  145
142  136
148  146
;
run;
proc univariate noprint;
        var diff;
        output out=new t=t probt=pt msign=m probm=ps signrank=wil probs=pwil;
proc print;
        run;
 
(back to the top)

Code 6 - Fisher's Exact Test for enumeration data
(back to the top)
********************************************************;
******* Example of Data Step with use of cards ********;
******* Example of Fisher's Exact test ****************;
********************************************************;

********************************************************;
* STEP 1: creates a data set for Fisher's exact test ***;
********************************************************;
data weight;
      do a=1 to 2;
      do b=1 to 2;
      input wt @@;
      output;
      end;
      end;
      cards;
386 1722 27 115
      ;
      run;
proc print data=weight;
      run;

********************************************************;
* STEP 2: Fisher's exact test using PROC FREQ **********;
********************************************************;
Title1'Fisher Exact Test for 2 by 2 table';
proc freq data=weight;
      weight wt;
      tables a*b /exact;
      run;
 

(back to the top)

Code 7 - 1-sample binomial test for proportion
(back to the top)
************************************************************************;
***          BINOMIAL TEST with Normal approximation                  **;
***    This program tests if a given proportion is different          **;
***    from a null proportion. Two values need to be enter: sample    **;
***    size (n) and number of observations with positive outcome (X). **;
***      Written by M. Kowalewski, Virginia Tech, 2/7/2000            **;
************************************************************************;

******** ENTER THE FOLLOWING MACRO VARIABLES ***************************;
%let X=18;       ** X - number of observations = "1" ("present");
%let n=30;    ** n - total number of observations;
%let p=.5;      ** proportion postulated by the null hypothesis;
************************************************************************;

%let p1=&X/&n;
%let n1=&X;
%let n2=&n-&X;
data bintest;
      theor1=1-probbnml(&p,&n,&X-1);
      theor2=probbnml(&p,&n,&n2)+theor1;
      n=&n1+&n2;
      z1=abs(&n1-((&n1+&n2)*0.5))-0.5;
      z2=sqrt(&n1*&p1*(1-&p1));
      z=z1/z2;
      d=(&n1+&n2)-1;
      prob=(1-probt(abs(z),d))*2;
      p=&p1;
      q=1-p;
      n=&n1+&n2;
      keep n z prob theor1 theor2 p q;
      run;
Title1'Binomial Test for proportions';
Title2'written by Michal Kowalewski, October 4, 1995';
proc print data=bintest noobs split='*';
      label n='number of observations';
      label prob='two-tailed probability with normal approximation';
      label theor1='one-tailed binomial probability';
      label theor2='two-tailed binomial probability';
      label p='proportion of observations with state=1';
      label q='proportion of observations with state=0';
      label z='z-value';
      run;
************************************************************************;
 
 

(back to the top)

Code 8 - confidence intervals around proportions
(back to the top)
***********************************************************************;
***    This program calculates confidence limits for  proportions    **;
***    Two values need to be enter: sample size (n) and number of    **;
***    observations with positive outcome (X).                       **;
***      Exact confidence intervals for proportion based on          **;
***    binomial and F distributions (eq.: 22.26-27, Zar 1984, p.378) **;
***      Written by M. Kowalewski, Virginia Tech                     **;
***********************************************************************;

******** ENTER THE FOLLOWING MACRO VARIABLES *************************;
%let X=187;         ** X - number of observations = "1" ("present");
%let n=1140;       ** n - total number of observations;
%let alpha=0.975; ** alpha - sets confidence interval;
***********************************************************************;
DATA binom;
      do i=0 to i=1;
      n=&n;
      v1=2*(&n-&X+1);
      v2=2*&X;
      f=&alpha;
      f1=FINV(f,v1,v2);
      f2=FINV(f,v2+2,v1-2);
      p=&X/&n;
      L=&X/(&X+(f1*(&n-&X+1)));
      U=((&X+1)*F2)/((F2*(&X+1))+&n-&X);
      q=1-(&X/&n);
      qL=1-U;
      qU=1-L;
      end;
      keep n p L U q qL qU;
Title1'Exact confidence intervals for proportions (Fisher & Yates 1963)';
Title2'written by Michal Kowalewski, October 4, 1995';
PROC PRINT data=binom noobs split='*';
      label n='number of observations';
      label p='proportion of observations with state=1';
      label L='95% lower confidence limit for p';
      label U='95% upper confidence limit for p';
      label q='proportion of observations with state=0';
      label qL='95% lower confidence limit for q';
      label qU='95% upper confidence limit for q';
      RUN;
      QUIT;
***********************************************************************;

(back to the top)

Code 9 - correlation and regression

* - Correlation and regression analysis using Zar's (1999) data from example 19.1;
data zarcorr;
        infile cards;
        input X Y;
cards;
10.4 7.4
10.8 7.6
11.1 7.9
10.2 7.2
10.3 7.4
10.2 7.1
10.7 7.4
10.5 7.2
10.8 7.8
11.2 7.7
10.6 7.8
11.4 8.3
;
proc corr pearson spearman kendall;
        var x;
        with y;
proc reg;
        model x=y;
        model y=x;
run;
quit;
 

(back to the top)


Code 10 - discriminant analysis

* - discriminant analysis and a posteriori classification of a multivariate dataset;

DATA class00;
        INFILE cards;
        INPUT name $ gender $ status $ age pound feet inch shoe teeth;
        height=round(30.5*(feet+inch/12));
       weight=round(pound*0.4536,.1);
        DROP pound feet inch;
cards;
Matt    M  MS   23  140  5  6      9    30
Monica  F  MS   26  130  5  6.125  5    32
Brooke  F  MS   22  135  5  3.5    6.5  31
Dave    M  PhD  25  130  5  8      10   28
Jason   M  PhD  24  145  5  8      9    28
Carrie  F  PhD  32  149  5  7      7    28
Alan    M  PhD  37  175  5  10     9.5  28
Mike    M  F    35  210  5  8      12   29
;
DATA input;
        SET class00;
        IF name='Mike' THEN DELETE;
        RUN;
PROC SORT;
        BY gender;
PROC PRINT;
DATA test;
        SET class00;
        IF name='Mike';
        RUN;
PROC SORT;
        BY gender;
PROC PRINT;
PROC DISCRIM DATA=input NCAN=1 CROSSVALIDATE CANONICAL DISTANCE
        TESTDATA=test OUTCROSS=scores TESTOUT=testscores TESTOUTD=testclass;
        VAR age shoe teeth height weight;
        CLASS gender;
PROC PRINT DATA=scores;
PROC PRINT DATA=testscores;
PROC PRINT DATA=testclass;
RUN;
QUIT;

(back to the top)


Code 11 -  Randomization and bootstrap tests for Manly's Example 1.1

*****************************************************;
*        TWO_SAMPLE RADNMIZATION AND BOOTSTRAP       ;
*         Example Data from Manly, 1997, p.4         ;
*            written by Michal Kowalewski            ;
*     Department of Geological Sciences              ;
* Virginia Polytechnic Institute & State University  ;
*             February 28, 2000                      ;
*****************************************************;

********** Macrovariables ****************************************;
%let TIMES=4999;             *--number of times to randomize;
%let DATASET=new;            *--name of sasdataset containg data;
%let VAR=mand;               *--name of variable to use;
%let n1=10;                  *--sample size for first sample;
%let n2=10;                  *--sample size for second sample;
%let type=2;                 *--choose "1" for randomization or "2" for bootstrap;
***********************************************************************;

data new;
        infile cards;
        input mand gend $;
cards;
120  m
107  m
110  m
116  m
114  m
111  m
113  m
117  m
114  m
112  m
110  f
111  f
107  f
108  f
110  f
105  f
107  f
106  f
111  f
111  f
;
RUN;

TITLE1'Parametric Ttest';
PROC TTEST;
     CLASS gend;
     VAR mand;
     RUN;

PROC IML;
USE &dataset;
READ all var{&var} into X;

START RANVEC(in,v_out);
      k=nrow(in);
      v_index=in;
   do i=1 to k;
      rand=floor((k-i+1)*ranuni(0) + 1);
      v_ran=v_ran||v_index[rand];
      v_index=remove(v_index,rand);
   end;
      v_out=v_ran;
finish ranvec;

START MIXUP(X,times,template);
      n=nrow(X);
      template=t(1:n)*j(1,times,1);
   do i=1 to times;
      run ranvec(template[,i],out);
      template[,i]=t(out);
   end;
   do i=1 to n;
      run ranvec(t(template[i,]),out);
      template[i,]=out;
   end;
finish mixup;

START DMEAN(X1,X2,D);
      m1=sum(X1)/nrow(X1);
      m2=sum(X2)/nrow(X2);
      D=m1-m2;
finish dmean;

START BOOT(Y,times,dist);
    X1=Y[1:&n1,];
    X2=Y[&n1+1:&n1+&n2,];
    run dmean(X1,X2,ad);
    print 'actual difference between the means is' ad;
    X=X1//X2;
    k=nrow(X1);  j=nrow(X2);
    run mixup(X,times,template);
  do i=1 to times;
    Y1=X[template[1:k,i]];
    Y2=X[template[(k+1):(k+j),i]];
    run dmean(Y1,Y2,D);
    Dq=Dq//D;
  end;
  dmean=Dq;
  act=shape(ad,nrow(dmean),1);
  dist=dmean||act;
finish boot;

start rand(Y,times,dist);
    X1=Y[1:&n1,];
    X2=Y[&n1+1:&n1+&n2,];
    run dmean(X1,X2,ad);
    print 'actual difference between the means is' ad;
    X=X1//X2;
     do i=1 to times;
       run ranvec(X,out);
       out1=t(out[,1:&n1]);
       out2=t(out[,&n1+1:&n1+&n2]);
       run dmean(out1,out2,rd);
       Dq=Dq//rd;
     end;
  dmean=Dq;
  act=shape(ad,nrow(dmean),1);
  dist=dmean||act;
finish rand;

if &type=1 then run rand(X,&times,dist);
if &type=2 then run boot(X,&times,dist);

create OUT from dist [colname={'Dmean' 'Act'}];
     append from dist;
close OUT;

QUIT;
RUN;

proc univariate normal data=out;
   var Dmean;
   run;
proc chart data=out;
   vbar Dmean;
   run;
data prob;
      set out;
      if dmean>=act then p1=1;
      else p1=0;
      if dmean<=act then p2=1;
      else p2=0;
      if abs(dmean)=>abs(act) then p3=1;
      else p3=0;
      run;
proc univariate data=prob noprint;
      var p1 p2 p3;
      output out=last sum=s1 s2 s3 N=n1 n2 n3;
      run;
data final;
      set last;
      p1t=(s1+1)/(n1+1);
      p2t=(s2+1)/(n2+1);
      ptt=(s3+1)/(n3+1);
      n=n1+1;
      keep p1t p2t ptt n;
      run;
Title1*Randomization(1) or Bootstrap(2) test (test type=&type) for difference between two means*;
PROC PRINT data=final noobs split='*';
        label n='number of iterations';
        label p1t='one-tailed probability H[0]: mean1<=mean2';
        label p2t='one-tailed probability H[0]: mean1>=mean2';
        label ptt='two-tailed probability H[0]: mean1 NE mean2';
        run;
 

(back to the top)

Code 12 - Diversity indices (IML code)

This code uses PROC IML to compute Shannon-Weaver [H] and Evenness [E] indices. The data are absolute specimen counts for one sample. The Dataset "tax" is a hypothetical example. The language syntax is capitalized, and arbitary name are typed in lower case. See if you can understand this relatively simple code.

 DATA tax;
        INFILE CARDS;
        INPUT taxa;
CARDS;
210
4
3
2
2
1
;
RUN;

PROC IML;
  USE tax;
  READ ALL VAR{taxa} INTO X;
  START index(Y,S,H);
    X=Y;
    S=l1||NROW(X);
    P=X/SUM(X);
    H=l2||-SUM(P#LOG(P));
  FINISH index;
  START final(X,out);
        Y=X;
        Ymax=SHAPE(SUM(Y)/NROW(Y),NROW(Y),1);
        RUN index(Y,S,H);
        RUN index(Ymax,Smax,Hmax);
        E=H/Hmax;
        out=S//H//Hmax//E;
        l1={'richness'};
        l2={'H index'};
        l3={'Hmax index'};
        l4={'E index'};
        label=l1//l2//l3//l4;
        PRINT label out;
  FINISH final;
  RUN final(X,out);
RUN;

QUIT;
 

(back to the top)