|
Copyright Note - All codes posted here were written by M. KowalewskiCode 1 - Sampling means drawn from normal distributions
Code 2 - 1-sample two-tailed t-test (parametric and bootstrap)
Code 3 - 1-sample tests about central tendency (Proc Univariate)
Code 4 - Simulating binomial distributions using ranbin function
Code 5 - Paired Sample tests (t, Wilcoxon, Rank) (Proc Univariate)
Code 6 - Fisher's Exact Test for enumeration data
Code 7 - 1-sample binomial test for proportion
Code 8 - confidence intervals around proportions
Code 9 - correlation and regression
Code 10 - discriminant analysis
Code 11 - randomization and bootstrap tests for Manly's Example 1.1
Code 12 - IML code for diversity/evenness indices
* - PART 1. One sampled two-tailed t-test
for null hypothesis mean ***;
DATA zar;
infile cards;
input rats;
CARDS;
1.7
0.7
-0.4
-1.8
0.2
0.9
-1.2
-0.9
-1.8
-1.4
-1.8
-2.0
;
PROC UNIVARIATE noprint;
output out=param mean=mean n=n
std=std stderr=stderr t=t probt=probt;
PROC PRINT;
run;
* PART 2: T-distribution for
DF=11;
DATA
tdist;
DO i=0.0001 to
0.9999 by 0.0001;
x=tinv(i,11);
if
abs(x)>=1.79807 then y=0.0001;
else y=0;
output;
END;
keep x y;
PROC
chart;
vbar
x;
PROC UNIVARIATE
noprint;
var
y;
output out=prob
sum=sum;
proc
print;
run;
* PART 3: Bootstrap analysis for H[0]:
mean=0 for data from example 7.2 above ***;
OPTIONS linesize=80 nodate;
**********(Enter these
values)****************************************;
%let boot=1000; *--number of bootstrap
iterations;
%LET
NULL=0; *--true mean of the
population;
***********************************************************************;
PROC iml symsize=1000 worksize=1000;
*--sets memory allocations;
USE
ZAR;
*--reads data;
READ all
var{rats} into X;
START
RANVEC(in,v_out);
*--creates a vector of random integers;
k=nrow(in);
v_index=in;
do i=1 to k;
rand=floor((k-i+1)*ranuni(0) + 1);
v_ran=v_ran||v_index[rand];
v_index=remove(v_index,rand);
end;
v_out=v_ran;
FINISH RANVEC;
START
MIXUP(X,times,template);
*--creates a template of random values;
n=nrow(X);
template=t(1:n)*j(1,times,1);
do i=1 to times;
run
ranvec(template[,i],out);
template[,i]=t(out);
end;
do i=1 to n;
run
ranvec(t(template[i,]),out);
template[i,]=out;
end;
FINISH MIXUP;
START
BOOT(in,times,out);
*--bootstrap for 1-sample 2-tailed test for means;
X=in;
n=nrow(X);
dm=sum(X)/n-&null; *- Computes
the difference between the sample mean and null value;
var=(sum(X##2)-sum(X)**2/nrow(X))/(n-1); *- Computes sample
variance;
stderr=sqrt(var/n); *- Computes standard
error;
t=dm/stderr;
*- Computes student's t-statistic;
PRINT T;
run
mixup(X,times,template);
do i=1 to times;
y=X[template[,i]];
nr=nrow(Y);
rm=sum(Y)/nr-&null;
rvar=(sum(Y##2)-sum(Y)**2/nrow(Y))/(nr-1);
rerr=sqrt(rvar/nr);
trand=rm/rerr;
if
abs(t)>=abs(trand) then pp=1;
else pp=0;
ppp=ppp//pp;
trr=trr//trand;
VVV=vvv//rvar;
ERR=ERR//rerr;
MMM=MMM//RM;
NNN=NNN//NR;
end;
p=sum(ppp)/times;
print 'bootstrap
probability' p;
meant=sum(trr)/nrow(trr);
print 'mean
t-value' meant;
meanv=sum(vvv)/nrow(vvv);
print 'mean
variance' meanv;
meann=sum(nnn)/nrow(nnn);
print 'sample
size' meann;
meanm=sum(mmm)/nrow(mmm);
print 'mean mean'
meanm;
meanr=sum(err)/nrow(err);
print 'mean
standard error' meanr;
out=trr;
FINISH BOOT;
***********************RUN BOOTSTRAP
MODULE**********************;
RUN boot(X,&boot,dist); *- Executes the
analysis;
CREATE OUT from
dist; *- Creates an output
datafile;
append from
dist;
CLOSE
OUT;
QUIT;
RUN;
******************************************************************;
PROC CHART data=OUT;
vbar col1;
run;
QUIT;
* - example 7.1 from Zar,
1999;
%LET
null=24.3; * - mean postulated by our null
hypothesis;
DATA
sample;
infile cards;
input temp;
var1=temp-&null;
var2=&null-temp;
cards;
25.8
24.6
26.1
22.9
25.1
27.3
24
24.5
23.9
26.2
24.3
24.6
23.3
25.5
28.1
24.8
23.5
26.3
25.4
25.5
23.9
27
24.8
22.9
25.4
;
PROC univariate data=sample
noprint;
var var1 var2;
output out=result mean=mean1 mean2 median=m1 m2 n=n
stderr=sterr msign=sign1 sign2 probm=psign1 psign2
t=t1 t2 probt=pt1 pt2 signrank=wl1 wl2 probs=pwl1 pwl2;
proc print;
run;
quit;
********************************************************;
* STEP 1: creates a data set for Fisher's
exact test ***;
********************************************************;
data weight;
do a=1 to
2;
do b=1 to 2;
input wt
@@;
output;
end;
end;
cards;
386 1722 27 115
;
run;
proc print
data=weight;
run;
********************************************************;
* STEP 2: Fisher's exact test using PROC
FREQ **********;
********************************************************;
Title1'Fisher Exact Test for 2 by 2
table';
proc freq
data=weight;
weight wt;
tables a*b
/exact;
run;
******** ENTER THE FOLLOWING MACRO
VARIABLES ***************************;
%let X=18; ** X
- number of observations = "1" ("present");
%let n=30; ** n - total number of
observations;
%let
p=.5; ** proportion postulated by the null
hypothesis;
************************************************************************;
%let p1=&X/&n;
%let n1=&X;
%let n2=&n-&X;
data bintest;
theor1=1-probbnml(&p,&n,&X-1);
theor2=probbnml(&p,&n,&n2)+theor1;
n=&n1+&n2;
z1=abs(&n1-((&n1+&n2)*0.5))-0.5;
z2=sqrt(&n1*&p1*(1-&p1));
z=z1/z2;
d=(&n1+&n2)-1;
prob=(1-probt(abs(z),d))*2;
p=&p1;
q=1-p;
n=&n1+&n2;
keep n z prob theor1 theor2 p
q;
run;
Title1'Binomial Test for proportions';
Title2'written by Michal Kowalewski,
October 4, 1995';
proc print
data=bintest noobs split='*';
label n='number of
observations';
label prob='two-tailed probability with
normal approximation';
label theor1='one-tailed binomial
probability';
label theor2='two-tailed binomial
probability';
label p='proportion of observations with
state=1';
label q='proportion of observations with
state=0';
label z='z-value';
run;
************************************************************************;
******** ENTER THE FOLLOWING MACRO
VARIABLES *************************;
%let X=187; ** X -
number of observations = "1" ("present");
%let n=1140; **
n - total number of observations;
%let alpha=0.975; ** alpha - sets confidence interval;
***********************************************************************;
DATA binom;
do i=0 to
i=1;
n=&n;
v1=2*(&n-&X+1);
v2=2*&X;
f=α
f1=FINV(f,v1,v2);
f2=FINV(f,v2+2,v1-2);
p=&X/&n;
L=&X/(&X+(f1*(&n-&X+1)));
U=((&X+1)*F2)/((F2*(&X+1))+&n-&X);
q=1-(&X/&n);
qL=1-U;
qU=1-L;
end;
keep n p L U q qL qU;
Title1'Exact confidence intervals for
proportions (Fisher & Yates 1963)';
Title2'written by Michal Kowalewski, October 4,
1995';
PROC PRINT data=binom
noobs split='*';
label n='number of
observations';
label p='proportion of observations with
state=1';
label L='95% lower confidence limit for
p';
label U='95% upper confidence limit for
p';
label q='proportion of observations with
state=0';
label qL='95% lower confidence limit for
q';
label qU='95% upper confidence limit for
q';
RUN;
QUIT;
***********************************************************************;
* - Correlation and regression analysis
using Zar's (1999) data from example 19.1;
data zarcorr;
infile
cards;
input X Y;
cards;
10.4 7.4
10.8 7.6
11.1
7.9
10.2 7.2
10.3 7.4
10.2 7.1
10.7 7.4
10.5
7.2
10.8 7.8
11.2 7.7
10.6 7.8
11.4 8.3
;
proc corr pearson
spearman kendall;
var x;
with y;
proc
reg;
model x=y;
model y=x;
run;
quit;
* - discriminant analysis and a posteriori classification of a multivariate dataset;
DATA class00;
INFILE
cards;
INPUT name $ gender $ status
$ age pound feet inch shoe teeth;
height=round(30.5*(feet+inch/12));
weight=round(pound*0.4536,.1);
DROP pound feet
inch;
cards;
Matt M
MS 23 140 5 6
9 30
Monica F MS 26 130 5
6.125 5 32
Brooke F MS 22 135 5
3.5 6.5 31
Dave M PhD 25 130 5
8 10 28
Jason M PhD 24
145 5 8 9
28
Carrie F
PhD 32 149 5 7
7 28
Alan M PhD 37 175 5
10 9.5 28
Mike M F
35 210 5 8 12
29
;
DATA input;
SET
class00;
IF name='Mike' THEN
DELETE;
RUN;
PROC SORT;
BY
gender;
PROC
PRINT;
DATA
test;
SET class00;
IF name='Mike';
RUN;
PROC SORT;
BY
gender;
PROC
PRINT;
PROC DISCRIM
DATA=input NCAN=1 CROSSVALIDATE CANONICAL DISTANCE
TESTDATA=test OUTCROSS=scores TESTOUT=testscores
TESTOUTD=testclass;
VAR age shoe teeth height
weight;
CLASS gender;
PROC PRINT DATA=scores;
PROC PRINT DATA=testscores;
PROC PRINT DATA=testclass;
RUN;
QUIT;
*****************************************************;
* TWO_SAMPLE RADNMIZATION AND
BOOTSTRAP ;
*
Example Data from Manly, 1997,
p.4 ;
*
written by Michal
Kowalewski
;
*
Department of Geological
Sciences
;
* Virginia Polytechnic
Institute & State University ;
*
February 28,
2000
;
*****************************************************;
********** Macrovariables
****************************************;
%let
TIMES=4999;
*--number of times to randomize;
%let
DATASET=new;
*--name of sasdataset containg data;
%let
VAR=mand;
*--name of variable to use;
%let
n1=10;
*--sample size for first sample;
%let
n2=10;
*--sample size for second sample;
%let
type=2;
*--choose "1" for randomization or "2" for bootstrap;
***********************************************************************;
data new;
infile
cards;
input mand gend
$;
cards;
120 m
107 m
110 m
116 m
114 m
111 m
113 m
117 m
114 m
112 m
110 f
111 f
107 f
108 f
110 f
105 f
107 f
106 f
111 f
111 f
;
RUN;
TITLE1'Parametric Ttest';
PROC TTEST;
CLASS gend;
VAR
mand;
RUN;
PROC IML;
USE &dataset;
READ all var{&var} into X;
START RANVEC(in,v_out);
k=nrow(in);
v_index=in;
do i=1 to k;
rand=floor((k-i+1)*ranuni(0) + 1);
v_ran=v_ran||v_index[rand];
v_index=remove(v_index,rand);
end;
v_out=v_ran;
finish ranvec;
START
MIXUP(X,times,template);
n=nrow(X);
template=t(1:n)*j(1,times,1);
do i=1 to times;
run
ranvec(template[,i],out);
template[,i]=t(out);
end;
do i=1 to n;
run
ranvec(t(template[i,]),out);
template[i,]=out;
end;
finish mixup;
START DMEAN(X1,X2,D);
m1=sum(X1)/nrow(X1);
m2=sum(X2)/nrow(X2);
D=m1-m2;
finish
dmean;
START BOOT(Y,times,dist);
X1=Y[1:&n1,];
X2=Y[&n1+1:&n1+&n2,];
run
dmean(X1,X2,ad);
print 'actual difference between the means is'
ad;
X=X1//X2;
k=nrow(X1); j=nrow(X2);
run mixup(X,times,template);
do i=1 to times;
Y1=X[template[1:k,i]];
Y2=X[template[(k+1):(k+j),i]];
run
dmean(Y1,Y2,D);
Dq=Dq//D;
end;
dmean=Dq;
act=shape(ad,nrow(dmean),1);
dist=dmean||act;
finish boot;
start rand(Y,times,dist);
X1=Y[1:&n1,];
X2=Y[&n1+1:&n1+&n2,];
run
dmean(X1,X2,ad);
print 'actual difference between the means is'
ad;
X=X1//X2;
do i=1 to times;
run
ranvec(X,out);
out1=t(out[,1:&n1]);
out2=t(out[,&n1+1:&n1+&n2]);
run
dmean(out1,out2,rd);
Dq=Dq//rd;
end;
dmean=Dq;
act=shape(ad,nrow(dmean),1);
dist=dmean||act;
finish rand;
if &type=1 then run
rand(X,×,dist);
if
&type=2 then run boot(X,×,dist);
create OUT from dist [colname={'Dmean'
'Act'}];
append from dist;
close OUT;
QUIT;
RUN;
proc univariate normal
data=out;
var
Dmean;
run;
proc chart
data=out;
vbar
Dmean;
run;
data
prob;
set out;
if dmean>=act then
p1=1;
else p1=0;
if dmean<=act then
p2=1;
else p2=0;
if
abs(dmean)=>abs(act) then p3=1;
else p3=0;
run;
proc univariate data=prob
noprint;
var p1 p2 p3;
output out=last
sum=s1 s2 s3 N=n1 n2 n3;
run;
data final;
set
last;
p1t=(s1+1)/(n1+1);
p2t=(s2+1)/(n2+1);
ptt=(s3+1)/(n3+1);
n=n1+1;
keep p1t p2t ptt n;
run;
Title1*Randomization(1)
or Bootstrap(2) test (test type=&type) for difference between two
means*;
PROC PRINT
data=final noobs split='*';
label n='number of
iterations';
label p1t='one-tailed
probability H[0]: mean1<=mean2';
label p2t='one-tailed
probability H[0]: mean1>=mean2';
label ptt='two-tailed
probability H[0]: mean1 NE mean2';
run;
This code uses PROC IML to compute Shannon-Weaver [H] and Evenness [E] indices. The data are absolute specimen counts for one sample. The Dataset "tax" is a hypothetical example. The language syntax is capitalized, and arbitary name are typed in lower case. See if you can understand this relatively simple code.
DATA tax;
INFILE
CARDS;
INPUT taxa;
CARDS;
210
4
3
2
2
1
;
RUN;
PROC IML;
USE tax;
READ ALL VAR{taxa} INTO X;
START index(Y,S,H);
X=Y;
S=l1||NROW(X);
P=X/SUM(X);
H=l2||-SUM(P#LOG(P));
FINISH index;
START
final(X,out);
Y=X;
Ymax=SHAPE(SUM(Y)/NROW(Y),NROW(Y),1);
RUN
index(Y,S,H);
RUN
index(Ymax,Smax,Hmax);
E=H/Hmax;
out=S//H//Hmax//E;
l1={'richness'};
l2={'H index'};
l3={'Hmax index'};
l4={'E index'};
label=l1//l2//l3//l4;
PRINT label
out;
FINISH
final;
RUN
final(X,out);
RUN;
QUIT;