

# Don't Do Significance Testing!


# doesn't answer the questions of interest

# highly dependent on sample size

# in large samples, can pounce on tiny differences, declaring them
# "significant"


> ml <- read.table('u.data',header=FALSE)
> userMeans <- tapply(ml[,3],ml[,1],mean)
> usr <- read.table('u.user',header=FALSE,sep='|')
> age <- usr$V2
> gen <- usr$V3
> summary(lm(userMeans ~ age+gen))
...
Coefficients:
             Estimate Std. Error t value Pr(>|t|)
(Intercept) 3.4725821  0.0482655  71.947  < 2e-16 ***
age         0.0033891  0.0011860   2.858  0.00436 **
genM        0.0002862  0.0318670   0.009  0.99284
...
# age 40 - age 30, .0034

# much better: form CI
> rad <- 1.96*0.0011860
> beta <- 0.0033891
> c(beta - rad, beta + rad)
[1] 0.00106454 0.00571366
# 0 not in CI but whole CI very near 0

# note: tests/CIs valid here, as data points are independent; not true
# for 'ml' above

# if small n:  e.g. CI of (0.48,0.2) for a proportion still informative;
# most of CI is > 0


> age2 <- age^2
> summary(lm(userMeans ~ age+age2+gen))
...
Coefficients:
              Estimate Std. Error t value Pr(>|t|)
(Intercept)  3.407e+00  1.179e-01  28.908   <2e-16 ***
age          7.274e-03  6.500e-03   1.119    0.263
age2        -5.148e-05  8.469e-05  -0.608    0.543
genM         8.645e-04  3.189e-02   0.027    0.978
...
# overfit? n = 943 but effects small
# age 40 - age 30 = 0.035; still small but possible quad effect;
# CI requires vcov()

