Bootstrap

Un esempio: stimare l’errore standard della media e della mediana. Un campione da una gamma.

library(boot)
set.seed(52)
n = 100
y = rgamma(n, shape = 2, scale = 5)
curve(dgamma(x,shape = 2, scale = 5), 0, 50, ylab = "", lwd =2, col = "blue")
rug(y, col = "red")

Veri valori dei parametri \(\mu = 10\),

med  = qgamma(0.5,shape = 2, scale = 5 )
med

[1] 8.392

Stime

mean(y)

[1] 9.201

median(y)

[1] 8.284

Stima bootstrap dell’errore standard.

boot(y,function(x, index){ mean(x[index])}, R=1000)


ORDINARY NONPARAMETRIC BOOTSTRAP


Call:
boot(data = y, statistic = function(x, index) {
    mean(x[index])
}, R = 1000)


Bootstrap Statistics :
    original   bias    std. error
t1*    9.201 0.009677      0.6166

boot(y,function(x, index){ median(x[index])}, R=1000)


ORDINARY NONPARAMETRIC BOOTSTRAP


Call:
boot(data = y, statistic = function(x, index) {
    median(x[index])
}, R = 1000)


Bootstrap Statistics :
    original  bias    std. error
t1*    8.284  -0.146      0.8699

library(ISLR)
alpha.fn = function(data,index){
     X=data$X[index]
     Y=data$Y[index]
     return((var(Y)-cov(X,Y))/(var(X)+var(Y)-2*cov(X,Y)))
}

alpha.fn(Portfolio,1:100)

[1] 0.5758

set.seed(1)
alpha.fn(Portfolio,sample(100,100,replace=T))

[1] 0.5964

boot(Portfolio,alpha.fn,R=1000)


ORDINARY NONPARAMETRIC BOOTSTRAP


Call:
boot(data = Portfolio, statistic = alpha.fn, R = 1000)


Bootstrap Statistics :
    original     bias    std. error
t1*   0.5758 -7.315e-05     0.08862

Prostate data

Descrizione

The data for this example come from a study by Stamey et al. (1989). They examined the correlation between the level of prostate-specific antigen and a number of clinical measures in men who were about to receive a radical prostatectomy. The variables are log cancer volume (lcavol), log prostate weight (lweight), age, log of the amount of benign prostatic hyperplasia (lbph), seminal vesicle invasion (svi), log of capsular penetration (lcp), Gleason score (gleason), and percent of Gleason scores 4 or 5 (pgg45).

Leggiamo i dati

# dati  = read.table("~/Dropbox/Public/gmm/static/R/prostate.txt", header = TRUE)
dati =  read.table("http://labdisia.disia.unifi.it/gmm/downloads/files/prostate.txt", head=TRUE) 
head(dati)

   lcavol lweight age   lbph svi    lcp gleason pgg45    lpsa train
1 -0.5798   2.769  50 -1.386   0 -1.386       6     0 -0.4308  TRUE
2 -0.9943   3.320  58 -1.386   0 -1.386       6     0 -0.1625  TRUE
3 -0.5108   2.691  74 -1.386   0 -1.386       7    20 -0.1625  TRUE
4 -1.2040   3.283  58 -1.386   0 -1.386       6     0 -0.1625  TRUE
5  0.7514   3.432  62 -1.386   0 -1.386       6     0  0.3716  TRUE
6 -1.0498   3.229  50 -1.386   0 -1.386       6     0  0.7655  TRUE

summary(dati)

     lcavol          lweight          age            lbph            svi       
 Min.   :-1.347   Min.   :2.38   Min.   :41.0   Min.   :-1.39   Min.   :0.000  
 1st Qu.: 0.513   1st Qu.:3.38   1st Qu.:60.0   1st Qu.:-1.39   1st Qu.:0.000  
 Median : 1.447   Median :3.62   Median :65.0   Median : 0.30   Median :0.000  
 Mean   : 1.350   Mean   :3.63   Mean   :63.9   Mean   : 0.10   Mean   :0.216  
 3rd Qu.: 2.127   3rd Qu.:3.88   3rd Qu.:68.0   3rd Qu.: 1.56   3rd Qu.:0.000  
 Max.   : 3.821   Max.   :4.78   Max.   :79.0   Max.   : 2.33   Max.   :1.000  
      lcp            gleason         pgg45            lpsa       
 Min.   :-1.386   Min.   :6.00   Min.   :  0.0   Min.   :-0.431  
 1st Qu.:-1.386   1st Qu.:6.00   1st Qu.:  0.0   1st Qu.: 1.732  
 Median :-0.798   Median :7.00   Median : 15.0   Median : 2.592  
 Mean   :-0.179   Mean   :6.75   Mean   : 24.4   Mean   : 2.478  
 3rd Qu.: 1.179   3rd Qu.:7.00   3rd Qu.: 40.0   3rd Qu.: 3.056  
 Max.   : 2.904   Max.   :9.00   Max.   :100.0   Max.   : 5.583  
   train        
 Mode :logical  
 FALSE:30       
 TRUE :67       
 NA's :0

Calcoliamo la matrice di correlazione

round(cor(dati),2)

        lcavol lweight  age  lbph   svi   lcp gleason pgg45  lpsa train
lcavol    1.00    0.28 0.22  0.03  0.54  0.68    0.43  0.43  0.73 -0.05
lweight   0.28    1.00 0.35  0.44  0.16  0.16    0.06  0.11  0.43 -0.01
age       0.22    0.35 1.00  0.35  0.12  0.13    0.27  0.28  0.17  0.18
lbph      0.03    0.44 0.35  1.00 -0.09 -0.01    0.08  0.08  0.18 -0.03
svi       0.54    0.16 0.12 -0.09  1.00  0.67    0.32  0.46  0.57  0.03
lcp       0.68    0.16 0.13 -0.01  0.67  1.00    0.51  0.63  0.55 -0.04
gleason   0.43    0.06 0.27  0.08  0.32  0.51    1.00  0.75  0.37 -0.04
pgg45     0.43    0.11 0.28  0.08  0.46  0.63    0.75  1.00  0.42  0.10
lpsa      0.73    0.43 0.17  0.18  0.57  0.55    0.37  0.42  1.00 -0.03
train    -0.05   -0.01 0.18 -0.03  0.03 -0.04   -0.04  0.10 -0.03  1.00

Matrice di scatter

I punti blu sono i training data.

pairs(dati[,c(1,2,3,4,6,8,9)], col = c('red', 'RoyalBlue')[1 + (dati$train==TRUE)], pch = 20)

Adattamento con i minimi quadrati

We fit a linear model to the log of prostate-specific antigen, lpsa, after first standardizing the predictors to have unit variance. We randomly split the dataset into a training set of size 67 and a test set of size 30.

Standardizzazione (anche le variabili qualitative)

#for(i in c(1,2,3,4,6,8)) dati[,i] = (dati[,i] - mean(dati[,i]))/sd(dati[,i])
for(i in 1:8) dati[,i] = (dati[,i] - mean(dati[,i]))/sd(dati[,i])

tr = (1:97)[dati$train]
fit1 = lm(lpsa ~ lcavol + lweight + age + lbph + svi + lcp + gleason + pgg45,subset = tr, data = dati)
summary(fit1)


Call:
lm(formula = lpsa ~ lcavol + lweight + age + lbph + svi + lcp + 
    gleason + pgg45, data = dati, subset = tr)

Residuals:
    Min      1Q  Median      3Q     Max 
-1.6487 -0.3415 -0.0542  0.4494  1.4868 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)   2.4649     0.0893   27.60  < 2e-16 ***
lcavol        0.6795     0.1266    5.37  1.5e-06 ***
lweight       0.2631     0.0956    2.75   0.0079 ** 
age          -0.1415     0.1013   -1.40   0.1681    
lbph          0.2101     0.1022    2.06   0.0443 *  
svi           0.3052     0.1236    2.47   0.0165 *  
lcp          -0.2885     0.1545   -1.87   0.0670 .  
gleason      -0.0213     0.1452   -0.15   0.8839    
pgg45         0.2670     0.1536    1.74   0.0875 .  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.712 on 58 degrees of freedom
Multiple R-squared:  0.694, Adjusted R-squared:  0.652 
F-statistic: 16.5 on 8 and 58 DF,  p-value: 2.04e-12

Mean prediction error on test data

mean((dati$lpsa[-tr] - predict(fit1, dati[-tr,]))^2)

[1] 0.5213

Mean prediction error on test data using the mean training value of lpsa (base error rate)

mean((dati$lpsa[-tr] - mean(dati$lpsa[tr]))^2)

[1] 1.057

Hence the linear model reduces the base error rate by about 50%.

Subset selection

library(leaps)
a  = regsubsets(lpsa ~ ., data = dati)
regsum = summary(a)
regsum

Subset selection object
Call: regsubsets.formula(lpsa ~ ., data = dati)
9 Variables  (and intercept)
          Forced in Forced out
lcavol        FALSE      FALSE
lweight       FALSE      FALSE
age           FALSE      FALSE
lbph          FALSE      FALSE
svi           FALSE      FALSE
lcp           FALSE      FALSE
gleason       FALSE      FALSE
pgg45         FALSE      FALSE
trainTRUE     FALSE      FALSE
1 subsets of each size up to 8
Selection Algorithm: exhaustive
         lcavol lweight age lbph svi lcp gleason pgg45 trainTRUE
1  ( 1 ) "*"    " "     " " " "  " " " " " "     " "   " "      
2  ( 1 ) "*"    "*"     " " " "  " " " " " "     " "   " "      
3  ( 1 ) "*"    "*"     " " " "  "*" " " " "     " "   " "      
4  ( 1 ) "*"    "*"     " " "*"  "*" " " " "     " "   " "      
5  ( 1 ) "*"    "*"     "*" "*"  "*" " " " "     " "   " "      
6  ( 1 ) "*"    "*"     "*" "*"  "*" " " " "     "*"   " "      
7  ( 1 ) "*"    "*"     "*" "*"  "*" "*" " "     "*"   " "      
8  ( 1 ) "*"    "*"     "*" "*"  "*" "*" "*"     "*"   " "

plot(regsum$adjr2 ,xlab="Numero di variabili",
ylab="R2 aggiustato",type="b", col = "red")

which.max(regsum$adjr2)

[1] 7

Ridge regression

library(glmnet)
x=model.matrix(lpsa ~ . , data  = dati)[,-1];  y=dati$lpsa

set.seed (1)
train=sample(1:nrow(x), nrow(x)/2)
test=(-train)
y.test=y[test]

# grid=10^seq(3,-2,length=100) # you can override the default 
#ridge.mod=glmnet(x,y,alpha=0, standardize = FALSE)
ridge.mod=glmnet(x[train,],y[train],alpha=0, thresh=1e-12)
plot(ridge.mod, "lambda", label = TRUE)
legend("topright", legend = paste(1:9, colnames(x)),cex = .7 )

grid=10^seq(-3,3,length=50) 
cv.out=cv.glmnet(x[train ,],y[train],alpha=0, lambda = grid)
plot(cv.out)

lamin = cv.out$lambda.min
lamin

[1] 0.03907

log(lamin)

[1] -3.242

Prediction

ridge.mod=glmnet(x[train,],y[train],alpha=0,lambda=grid, thresh=1e-12)
ridge.pred=predict(ridge.mod,  s=lamin,  newx=x[test,])
mean((ridge.pred-y.test)^2)

[1] 0.8311

out = glmnet(x,y,alpha=0)
predict(out, type="coefficients",s=lamin)

10 x 1 sparse Matrix of class "dgCMatrix"
                    1
(Intercept)  2.479914
lcavol       0.580079
lweight      0.258792
age         -0.124325
lbph         0.124427
svi          0.283827
lcp         -0.055632
gleason      0.045864
pgg45        0.096492
trainTRUE   -0.002211

Lasso

lasso.mod=glmnet(x[train,],y[train],alpha=1,lambda=grid)
plot(lasso.mod)

set.seed(1)
cv.out=cv.glmnet(x[train,],y[train],alpha=1)
plot(cv.out)
bestlam=cv.out$lambda.min
bestlam

[1] 0.009874

log(bestlam)

[1] -4.618

lam1se = cv.out$lambda.1se
lam1se

[1] 0.1938

log(lam1se)

[1] -1.641

e = cv.out$cvup[cv.out$lambda==bestlam]
abline(h = e, lty = 2)

coef(lasso.mod, s = bestlam)

10 x 1 sparse Matrix of class "dgCMatrix"
                   1
(Intercept)  2.18590
lcavol       0.64060
lweight      0.33355
age         -0.19103
lbph         0.19882
svi          0.08672
lcp         -0.12910
gleason      0.12527
pgg45        0.08932
trainTRUE    0.16642

coef(lasso.mod, s = lam1se)

10 x 1 sparse Matrix of class "dgCMatrix"
                   1
(Intercept) 2.313950
lcavol      0.434219
lweight     0.240813
age         .       
lbph        .       
svi         .       
lcp         .       
gleason     0.028357
pgg45       0.001893
trainTRUE   .

lasso.pred=predict(lasso.mod,s=bestlam,newx=x[test,])
mean((lasso.pred-y.test)^2)

[1] 0.8384

out=glmnet(x,y,alpha=1,lambda=grid)
lasso.coef=predict(out,type="coefficients",s=lam1se)
lasso.coef

10 x 1 sparse Matrix of class "dgCMatrix"
                 1
(Intercept) 2.4784
lcavol      0.5352
lweight     0.1319
age         .     
lbph        .     
svi         0.1499
lcp         .     
gleason     .     
pgg45       .     
trainTRUE   .

Esercizi 10

Scrivendo il criterio degli stimatori ridge come \[ RSS(\lambda) = (y − X \beta )^T (y − X\beta) + \lambda \beta^T\beta \] trovare la forma esplicita dello stimatore ridge \[ \hat \beta = (X^T X + \lambda I)^{-1} X^T y \]

Lab. 10 - Bootstrap e scelta del modello

A cura di G. Marchetti, (Hastie Tibshirani Friedman, 2009. The elements of statistical learning 2nd ed, Springer)