# # Program: publicVIFfncs.R # # Written by: D.J. Dupuis # 26 July 2012 # # Purpose: Implemention of # (1) robust VIF regression of Dupuis and Victoria-Feser~(2012, AOAS) # (2) VIF regression of Lin et al~(2011, JASA) # ------------------------------------------------------------------------------ library(robustbase) library(mvtnorm) library(MASS) options(digits=3) fastpickVIFrob <- function(y,matx,m,w0,dw,cc=4.685,seed=20) { # y = response # matx = covariate matrix # m = sub-sample size # w0 = initial wealth # dw = pay-out # cc=3.8827 -> 0.90 eff # cc=4.685 -> 0.95 eff # seed for sub-sampling set.seed(seed) # to reproduce results n <- length(y) p <- dim(matx)[2] matx <- as.matrix(matx) matx <- t(t(matx) - apply(matx, 2, mean)) # Center variables y <- as.vector(y) - mean(y) ptm <- proc.time() # Start Model Selection model <- rep(NA,0) # Initialize r <- y - mean(y) # r = residual, mean model shat <- mad(y) # Robust scale w <- w0 f <- 0 # Calculate efficiency mfn <- function(r,cc) { (5*(r/cc)^4 - 6*(r/cc)^2 + 1)*dnorm(r) } mfd <- function(r,cc) { r^2*((r/cc)^2 - 1)^4*dnorm(r) } ec <- integrate(mfn,-cc,cc,cc=cc)$value^2/ integrate(mfd,-cc,cc,cc=cc)$value one.w <- list() # Compute all 1-var models for (i in 1:p) { # M-est with Huber weights # default c=1.345 l.out <- rlm(matx[,i],y,method="M",psi=psi.huber,scale.est="MAD") # l.out$coef are marginal betas errs <- y - l.out$fitted.values m.sca <- mad(errs) # Use mad to estimate sigma resids <- errs/m.sca one.w[[i]] <- tukeyPsi1(resids,cc,deriv=0)/resids # Tukey's bi-weights } ws.m <- matrix(unlist(one.w),ncol=p) # matrix of marginal weights # Intercept only in model matxw <- matxw2 <- cbind(rep(1,n)) beta.0 <- solve(t(matxw)%*%matxw)%*%t(matxw2)%*%y errs.0 <- y - cbind(rep(1,n))%*%beta.0 scal.0 <- mad(errs.0) resi.0 <- errs.0/scal.0 w.0 <- tukeyPsi1(resi.0,cc,deriv=0)/resi.0 # yw <- sqrt(w.0)*y x.sel <- cbind(rep(1,n),matx[,model]) # model = intercept only Xsw <- cbind(x.sel[,1]) betaS <- solve(t(Xsw)%*%Xsw)%*%t(Xsw)%*%yw rSw <- yw - Xsw%*%betaS k2 <- numeric() for (i in 1:p) { # ALL x_i considered ONCE alpha <- w/(1+i-f) # Fast Evaluation Procedure # # Step 3 zw <- sqrt(ws.m[,i])*matx[,i]/ sqrt(t(matx[,i])%*%matx[,i]) # scaled regressor tzw.zw.inv <- try(solve(t(zw)%*%zw)) if (is.character(tzw.zw.inv)) { next } else { gnew <- tzw.zw.inv%*%t(zw)%*%rSw } # reg no intercept shat <- mad(rSw - zw%*%gnew) # 19 August 2011 if (m == n) index <- 1:n # Step 4 else index <- sample(1:n,m) Xsw.sub <- Xsw[index,] XswTXswinv <- try(solve(t(Xsw.sub)%*%Xsw.sub)) if (is.character(XswTXswinv)) next Hsw.sub <- Xsw.sub%*%XswTXswinv%*%t(Xsw.sub) zw.sub <- zw[index] zwsubzwsubinv <- try(solve(t(zw.sub)%*%zw.sub)) if (is.character(zwsubzwsubinv)) next R2w <- t(zw.sub)%*%Hsw.sub%*%zw.sub*zwsubzwsubinv rhow <- 1 - R2w tnew <- (rhow^(-1/2))*gnew/ sqrt((shat^2)/n*(mean(zw^2))^(-1)*ec^(-1)) if (is.na(tnew)) next if (2*(1-pnorm(abs(tnew))) < alpha) { x.tmp <- cbind(x.sel,matx[,i]) # Check that we can get betahat matxw.tmp <- cbind(matxw,matx[,i]*sqrt(ws.m[,i])) matxw2.tmp <- cbind(matxw2,matx[,i]*ws.m[,i]) tmatxw.matxw.inv <- try(solve((t(matxw.tmp)%*%matxw.tmp))) if (is.character(tmatxw.matxw.inv)) next # can't get beta0 beta.0 <- tmatxw.matxw.inv%*%t(matxw2.tmp)%*%y errs.0 <- y - x.tmp%*%beta.0 scal.0 <- mad(errs.0) resi.0 <- errs.0/scal.0 w.0 <- tukeyPsi1(resi.0,cc,deriv=0)/resi.0 # yw <- sqrt(w.0)*y Xsw <- numeric() for (j in 1:(dim(x.tmp)[2]-1)) Xsw <- cbind(Xsw,x.tmp[,j+1]*sqrt(w.0)) Xsw <- cbind(rep(1,n),Xsw) tXsw.Xsw.inv <- try(solve(t(Xsw)%*%Xsw)) if (is.character(tXsw.Xsw.inv)) next # can't get beta else { betaS <- tXsw.Xsw.inv%*%t(Xsw)%*%yw } if (!all(!is.na(betaS))) next # all betas not NA model <- c(model,i) x.sel <- x.tmp matxw <- matxw.tmp matxw2 <- matxw2.tmp rSw <- yw - Xsw%*%betaS w <- w + dw f <- i } else { w <- w - alpha/(1-alpha) } if (w <= 0) break } s.t <- (proc.time()-ptm)[1] # End model selection ################# return(list(model=model, pent=ifelse(is.na(model[1]),0,length(model)), s.t=s.t)) } # fastpickVIF <- function(y,matx,m,w0,dw,seed=20) { # y = response # matx = covariates # m = sub-sample size # w0 = initial wealth # dw = pay-out set.seed(seed) # to reproduce results n <- length(y) p <- dim(matx)[2] # number of covariates # Center matx <- as.matrix(matx) matx <- t(t(matx) - apply(matx, 2, mean)) y <- as.vector(y) - mean(y) ptm <- proc.time() # Start Model Selection model <- rep(NA,0) # Initialize r <- y - mean(y) # r = residual, mean model shat <- sd(y) i <- 1 w <- w0 f <- 0 for (i in 1:p) { # ALL regressors ONCE alpha <- w/(1+i-f) # Fast Evaluation Procedure # Step 3 gnew <- t(matx[,i])%*%r/sqrt(t(matx[,i])%*%matx[,i]) # Step 4 if (m == n) { index <- 1:n } else { index <- sample(1:n,m) } lm2 <- lm(matx[index,i] ~ cbind(rep(1,m),matx[index,model]) - 1) r.squared <- summary(lm2)$r.squared tnew <- gnew/(shat*sqrt(1-r.squared)) if (is.na(tnew)) next if (2*(1-pnorm(abs(tnew))) < alpha) { model <- c(model,i) lm3 <- lm(y ~ matx[,model]-1) if (!all(!is.na(lm3$coef))) next r <- lm3$residuals shat <- sqrt(sum(lm3$residuals^2)/(n-length(model)-1)) w <- w + dw f <- i } else { w <- w - alpha/(1-alpha) } if (w <=0) break } if (length(model)==0) { model <- NA } s.t <- (proc.time()-ptm)[1] # End model selection ################# return(list(model=model, pent=ifelse(is.na(model[1]),0,length(model)), s.t=s.t)) }