I used two nested case-control and six case-cohort methods in the following paper:
Kim RS, A New Comparison of Nested Case-Control and Case-Cohort Designs and Methods (Under Review)
I originally intended to use existing packages but found inaccuracies in some functions. I have discussed the inaccuracies with R community (here) as well as with one of the authors privately. So I ended up writing my own codes for the methods and I am sharing them here with research community.
For each method, one can easily use ‘robust’ or ‘sandwich’ option in software to compute variance estimators that converge to approximate jackknife estimators (or ‘robust’ estimators by Lin & Wei, 1989) as sample size increases to the size of full cohorts. When someone has already reported its use, I indicated the names (e.g. Lin and Ying). Otherwise, I named them Type B variance estimators.
inclusion.prob <- inclusion.prob.ncc.f(data=dt,t.entry.nm=t.entry.nm, t.exit.nm=t.exit.nm, fail.nm=fail.nm,controls=m,risk.table=risk.table,match.nm=match.nm)
sub.dt.clogit <- nccdata(t.entry.nm = t.entry.nm, t.exit.nm = t.exit.nm, fail.nm=fail.nm,controls = m, match.nm =NULL, data=dt,risk.table=risk.table,inclusion.prob=inclusion.prob)
sub.dt.ncc <- sub.dt.clogit[!duplicated(sub.dt.clogit$Map),]
sub.dt.ncc <- cbind(sub.dt.ncc,cluster.id=1:nrow(sub.dt.ncc))
n.ncc<-nrow(sub.dt.ncc);n0.ncc<-sum(sub.dt.ncc[,fail.nm]==0);N0<-sum(dt[,fail.nm]==0)
cch.p =n0.ncc/N0 #To ensure expected sample size of ncc and cch are the same
CCH <-casecohortdata(dt,cch.p,fail.nm)
CCH <- cbind(CCH,cluster.id=1:nrow(CCH))
Nested Case-Control Methods
Thomas (1977)’s Hazard Ratio and Standard Error Estimation (“Conditional Logistic Method”)
First, make a ‘trick’ data set “by including multiple inputs for subjects who are selected multiple times, and recording all randomly selected failures as non-failures.” (Kim 2013)
clogit.fn <- function(formula,data){
fit <- coxph(formula=formula,data=data,method="exact")
res <- list(coef=fit$coef, se=sqrt(diag(fit$var)))
}
formula.clogit <- Surv(time, Fail) ~ X1+X2 + strata(Set)
clogit.fn(formula.clogit,data=sub.dt.clogit)
Samuelsen (1997)’s Hazard Ratio and Kim (2013)’s Standard Error Estimation (“IPW method”)
robust.coxph.fn <-function(formula,sub.dt,wt="wt"){
if(is.null(wt)) {wt<-rep(1,nrow(sub.dt))} else { wt<-sub.dt[,wt]}
fit<-coxph(formula=formula,data=sub.dt, weights=wt)
res<-list(coef=fit$coef,se=sqrt(diag(fit$var)))
}
robust.formula <- Surv(time, status) ~ X1+X2+cluster(cluster.id)
robust.coxph.fn(robust.formula,sub.dt.ncc,wt="wt")
Case-Cohort Methods
Barlow (1994)’s Hazard Ratio and Standard Error Estimation
Barlow (1994) originally proposed two approaches for weighting: 1) using time dependent subcohort proportion π(t) which is the number of subjects at risk among the subcohort at t divided by the number of subjects at risk in the cohort at t, or 2) using the baseline subcohort proportion π. Later Barlow et al. (1999) used only π. Here the program uses the baseline subcohort proportion π.
barlow.dt <- function(CCH,cch.p){
#assumes no ties in failure time & zero entry time
#http://lib.stat.cmu.edu/general/robphreg
eps<-1e-08
wt.temp=1/cch.p
ST <- CCH$status
SC <- CCH$in.subcohort
temp1 <- CCH[ST==1,]
temp1 <- cbind(temp1, T=temp1$time,WWTT=1,start=temp1$time-eps,CASE=1)
situation2<-ST==1 & SC==T
if(sum(situation2)>0){
temp2 <- CCH[ST==1 & SC==T,]
temp2 <- cbind(temp2,T=temp2$time-eps,WWTT=wt.temp,start=temp2$t.entry,CASE=0)
}
temp3 <- CCH[ST==0 & SC==T,]
temp3 <- cbind(temp3,T=temp3$time, WWTT=wt.temp,start=temp3$t.entry,CASE=0)
if(sum(situation2)>0){
res<-rbind(temp1,temp2,temp3)
} else res <-rbind(temp1,temp3)
return(cbind(res,logwt=log(res$WWTT)))
}
barlow.fn <-function(formula=formula.Barlow,data=CCH.barlow){
fit<-coxph(formula=formula,weights=WWTT, data=data)
res<-list(coef=fit$coef,se=sqrt(diag(fit$var)))
}
formula.Barlow<-Surv(start,T, CASE) ~ X1+X2+cluster(cluster.id)
CCH.barlow <- barlow.dt(cbind(CCH, t.entry=0), cch.p)
barlow.fn(formula.Barlow, data=CCH.barlow)
Prentice (1986)’s Hazard Ratio and Self & Prentice (1988)’s Standard Error Estimation
prentice.dt <-function(CCH){
eps <- 1e-08
TM <-CCH$time
start<-CCH$t.entry
SC <-CCH$in.subcohort
start[SC == FALSE] <- TM[SC == FALSE] - eps
return(cbind(CCH,start,TM))
}
CCH.prentice <- prentice.dt(cbind(CCH,t.entry=0))
prentice <-function(formula.prentice,data=CCH.prentice,cohort.size=nrow(dt)){
SC <- data$in.subcohort
fit <- coxph(formula.prentice,data=data,x=T)
dfbeta <- resid(fit,type="dfbeta")
d2 <- dfbeta[SC,]
fit$naive.var <- fit$naive.var + (1- sum(SC)/cohort.size)* t(d2)%*% d2
fit$SelfPrentice.var <- fit$naive.var
fit$AJK.var <- fit$var
fit<-fit[c("coefficients","SelfPrentice.var","AJK.var"]
return(fit)
}
prentice.fn <-function(formula=formula.prentice,data=CCH.prentice){
fit<-prentice(formula=formula,data=data)
res<-list(coef=fit$coef,se=sqrt(diag(fit$SelfPrentice.var)))
}
formula.prentice <- Surv(start, T, status) ~ X1+X2+cluster(cluster.id)
prentice.fn(formula.prentice,data =CCH.prentice)
Prentice (1986)’s Hazard Ratio and Type B (Approximate Jackknife) Standard Error Estimation
prentice.AJK.fn <-function(formula=formula.prentice,data=CCH.prentice){
fit<-prentice(formula=formula,data=data)
res<-list(coef=fit$coef,se=sqrt(diag(fit$AJK.var)))
}
formula.prentice<-Surv(start, T, status) ~ X1+X2+cluster(cluster.id)
prentice.AJK.fn(formula.prentice,data =CCH.prentice)
Self & Prentice (1998)’s Hazard Ratio and Standard Error Estimation
temp1 <- data.frame(CCH[CCH$status==1, ], group=1, dummy= -100)
temp2 <- data.frame(CCH[CCH$in.subcohort==TRUE, ],group=0, dummy= 0)
CCH.SP <- rbind(temp1, temp2)
selfprentice <-function(formula.SP,data=sub.dt,cohort.size=nrow(dt)){
SC <- data$in.subcohort
fit <- coxph(formula.SP,data=data,x=T)
dfbeta <- resid(fit,type="dfbeta")
d2 <- dfbeta[SC,]
fit$SelfPrentice.var <- fit$naive.var + (1- sum(SC)/cohort.size)* t(d2)%*% d2
fit$LinYing.var <- fit$var
fit<-fit[c("coefficients","SelfPrentice.var","LinYing.var")]
return(fit)
}
selfprentice.fn <-function(formula,data=sub.dt,cohort.size=nrow(dt)){
fit<-selfprentice(formula=formula,data=data,cohort.size=cohort.size)
res<-list(coef=fit$coef,se=sqrt(diag(fit$SelfPrentice.var)))
}
formula.SP <- Surv(time, group) ~ X1+X2+offset(dummy) + cluster(cluster.id)
selfprentice.fn(formula.SP, data=CCH.SP, cohort.size=nrow(dt))
Self & Prentice (1998)’s Hazard Ratio and Lin & Ying (1993)’s (Approximate Jackknife) Standard Error Estimation
linying.fn <-function(formula,data=sub.dt,cohort.size=nrow(dt)){
fit<-selfprentice(formula=formula,data=data,cohort.size=cohort.size)
res<-list(coef=fit$coef,se=sqrt(diag(fit$LinYing.var)))
}
formula.SP <- Surv(time, group) ~ X1+X2+offset(dummy) + cluster(cluster.id)
linying.fn(formula.SP, data =CCH.SP, cohort.size=nrow(dt))
Binder (1992)’s Hazard Ratio and Type B (Approximate Jackknife) Standard Error Estimation (“IPW method”)
robust.coxph.fn <-function(formula,sub.dt,nm="Robust.Cox",wt="wt"){
if(is.null(wt)) {wt<-rep(1,nrow(sub.dt))} else { wt<-sub.dt[,wt]}
fit<-coxph(formula=formula,data=sub.dt, weights=wt)
res<-list(coef=fit$coef,se=sqrt(diag(fit$var)))
}
robust.formula <- Surv(time, status) ~ X1+X2+cluster(cluster.id)
robust.coxph.fn(robust.formula,CCH,wt="wt")
Contact me for beta versions of the following:
Samuelsen (1992)’s Hazard Ratio Standard Error Estimation (“IPW method”)
Binder (1992)’s Hazard Ratio and Lin (2000)’s Standard Error Estimation (“IPW method”)
library(survey)
svycoxph.fn<-function(formula,design=design){
fit<-svycoxph(formula,design=design)
list(coef=fit$coef,se=sqrt(diag(fit$var)))
}
formula <- Surv(time, status) ~ X1+X2
svycoxph.fn(formula,design=svydesign(id=~1, weights=~wt, data=CCH))