cgd0 <- read.table('data.cgd', na.strings='.', col.names=c('id', 'center', 'rand.dt', 'rx', 'sex', 'age', 'height', 'weight', 'inherit', 'steroids', 'propylac', 'hos.cat', 'futime', paste('event', 1:7, sep=''))) # Turn the randomization date into a date object (as it should be) # I have to insert "/" marks, since the timeDate function isn't # smart enough to to without them (or I'm missing something) temp <- cgd0$rand.dt tempm<- floor(temp/10000) tempd<- floor(temp/100)%%100 tempy<- 1900 + temp%%100 cgd0$rand.dt <- timeDate(charvec=paste(tempm, tempd, tempy, sep='/'), format="%d%b%C") # Find the max event time for each subject, setting it to 0 for # those with no events n <- nrow(cgd0) etemp <- as.matrix(cgd0[,14:20]) maxtime <- apply(cbind(0,etemp), 1, max, na.rm=T) # # Create the WLW style data set # everyone gets 7 rows # tstop <- ifelse(is.na(etemp), cgd0$futime, etemp) tstat <- ifelse(is.na(etemp), 0, 1) cgd2 <- data.frame(cgd0[rep(1:n,7), 1:12], time = c(tstop), status=c(tstat), enum = rep(1:7, rep(n,7)) ) # A key idea in the above is duplicated subscripts, for instance # cgd0[c(1,1,1,1,2,2,4,5,5),] # is a data set with 4 copies of row 1 of cgd0, 2 copies of row 2, one of row # 4 and 2 copies of row 5. # Now, create the data Andersen-Gill style data set # First, pretend that everyone had all 7 events + more follow-up # Then thin things out tstart <- c(t(cbind(0, ifelse(is.na(etemp), maxtime, etemp)))) tstop <- c(t(cbind(etemp, cgd0$futime))) tstat <- rep(c(1,1,1,1,1,1,1,0), n) keep <- (!is.na(tstop) & (tstop > tstart)) #which rows to keep nrow <- apply(matrix(keep, nrow=8), 2, sum) #how many rows remain for each enum <- apply(matrix(keep, nrow=8), 2, cumsum) indx <- rep(1:n, nrow) cgd1 <- data.frame(cgd0[indx, 1:12], tstart=tstart[keep], tstop =tstop [keep], status=tstat[keep], enum = enum[keep], row.names=NULL) # remove temporary variables rm(tstart, tstop, tstat, keep, nrow, indx) rm(etemp, maxtime, n, temp)