I wanted to show a couple of Cox proportional hazards regression models in R for a talk I am giving for the R users group.

This builds on the work in the previous two blog entries.

This is the documentation of the data file.

# Documentation found at http://lib.stat.cmu.edu/datasets/pbc
# The data set is a study of primary biliary cirrhosis
# Variables:
# V1  case number
# V2  number of days between registration and the earlier of death,
#     transplantion, or study analysis time in July, 1986
# V3  status
#     0=censored, 1=censored due to liver tx, 2=death
# V4  drug
#     1= D-penicillamine, 2=placebo
# V5  age in days
# V6  sex
#     0=male, 1=female
# V7  presence of asictes
#     0=no 1=yes
# V8  presence of hepatomegaly
#     0=no 1=yes
# V9  presence of spiders
#     0=no 1=yes
# V10 presence of edema          
#     0   = no edema and no diuretic therapy for edema;
#     0.5 = edema present without diuretics, or edema resolved by diuretics;
#     1   = edema despite diuretic therapy
# V11 serum bilirubin in mg/dl
# V12 serum cholesterol in mg/dl
# V13 albumin in gm/dl
# V14 urine copper in ug/day#
# V15 alkaline phosphatase in U/liter
# V16 SGOT in U/ml
# V17 triglicerides in mg/dl
# V18 platelets per cubic ml / 1000
# V19 prothrombin time in seconds
# V20 histologic stage of disease

The dataset is already part of the survival package.

library("survival")
## Loading required package: splines
head(pbc)
##   id time status trt      age sex ascites hepato spiders edema bili chol
## 1  1  400      2   1 58.76523   f       1      1       1   1.0 14.5  261
## 2  2 4500      0   1 56.44627   f       0      1       1   0.0  1.1  302
## 3  3 1012      2   1 70.07255   m       0      0       0   0.5  1.4  176
## 4  4 1925      2   1 54.74059   f       0      1       1   0.5  1.8  244
## 5  5 1504      1   2 38.10541   f       0      1       1   0.0  3.4  279
## 6  6 2503      2   2 66.25873   f       0      1       0   0.0  0.8  248
##   albumin copper alk.phos    ast trig platelet protime stage
## 1    2.60    156   1718.0 137.95  172      190    12.2     4
## 2    4.14     54   7394.8 113.52   88      221    10.6     3
## 3    3.48    210    516.0  96.10   55      151    12.0     4
## 4    2.54     64   6121.8  60.63   92      183    10.3     4
## 5    3.53    143    671.0 113.15   72      136    10.9     3
## 6    3.98     50    944.0  93.00   63       NA    11.0     3
tail(pbc)
##      id time status trt      age sex ascites hepato spiders edema bili
## 413 413  989      0  NA 35.00068   f      NA     NA      NA     0  0.7
## 414 414  681      2  NA 67.00068   f      NA     NA      NA     0  1.2
## 415 415 1103      0  NA 39.00068   f      NA     NA      NA     0  0.9
## 416 416 1055      0  NA 56.99932   f      NA     NA      NA     0  1.6
## 417 417  691      0  NA 58.00137   f      NA     NA      NA     0  0.8
## 418 418  976      0  NA 52.99932   f      NA     NA      NA     0  0.7
##     chol albumin copper alk.phos ast trig platelet protime stage
## 413   NA    3.23     NA       NA  NA   NA      312    10.8     3
## 414   NA    2.96     NA       NA  NA   NA      174    10.9     3
## 415   NA    3.83     NA       NA  NA   NA      180    11.2     4
## 416   NA    3.42     NA       NA  NA   NA      143     9.9     3
## 417   NA    3.75     NA       NA  NA   NA      269    10.4     3
## 418   NA    3.29     NA       NA  NA   NA      350    10.6     4
pmod <- pbc[1:312,]
tail(pmod)
##      id time status trt      age sex ascites hepato spiders edema bili
## 307 307 1149      0   2 30.57358   f       0      0       0     0  0.8
## 308 308 1153      0   1 61.18275   f       0      1       0     0  0.4
## 309 309  994      0   2 58.29979   f       0      0       0     0  0.4
## 310 310  939      0   1 62.33265   f       0      0       0     0  1.7
## 311 311  839      0   1 37.99863   f       0      0       0     0  2.0
## 312 312  788      0   2 33.15264   f       0      0       1     0  6.4
##     chol albumin copper alk.phos ast trig platelet protime stage
## 307  273    3.56     52     1282 130   59      344    10.5     2
## 308  246    3.58     24      797  91  113      288    10.4     2
## 309  260    2.75     41     1166  70   82      231    10.8     2
## 310  434    3.35     39     1713 171  100      234    10.2     2
## 311  247    3.16     69     1050 117   88      335    10.5     2
## 312  576    3.79    186     2115 136  149      200    10.8     2
summary(pmodtime)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##      41    1191    1840    2006    2697    4556
table(pmodstatus)
## 
##   0   1   2 
## 168  19 125
p.surv <- Surv(pmodtime,pmod$status==2)
print(p.surv)
##   [1]  400  4500+ 1012  1925  1504+ 2503  1832+ 2466  2400    51  3762 
##  [12]  304  3577+ 1217  3584  3672+  769   131  4232+ 1356  3445+  673 
##  [23]  264  4079  4127+ 1444    77   549  4509+  321  3839  4523+ 3170 
##  [34] 3933+ 2847  3611+  223  3244  2297  4467+ 1350  4453+ 4556+ 3428 
##  [45] 4025+ 2256  2576+ 4427+  708  2598  3853  2386  1000  1434  1360 
##  [56] 1847  3282  4459+ 2224  4365+ 4256+ 3090   859  1487  3992+ 4191 
##  [67] 2769  4039+ 1170  3458+ 4196+ 4184+ 4190+ 1827  1191    71   326 
##  [78] 1690  3707+  890  2540  3574  4050+ 4032+ 3358  1657   198  2452+
##  [89] 1741  2689   460   388  3913+  750   130  3850+  611  3823+ 3820+
## [100]  552  3581+ 3099+  110  3086  3092+ 3222  3388+ 2583  2504+ 2105 
## [111] 2350+ 3445   980  3395  3422+ 3336+ 1083  2288   515  2033+  191 
## [122] 3297+  971  3069+ 2468+  824  3255+ 1037  3239+ 1413   850  2944+
## [133] 2796  3149+ 3150+ 3098+ 2990+ 1297  2106+ 3059+ 3050+ 2419   786 
## [144]  943  2976+ 2615+ 2995+ 1427   762  2891+ 2870+ 1152  2863+  140 
## [155] 2666+  853  2835+ 2475+ 1536  2772+ 2797+  186  2055   264  1077 
## [166] 2721+ 1682  2713+ 1212  2692+ 2574+ 2301+ 2657+ 2644+ 2624+ 1492 
## [177] 2609+ 2580+ 2573+ 2563+ 2556+ 2555+ 2241+  974  2527+ 1576   733 
## [188] 2332+ 2456+ 2504+  216  2443+  797  2449+ 2330+ 2363+ 2365+ 2357+
## [199] 1592+ 2318+ 2294+ 2272+ 2221+ 2090  2081  2255+ 2171+  904  2216+
## [210] 2224+ 2195+ 2176+ 2178+ 1786  1080  2168+  790  2170+ 2157+ 1235 
## [221] 2050+  597   334  1945+ 2022+ 1978+  999  1967+  348  1979+ 1165 
## [232] 1951+ 1932+ 1776+ 1882+ 1908+ 1882+ 1874+  694  1831+  837+ 1810+
## [243]  930  1690  1790+ 1435+  732+ 1785+ 1783+ 1769+ 1457+ 1770+ 1765+
## [254]  737+ 1735+ 1701+ 1614+ 1702+ 1615+ 1656+ 1677+ 1666+ 1301+ 1542+
## [265] 1084+ 1614+  179  1191  1363+ 1568+ 1569+ 1525+ 1558+ 1447+ 1349+
## [276] 1481+ 1434+ 1420+ 1433+ 1412+   41  1455+ 1030+ 1418+ 1401+ 1408+
## [287] 1234+ 1067+  799  1363+  901+ 1329+ 1320+ 1302+  877+ 1321+  533+
## [298] 1300+ 1293+  207  1295+ 1271+ 1250+ 1230+ 1216+ 1216+ 1149+ 1153+
## [309]  994+  939+  839+  788+

Run a simple Cox proportional hazards model

cox.mod1 <- coxph(p.surv~sex,data=pmod,na.action=na.omit)
print(cox.mod1)
## Call:
## coxph(formula = p.surv ~ sex, data = pmod, na.action = na.omit)
## 
## 
##        coef exp(coef) se(coef)     z     p
## sexf -0.484     0.616    0.236 -2.05 0.041
## 
## Likelihood ratio test=3.77  on 1 df, p=0.0521  n= 312, number of events= 125
summary(cox.mod1)
## Call:
## coxph(formula = p.surv ~ sex, data = pmod, na.action = na.omit)
## 
##   n= 312, number of events= 125 
## 
##         coef exp(coef) se(coef)      z Pr(>|z|)  
## sexf -0.4839    0.6164   0.2365 -2.046   0.0407 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##      exp(coef) exp(-coef) lower .95 upper .95
## sexf    0.6164      1.622    0.3877    0.9798
## 
## Concordance= 0.532  (se = 0.015 )
## Rsquare= 0.012   (max possible= 0.983 )
## Likelihood ratio test= 3.77  on 1 df,   p=0.05214
## Wald test            = 4.19  on 1 df,   p=0.04075
## Score (logrank) test = 4.27  on 1 df,   p=0.03887

To plot the data, you need to create predictions at the specific values of interest. With multiple indpendent variables, any variable left out would be predicted at its average value.

pred.mod1 <- survfit(cox.mod1,newdata=data.frame(sex=c("f","m")))
plot(pred.mod1,xlim=c(0,6000))
title("sex")
end.pts <- dim(pred.mod1surv)[1]
end.x   <- rep(pred.mod1time[end.pts]+100,1)
end.y   <- pred.mod1surv[end.pts,]
end.nm  <- c("f","m")
text(end.x,end.y,end.nm,cex=1.5,col="darkred",adj=0)

The Cox model works fairly similarly for continuous independent variables.

cox.mod1 <- coxph(p.surv~sex,data=pmod,na.action=na.omit)
print(cox.mod1)
## Call:
## coxph(formula = p.surv ~ sex, data = pmod, na.action = na.omit)
## 
## 
##        coef exp(coef) se(coef)     z     p
## sexf -0.484     0.616    0.236 -2.05 0.041
## 
## Likelihood ratio test=3.77  on 1 df, p=0.0521  n= 312, number of events= 125
summary(cox.mod1)
## Call:
## coxph(formula = p.surv ~ sex, data = pmod, na.action = na.omit)
## 
##   n= 312, number of events= 125 
## 
##         coef exp(coef) se(coef)      z Pr(>|z|)  
## sexf -0.4839    0.6164   0.2365 -2.046   0.0407 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##      exp(coef) exp(-coef) lower .95 upper .95
## sexf    0.6164      1.622    0.3877    0.9798
## 
## Concordance= 0.532  (se = 0.015 )
## Rsquare= 0.012   (max possible= 0.983 )
## Likelihood ratio test= 3.77  on 1 df,   p=0.05214
## Wald test            = 4.19  on 1 df,   p=0.04075
## Score (logrank) test = 4.27  on 1 df,   p=0.03887
cox.mod2 <- coxph(p.surv~age,data=pmod,na.action=na.omit)
print(cox.mod2)
## Call:
## coxph(formula = p.surv ~ age, data = pmod, na.action = na.omit)
## 
## 
##     coef exp(coef) se(coef)    z       p
## age 0.04      1.04  0.00881 4.54 5.7e-06
## 
## Likelihood ratio test=20.5  on 1 df, p=5.95e-06  n= 312, number of events= 125
summary(cox.mod2)
## Call:
## coxph(formula = p.surv ~ age, data = pmod, na.action = na.omit)
## 
##   n= 312, number of events= 125 
## 
##         coef exp(coef) se(coef)     z Pr(>|z|)    
## age 0.039995  1.040806 0.008811 4.539 5.65e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##     exp(coef) exp(-coef) lower .95 upper .95
## age     1.041     0.9608     1.023     1.059
## 
## Concordance= 0.625  (se = 0.029 )
## Rsquare= 0.064   (max possible= 0.983 )
## Likelihood ratio test= 20.51  on 1 df,   p=5.947e-06
## Wald test            = 20.6  on 1 df,   p=5.651e-06
## Score (logrank) test = 20.86  on 1 df,   p=4.942e-06
pred.mod2 <- survfit(cox.mod2,newdata=data.frame(age=c(30,40,50,60)))
plot(pred.mod2,xlim=c(0,6000))
title("age")
end.pts <- dim(pred.mod2surv)[1]
end.x   <- pred.mod2time[end.pts]+100
end.y   <- pred.mod2surv[end.pts,]
end.nm  <- c(30,40,50,60)
text(end.x,end.y,end.nm,cex=1.5,col="darkred",adj=0)

This Blog post was added to the website on 2014-11-01 and was last modified on 2020-02-29. You can find similar pages at R software, Survival analysis.

An earlier version of this page appears here.