1 Introduction

In this document, we will visualise the cleaned cricket

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.2.0     ✔ purrr   0.3.2
## ✔ tibble  2.1.3     ✔ dplyr   0.8.3
## ✔ tidyr   0.8.3     ✔ stringr 1.4.0
## ✔ readr   1.3.1     ✔ forcats 0.4.0
## ── Conflicts ────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(survminer)
## Loading required package: ggpubr
## Loading required package: magrittr
## 
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
## 
##     set_names
## The following object is masked from 'package:tidyr':
## 
##     extract

2 Loading data

clean_test_batting_inngings = readr::read_csv("./clean_test_batting_inngings.csv") %>% 
  dplyr::mutate(runs_num = runs %>% stringr::str_extract("\\-*\\d+\\.*\\d*") %>% as.integer(),
                not_out = runs %>% stringr::str_detect("[*]"),
                dead = !not_out)
## Parsed with column specification:
## cols(
##   country = col_character(),
##   player_name = col_character(),
##   runs = col_character(),
##   mins = col_double(),
##   balls_faced = col_double(),
##   fours = col_double(),
##   sixes = col_double(),
##   strike_rate = col_double(),
##   pos = col_double(),
##   dismissal = col_character(),
##   inns = col_double(),
##   opposition = col_character(),
##   ground = col_character(),
##   start_date = col_character(),
##   test_number = col_character()
## )
glimpse(clean_test_batting_inngings)
## Observations: 94,604
## Variables: 18
## $ country     <chr> "australia", "australia", "australia", "australia", …
## $ player_name <chr> "C Bannerman", "C Bannerman", "C Bannerman", "C Bann…
## $ runs        <chr> "165*", "4", "10", "30", "15", "15*", "17", "6", "5"…
## $ mins        <dbl> 285, 8, 55, 13, NA, 10, NA, 16, NA, NA, NA, NA, 1, 4…
## $ balls_faced <dbl> NA, 10, NA, NA, NA, NA, NA, 20, NA, NA, NA, NA, 2, 5…
## $ fours       <dbl> 18, 1, NA, NA, NA, NA, NA, 0, NA, NA, NA, NA, 0, 2, …
## $ sixes       <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, NA, 0, 0, 0, 0, 0, …
## $ strike_rate <dbl> NA, 40.00, NA, NA, NA, NA, NA, 30.00, NA, NA, NA, NA…
## $ pos         <dbl> 1, 1, 2, 3, 1, 1, 8, 8, 3, 10, 9, NA, 6, 7, 7, 1, 2,…
## $ dismissal   <chr> "retired notout", "bowled", "bowled", "caught", "bow…
## $ inns        <dbl> 1, 3, 1, 3, 2, 4, 1, 3, 1, 3, 2, 4, 2, 3, 2, 4, 2, 4…
## $ opposition  <chr> "v England", "v England", "v England", "v England", …
## $ ground      <chr> "Melbourne", "Melbourne", "Melbourne", "Melbourne", …
## $ start_date  <chr> "15 Mar 1877", "15 Mar 1877", "31 Mar 1877", "31 Mar…
## $ test_number <chr> "Test # 1", "Test # 1", "Test # 2", "Test # 2", "Tes…
## $ runs_num    <int> 165, 4, 10, 30, 15, 15, 17, 6, 5, 26, 6, NA, 0, 19, …
## $ not_out     <lgl> TRUE, FALSE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE…
## $ dead        <lgl> FALSE, TRUE, TRUE, TRUE, TRUE, FALSE, TRUE, TRUE, TR…

3 Data extraction

selected_players = clean_test_batting_inngings %>% 
  dplyr::filter(player_name %in% c("MJ Clarke", "SPD Smith", "RT Ponting", "SR Waugh"))

selected_players_df =  selected_players %>% 
  dplyr::transmute(player_name, 
                   runs_num, 
                   dead) %>% 
  as.data.frame()

library(survival)
fit = survfit(Surv(runs_num, dead) ~ player_name, data = selected_players_df)

survminer::ggsurvplot(
  fit = fit, 
  data = selected_players_df, 
  pval = TRUE, palette = "Set1")

4 Also make a survival plot for all players to verify the hypothesis that you are more likely to get out in the first 30 balls faced.

fit_bf = survfit(Surv(balls_faced, dead) ~ 1, data = clean_test_batting_inngings)

plot_bf = survminer::ggsurvplot(
  fit = fit_bf,
  data = clean_test_batting_inngings,
  palette = "Set1",
  surv.median.line = "hv")$plot + 
  labs(x = "Balls faced before getting out",
       main = "Balls faced survival plot") +
  scale_x_continuous(breaks = c(30, 100, 200, 400, 600, 800))
## Scale for 'x' is already present. Adding another scale for 'x', which
## will replace the existing scale.
fit_runs = survfit(Surv(runs_num, dead) ~ 1, data = clean_test_batting_inngings)

plot_runs = survminer::ggsurvplot(
  fit = fit_runs,
  data = clean_test_batting_inngings,
  palette = "Set1",
  surv.median.line = "hv")$plot +
  labs(x = "Runs scored before getting out",
       main = "Runs scored survival plot") +
  scale_x_continuous(breaks = c(30, 100, 200, 400, 600, 800))
## Scale for 'x' is already present. Adding another scale for 'x', which
## will replace the existing scale.
ggpubr::ggarrange(plot_bf, plot_runs)

5 Session Info

sessionInfo()
## R version 3.6.0 (2019-04-26)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS High Sierra 10.13.6
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_AU.UTF-8/en_AU.UTF-8/en_AU.UTF-8/C/en_AU.UTF-8/en_AU.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] survival_2.44-1.1 survminer_0.4.4   ggpubr_0.2.1     
##  [4] magrittr_1.5      forcats_0.4.0     stringr_1.4.0    
##  [7] dplyr_0.8.3       purrr_0.3.2       readr_1.3.1      
## [10] tidyr_0.8.3       tibble_2.1.3      ggplot2_3.2.0    
## [13] tidyverse_1.2.1  
## 
## loaded via a namespace (and not attached):
##  [1] zoo_1.8-6          tidyselect_0.2.5   xfun_0.8          
##  [4] splines_3.6.0      haven_2.1.1        lattice_0.20-38   
##  [7] colorspace_1.4-1   generics_0.0.2     vctrs_0.2.0       
## [10] htmltools_0.3.6    yaml_2.2.0         utf8_1.1.4        
## [13] survMisc_0.5.5     rlang_0.4.0        pillar_1.4.2      
## [16] glue_1.3.1         withr_2.1.2        RColorBrewer_1.1-2
## [19] modelr_0.1.4       readxl_1.3.1       munsell_0.5.0     
## [22] ggsignif_0.5.0     gtable_0.3.0       cellranger_1.1.0  
## [25] rvest_0.3.4        evaluate_0.14      labeling_0.3      
## [28] knitr_1.23         fansi_0.4.0        broom_0.5.2       
## [31] Rcpp_1.0.1         xtable_1.8-4       scales_1.0.0      
## [34] backports_1.1.4    cmprsk_2.2-8       jsonlite_1.6      
## [37] km.ci_0.5-2        gridExtra_2.3      hms_0.5.0         
## [40] digest_0.6.20      stringi_1.4.3      KMsurv_0.1-5      
## [43] cowplot_1.0.0      grid_3.6.0         cli_1.1.0         
## [46] tools_3.6.0        lazyeval_0.2.2     crayon_1.3.4      
## [49] pkgconfig_2.0.2    zeallot_0.1.0      Matrix_1.2-17     
## [52] data.table_1.12.2  xml2_1.2.0         lubridate_1.7.4   
## [55] assertthat_0.2.1   rmarkdown_1.13     httr_1.4.0        
## [58] rstudioapi_0.10    R6_2.4.0           nlme_3.1-140      
## [61] compiler_3.6.0