1 Introduction

In this document, we will clean the raw downloaed data from Cricinfo.

2 Loading packages

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.2.0     ✔ purrr   0.3.2
## ✔ tibble  2.1.3     ✔ dplyr   0.8.3
## ✔ tidyr   0.8.3     ✔ stringr 1.4.0
## ✔ readr   1.3.1     ✔ forcats 0.4.0
## ── Conflicts ────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
theme_set(theme_classic(18) +
            theme(legend.position = "bottom"))

3 Loading raw data

test_batting_career = read_csv("all_test_batting_career.csv")
## Parsed with column specification:
## cols(
##   country = col_character(),
##   player_name = col_character(),
##   x = col_character(),
##   span = col_character(),
##   mat = col_double(),
##   inns = col_character(),
##   no = col_character(),
##   runs = col_character(),
##   hs = col_character(),
##   ave = col_character(),
##   x100 = col_character(),
##   x50 = col_character(),
##   x0 = col_character(),
##   x_2 = col_character(),
##   x4s = col_character(),
##   x6s = col_character(),
##   bf = col_character(),
##   sr = col_character()
## )
test_batting_inngings = read_csv("all_test_batting_innings.csv")
## Parsed with column specification:
## cols(
##   country = col_character(),
##   player_name = col_character(),
##   runs = col_character(),
##   mins = col_character(),
##   bf = col_character(),
##   x4s = col_character(),
##   x6s = col_character(),
##   sr = col_character(),
##   pos = col_character(),
##   dismissal = col_character(),
##   inns = col_character(),
##   x = col_logical(),
##   opposition = col_character(),
##   ground = col_character(),
##   start_date = col_character(),
##   x_2 = col_character()
## )

4 Cleaning data

clean_test_batting_career = test_batting_career %>% 
  tidyr::separate(span, 
                  into = c("career_start", "career_end"),
                  sep = "-", remove = FALSE) %>% 
  dplyr::transmute(
    country,
    player_name,
    span,
    career_start = career_start %>% as.integer(),
    career_end = career_end %>% as.integer(),
    mat = mat %>% as.integer(),
    inns = inns %>% as.integer(),
    not_out = no %>% as.integer(),
    runs = as.integer(runs),
    hs, 
    ave = ave %>% as.numeric(),
    century = x100 %>% stringr::str_extract("\\-*\\d+\\.*\\d*") %>% as.integer(), 
    half_century = x50 %>% stringr::str_extract("\\-*\\d+\\.*\\d*") %>% as.integer(),
    ducks = x0 %>% stringr::str_extract("\\-*\\d+\\.*\\d*") %>% as.integer(), 
    fours = x4s %>% stringr::str_extract("\\-*\\d+\\.*\\d*") %>% as.integer(), 
    sixes = x6s %>% stringr::str_extract("\\-*\\d+\\.*\\d*") %>% as.integer(), 
    balls_faced = bf %>% stringr::str_extract("\\-*\\d+\\.*\\d*") %>% as.integer(), 
    strike_rate = sr %>% stringr::str_extract("\\-*\\d+\\.*\\d*") %>% as.numeric()
  )
## Warning in function_list[[k]](value): NAs introduced by coercion

## Warning in function_list[[k]](value): NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning in function_list[[k]](value): NAs introduced by coercion
clean_test_batting_career %>% glimpse()
## Observations: 2,993
## Variables: 18
## $ country      <chr> "australia", "australia", "australia", "australia",…
## $ player_name  <chr> "C Bannerman", "JM Blackham", "BB Cooper", "TW Garr…
## $ span         <chr> "1877-1879", "1877-1894", NA, "1877-1888", "1877-18…
## $ career_start <int> 1877, 1877, NA, 1877, 1877, NA, NA, 1877, NA, 1877,…
## $ career_end   <int> 1879, 1894, NA, 1888, 1879, NA, NA, 1885, NA, 1887,…
## $ mat          <int> 3, 35, 1, 19, 3, 1, 2, 15, 2, 12, 2, 2, 19, 18, 1, …
## $ inns         <int> 6, 62, 2, 33, 5, 2, 4, 27, 4, 21, 4, 3, 34, 29, 1, …
## $ not_out      <int> 2, 11, 0, 6, 2, 0, 1, 2, 1, 1, 0, 0, 5, 6, 0, 2, 4,…
## $ runs         <int> 239, 800, 18, 339, 60, 11, 10, 471, 39, 269, 67, 64…
## $ hs           <chr> "165*", "74", "15", "51*", "43", "11", "8", "124", …
## $ ave          <dbl> 59.75, 15.68, 9.00, 12.55, 20.00, 5.50, 3.33, 18.84…
## $ century      <int> 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 2, 0, 0, 0, 0, …
## $ half_century <int> 0, 4, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 8, 0, …
## $ ducks        <int> 0, 6, 0, 5, 0, 1, 1, 3, 0, 1, 0, 0, 3, 6, 0, 3, 2, …
## $ fours        <int> NA, NA, NA, NA, NA, 1, 1, NA, NA, NA, NA, NA, NA, N…
## $ sixes        <int> NA, NA, NA, NA, NA, 0, 0, NA, NA, NA, NA, NA, NA, N…
## $ balls_faced  <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ strike_rate  <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
clean_test_batting_inngings = test_batting_inngings %>% 
  dplyr::transmute(
    country,
    player_name,
    runs,
    mins = mins %>% as.integer(),
    balls_faced = bf %>% stringr::str_extract("\\-*\\d+\\.*\\d*") %>% as.integer(), 
    fours = x4s %>% stringr::str_extract("\\-*\\d+\\.*\\d*") %>% as.integer(), 
    sixes = x6s %>% stringr::str_extract("\\-*\\d+\\.*\\d*") %>% as.integer(), 
    strike_rate = sr %>% stringr::str_extract("\\-*\\d+\\.*\\d*") %>% as.numeric(),
    pos = pos %>% as.integer(),
    dismissal,
    inns = inns %>% as.integer(),
    opposition,
    ground,
    start_date,
    test_number = x_2
  )
## Warning in function_list[[k]](value): NAs introduced by coercion

## Warning in function_list[[k]](value): NAs introduced by coercion

## Warning in function_list[[k]](value): NAs introduced by coercion
clean_test_batting_inngings %>% glimpse()
## Observations: 94,604
## Variables: 15
## $ country     <chr> "australia", "australia", "australia", "australia", …
## $ player_name <chr> "C Bannerman", "C Bannerman", "C Bannerman", "C Bann…
## $ runs        <chr> "165*", "4", "10", "30", "15", "15*", "17", "6", "5"…
## $ mins        <int> 285, 8, 55, 13, NA, 10, NA, 16, NA, NA, NA, NA, 1, 4…
## $ balls_faced <int> NA, 10, NA, NA, NA, NA, NA, 20, NA, NA, NA, NA, 2, 5…
## $ fours       <int> 18, 1, NA, NA, NA, NA, NA, 0, NA, NA, NA, NA, 0, 2, …
## $ sixes       <int> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, NA, 0, 0, 0, 0, 0, …
## $ strike_rate <dbl> NA, 40.00, NA, NA, NA, NA, NA, 30.00, NA, NA, NA, NA…
## $ pos         <int> 1, 1, 2, 3, 1, 1, 8, 8, 3, 10, 9, NA, 6, 7, 7, 1, 2,…
## $ dismissal   <chr> "retired notout", "bowled", "bowled", "caught", "bow…
## $ inns        <int> 1, 3, 1, 3, 2, 4, 1, 3, 1, 3, 2, 4, 2, 3, 2, 4, 2, 4…
## $ opposition  <chr> "v England", "v England", "v England", "v England", …
## $ ground      <chr> "Melbourne", "Melbourne", "Melbourne", "Melbourne", …
## $ start_date  <chr> "15 Mar 1877", "15 Mar 1877", "31 Mar 1877", "31 Mar…
## $ test_number <chr> "Test # 1", "Test # 1", "Test # 2", "Test # 2", "Tes…

5 Saving data

readr::write_csv(x = clean_test_batting_career, 
                 path = "./clean_test_batting_career.csv")

readr::write_csv(x = clean_test_batting_inngings, 
                 path = "./clean_test_batting_inngings.csv")

6 Session Info

sessionInfo()
## R version 3.6.0 (2019-04-26)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS High Sierra 10.13.6
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_AU.UTF-8/en_AU.UTF-8/en_AU.UTF-8/C/en_AU.UTF-8/en_AU.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] forcats_0.4.0   stringr_1.4.0   dplyr_0.8.3     purrr_0.3.2    
## [5] readr_1.3.1     tidyr_0.8.3     tibble_2.1.3    ggplot2_3.2.0  
## [9] tidyverse_1.2.1
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_1.0.1       cellranger_1.1.0 pillar_1.4.2     compiler_3.6.0  
##  [5] tools_3.6.0      zeallot_0.1.0    digest_0.6.20    lubridate_1.7.4 
##  [9] jsonlite_1.6     evaluate_0.14    nlme_3.1-140     gtable_0.3.0    
## [13] lattice_0.20-38  pkgconfig_2.0.2  rlang_0.4.0      cli_1.1.0       
## [17] rstudioapi_0.10  yaml_2.2.0       haven_2.1.1      xfun_0.8        
## [21] withr_2.1.2      xml2_1.2.0       httr_1.4.0       knitr_1.23      
## [25] vctrs_0.2.0      generics_0.0.2   hms_0.5.0        grid_3.6.0      
## [29] tidyselect_0.2.5 glue_1.3.1       R6_2.4.0         fansi_0.4.0     
## [33] readxl_1.3.1     rmarkdown_1.13   modelr_0.1.4     magrittr_1.5    
## [37] backports_1.1.4  scales_1.0.0     htmltools_0.3.6  rvest_0.3.4     
## [41] assertthat_0.2.1 colorspace_1.4-1 utf8_1.1.4       stringi_1.4.3   
## [45] lazyeval_0.2.2   munsell_0.5.0    broom_0.5.2      crayon_1.3.4