In this document, we will clean the raw downloaed data from Cricinfo.
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.2.0 ✔ purrr 0.3.2
## ✔ tibble 2.1.3 ✔ dplyr 0.8.3
## ✔ tidyr 0.8.3 ✔ stringr 1.4.0
## ✔ readr 1.3.1 ✔ forcats 0.4.0
## ── Conflicts ────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
theme_set(theme_classic(18) +
theme(legend.position = "bottom"))
test_batting_career = read_csv("all_test_batting_career.csv")
## Parsed with column specification:
## cols(
## country = col_character(),
## player_name = col_character(),
## x = col_character(),
## span = col_character(),
## mat = col_double(),
## inns = col_character(),
## no = col_character(),
## runs = col_character(),
## hs = col_character(),
## ave = col_character(),
## x100 = col_character(),
## x50 = col_character(),
## x0 = col_character(),
## x_2 = col_character(),
## x4s = col_character(),
## x6s = col_character(),
## bf = col_character(),
## sr = col_character()
## )
test_batting_inngings = read_csv("all_test_batting_innings.csv")
## Parsed with column specification:
## cols(
## country = col_character(),
## player_name = col_character(),
## runs = col_character(),
## mins = col_character(),
## bf = col_character(),
## x4s = col_character(),
## x6s = col_character(),
## sr = col_character(),
## pos = col_character(),
## dismissal = col_character(),
## inns = col_character(),
## x = col_logical(),
## opposition = col_character(),
## ground = col_character(),
## start_date = col_character(),
## x_2 = col_character()
## )
clean_test_batting_career = test_batting_career %>%
tidyr::separate(span,
into = c("career_start", "career_end"),
sep = "-", remove = FALSE) %>%
dplyr::transmute(
country,
player_name,
span,
career_start = career_start %>% as.integer(),
career_end = career_end %>% as.integer(),
mat = mat %>% as.integer(),
inns = inns %>% as.integer(),
not_out = no %>% as.integer(),
runs = as.integer(runs),
hs,
ave = ave %>% as.numeric(),
century = x100 %>% stringr::str_extract("\\-*\\d+\\.*\\d*") %>% as.integer(),
half_century = x50 %>% stringr::str_extract("\\-*\\d+\\.*\\d*") %>% as.integer(),
ducks = x0 %>% stringr::str_extract("\\-*\\d+\\.*\\d*") %>% as.integer(),
fours = x4s %>% stringr::str_extract("\\-*\\d+\\.*\\d*") %>% as.integer(),
sixes = x6s %>% stringr::str_extract("\\-*\\d+\\.*\\d*") %>% as.integer(),
balls_faced = bf %>% stringr::str_extract("\\-*\\d+\\.*\\d*") %>% as.integer(),
strike_rate = sr %>% stringr::str_extract("\\-*\\d+\\.*\\d*") %>% as.numeric()
)
## Warning in function_list[[k]](value): NAs introduced by coercion
## Warning in function_list[[k]](value): NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning in function_list[[k]](value): NAs introduced by coercion
clean_test_batting_career %>% glimpse()
## Observations: 2,993
## Variables: 18
## $ country <chr> "australia", "australia", "australia", "australia",…
## $ player_name <chr> "C Bannerman", "JM Blackham", "BB Cooper", "TW Garr…
## $ span <chr> "1877-1879", "1877-1894", NA, "1877-1888", "1877-18…
## $ career_start <int> 1877, 1877, NA, 1877, 1877, NA, NA, 1877, NA, 1877,…
## $ career_end <int> 1879, 1894, NA, 1888, 1879, NA, NA, 1885, NA, 1887,…
## $ mat <int> 3, 35, 1, 19, 3, 1, 2, 15, 2, 12, 2, 2, 19, 18, 1, …
## $ inns <int> 6, 62, 2, 33, 5, 2, 4, 27, 4, 21, 4, 3, 34, 29, 1, …
## $ not_out <int> 2, 11, 0, 6, 2, 0, 1, 2, 1, 1, 0, 0, 5, 6, 0, 2, 4,…
## $ runs <int> 239, 800, 18, 339, 60, 11, 10, 471, 39, 269, 67, 64…
## $ hs <chr> "165*", "74", "15", "51*", "43", "11", "8", "124", …
## $ ave <dbl> 59.75, 15.68, 9.00, 12.55, 20.00, 5.50, 3.33, 18.84…
## $ century <int> 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 2, 0, 0, 0, 0, …
## $ half_century <int> 0, 4, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 8, 0, …
## $ ducks <int> 0, 6, 0, 5, 0, 1, 1, 3, 0, 1, 0, 0, 3, 6, 0, 3, 2, …
## $ fours <int> NA, NA, NA, NA, NA, 1, 1, NA, NA, NA, NA, NA, NA, N…
## $ sixes <int> NA, NA, NA, NA, NA, 0, 0, NA, NA, NA, NA, NA, NA, N…
## $ balls_faced <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ strike_rate <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
clean_test_batting_inngings = test_batting_inngings %>%
dplyr::transmute(
country,
player_name,
runs,
mins = mins %>% as.integer(),
balls_faced = bf %>% stringr::str_extract("\\-*\\d+\\.*\\d*") %>% as.integer(),
fours = x4s %>% stringr::str_extract("\\-*\\d+\\.*\\d*") %>% as.integer(),
sixes = x6s %>% stringr::str_extract("\\-*\\d+\\.*\\d*") %>% as.integer(),
strike_rate = sr %>% stringr::str_extract("\\-*\\d+\\.*\\d*") %>% as.numeric(),
pos = pos %>% as.integer(),
dismissal,
inns = inns %>% as.integer(),
opposition,
ground,
start_date,
test_number = x_2
)
## Warning in function_list[[k]](value): NAs introduced by coercion
## Warning in function_list[[k]](value): NAs introduced by coercion
## Warning in function_list[[k]](value): NAs introduced by coercion
clean_test_batting_inngings %>% glimpse()
## Observations: 94,604
## Variables: 15
## $ country <chr> "australia", "australia", "australia", "australia", …
## $ player_name <chr> "C Bannerman", "C Bannerman", "C Bannerman", "C Bann…
## $ runs <chr> "165*", "4", "10", "30", "15", "15*", "17", "6", "5"…
## $ mins <int> 285, 8, 55, 13, NA, 10, NA, 16, NA, NA, NA, NA, 1, 4…
## $ balls_faced <int> NA, 10, NA, NA, NA, NA, NA, 20, NA, NA, NA, NA, 2, 5…
## $ fours <int> 18, 1, NA, NA, NA, NA, NA, 0, NA, NA, NA, NA, 0, 2, …
## $ sixes <int> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, NA, 0, 0, 0, 0, 0, …
## $ strike_rate <dbl> NA, 40.00, NA, NA, NA, NA, NA, 30.00, NA, NA, NA, NA…
## $ pos <int> 1, 1, 2, 3, 1, 1, 8, 8, 3, 10, 9, NA, 6, 7, 7, 1, 2,…
## $ dismissal <chr> "retired notout", "bowled", "bowled", "caught", "bow…
## $ inns <int> 1, 3, 1, 3, 2, 4, 1, 3, 1, 3, 2, 4, 2, 3, 2, 4, 2, 4…
## $ opposition <chr> "v England", "v England", "v England", "v England", …
## $ ground <chr> "Melbourne", "Melbourne", "Melbourne", "Melbourne", …
## $ start_date <chr> "15 Mar 1877", "15 Mar 1877", "31 Mar 1877", "31 Mar…
## $ test_number <chr> "Test # 1", "Test # 1", "Test # 2", "Test # 2", "Tes…
readr::write_csv(x = clean_test_batting_career,
path = "./clean_test_batting_career.csv")
readr::write_csv(x = clean_test_batting_inngings,
path = "./clean_test_batting_inngings.csv")
sessionInfo()
## R version 3.6.0 (2019-04-26)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS High Sierra 10.13.6
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_AU.UTF-8/en_AU.UTF-8/en_AU.UTF-8/C/en_AU.UTF-8/en_AU.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] forcats_0.4.0 stringr_1.4.0 dplyr_0.8.3 purrr_0.3.2
## [5] readr_1.3.1 tidyr_0.8.3 tibble_2.1.3 ggplot2_3.2.0
## [9] tidyverse_1.2.1
##
## loaded via a namespace (and not attached):
## [1] Rcpp_1.0.1 cellranger_1.1.0 pillar_1.4.2 compiler_3.6.0
## [5] tools_3.6.0 zeallot_0.1.0 digest_0.6.20 lubridate_1.7.4
## [9] jsonlite_1.6 evaluate_0.14 nlme_3.1-140 gtable_0.3.0
## [13] lattice_0.20-38 pkgconfig_2.0.2 rlang_0.4.0 cli_1.1.0
## [17] rstudioapi_0.10 yaml_2.2.0 haven_2.1.1 xfun_0.8
## [21] withr_2.1.2 xml2_1.2.0 httr_1.4.0 knitr_1.23
## [25] vctrs_0.2.0 generics_0.0.2 hms_0.5.0 grid_3.6.0
## [29] tidyselect_0.2.5 glue_1.3.1 R6_2.4.0 fansi_0.4.0
## [33] readxl_1.3.1 rmarkdown_1.13 modelr_0.1.4 magrittr_1.5
## [37] backports_1.1.4 scales_1.0.0 htmltools_0.3.6 rvest_0.3.4
## [41] assertthat_0.2.1 colorspace_1.4-1 utf8_1.1.4 stringi_1.4.3
## [45] lazyeval_0.2.2 munsell_0.5.0 broom_0.5.2 crayon_1.3.4