1 Introduction

Download all cricketer data

2 Loading packages

library(rvest)
## Loading required package: xml2
## Registered S3 method overwritten by 'rvest':
##   method            from
##   read_xml.response xml2
library(xml2)
library(tidyverse)
## ── Attaching packages ───────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.2.0     ✔ purrr   0.3.2
## ✔ tibble  2.1.3     ✔ dplyr   0.8.2
## ✔ tidyr   0.8.3     ✔ stringr 1.4.0
## ✔ readr   1.3.1     ✔ forcats 0.4.0
## ── Conflicts ──────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter()         masks stats::filter()
## ✖ readr::guess_encoding() masks rvest::guess_encoding()
## ✖ dplyr::lag()            masks stats::lag()
## ✖ purrr::pluck()          masks rvest::pluck()
library(XML)
## 
## Attaching package: 'XML'
## The following object is masked from 'package:rvest':
## 
##     xml
library(furrr)
## Loading required package: future
plan(multisession, workers = 20)

theme_set(theme_classic(18) +
            theme(legend.position = "bottom"))

3 Country list of players

html = xml2::read_html("http://www.espncricinfo.com/ci/content/player/caps.html?country=2;class=1")

player_name = html %>% 
  rvest::html_nodes(".ciPlayername") %>% 
  rvest::html_text()

player_url = html %>% 
  rvest::html_nodes(".ciPlayername a") %>% 
  rvest::html_attr("href")

player_fullurl = stringr::str_replace(player_url, "/ci", "http://www.espncricinfo.com/australia")
player_cid = stringr::str_remove(player_url, "/ci/content/player/")
player_batting_url = paste0("http://stats.espncricinfo.com/ci/engine/player/", player_cid,
                            "?class=1;template=results;type=batting;view=innings")


player_tibble = tibble::tibble(
  player_name,
  player_url,
  player_fullurl,
  player_cid,
  player_batting_url
)

4 Player individual batting innings list

get_player_batting_innings = function(url){
  player_batting = XML::readHTMLTable(url)
  
  player_batting_career = player_batting[["Career averages"]] %>% 
    as.data.frame() %>% 
    janitor::clean_names() %>% 
    as_tibble() %>% 
    dplyr::mutate_all(as.character)
  
  player_batting_innings = player_batting[["Innings by innings list"]] %>% 
    janitor::clean_names() %>% 
    as_tibble() %>% 
    dplyr::mutate_all(as.character)
  
  
  result = list(
    batting_career = player_batting_career, 
    batting_innings = player_batting_innings
  )
  
  return(result)
}



# batting_list = purrr::map(.x = player_batting_url[1:5], .f = get_player_batting_innings)

batting_list = furrr::future_map(.x = player_tibble$player_batting_url, .f = get_player_batting_innings, 
                                 .progress = TRUE)
## 
 Progress: ───────────────────────────────────────────                                                            100%
 Progress: ─────────────────────────────────────────────────────────────                                          100%
 Progress: ──────────────────────────────────────────────────────────────────────────────                         100%
 Progress: ──────────────────────────────────────────────────────────────────────────────────────────             100%
 Progress: ─────────────────────────────────────────────────────────────────────────────────────────────────      100%
 Progress: ──────────────────────────────────────────────────────────────────────────────────────────────────     100%
 Progress: ───────────────────────────────────────────────────────────────────────────────────────────────────    100%
 Progress: ────────────────────────────────────────────────────────────────────────────────────────────────────   100%
 Progress: ─────────────────────────────────────────────────────────────────────────────────────────────────────  100%
 Progress: ─────────────────────────────────────────────────────────────────────────────────────────────────────  100%
 Progress: ─────────────────────────────────────────────────────────────────────────────────────────────────────  100%
 Progress: ─────────────────────────────────────────────────────────────────────────────────────────────────────  100%
 Progress: ─────────────────────────────────────────────────────────────────────────────────────────────────────  100%
 Progress: ─────────────────────────────────────────────────────────────────────────────────────────────────────  100%
 Progress: ─────────────────────────────────────────────────────────────────────────────────────────────────────  100%
 Progress: ─────────────────────────────────────────────────────────────────────────────────────────────────────  100%
 Progress: ─────────────────────────────────────────────────────────────────────────────────────────────────────  100%
 Progress: ────────────────────────────────────────────────────────────────────────────────────────────────────── 100%
names(batting_list) = player_tibble$player_name

batting_list_trans = purrr::transpose(batting_list)

batting_career = batting_list_trans$batting_career %>% 
  dplyr::bind_rows(.id = "player_name")

batting_innings = batting_list_trans$batting_innings %>% 
  dplyr::bind_rows(.id = "player_name")


readr::write_csv(x = batting_career, 
                 path = "./aus_batting_career.csv")

readr::write_csv(x = batting_innings, 
                 path = "./aus_batting_innings.csv")

5 Session Info

sessionInfo()
## R version 3.6.0 (2019-04-26)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Debian GNU/Linux 9 (stretch)
## 
## Matrix products: default
## BLAS:   /dora/nobackup/biostat/R-3.6.0/R-3.6.0/lib/libRblas.so
## LAPACK: /dora/nobackup/biostat/R-3.6.0/R-3.6.0/lib/libRlapack.so
## 
## locale:
##  [1] LC_CTYPE=C.UTF-8       LC_NUMERIC=C           LC_TIME=C.UTF-8       
##  [4] LC_COLLATE=C.UTF-8     LC_MONETARY=C.UTF-8    LC_MESSAGES=C.UTF-8   
##  [7] LC_PAPER=C.UTF-8       LC_NAME=C              LC_ADDRESS=C          
## [10] LC_TELEPHONE=C         LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C   
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] furrr_0.1.0     future_1.13.0   XML_3.98-1.19   forcats_0.4.0  
##  [5] stringr_1.4.0   dplyr_0.8.2     purrr_0.3.2     readr_1.3.1    
##  [9] tidyr_0.8.3     tibble_2.1.3    ggplot2_3.2.0   tidyverse_1.2.1
## [13] rvest_0.3.3     xml2_1.2.0     
## 
## loaded via a namespace (and not attached):
##  [1] tidyselect_0.2.5 xfun_0.6         listenv_0.7.0    haven_2.1.0     
##  [5] lattice_0.20-38  colorspace_1.4-1 generics_0.0.2   htmltools_0.3.6 
##  [9] yaml_2.2.0       rlang_0.4.0      pillar_1.4.2     glue_1.3.1      
## [13] withr_2.1.2      selectr_0.4-1    modelr_0.1.4     readxl_1.3.1    
## [17] munsell_0.5.0    gtable_0.3.0     cellranger_1.1.0 codetools_0.2-16
## [21] evaluate_0.13    knitr_1.22       curl_3.3         parallel_3.6.0  
## [25] broom_0.5.2      Rcpp_1.0.1       scales_1.0.0     backports_1.1.4 
## [29] jsonlite_1.6     hms_0.4.2        digest_0.6.19    stringi_1.4.3   
## [33] grid_3.6.0       cli_1.1.0        tools_3.6.0      magrittr_1.5    
## [37] lazyeval_0.2.2   crayon_1.3.4     pkgconfig_2.0.2  lubridate_1.7.4 
## [41] assertthat_0.2.1 rmarkdown_1.12   httr_1.4.0       rstudioapi_0.10 
## [45] R6_2.4.0         globals_0.12.4   nlme_3.1-140     compiler_3.6.0