Download all cricketer data
library(rvest)
## Loading required package: xml2
## Registered S3 method overwritten by 'rvest':
## method from
## read_xml.response xml2
library(xml2)
library(tidyverse)
## ── Attaching packages ───────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.2.0 ✔ purrr 0.3.2
## ✔ tibble 2.1.3 ✔ dplyr 0.8.2
## ✔ tidyr 0.8.3 ✔ stringr 1.4.0
## ✔ readr 1.3.1 ✔ forcats 0.4.0
## ── Conflicts ──────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ readr::guess_encoding() masks rvest::guess_encoding()
## ✖ dplyr::lag() masks stats::lag()
## ✖ purrr::pluck() masks rvest::pluck()
library(XML)
##
## Attaching package: 'XML'
## The following object is masked from 'package:rvest':
##
## xml
library(furrr)
## Loading required package: future
plan(multisession, workers = 20)
theme_set(theme_classic(18) +
theme(legend.position = "bottom"))
html = xml2::read_html("http://www.espncricinfo.com/ci/content/player/caps.html?country=2;class=1")
player_name = html %>%
rvest::html_nodes(".ciPlayername") %>%
rvest::html_text()
player_url = html %>%
rvest::html_nodes(".ciPlayername a") %>%
rvest::html_attr("href")
player_fullurl = stringr::str_replace(player_url, "/ci", "http://www.espncricinfo.com/australia")
player_cid = stringr::str_remove(player_url, "/ci/content/player/")
player_batting_url = paste0("http://stats.espncricinfo.com/ci/engine/player/", player_cid,
"?class=1;template=results;type=batting;view=innings")
player_tibble = tibble::tibble(
player_name,
player_url,
player_fullurl,
player_cid,
player_batting_url
)
get_player_batting_innings = function(url){
player_batting = XML::readHTMLTable(url)
player_batting_career = player_batting[["Career averages"]] %>%
as.data.frame() %>%
janitor::clean_names() %>%
as_tibble() %>%
dplyr::mutate_all(as.character)
player_batting_innings = player_batting[["Innings by innings list"]] %>%
janitor::clean_names() %>%
as_tibble() %>%
dplyr::mutate_all(as.character)
result = list(
batting_career = player_batting_career,
batting_innings = player_batting_innings
)
return(result)
}
# batting_list = purrr::map(.x = player_batting_url[1:5], .f = get_player_batting_innings)
batting_list = furrr::future_map(.x = player_tibble$player_batting_url, .f = get_player_batting_innings,
.progress = TRUE)
##
Progress: ─────────────────────────────────────────── 100%
Progress: ───────────────────────────────────────────────────────────── 100%
Progress: ────────────────────────────────────────────────────────────────────────────── 100%
Progress: ────────────────────────────────────────────────────────────────────────────────────────── 100%
Progress: ───────────────────────────────────────────────────────────────────────────────────────────────── 100%
Progress: ────────────────────────────────────────────────────────────────────────────────────────────────── 100%
Progress: ─────────────────────────────────────────────────────────────────────────────────────────────────── 100%
Progress: ──────────────────────────────────────────────────────────────────────────────────────────────────── 100%
Progress: ───────────────────────────────────────────────────────────────────────────────────────────────────── 100%
Progress: ───────────────────────────────────────────────────────────────────────────────────────────────────── 100%
Progress: ───────────────────────────────────────────────────────────────────────────────────────────────────── 100%
Progress: ───────────────────────────────────────────────────────────────────────────────────────────────────── 100%
Progress: ───────────────────────────────────────────────────────────────────────────────────────────────────── 100%
Progress: ───────────────────────────────────────────────────────────────────────────────────────────────────── 100%
Progress: ───────────────────────────────────────────────────────────────────────────────────────────────────── 100%
Progress: ───────────────────────────────────────────────────────────────────────────────────────────────────── 100%
Progress: ───────────────────────────────────────────────────────────────────────────────────────────────────── 100%
Progress: ────────────────────────────────────────────────────────────────────────────────────────────────────── 100%
names(batting_list) = player_tibble$player_name
batting_list_trans = purrr::transpose(batting_list)
batting_career = batting_list_trans$batting_career %>%
dplyr::bind_rows(.id = "player_name")
batting_innings = batting_list_trans$batting_innings %>%
dplyr::bind_rows(.id = "player_name")
readr::write_csv(x = batting_career,
path = "./aus_batting_career.csv")
readr::write_csv(x = batting_innings,
path = "./aus_batting_innings.csv")
sessionInfo()
## R version 3.6.0 (2019-04-26)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Debian GNU/Linux 9 (stretch)
##
## Matrix products: default
## BLAS: /dora/nobackup/biostat/R-3.6.0/R-3.6.0/lib/libRblas.so
## LAPACK: /dora/nobackup/biostat/R-3.6.0/R-3.6.0/lib/libRlapack.so
##
## locale:
## [1] LC_CTYPE=C.UTF-8 LC_NUMERIC=C LC_TIME=C.UTF-8
## [4] LC_COLLATE=C.UTF-8 LC_MONETARY=C.UTF-8 LC_MESSAGES=C.UTF-8
## [7] LC_PAPER=C.UTF-8 LC_NAME=C LC_ADDRESS=C
## [10] LC_TELEPHONE=C LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] furrr_0.1.0 future_1.13.0 XML_3.98-1.19 forcats_0.4.0
## [5] stringr_1.4.0 dplyr_0.8.2 purrr_0.3.2 readr_1.3.1
## [9] tidyr_0.8.3 tibble_2.1.3 ggplot2_3.2.0 tidyverse_1.2.1
## [13] rvest_0.3.3 xml2_1.2.0
##
## loaded via a namespace (and not attached):
## [1] tidyselect_0.2.5 xfun_0.6 listenv_0.7.0 haven_2.1.0
## [5] lattice_0.20-38 colorspace_1.4-1 generics_0.0.2 htmltools_0.3.6
## [9] yaml_2.2.0 rlang_0.4.0 pillar_1.4.2 glue_1.3.1
## [13] withr_2.1.2 selectr_0.4-1 modelr_0.1.4 readxl_1.3.1
## [17] munsell_0.5.0 gtable_0.3.0 cellranger_1.1.0 codetools_0.2-16
## [21] evaluate_0.13 knitr_1.22 curl_3.3 parallel_3.6.0
## [25] broom_0.5.2 Rcpp_1.0.1 scales_1.0.0 backports_1.1.4
## [29] jsonlite_1.6 hms_0.4.2 digest_0.6.19 stringi_1.4.3
## [33] grid_3.6.0 cli_1.1.0 tools_3.6.0 magrittr_1.5
## [37] lazyeval_0.2.2 crayon_1.3.4 pkgconfig_2.0.2 lubridate_1.7.4
## [41] assertthat_0.2.1 rmarkdown_1.12 httr_1.4.0 rstudioapi_0.10
## [45] R6_2.4.0 globals_0.12.4 nlme_3.1-140 compiler_3.6.0