library(tidyverse)
## ── Attaching packages ─────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.2.0 ✔ purrr 0.3.2
## ✔ tibble 2.1.3 ✔ dplyr 0.8.3
## ✔ tidyr 0.8.3 ✔ stringr 1.4.0
## ✔ readr 1.3.1 ✔ forcats 0.4.0
## ── Conflicts ────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(janitor)
##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(skimr)
##
## Attaching package: 'skimr'
## The following object is masked from 'package:stats':
##
## filter
library(maps)
##
## Attaching package: 'maps'
## The following object is masked from 'package:purrr':
##
## map
library(gganimate)
library(ggpubr)
## Loading required package: magrittr
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
##
## set_names
## The following object is masked from 'package:tidyr':
##
## extract
theme_set(theme_classic(18) +
theme(legend.position = "bottom"))
raw_data = read_csv("bird_impacts.csv") %>%
janitor::clean_names() %>%
dplyr::mutate(incident_month = factor(incident_month),
incident_year = factor(incident_year),
month = lubridate::month(incident_date, label = TRUE, abbr = TRUE))
## Parsed with column specification:
## cols(
## .default = col_character(),
## incident_date = col_datetime(format = ""),
## num_engs = col_double(),
## incident_month = col_double(),
## incident_year = col_double(),
## time = col_double(),
## height = col_double(),
## speed = col_double(),
## cost_repairs_infl_adj = col_double()
## )
## See spec(...) for full column specifications.
raw_data %>%
skimr::skim()
## Skim summary statistics
## n obs: 56978
## n variables: 22
##
## ── Variable type:character ─────────────────────────────────────────────────────────────────────────────────
## variable missing complete n min max empty n_unique
## airport 0 56978 56978 4 53 0 423
## airport_id 0 56978 56978 3 5 0 423
## atype 0 56978 56978 4 18 0 91
## damage 4324 52654 56978 1 2 0 4
## operator 0 56978 56978 15 18 0 4
## phase_of_flt 11469 45509 56978 4 12 0 24
## precip 21614 35364 56978 3 10 0 7
## sky 20485 36493 56978 8 10 0 3
## species 0 56978 56978 4 50 0 527
## species_id 0 56978 56978 1 6 0 528
## state 0 56978 56978 2 3 0 59
## time_of_day 16133 40845 56978 3 5 0 4
## type_eng 234 56744 56978 1 1 0 4
##
## ── Variable type:factor ────────────────────────────────────────────────────────────────────────────────────
## variable missing complete n n_unique
## incident_month 0 56978 56978 12
## incident_year 0 56978 56978 29
## month 0 56978 56978 12
## top_counts ordered
## 9: 7980, 10: 7754, 8: 7104, 5: 6161 FALSE
## 201: 4452, 201: 4081, 201: 3764, 201: 3602 FALSE
## Sep: 7980, Oct: 7754, Aug: 7104, May: 6161 TRUE
##
## ── Variable type:numeric ───────────────────────────────────────────────────────────────────────────────────
## variable missing complete n mean sd p0 p25
## cost_repairs_infl_adj 56363 615 56978 242387.9 942601.48 11 5128
## height 18038 38940 56978 983.83 2026.76 0 0
## num_engs 233 56745 56978 2.06 0.26 1 2
## speed 30046 26932 56978 154.55 42.28 0 130
## time 26124 30854 56978 1427.55 581.73 -84 930
## p50 p75 p100 hist
## 26783 93123.5 1.6e+07 ▇▁▁▁▁▁▁▁
## 50 1000 25000 ▇▁▁▁▁▁▁▁
## 2 2 4 ▁▁▇▁▁▁▁▁
## 140 170 354 ▁▁▅▇▂▂▁▁
## 1426 1950 2359 ▁▁▆▇▅▆▇▇
##
## ── Variable type:POSIXct ───────────────────────────────────────────────────────────────────────────────────
## variable missing complete n min max median
## incident_date 0 56978 56978 1990-01-01 2018-12-31 2009-11-03
## n_unique
## 9880
This data has a large number of missing data in some fields. It seems like we should consider analysing individual variables
We put forward the hypothesis that some months has more impacts due to weather and bird behaviours.
We will also visualise the number of incidence per year per month to detect any report-time bias in the data.
month_summ = raw_data %>%
dplyr::group_by(incident_year, month) %>%
dplyr::count()
month_trace_plot = ggplot(month_summ,
aes(x = month,
y = n,
colour = incident_year,
group = incident_year)) +
geom_line() +
labs(title = "Number of birds impacts per year per month",
y = "Number of incidents") +
theme(legend.title = element_blank()) +
guides(colour = guide_legend(nrow = 3, byrow = TRUE))
month_trace_plot
These numbers should be scaled by the number of flights in the US every year. This analysis will be delayed.
It should be clear that the number of impacts are lower in the winter (Dec to Feb) and both spring and autumn has a higher number of impacts. It is interesting that there is a dip in June every year. This could be caused by some sampling bias or somthing about bird behaviours.
state_data = tibble::tibble(
state_abb = datasets::state.abb,
state_name = tolower(datasets::state.name),
state_division = datasets::state.division
)
#########################
state_summ = raw_data %>%
dplyr::filter(incident_year == 2018) %>%
dplyr::mutate(state = factor(state)) %>%
dplyr::group_by(month, state, .drop = FALSE) %>%
dplyr::tally() %>%
ungroup() %>%
left_join(state_data, by = c("state" = "state_abb"))
## Warning: Column `state`/`state_abb` joining factor and character vector,
## coercing into character vector
state_summ
## # A tibble: 648 x 5
## month state n state_name state_division
## <ord> <chr> <int> <chr> <fct>
## 1 Jan AL 1 alabama East South Central
## 2 Jan AR 0 arkansas West South Central
## 3 Jan AZ 1 arizona Mountain
## 4 Jan BC 0 <NA> <NA>
## 5 Jan CA 21 california Pacific
## 6 Jan CO 3 colorado Mountain
## 7 Jan CT 0 connecticut New England
## 8 Jan DC 3 <NA> <NA>
## 9 Jan FL 9 florida South Atlantic
## 10 Jan GA 3 georgia South Atlantic
## # … with 638 more rows
summary(state_summ$n)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 0.00 2.00 6.87 6.00 246.00
#################
us_states <- map_data("state") %>%
left_join(state_summ, by = c("region" = "state_name")) %>%
dplyr::filter(complete.cases(month))
ggplot(data = us_states,
mapping = aes(x = long, y = lat,
group = group)) +
geom_polygon(aes(fill = n), colour = "black") +
facet_wrap(~month) +
scale_fill_distiller(palette = "Reds", direction = 1) +
labs(title = "No. of bird impacts in 2018") +
theme_void(18) +
theme(legend.position = "bottom")
gif = ggplot(data = us_states,
mapping = aes(x = long, y = lat,
group = group)) +
geom_polygon(aes(fill = n), colour = "black") +
scale_fill_distiller(palette = "Reds", direction = 1) +
theme_void(18) +
theme(legend.position = "bottom") +
transition_states(month,
transition_length = 0.1,
state_length = 0.5) +
ease_aes('linear') +
labs(title = "No. of bird impacts in 2018",
subtitle = "Month: {closest_state}")
anim_save(filename = "incidents_2018.gif",
animation = gif, width = 800)
time_summ = raw_data %>%
dplyr::filter(complete.cases(time_of_day)) %>%
dplyr::group_by(incident_year, month, time_of_day) %>%
dplyr::count()
time_trace_plot = ggplot(time_summ,
aes(x = month,
y = n,
colour = incident_year,
group = interaction(incident_year, time_of_day))) +
geom_line() +
facet_grid(~ time_of_day) +
labs(title = "Number of birds incidents during the time of day",
y = "Number of incidents") +
theme(legend.title = element_blank(),
axis.text.x = element_text(angle = 90),
legend.position = "none")
time_trace_plot
Interestingly, the dipping effect in June was only visible at night times.
combine_trace_plot = cowplot::plot_grid(
month_trace_plot,
time_trace_plot,
nrow = 2,
rel_heights = c(2, 1))
combine_trace_plot
ggsave(plot = combine_trace_plot,
filename = "combine_trace_plot.png",
width = 12, height = 8)
complete_phase = raw_data %>%
dplyr::filter(complete.cases(phase_of_flt)) %>%
dplyr::mutate(
phase_of_flt = tolower(phase_of_flt),
phase_of_flt_cat = phase_of_flt %>%
forcats::fct_lump(n = 15) %>%
forcats::fct_lump_min(min = 5) %>%
forcats::fct_infreq(),
log_height = log10(height + 1)
)
table(complete_phase$phase_of_flt_cat)
##
## approach take-off run climb landing roll descent
## 20654 8271 7673 7282 661
## departure arrival local en route taxi
## 470 190 116 81 76
## unknown parked
## 23 12
complete_phase %>%
ggplot(aes(x = phase_of_flt_cat)) +
geom_bar()
complete_phase %>%
ggplot(aes(x = log_height)) +
geom_histogram() +
facet_wrap(~phase_of_flt_cat, scales = "free")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 6743 rows containing non-finite values (stat_bin).
complete_phase %>%
dplyr::filter(log_height > 0) %>%
ggplot(aes(x = log_height)) +
geom_histogram(bins = 30) +
facet_wrap(~phase_of_flt_cat, scales = "free")
table(is.na(complete_phase$cost_repairs_infl_adj))
##
## FALSE TRUE
## 497 45012
summary(complete_phase$cost_repairs_infl_adj)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 11 5810 32100 274037 107700 16380000 45012
hist(complete_phase$cost_repairs_infl_adj)
complete_cost_data = complete_phase %>%
dplyr::filter(complete.cases(cost_repairs_infl_adj)) %>%
dplyr::rename(cost_repairs = cost_repairs_infl_adj) %>%
dplyr::mutate(
atype_cat = atype %>%
forcats::fct_lump(n = 15) %>%
forcats::fct_lump_min(min = 5) %>%
forcats::fct_reorder(cost_repairs, .fun = median),
species_cat = species %>%
forcats::fct_lump(n = 20) %>%
forcats::fct_lump_min(min = 5) %>%
forcats::fct_infreq(),
phase_of_flt = forcats::fct_infreq(phase_of_flt)
) %>%
tidyr::separate(atype,
into = c("acompany", "amodel", "avariant"),
sep = "-",
remove = FALSE)
## Warning: Expected 3 pieces. Missing pieces filled with `NA` in 148 rows
## [1, 3, 4, 5, 7, 10, 11, 12, 14, 15, 17, 18, 21, 22, 24, 30, 33, 34, 39,
## 41, ...].
complete_cost_data %>%
group_by(atype_cat) %>%
summarise(median(cost_repairs))
## # A tibble: 16 x 2
## atype_cat `median(cost_repairs)`
## <fct> <dbl>
## 1 B-737-700 4697
## 2 B-737-800 13347
## 3 B-737-900 24581
## 4 B-747-400 25426
## 5 MD-82 26540
## 6 A-319 32100
## 7 B-737-300 32358
## 8 A-320 33384
## 9 Other 36880
## 10 B-737-500 42109
## 11 MD-88 48347
## 12 MD-83 50904.
## 13 B-757-200 51349
## 14 MD-80 52272
## 15 B-737-200 92200
## 16 B-767-300 100000
complete_cost_data %>%
dplyr::filter(atype_cat != "Other") %>%
ggplot(aes(x = atype_cat,
y = cost_repairs,
fill = acompany)) +
# geom_violin(width = 1) +
geom_boxplot(width = 0.5) +
scale_y_log10(label = scales::comma) +
ggsci::scale_fill_nejm() +
coord_flip()
complete_cost_data %>%
dplyr::filter(species_cat != "Other") %>%
ggplot(aes(x = species_cat,
y = cost_repairs)) +
# geom_violin(width = 1) +
geom_boxplot(width = 0.5) +
scale_y_log10(label = scales::comma) +
ggsci::scale_fill_nejm() +
coord_flip()
complete_cost_data %>%
group_by(phase_of_flt, species_cat, .drop = FALSE) %>%
dplyr::count() %>%
ggplot(aes(x = phase_of_flt,
y = species_cat,
fill = n,
label = n)) +
geom_tile() +
geom_text() +
scale_fill_distiller(palette = "Spectral")
complete_cost_data %>%
ggplot(aes(x = month,
y = cost_repairs)) +
# geom_violin(width = 1) +
geom_boxplot(width = 0.5) +
scale_y_log10(breaks = c(1e4, 2e4, 5e4, 7e4,
1e5, 2e5, 5e5, 7e5,
1e6, 2e6, 5e6, 7e6),
label = scales::comma)
library(ggparty)
## Loading required package: partykit
## Loading required package: grid
## Loading required package: libcoin
## Loading required package: mvtnorm
complete_cost_data %>% skimr::skim()
## Skim summary statistics
## n obs: 497
## n variables: 29
##
## ── Variable type:character ─────────────────────────────────────────────────────────────────────────────────
## variable missing complete n min max empty n_unique
## acompany 0 497 497 1 11 0 9
## airport 0 497 497 7 46 0 104
## airport_id 0 497 497 4 4 0 104
## amodel 5 492 497 1 4 0 22
## atype 0 497 497 5 11 0 38
## avariant 148 349 497 2 5 0 9
## damage 2 495 497 1 2 0 4
## operator 0 497 497 15 18 0 4
## precip 108 389 497 3 9 0 5
## sky 99 398 497 8 10 0 3
## species 0 497 497 5 39 0 107
## species_id 0 497 497 1 6 0 107
## state 0 497 497 2 3 0 45
## time_of_day 59 438 497 3 5 0 4
## type_eng 1 496 497 1 1 0 2
##
## ── Variable type:factor ────────────────────────────────────────────────────────────────────────────────────
## variable missing complete n n_unique
## atype_cat 0 497 497 16
## incident_month 0 497 497 12
## incident_year 0 497 497 29
## month 0 497 497 12
## phase_of_flt 0 497 497 6
## phase_of_flt_cat 0 497 497 6
## species_cat 0 497 497 23
## top_counts ordered
## B-7: 89, B-7: 82, B-7: 54, Oth: 53 FALSE
## 11: 74, 10: 67, 4: 57, 9: 53 FALSE
## 201: 41, 200: 35, 201: 32, 201: 30 FALSE
## Nov: 74, Oct: 67, Apr: 57, Sep: 53 TRUE
## app: 245, cli: 131, tak: 76, lan: 27 FALSE
## app: 245, cli: 131, tak: 76, lan: 27 FALSE
## Oth: 154, Unk: 75, Unk: 34, Gul: 30 FALSE
##
## ── Variable type:numeric ───────────────────────────────────────────────────────────────────────────────────
## variable missing complete n mean sd p0 p25 p50
## cost_repairs 0 497 497 274036.81 1e+06 11 5810 32100
## height 70 427 497 1773.44 2857.88 0 5 500
## log_height 70 427 497 2.2 1.43 0 0.78 2.7
## num_engs 1 496 497 2.05 0.3 2 2 2
## speed 195 302 497 171.14 51.64 70 140 150
## time 118 379 497 1473.45 575 20 1015 1530
## p75 p100 hist
## 107700 1.6e+07 ▇▁▁▁▁▁▁▁
## 2350 19000 ▇▁▁▁▁▁▁▁
## 3.37 4.28 ▇▁▁▂▃▇▇▃
## 2 4 ▇▁▁▁▁▁▁▁
## 200 350 ▁▇▆▅▂▃▁▁
## 2005 2355 ▁▁▅▆▆▆▇▇
##
## ── Variable type:POSIXct ───────────────────────────────────────────────────────────────────────────────────
## variable missing complete n min max median
## incident_date 0 497 497 1990-10-29 2018-12-06 2009-05-12
## n_unique
## 479
cost_data_reg = complete_cost_data %>%
dplyr::filter(complete.cases(sky),
# complete.cases(phase_of_flt),
complete.cases(speed),
complete.cases(height),
complete.cases(damage),
complete.cases(time_of_day)) %>%
dplyr::transmute(
sky,
# phase_of_flt,
damage_three = forcats::fct_recode(damage, "M" = "M?") %>%
forcats::fct_relevel("N"),
speed,
log_height = log10(height + 1L),
time_of_day,
log_cost_repairs = log10(cost_repairs),
) %>% as.data.frame()
lm(log_cost_repairs ~ ., data = cost_data_reg) %>%
broom::tidy()
## # A tibble: 10 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 2.79 0.327 8.53 1.26e-15
## 2 skyOvercast -0.180 0.129 -1.39 1.64e- 1
## 3 skySome Cloud -0.0279 0.110 -0.255 7.99e- 1
## 4 damage_threeM 1.72 0.166 10.4 2.57e-21
## 5 damage_threeS 2.78 0.174 15.9 2.11e-40
## 6 speed -0.00293 0.00139 -2.10 3.65e- 2
## 7 log_height 0.108 0.0581 1.85 6.48e- 2
## 8 time_of_dayDay -0.218 0.248 -0.877 3.81e- 1
## 9 time_of_dayDusk 0.0153 0.357 0.0429 9.66e- 1
## 10 time_of_dayNight -0.318 0.263 -1.21 2.27e- 1
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
rf = randomForest::randomForest(
x = model.matrix(log_cost_repairs ~., data = cost_data_reg),
y = cost_data_reg$log_cost_repairs)
varImpPlot(rf)
cost_data_reg %>%
ggplot(aes(x = log_height,
y = log_cost_repairs,
colour = damage_three)) +
geom_point() +
geom_smooth(method = "loess")
cost_data_reg %>%
dplyr::filter(log_height > 0) %>%
ggplot(aes(x = log_height,
y = log_cost_repairs,
colour = damage_three)) +
geom_point() +
geom_smooth(method = "loess")
library(ggparty)
cost_data_reg = cost_data_reg %>%
dplyr::mutate_if(is.character, as.factor)
tree <- ctree(log_cost_repairs ~ ., data = cost_data_reg)
ggparty(tree, horizontal = TRUE, terminal_space = 0.6) +
geom_edge() +
geom_edge_label() +
geom_node_splitvar() +
geom_node_plot(gglist = list(
geom_density(aes(x = log_cost_repairs))),
shared_axis_labels = TRUE)
https://cran.r-project.org/web/packages/geofacet/index.html # Session Info
sessionInfo()
## R version 3.6.0 (2019-04-26)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS High Sierra 10.13.6
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_AU.UTF-8/en_AU.UTF-8/en_AU.UTF-8/C/en_AU.UTF-8/en_AU.UTF-8
##
## attached base packages:
## [1] grid stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] randomForest_4.6-14 ggparty_0.0.0.9000 partykit_1.2-4
## [4] mvtnorm_1.0-11 libcoin_1.0-4 ggpubr_0.2.1
## [7] magrittr_1.5 gganimate_1.0.3 maps_3.3.0
## [10] skimr_1.0.7 janitor_1.2.0 forcats_0.4.0
## [13] stringr_1.4.0 dplyr_0.8.3 purrr_0.3.2
## [16] readr_1.3.1 tidyr_0.8.3 tibble_2.1.3
## [19] ggplot2_3.2.0 tidyverse_1.2.1
##
## loaded via a namespace (and not attached):
## [1] nlme_3.1-140 sf_0.7-6 lubridate_1.7.4
## [4] RColorBrewer_1.1-2 progress_1.2.2 httr_1.4.0
## [7] ggsci_2.9 tools_3.6.0 backports_1.1.4
## [10] utf8_1.1.4 R6_2.4.0 rpart_4.1-15
## [13] KernSmooth_2.23-15 DBI_1.0.0 lazyeval_0.2.2
## [16] colorspace_1.4-1 withr_2.1.2 tidyselect_0.2.5
## [19] prettyunits_1.0.2 compiler_3.6.0 cli_1.1.0
## [22] rvest_0.3.4 xml2_1.2.0 labeling_0.3
## [25] checkmate_1.9.4 scales_1.0.0 classInt_0.3-3
## [28] digest_0.6.20 rmarkdown_1.13 pkgconfig_2.0.2
## [31] htmltools_0.3.6 rlang_0.4.0 readxl_1.3.1
## [34] rstudioapi_0.10 farver_1.1.0 generics_0.0.2
## [37] jsonlite_1.6 Formula_1.2-3 Matrix_1.2-17
## [40] Rcpp_1.0.1 munsell_0.5.0 fansi_0.4.0
## [43] stringi_1.4.3 yaml_2.2.0 inum_1.0-1
## [46] snakecase_0.11.0 plyr_1.8.4 crayon_1.3.4
## [49] lattice_0.20-38 haven_2.1.1 cowplot_1.0.0
## [52] splines_3.6.0 hms_0.5.0 transformr_0.1.1
## [55] zeallot_0.1.0 knitr_1.23 pillar_1.4.2
## [58] ggsignif_0.5.0 reshape2_1.4.3 lpSolve_5.6.13.1
## [61] glue_1.3.1 evaluate_0.14 gifski_0.8.6
## [64] modelr_0.1.4 png_0.1-7 vctrs_0.2.0
## [67] tweenr_1.0.1 cellranger_1.1.0 gtable_0.3.0
## [70] assertthat_0.2.1 xfun_0.8 broom_0.5.2
## [73] e1071_1.7-2 class_7.3-15 survival_2.44-1.1
## [76] units_0.6-3 ellipsis_0.2.0.1