library(tidyverse)
## Registered S3 methods overwritten by 'ggplot2':
## method from
## [.quosures rlang
## c.quosures rlang
## print.quosures rlang
## ── Attaching packages ──────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.1.1 ✔ purrr 0.3.2
## ✔ tibble 2.1.1 ✔ dplyr 0.8.1
## ✔ tidyr 0.8.3 ✔ stringr 1.4.0
## ✔ readr 1.3.1 ✔ forcats 0.4.0
## ── Conflicts ─────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
rawData = read_csv("winemag-data-130k-v2.csv") %>%
janitor::clean_names()
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
## X1 = col_double(),
## country = col_character(),
## description = col_character(),
## designation = col_character(),
## points = col_double(),
## price = col_double(),
## province = col_character(),
## region_1 = col_character(),
## region_2 = col_character(),
## taster_name = col_character(),
## taster_twitter_handle = col_character(),
## title = col_character(),
## variety = col_character(),
## winery = col_character()
## )
rawData
## # A tibble: 129,971 x 14
## x1 country description designation points price province region_1
## <dbl> <chr> <chr> <chr> <dbl> <dbl> <chr> <chr>
## 1 0 Italy Aromas inc… Vulkà Bian… 87 NA Sicily … Etna
## 2 1 Portug… This is ri… Avidagos 87 15 Douro <NA>
## 3 2 US Tart and s… <NA> 87 14 Oregon Willame…
## 4 3 US Pineapple … Reserve La… 87 13 Michigan Lake Mi…
## 5 4 US Much like … Vintner's … 87 65 Oregon Willame…
## 6 5 Spain Blackberry… Ars In Vit… 87 15 Norther… Navarra
## 7 6 Italy Here's a b… Belsito 87 16 Sicily … Vittoria
## 8 7 France This dry a… <NA> 87 24 Alsace Alsace
## 9 8 Germany Savory dri… Shine 87 12 Rheinhe… <NA>
## 10 9 France This has g… Les Natures 87 27 Alsace Alsace
## # … with 129,961 more rows, and 6 more variables: region_2 <chr>,
## # taster_name <chr>, taster_twitter_handle <chr>, title <chr>,
## # variety <chr>, winery <chr>
We wish to look into the relationship between points and price of the wine. We will mostly use ggplot2
for visualisation and lm
for linear regression.
rawData %>%
ggplot(aes(x = points, y = price)) +
geom_point()
## Warning: Removed 8996 rows containing missing values (geom_point).
Note the discrete nature of the points rating and also the slightly increasing trend in the data. This implies that a highly rated wine will likely be highly priced as well. However, due to the price can be orders of magnitude different for different wine, we will need to use a logarithm scale on the y-axis.
rawData %>%
ggplot(aes(x = points, y = price)) +
geom_point() +
geom_smooth(method = "lm") +
scale_y_log10()
## Warning: Removed 8996 rows containing non-finite values (stat_smooth).
## Warning: Removed 8996 rows containing missing values (geom_point).
We will create a new dataset, clean_data
, with an extra column called log10price
before performing linear regression modelling.
clean_data = rawData %>%
dplyr::mutate(log10price = log10(price))
M = lm(log10price ~ points, data = clean_data)
plot(clean_data$points, clean_data$log10price)
abline(M, col = "red")
M
##
## Call:
## lm(formula = log10price ~ points, data = clean_data)
##
## Coefficients:
## (Intercept) points
## -3.60797 0.05708
summary(M)
##
## Call:
## lm(formula = log10price ~ points, data = clean_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.74160 -0.16017 -0.01759 0.13798 2.10307
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.6079666 0.0187718 -192.2 <2e-16 ***
## points 0.0570842 0.0002122 269.0 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2247 on 120973 degrees of freedom
## (8996 observations deleted due to missingness)
## Multiple R-squared: 0.3744, Adjusted R-squared: 0.3744
## F-statistic: 7.239e+04 on 1 and 120973 DF, p-value: < 2.2e-16
cor(clean_data$points,
clean_data$log10price, use = "pairwise.complete.obs")
## [1] 0.6118495
The linear model tells us as the rating point increase by 1, we will see an increase in the log10 price of 0.057. Taking the exponential of 10, this means every point increase of 1 translate to 10^0.057 = 1.14 fold increase in pricing (i.e. 14% increase).
This is a bad visualisation since we have way too many countries and also some countries has very small production.
clean_data %>%
ggplot(aes(x = country, y = log10price)) +
geom_boxplot() +
labs(title = "This is ugly")
## Warning: Removed 8996 rows containing non-finite values (stat_boxplot).
table(clean_data$country)
##
## Argentina Armenia Australia
## 3800 2 2329
## Austria Bosnia and Herzegovina Brazil
## 3345 2 52
## Bulgaria Canada Chile
## 141 257 4472
## China Croatia Cyprus
## 1 73 11
## Czech Republic Egypt England
## 12 1 74
## France Georgia Germany
## 22093 86 2165
## Greece Hungary India
## 466 146 9
## Israel Italy Lebanon
## 505 19540 35
## Luxembourg Macedonia Mexico
## 6 12 70
## Moldova Morocco New Zealand
## 59 28 1419
## Peru Portugal Romania
## 16 5691 120
## Serbia Slovakia Slovenia
## 12 1 87
## South Africa Spain Switzerland
## 1401 6645 7
## Turkey Ukraine Uruguay
## 90 14 109
## US
## 54504
In a new data subset_data
, we will only retain the top 8 countries with the highest production of wine by label. All other countries will be lumped into a category called “Others”. This is through the use of the function fct_lump
in the forcats
package.
Then, we will reorder the countries by their median log 10 price (as oppose to alphabetical ordering).
subset_data = clean_data %>%
dplyr::mutate(
country_lumped = forcats::fct_lump(f = country, n = 8) %>%
fct_reorder(log10price, .fun = median, na.rm = TRUE)
)
table(subset_data$country_lumped)
##
## Chile Portugal Argentina Spain Other Austria France
## 4472 5691 3800 6645 9818 3345 22093
## Italy US
## 19540 54504
subset_data %>%
ggplot(aes(x = country_lumped,
y = log10price)) +
geom_boxplot() +
labs(title = "Log10 price of wine for different countries")
## Warning: Removed 8996 rows containing non-finite values (stat_boxplot).
aus_data = subset_data %>%
dplyr::filter(country == "Australia") %>%
dplyr::mutate(variety = fct_lump(variety, n = 5))
aus_data %>%
ggplot(aes(x = fct_reorder(province, log10price, median, na.rm = TRUE),
y = log10price)) +
geom_boxplot() +
labs(title = "Log 10 price of Australian wine, by states")
## Warning: Removed 35 rows containing non-finite values (stat_boxplot).
aus_data %>%
ggplot(aes(x = points, y = log10price)) +
geom_point() +
geom_smooth(method = "lm") +
facet_wrap(~province) +
labs(title = "Log 10 price vs wine rating of Australian wine, by states")
## Warning: Removed 35 rows containing non-finite values (stat_smooth).
## Warning: Removed 35 rows containing missing values (geom_point).
aus_data_summ = aus_data %>%
group_by(province, variety) %>%
dplyr::tally() %>%
group_by(province) %>%
dplyr::mutate(prop = n/sum(n))
aus_data_summ
## # A tibble: 33 x 4
## # Groups: province [6]
## province variety n prop
## <chr> <fct> <int> <dbl>
## 1 Australia Other Cabernet Sauvignon 18 0.0735
## 2 Australia Other Chardonnay 87 0.355
## 3 Australia Other Pinot Noir 4 0.0163
## 4 Australia Other Riesling 2 0.00816
## 5 Australia Other Shiraz 47 0.192
## 6 Australia Other Other 87 0.355
## 7 New South Wales Cabernet Sauvignon 6 0.0706
## 8 New South Wales Chardonnay 30 0.353
## 9 New South Wales Pinot Noir 1 0.0118
## 10 New South Wales Shiraz 25 0.294
## # … with 23 more rows
aus_data_summ %>%
ggplot(aes(x = province, y = prop, fill = variety)) +
geom_col() +
labs("Proportion of wine variety across Australian states")
We will now focus on NSW and SA wine production. We theorise that these two states has very different varieties of wine. And we will use a Chi-square test. H0: The wine variety variable is independent for the states. H1: Not H0.
chi_data = aus_data %>%
dplyr::filter(province %in% c("New South Wales", "South Australia"))
chi_data
## # A tibble: 1,434 x 16
## x1 country description designation points price province region_1
## <dbl> <chr> <chr> <chr> <dbl> <dbl> <chr> <chr>
## 1 77 Austra… This mediu… Made With … 86 18 South A… South A…
## 2 83 Austra… Pale coppe… Jester San… 86 20 South A… McLaren…
## 3 123 Austra… The blend … Parson's F… 92 40 South A… Padthaw…
## 4 191 Austra… From the l… The Trial … 87 30 South A… Padthaw…
## 5 232 Austra… Lifted ced… Red Belly … 85 12 South A… South A…
## 6 238 Austra… The sort o… The Dereli… 85 29 South A… McLaren…
## 7 246 Austra… Juicy and … Destinatio… 85 15 South A… Adelaid…
## 8 293 Austra… This wine … Green's Vi… 92 70 South A… Barossa…
## 9 349 Austra… RunRig is … RunRig 97 225 South A… Barossa
## 10 360 Austra… Bacon and … Descendant 95 125 South A… Barossa…
## # … with 1,424 more rows, and 8 more variables: region_2 <chr>,
## # taster_name <chr>, taster_twitter_handle <chr>, title <chr>,
## # variety <fct>, winery <chr>, log10price <dbl>, country_lumped <fct>
table(chi_data$province, chi_data$variety)
##
## Cabernet Sauvignon Chardonnay Pinot Noir Riesling Shiraz
## New South Wales 6 30 1 0 25
## South Australia 180 150 26 109 443
##
## Other
## New South Wales 23
## South Australia 441
table(chi_data$province, chi_data$variety) %>% chisq.test()
## Warning in chisq.test(.): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: .
## X-squared = 47.843, df = 5, p-value = 3.824e-09
Since we have p < 0.05, we will reject H0 in favour of H1.
We theorise that NSW and TAS has very different mean log10-price for their wine. We will perform a two-sample t-test. H0: mean of NSW is the same as mean of TAS. H0: mean of NSW is not the same as mean of TAS.
nsw_tas_data = aus_data %>%
dplyr::filter(province %in% c("New South Wales", "Tasmania"))
nsw_tas_data %>%
ggplot(aes(x = log10price,
y = ..density..)) +
geom_histogram(aes(fill = province)) +
geom_density() +
facet_wrap(~province, nrow = 2) +
labs(title = "We note the small right skewness in NSW")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
nsw_tas_data %>%
ggplot(aes(x = province, y = log10price)) +
geom_boxplot()
t.test(log10price ~ province, data = nsw_tas_data)
##
## Welch Two Sample t-test
##
## data: log10price by province
## t = -4.6291, df = 100.63, p-value = 1.099e-05
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.2729839 -0.1091977
## sample estimates:
## mean in group New South Wales mean in group Tasmania
## 1.321185 1.512276
As p < 0.05, we will reject H0 in favour of H1.
sessionInfo()
## R version 3.6.0 (2019-04-26)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS High Sierra 10.13.6
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_AU.UTF-8/en_AU.UTF-8/en_AU.UTF-8/C/en_AU.UTF-8/en_AU.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] forcats_0.4.0 stringr_1.4.0 dplyr_0.8.1 purrr_0.3.2
## [5] readr_1.3.1 tidyr_0.8.3 tibble_2.1.1 ggplot2_3.1.1
## [9] tidyverse_1.2.1
##
## loaded via a namespace (and not attached):
## [1] tidyselect_0.2.5 xfun_0.7 janitor_1.2.0 haven_2.1.0
## [5] lattice_0.20-38 snakecase_0.10.0 colorspace_1.4-1 generics_0.0.2
## [9] vctrs_0.1.0 htmltools_0.3.6 yaml_2.2.0 utf8_1.1.4
## [13] rlang_0.3.4 pillar_1.4.0 glue_1.3.1 withr_2.1.2
## [17] modelr_0.1.4 readxl_1.3.1 plyr_1.8.4 munsell_0.5.0
## [21] gtable_0.3.0 cellranger_1.1.0 rvest_0.3.4 evaluate_0.14
## [25] labeling_0.3 knitr_1.23 fansi_0.4.0 broom_0.5.2
## [29] Rcpp_1.0.1 scales_1.0.0 backports_1.1.4 jsonlite_1.6
## [33] hms_0.4.2 digest_0.6.19 stringi_1.4.3 grid_3.6.0
## [37] cli_1.1.0 tools_3.6.0 magrittr_1.5 lazyeval_0.2.2
## [41] crayon_1.3.4 pkgconfig_2.0.2 zeallot_0.1.0 ellipsis_0.1.0
## [45] xml2_1.2.0 lubridate_1.7.4 assertthat_0.2.1 rmarkdown_1.13
## [49] httr_1.4.0 rstudioapi_0.10 R6_2.4.0 nlme_3.1-140
## [53] compiler_3.6.0