For this comparison, I will compare the scikit-learn
library in Python
and the newly developed tidymodels
meta-package in R
. The main reason that I have chosen these two is because they share a lot of similarities and imposed strict frameworks in data pre-processing, modelling and evaluations.
The data that I will use is the penguins
data from the R
package palmerpenguins
, which you can learn more about here. The response variable is a factor
variable, species
, indicating the species of a penguin. The other predictor variables are a mix of both numeric and factor variables. For convenience, I have reduced the number of species to two and extracted the data below in a CSV format so that Python can also use this data through pd.read_csv
.
library(palmerpenguins)
library(tidyverse)
penguins %>%
na.omit %>%
dplyr::filter(species %in% c("Adelie", "Chinstrap")) %>%
readr::write_csv(path = "data/penguins_complete.csv")
R
: tidyverse
library(tidyverse)
penguins = readr::read_csv(file = "data/penguins_complete.csv")
penguins %>% colnames()
## [1] "species" "island" "bill_length_mm"
## [4] "bill_depth_mm" "flipper_length_mm" "body_mass_g"
## [7] "sex" "year"
Python
: pandas
import pandas as pd
penguins = pd.read_csv("data/penguins_complete.csv")
penguins.columns
## Index(['species', 'island', 'bill_length_mm', 'bill_depth_mm',
## 'flipper_length_mm', 'body_mass_g', 'sex', 'year'],
## dtype='object')
R
##
##
##
##
##
##
##
##
library(rpart)
library(rpart.plot)
feature_set = c('bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g')
sub_penguins = penguins[,c('species', feature_set)]
dtc_model = rpart::rpart(species ~ ., data = sub_penguins,
control = rpart.control(maxdepth = 1))
table(sub_penguins$species,
predict(dtc_model, type = "class"))
##
## Adelie Chinstrap
## Adelie 143 3
## Chinstrap 6 62
rpart.plot(dtc_model)
Python
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
import matplotlib as plt
import matplotlib.pyplot as pltpyplot
from sklearn.metrics import confusion_matrix
feature_set = ['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']
X = penguins[feature_set]
y = penguins.species
dtc_model = DecisionTreeClassifier(random_state = 1, max_depth = 1)
# Fit model
dtc_model.fit(X, y)
## DecisionTreeClassifier(max_depth=1, random_state=1)
confusion_matrix(y, dtc_model.predict(X))
## array([[143, 3],
## [ 6, 62]])
##
##
##
pltpyplot.figure()
tree.plot_tree(dtc_model, filled = True, feature_names = feature_set, class_names = list(set(list(penguins.species))))
pltpyplot.show()
R
benefits greatly from contributions from the community and there are mamy ways of performing the task albeit with improvements over the standard solution, often with a slightly more considerations for the user experience. treeheatr
is a good example of this. It improves on the tree diagram and uses an additional heatmap for visualisation while keeping the API simple.
library(treeheatr)
heat_tree(sub_penguins, target_lab = 'species')
While the previous code chunks use the entire data to fit a single tree model in each of the languages, this is obviously not the preferred practice for machine learning.
We will use tidymodels
for R
from this point forward.
R
library(tidymodels)
## Registered S3 method overwritten by 'tune':
## method from
## required_pkgs.model_spec parsnip
## ── Attaching packages ────────────────────────────────────── tidymodels 0.1.3 ──
## ✔ broom 0.7.6 ✔ rsample 0.1.0
## ✔ dials 0.0.9 ✔ tune 0.1.5
## ✔ infer 0.5.4 ✔ workflows 0.2.2
## ✔ modeldata 0.1.0 ✔ workflowsets 0.0.2
## ✔ parsnip 0.1.6 ✔ yardstick 0.0.8
## ✔ recipes 0.1.16
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ recipes::fixed() masks stringr::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ dials::prune() masks rpart::prune()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step() masks stats::step()
## • Use tidymodels_prefer() to resolve common conflicts.
penguins = penguins %>% dplyr::mutate(species = as.factor(species))
splitting = rsample::initial_split(data = penguins, prop = 0.75)
dtc_train_model = decision_tree() %>%
set_engine("rpart") %>%
set_mode("classification") %>%
fit(species ~ ., data = training(splitting))
dtc_predictions_tbl = bind_cols(
pred_class = dtc_train_model %>%
predict(new_data = testing(splitting), type = "class"),
pred_prob = dtc_train_model %>%
predict(new_data = testing(splitting), type = "prob"))
dtc_predictions_tbl2 = bind_cols(testing(splitting), dtc_predictions_tbl)
cm = conf_mat(data = dtc_predictions_tbl2, truth = "species", estimate = ".pred_class")
cm
## Truth
## Prediction Adelie Chinstrap
## Adelie 41 3
## Chinstrap 1 9
summary(cm)
## # A tibble: 13 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 accuracy binary 0.926
## 2 kap binary 0.772
## 3 sens binary 0.976
## 4 spec binary 0.75
## 5 ppv binary 0.932
## 6 npv binary 0.9
## 7 mcc binary 0.777
## 8 j_index binary 0.726
## 9 bal_accuracy binary 0.863
## 10 detection_prevalence binary 0.815
## 11 precision binary 0.932
## 12 recall binary 0.976
## 13 f_meas binary 0.953
Python
from sklearn.model_selection import train_test_split
train_X, test_X, train_y, test_y = train_test_split(X, y, random_state = 0, train_size = 0.75)
dtc_train_model = DecisionTreeClassifier(random_state = 1, max_depth = 2)
dtc_train_model = dtc_train_model.fit(train_X, train_y)
dtc_predictions = dtc_train_model.predict(test_X)
from sklearn.metrics import classification_report
cr = classification_report(
y_true = test_y,
y_pred = dtc_predictions)
print(cr)
## precision recall f1-score support
##
## Adelie 0.95 1.00 0.97 38
## Chinstrap 1.00 0.88 0.93 16
##
## accuracy 0.96 54
## macro avg 0.97 0.94 0.95 54
## weighted avg 0.96 0.96 0.96 54
R
roc_curve(dtc_predictions_tbl2, truth = "species", estimate = ".pred_Adelie") %>%
ggplot(aes(x = 1 - specificity, y = sensitivity)) +
geom_path() +
geom_abline() +
theme_bw()
Python
from sklearn.metrics import roc_curve
y_pred_prob = dtc_train_model.predict_proba(test_X)[:,1]
fpr, tpr, thresholds = roc_curve(test_y, y_pred_prob, pos_label = "Chinstrap")
# Plot ROC curve
pltpyplot.close()
pltpyplot.plot([0, 1], [0, 1], 'k--')
## [<matplotlib.lines.Line2D object at 0x7f8af13c1cf8>]
pltpyplot.plot(fpr, tpr)
## [<matplotlib.lines.Line2D object at 0x7f8af13c11d0>]
pltpyplot.xlabel('False Positive Rate')
## Text(0.5, 0, 'False Positive Rate')
pltpyplot.ylabel('True Positive Rate')
## Text(0, 0.5, 'True Positive Rate')
pltpyplot.title('ROC Curve')
## Text(0.5, 1.0, 'ROC Curve')
pltpyplot.show()
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score, make_scorer
import numpy as np
cv = RepeatedKFold(n_splits = 5, n_repeats = 20, random_state = 1)
dtc_train_model = DecisionTreeClassifier(random_state = 1, max_depth = 2)
scorer = make_scorer(f1_score, pos_label = 'Adelie')
scores = cross_val_score(estimator = dtc_train_model, X = train_X, y = train_y, scoring = scorer, cv = cv) ## n_jobs = -1 can be used for parallelisation
print("F1 statistics, mean (sd): " + str(np.round(np.mean(scores), 4)) + "(" + str(np.round(np.std(scores), 4)) + ")")
## F1 statistics, mean (sd): 0.9474(0.0281)
R
missing_penguins = readr::read_csv("data/penguins.csv")
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## species = col_character(),
## island = col_character(),
## bill_length_mm = col_double(),
## bill_depth_mm = col_double(),
## flipper_length_mm = col_double(),
## body_mass_g = col_double(),
## sex = col_character(),
## year = col_double()
## )
feature_set = c('bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g')
X = missing_penguins[,c('species', feature_set)]
X_dropped = na.omit(X)
Python
missing_penguins = pd.read_csv("data/penguins.csv")
missing_penguins.isnull().any()
## species False
## island False
## bill_length_mm True
## bill_depth_mm True
## flipper_length_mm True
## body_mass_g True
## sex True
## year False
## dtype: bool
missing_penguins.isnull().sum()
## species 0
## island 0
## bill_length_mm 2
## bill_depth_mm 2
## flipper_length_mm 2
## body_mass_g 2
## sex 11
## year 0
## dtype: int64
feature_set = ['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']
X = missing_penguins[feature_set]
# rows_with_missing_values = [row for row in X.index if X.iloc[row,:].isnull().any()]
X_dropped = X.dropna(axis = "index")
R
library(imputeMissings)
##
## Attaching package: 'imputeMissings'
## The following object is masked from 'package:dplyr':
##
## compute
imputed_X = impute(data = X, method = "median/mode")
Python
from sklearn.impute import SimpleImputer
median_imputer = SimpleImputer(strategy = "median")
imputed_X = pd.DataFrame(median_imputer.fit_transform(X),columns=feature_set)
imputed_X.isnull().any()
## bill_length_mm False
## bill_depth_mm False
## flipper_length_mm False
## body_mass_g False
## dtype: bool