data = data.frame(
x = 1:5,
y = 2:6,
row.names = c("a", "b", "c", "d", "e"))
print(data)
## x y
## a 1 2
## b 2 3
## c 3 4
## d 4 5
## e 5 6
import pandas as pd
data = pd.DataFrame(
{"x": list(range(1, 6)),
"y": list(range(2, 7))},
index = ["a", "b", "c", "d", "e"])
print(data)
## x y
## a 1 2
## b 2 3
## c 3 4
## d 4 5
## e 5 6
library(readr)
data = readr::read_csv("data/penguins_complete.csv")
import pandas as pd
data = pd.read_csv("data/penguins_complete.csv")
readr::write_csv(x = data, file = "data/penguins_complete.csv")
data.to_csv(path_or_buf = "data/penguins_complete2.csv")
In R
, both index-based and label-based subsetting are performed through [,]
. In Python
, this is separated into iloc
and loc
respectively.
## First row
data[1,]
## # A tibble: 1 x 8
## species island bill_length_mm bill_depth_mm flipper_length_… body_mass_g sex
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 Adelie Torge… 39.1 18.7 181 3750 male
## # … with 1 more variable: year <dbl>
data[c(1,3,5), c(1,3,5)]
## # A tibble: 3 x 3
## species bill_length_mm flipper_length_mm
## <chr> <dbl> <dbl>
## 1 Adelie 39.1 181
## 2 Adelie 40.3 195
## 3 Adelie 39.3 190
## First row
data.iloc[0,:]
## species Adelie
## island Torgersen
## bill_length_mm 39.1
## bill_depth_mm 18.7
## flipper_length_mm 181
## body_mass_g 3750
## sex male
## year 2007
## Name: 0, dtype: object
data.iloc[[0,2,4],[0,2,4]]
## species bill_length_mm flipper_length_mm
## 0 Adelie 39.1 181
## 2 Adelie 40.3 195
## 4 Adelie 39.3 190
In R
, both index-based and label-based subsetting are performed through [,]
. In Python
, this is separated into iloc
and loc
respectively.
## First five rows from the "species" column
data[1:5, "species"]
## # A tibble: 5 x 1
## species
## <chr>
## 1 Adelie
## 2 Adelie
## 3 Adelie
## 4 Adelie
## 5 Adelie
## First five rows from the "species" column, return as Series
data.loc[0:4,"species"]
## 0 Adelie
## 1 Adelie
## 2 Adelie
## 3 Adelie
## 4 Adelie
## Name: species, dtype: object
loc
and iloc
In Python, loc
’s label-querying capabilities means that it will subset inclusively on the indices. This means that even if we use the same indices in the code, but depending on which method is used, we can get different results.
data.loc[0:4,"species"]
## 0 Adelie
## 1 Adelie
## 2 Adelie
## 3 Adelie
## 4 Adelie
## Name: species, dtype: object
data.iloc[0:4,1]
## 0 Torgersen
## 1 Torgersen
## 2 Torgersen
## 3 Torgersen
## Name: island, dtype: object
## Base R solution
data[(data$species == "Adelie") & (data$bill_depth_mm > 21), ]
## # A tibble: 6 x 8
## species island bill_length_mm bill_depth_mm flipper_length_… body_mass_g sex
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 Adelie Torge… 38.6 21.2 191 3800 male
## 2 Adelie Torge… 34.6 21.1 198 4400 male
## 3 Adelie Torge… 46 21.5 194 4200 male
## 4 Adelie Dream 39.2 21.1 196 4150 male
## 5 Adelie Dream 42.3 21.2 191 4150 male
## 6 Adelie Biscoe 41.3 21.1 195 4400 male
## # … with 1 more variable: year <dbl>
##
##
data[data$species %in% c("Adelie", "Chinstrap"), ]
## # A tibble: 214 x 8
## species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
## <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Adelie Torgersen 39.1 18.7 181 3750
## 2 Adelie Torgersen 39.5 17.4 186 3800
## 3 Adelie Torgersen 40.3 18 195 3250
## 4 Adelie Torgersen 36.7 19.3 193 3450
## 5 Adelie Torgersen 39.3 20.6 190 3650
## 6 Adelie Torgersen 38.9 17.8 181 3625
## 7 Adelie Torgersen 39.2 19.6 195 4675
## 8 Adelie Torgersen 41.1 17.6 182 3200
## 9 Adelie Torgersen 38.6 21.2 191 3800
## 10 Adelie Torgersen 34.6 21.1 198 4400
## # … with 204 more rows, and 2 more variables: sex <chr>, year <dbl>
library(dplyr)
data %>% filter((species == "Adelie") & (bill_depth_mm > 21))
## # A tibble: 6 x 8
## species island bill_length_mm bill_depth_mm flipper_length_… body_mass_g sex
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 Adelie Torge… 38.6 21.2 191 3800 male
## 2 Adelie Torge… 34.6 21.1 198 4400 male
## 3 Adelie Torge… 46 21.5 194 4200 male
## 4 Adelie Dream 39.2 21.1 196 4150 male
## 5 Adelie Dream 42.3 21.2 191 4150 male
## 6 Adelie Biscoe 41.3 21.1 195 4400 male
## # … with 1 more variable: year <dbl>
data %>% filter(species %in% c("Adelie", "Chinstrap"))
## # A tibble: 214 x 8
## species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
## <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Adelie Torgersen 39.1 18.7 181 3750
## 2 Adelie Torgersen 39.5 17.4 186 3800
## 3 Adelie Torgersen 40.3 18 195 3250
## 4 Adelie Torgersen 36.7 19.3 193 3450
## 5 Adelie Torgersen 39.3 20.6 190 3650
## 6 Adelie Torgersen 38.9 17.8 181 3625
## 7 Adelie Torgersen 39.2 19.6 195 4675
## 8 Adelie Torgersen 41.1 17.6 182 3200
## 9 Adelie Torgersen 38.6 21.2 191 3800
## 10 Adelie Torgersen 34.6 21.1 198 4400
## # … with 204 more rows, and 2 more variables: sex <chr>, year <dbl>
## Using loc
data.loc[(data.species == "Adelie") & (data.bill_depth_mm > 21)]
##
##
##
## species island bill_length_mm ... body_mass_g sex year
## 8 Adelie Torgersen 38.6 ... 3800 male 2007
## 9 Adelie Torgersen 34.6 ... 4400 male 2007
## 14 Adelie Torgersen 46.0 ... 4200 male 2007
## 30 Adelie Dream 39.2 ... 4150 male 2007
## 43 Adelie Dream 42.3 ... 4150 male 2007
## 55 Adelie Biscoe 41.3 ... 4400 male 2008
##
## [6 rows x 8 columns]
substypes = ["Adelie", "Chinstrap"]
data.loc[(data.species.isin(substypes))]
## species island bill_length_mm ... body_mass_g sex year
## 0 Adelie Torgersen 39.1 ... 3750 male 2007
## 1 Adelie Torgersen 39.5 ... 3800 female 2007
## 2 Adelie Torgersen 40.3 ... 3250 female 2007
## 3 Adelie Torgersen 36.7 ... 3450 female 2007
## 4 Adelie Torgersen 39.3 ... 3650 male 2007
## .. ... ... ... ... ... ... ...
## 209 Chinstrap Dream 55.8 ... 4000 male 2009
## 210 Chinstrap Dream 43.5 ... 3400 female 2009
## 211 Chinstrap Dream 49.6 ... 3775 male 2009
## 212 Chinstrap Dream 50.8 ... 4100 male 2009
## 213 Chinstrap Dream 50.2 ... 3775 female 2009
##
## [214 rows x 8 columns]
##
##
## Using query()
data.query('(species == "Adelie") & (bill_depth_mm > 21)')
##
##
##
## species island bill_length_mm ... body_mass_g sex year
## 8 Adelie Torgersen 38.6 ... 3800 male 2007
## 9 Adelie Torgersen 34.6 ... 4400 male 2007
## 14 Adelie Torgersen 46.0 ... 4200 male 2007
## 30 Adelie Dream 39.2 ... 4150 male 2007
## 43 Adelie Dream 42.3 ... 4150 male 2007
## 55 Adelie Biscoe 41.3 ... 4400 male 2008
##
## [6 rows x 8 columns]
data.query('species == @substypes')
## species island bill_length_mm ... body_mass_g sex year
## 0 Adelie Torgersen 39.1 ... 3750 male 2007
## 1 Adelie Torgersen 39.5 ... 3800 female 2007
## 2 Adelie Torgersen 40.3 ... 3250 female 2007
## 3 Adelie Torgersen 36.7 ... 3450 female 2007
## 4 Adelie Torgersen 39.3 ... 3650 male 2007
## .. ... ... ... ... ... ... ...
## 209 Chinstrap Dream 55.8 ... 4000 male 2009
## 210 Chinstrap Dream 43.5 ... 3400 female 2009
## 211 Chinstrap Dream 49.6 ... 3775 male 2009
## 212 Chinstrap Dream 50.8 ... 4100 male 2009
## 213 Chinstrap Dream 50.2 ... 3775 female 2009
##
## [214 rows x 8 columns]
data["body_mass_kg"] = data$body_mass_g/1000
## dplyr solution
data %>% mutate(body_mass_kg = body_mass_g/1000)
## # A tibble: 214 x 9
## species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
## <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Adelie Torgersen 39.1 18.7 181 3750
## 2 Adelie Torgersen 39.5 17.4 186 3800
## 3 Adelie Torgersen 40.3 18 195 3250
## 4 Adelie Torgersen 36.7 19.3 193 3450
## 5 Adelie Torgersen 39.3 20.6 190 3650
## 6 Adelie Torgersen 38.9 17.8 181 3625
## 7 Adelie Torgersen 39.2 19.6 195 4675
## 8 Adelie Torgersen 41.1 17.6 182 3200
## 9 Adelie Torgersen 38.6 21.2 191 3800
## 10 Adelie Torgersen 34.6 21.1 198 4400
## # … with 204 more rows, and 3 more variables: sex <chr>, year <dbl>,
## # body_mass_kg <dbl>
data["body_mass_kg"] = data.body_mass_g/1000
data.assign(body_mass_kg = lambda x: x.body_mass_g/1000)
## species island bill_length_mm ... sex year body_mass_kg
## 0 Adelie Torgersen 39.1 ... male 2007 3.750
## 1 Adelie Torgersen 39.5 ... female 2007 3.800
## 2 Adelie Torgersen 40.3 ... female 2007 3.250
## 3 Adelie Torgersen 36.7 ... female 2007 3.450
## 4 Adelie Torgersen 39.3 ... male 2007 3.650
## .. ... ... ... ... ... ... ...
## 209 Chinstrap Dream 55.8 ... male 2009 4.000
## 210 Chinstrap Dream 43.5 ... female 2009 3.400
## 211 Chinstrap Dream 49.6 ... male 2009 3.775
## 212 Chinstrap Dream 50.8 ... male 2009 4.100
## 213 Chinstrap Dream 50.2 ... female 2009 3.775
##
## [214 rows x 9 columns]
head(data)
## # A tibble: 6 x 9
## species island bill_length_mm bill_depth_mm flipper_length_… body_mass_g sex
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 Adelie Torge… 39.1 18.7 181 3750 male
## 2 Adelie Torge… 39.5 17.4 186 3800 fema…
## 3 Adelie Torge… 40.3 18 195 3250 fema…
## 4 Adelie Torge… 36.7 19.3 193 3450 fema…
## 5 Adelie Torge… 39.3 20.6 190 3650 male
## 6 Adelie Torge… 38.9 17.8 181 3625 fema…
## # … with 2 more variables: year <dbl>, body_mass_kg <dbl>
summary(data)
## species island bill_length_mm bill_depth_mm
## Length:214 Length:214 Min. :32.1 Min. :15.50
## Class :character Class :character 1st Qu.:37.8 1st Qu.:17.50
## Mode :character Mode :character Median :40.6 Median :18.40
## Mean :42.0 Mean :18.37
## 3rd Qu.:46.0 3rd Qu.:19.10
## Max. :58.0 Max. :21.50
## flipper_length_mm body_mass_g sex year
## Min. :172.0 Min. :2700 Length:214 Min. :2007
## 1st Qu.:187.0 1st Qu.:3400 Class :character 1st Qu.:2007
## Median :191.0 Median :3700 Mode :character Median :2008
## Mean :191.9 Mean :3715 Mean :2008
## 3rd Qu.:196.0 3rd Qu.:3994 3rd Qu.:2009
## Max. :212.0 Max. :4800 Max. :2009
## body_mass_kg
## Min. :2.700
## 1st Qu.:3.400
## Median :3.700
## Mean :3.715
## 3rd Qu.:3.994
## Max. :4.800
glimpse(data)
## Rows: 214
## Columns: 9
## $ species <chr> "Adelie", "Adelie", "Adelie", "Adelie", "Adelie", "A…
## $ island <chr> "Torgersen", "Torgersen", "Torgersen", "Torgersen", …
## $ bill_length_mm <dbl> 39.1, 39.5, 40.3, 36.7, 39.3, 38.9, 39.2, 41.1, 38.6…
## $ bill_depth_mm <dbl> 18.7, 17.4, 18.0, 19.3, 20.6, 17.8, 19.6, 17.6, 21.2…
## $ flipper_length_mm <dbl> 181, 186, 195, 193, 190, 181, 195, 182, 191, 198, 18…
## $ body_mass_g <dbl> 3750, 3800, 3250, 3450, 3650, 3625, 4675, 3200, 3800…
## $ sex <chr> "male", "female", "female", "female", "male", "femal…
## $ year <dbl> 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007…
## $ body_mass_kg <dbl> 3.750, 3.800, 3.250, 3.450, 3.650, 3.625, 4.675, 3.2…
dim(data)
## [1] 214 9
unique(data$species)
## [1] "Adelie" "Chinstrap"
##
table(data$species)
##
## Adelie Chinstrap
## 146 68
data.head()
##
##
##
##
##
##
## species island bill_length_mm ... sex year body_mass_kg
## 0 Adelie Torgersen 39.1 ... male 2007 3.75
## 1 Adelie Torgersen 39.5 ... female 2007 3.80
## 2 Adelie Torgersen 40.3 ... female 2007 3.25
## 3 Adelie Torgersen 36.7 ... female 2007 3.45
## 4 Adelie Torgersen 39.3 ... male 2007 3.65
##
## [5 rows x 9 columns]
data.describe()
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
## bill_length_mm bill_depth_mm ... year body_mass_kg
## count 214.000000 214.000000 ... 214.000000 214.000000
## mean 42.004673 18.370561 ... 2008.028037 3.714720
## std 5.491545 1.191134 ... 0.827440 0.435667
## min 32.100000 15.500000 ... 2007.000000 2.700000
## 25% 37.800000 17.500000 ... 2007.000000 3.400000
## 50% 40.600000 18.400000 ... 2008.000000 3.700000
## 75% 46.000000 19.100000 ... 2009.000000 3.993750
## max 58.000000 21.500000 ... 2009.000000 4.800000
##
## [8 rows x 6 columns]
data.info()
##
##
##
##
##
##
##
## <class 'pandas.core.frame.DataFrame'>
## RangeIndex: 214 entries, 0 to 213
## Data columns (total 9 columns):
## # Column Non-Null Count Dtype
## --- ------ -------------- -----
## 0 species 214 non-null object
## 1 island 214 non-null object
## 2 bill_length_mm 214 non-null float64
## 3 bill_depth_mm 214 non-null float64
## 4 flipper_length_mm 214 non-null int64
## 5 body_mass_g 214 non-null int64
## 6 sex 214 non-null object
## 7 year 214 non-null int64
## 8 body_mass_kg 214 non-null float64
## dtypes: float64(3), int64(3), object(3)
## memory usage: 15.2+ KB
data.shape
## (214, 9)
data.species.unique()
## array(['Adelie', 'Chinstrap'], dtype=object)
data.species.value_counts()
## Adelie 146
## Chinstrap 68
## Name: species, dtype: int64
The dplyr
package (a part of the tidyverse
) contains important functions that makes the manipulation of data.frame very easy. A very important part of the dplyr package is the use of the pipe operator, %>%
. Together with the use of spaces and line breaks, this often makes the R
code more readable, so that one can read the code from top to bottom and left to right, i.e. consistent with how we would structure an English sentence. This design framework to make readable codes is a very important reason of why R
has gained a renewed popularity in data science.
In comparison, Python
(pandas
) also adopts similar principle, but the structuring of the codes is not as consistent.
library(dplyr)
data %>%
group_by(species) %>%
tally()
## # A tibble: 2 x 2
## species n
## <chr> <int>
## 1 Adelie 146
## 2 Chinstrap 68
data %>%
group_by(species) %>%
summarise(mean_bill_length = mean(bill_length_mm))
## # A tibble: 2 x 2
## species mean_bill_length
## <chr> <dbl>
## 1 Adelie 38.8
## 2 Chinstrap 48.8
data %>%
group_by(species) %>%
summarise_if(.predicate = is.numeric, .funs = mean)
## # A tibble: 2 x 7
## species bill_length_mm bill_depth_mm flipper_length_mm body_mass_g year
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Adelie 38.8 18.3 190. 3706. 2008.
## 2 Chinstrap 48.8 18.4 196. 3733. 2008.
## # … with 1 more variable: body_mass_kg <dbl>
data %>%
group_by(species) %>%
summarise(mean_bill_length_top1 = bill_length_mm[1])
## # A tibble: 2 x 2
## species mean_bill_length_top1
## <chr> <dbl>
## 1 Adelie 39.1
## 2 Chinstrap 46.5
data %>%
group_by(species, sex) %>%
summarise(longest = bill_length_mm[bill_length_mm == max(bill_length_mm)])
## # A tibble: 4 x 3
## # Groups: species [2]
## species sex longest
## <chr> <chr> <dbl>
## 1 Adelie female 42.2
## 2 Adelie male 46
## 3 Chinstrap female 58
## 4 Chinstrap male 55.8
data %>%
group_by(species) %>%
summarise_at(.vars = vars(bill_length_mm),
.funs = lst(min, max, mean, length))
## # A tibble: 2 x 5
## species min max mean length
## <chr> <dbl> <dbl> <dbl> <int>
## 1 Adelie 32.1 46 38.8 146
## 2 Chinstrap 40.9 58 48.8 68
data.groupby("species").species.count()
##
##
##
## species
## Adelie 146
## Chinstrap 68
## Name: species, dtype: int64
data.groupby("species")["bill_length_mm"].mean()
##
##
##
## species
## Adelie 38.823973
## Chinstrap 48.833824
## Name: bill_length_mm, dtype: float64
data.groupby("species").mean()
##
##
##
## bill_length_mm bill_depth_mm ... year body_mass_kg
## species ...
## Adelie 38.823973 18.347260 ... 2008.054795 3.706164
## Chinstrap 48.833824 18.420588 ... 2007.970588 3.733088
##
## [2 rows x 6 columns]
data.groupby("species").apply(lambda x: x.bill_length_mm.iloc[0])
##
##
##
## species
## Adelie 39.1
## Chinstrap 46.5
## dtype: float64
data.groupby(["species", "sex"]).apply(lambda x: x.loc[x.bill_length_mm.idxmax(), "bill_length_mm"])
##
##
##
## species sex
## Adelie female 42.2
## male 46.0
## Chinstrap female 58.0
## male 55.8
## dtype: float64
data.groupby("species").bill_length_mm.agg([min, max, "mean", "size"])
## min max mean size
## species
## Adelie 32.1 46.0 38.823973 146
## Chinstrap 40.9 58.0 48.833824 68
library(dplyr)
missing_data = readr::read_csv("data/penguins.csv")
missing_data %>%
filter(is.na(bill_length_mm))
## # A tibble: 2 x 8
## species island bill_length_mm bill_depth_mm flipper_length_… body_mass_g sex
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 Adelie Torge… NA NA NA NA <NA>
## 2 Gentoo Biscoe NA NA NA NA <NA>
## # … with 1 more variable: year <dbl>
missing_data %>%
dplyr::mutate(bill_length_mm = coalesce(bill_length_mm, 0))
## # A tibble: 344 x 8
## species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
## <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Adelie Torgersen 39.1 18.7 181 3750
## 2 Adelie Torgersen 39.5 17.4 186 3800
## 3 Adelie Torgersen 40.3 18 195 3250
## 4 Adelie Torgersen 0 NA NA NA
## 5 Adelie Torgersen 36.7 19.3 193 3450
## 6 Adelie Torgersen 39.3 20.6 190 3650
## 7 Adelie Torgersen 38.9 17.8 181 3625
## 8 Adelie Torgersen 39.2 19.6 195 4675
## 9 Adelie Torgersen 34.1 18.1 193 3475
## 10 Adelie Torgersen 42 20.2 190 4250
## # … with 334 more rows, and 2 more variables: sex <chr>, year <dbl>
missing_data = pd.read_csv("data/penguins.csv")
##
##
##
##
##
missing_data[pd.isnull(missing_data.bill_length_mm)]
## species island bill_length_mm ... body_mass_g sex year
## 3 Adelie Torgersen NaN ... NaN NaN 2007
## 271 Gentoo Biscoe NaN ... NaN NaN 2009
##
## [2 rows x 8 columns]
missing_data.bill_length_mm.fillna(0)
## 0 39.1
## 1 39.5
## 2 40.3
## 3 0.0
## 4 36.7
## ...
## 339 55.8
## 340 43.5
## 341 49.6
## 342 50.8
## 343 50.2
## Name: bill_length_mm, Length: 344, dtype: float64