Aggregation
library(magrittr)
set.seed(1)
ex_dat <- data.frame(group = rep(1:3, each = 2),
y = rnorm(6), x = rnorm(6))
ex_dat
## group y x
## 1 1 -0.6264538 0.4874291
## 2 1 0.1836433 0.7383247
## 3 2 -0.8356286 0.5757814
## 4 2 1.5952808 -0.3053884
## 5 3 0.3295078 1.5117812
## 6 3 -0.8204684 0.3898432
Summarise selected columns by group
dplyr::group_by(ex_dat, group) %>%
dplyr::summarise(total_y = sum(y))
## # A tibble: 3 x 2
## group total_y
## <int> <dbl>
## 1 1 -0.443
## 2 2 0.760
## 3 3 -0.491
Summarise all the columns by group
dplyr::group_by(ex_dat, group) %>%
dplyr::summarise_all(sum) # you can replace sum() by any function
## # A tibble: 3 x 3
## group y x
## <int> <dbl> <dbl>
## 1 1 -0.443 1.23
## 2 2 0.760 0.270
## 3 3 -0.491 1.90
Reshape
Reshaping data is often needed before ploting.
long_dat <- reshape2::melt(ex_dat, id.vars = "group")
long_dat
## group variable value
## 1 1 y -0.6264538
## 2 1 y 0.1836433
## 3 2 y -0.8356286
## 4 2 y 1.5952808
## 5 3 y 0.3295078
## 6 3 y -0.8204684
## 7 1 x 0.4874291
## 8 1 x 0.7383247
## 9 2 x 0.5757814
## 10 2 x -0.3053884
## 11 3 x 1.5117812
## 12 3 x 0.3898432
reshape2::dcast(long_dat, group ~ variable, mean)
## group y x
## 1 1 -0.2214052 0.6128769
## 2 2 0.3798261 0.1351965
## 3 3 -0.2454803 0.9508122
Complete a data frame
df <- tibble::tibble(
group = c(1:2, 1),
item_id = c(1:2, 2),
item_name = c("a", "b", "b"),
value1 = 1:3,
value2 = 4:6
)
df %>% tidyr::complete(group, tidyr::nesting(item_id, item_name))
## # A tibble: 4 x 5
## group item_id item_name value1 value2
## <dbl> <dbl> <chr> <int> <int>
## 1 1 1 a 1 4
## 2 1 2 b 3 6
## 3 2 1 a NA NA
## 4 2 2 b 2 5
df %>% tidyr::complete(group, item_id, item_name)
## # A tibble: 8 x 5
## group item_id item_name value1 value2
## <dbl> <dbl> <chr> <int> <int>
## 1 1 1 a 1 4
## 2 1 1 b NA NA
## 3 1 2 a NA NA
## 4 1 2 b 3 6
## 5 2 1 a NA NA
## 6 2 1 b NA NA
## 7 2 2 a NA NA
## 8 2 2 b 2 5
Extract subset of data
Extract variables
df %>% dplyr::select(group)
df %>% dplyr::select(tidyselect::starts_with("item_"))
df %>% dplyr::select(tidyselect::ends_with("_name"))
df %>% dplyr::select(tidyselect::num_range("value", 1:2))
df %>% dplyr::select(tidyselect::contains("item"))
df %>% dplyr::select(tidyselect::matches("value."))
Filter rows
df %>% dplyr::filter(group == 1)
df %>% dplyr::distinct(group, .keep_all = TRUE)
df %>% dplyr::slice(2:3) # select rows by position
df %>% dplyr::top_n(2, value1) # select top 2 entries in value1