Data manipulation in R

2019-08-07 data-manipulation

Aggregation

library(magrittr)
set.seed(1)
ex_dat <- data.frame(group = rep(1:3, each = 2),
                     y = rnorm(6), x = rnorm(6))
ex_dat

##   group          y          x
## 1     1 -0.6264538  0.4874291
## 2     1  0.1836433  0.7383247
## 3     2 -0.8356286  0.5757814
## 4     2  1.5952808 -0.3053884
## 5     3  0.3295078  1.5117812
## 6     3 -0.8204684  0.3898432

Summarise selected columns by group

dplyr::group_by(ex_dat, group) %>%
  dplyr::summarise(total_y = sum(y))

## # A tibble: 3 x 2
##   group total_y
##   <int>   <dbl>
## 1     1  -0.443
## 2     2   0.760
## 3     3  -0.491

Summarise all the columns by group

dplyr::group_by(ex_dat, group) %>%
  dplyr::summarise_all(sum)  # you can replace sum() by any function

## # A tibble: 3 x 3
##   group      y     x
##   <int>  <dbl> <dbl>
## 1     1 -0.443 1.23 
## 2     2  0.760 0.270
## 3     3 -0.491 1.90

Reshape

Reshaping data is often needed before ploting.

long_dat <- reshape2::melt(ex_dat, id.vars = "group")
long_dat

##    group variable      value
## 1      1        y -0.6264538
## 2      1        y  0.1836433
## 3      2        y -0.8356286
## 4      2        y  1.5952808
## 5      3        y  0.3295078
## 6      3        y -0.8204684
## 7      1        x  0.4874291
## 8      1        x  0.7383247
## 9      2        x  0.5757814
## 10     2        x -0.3053884
## 11     3        x  1.5117812
## 12     3        x  0.3898432

reshape2::dcast(long_dat, group ~ variable, mean)

##   group          y         x
## 1     1 -0.2214052 0.6128769
## 2     2  0.3798261 0.1351965
## 3     3 -0.2454803 0.9508122

Complete a data frame

df <- tibble::tibble(
  group = c(1:2, 1),
  item_id = c(1:2, 2),
  item_name = c("a", "b", "b"),
  value1 = 1:3,
  value2 = 4:6
)
df %>% tidyr::complete(group, tidyr::nesting(item_id, item_name))

## # A tibble: 4 x 5
##   group item_id item_name value1 value2
##   <dbl>   <dbl> <chr>      <int>  <int>
## 1     1       1 a              1      4
## 2     1       2 b              3      6
## 3     2       1 a             NA     NA
## 4     2       2 b              2      5

df %>% tidyr::complete(group, item_id, item_name)

## # A tibble: 8 x 5
##   group item_id item_name value1 value2
##   <dbl>   <dbl> <chr>      <int>  <int>
## 1     1       1 a              1      4
## 2     1       1 b             NA     NA
## 3     1       2 a             NA     NA
## 4     1       2 b              3      6
## 5     2       1 a             NA     NA
## 6     2       1 b             NA     NA
## 7     2       2 a             NA     NA
## 8     2       2 b              2      5

Extract subset of data

Extract variables

df %>% dplyr::select(group)
df %>% dplyr::select(tidyselect::starts_with("item_"))
df %>% dplyr::select(tidyselect::ends_with("_name"))
df %>% dplyr::select(tidyselect::num_range("value", 1:2))
df %>% dplyr::select(tidyselect::contains("item"))
df %>% dplyr::select(tidyselect::matches("value."))

Filter rows

df %>% dplyr::filter(group == 1)
df %>% dplyr::distinct(group, .keep_all = TRUE)
df %>% dplyr::slice(2:3)  # select rows by position
df %>% dplyr::top_n(2, value1) # select top 2 entries in value1

data r

Tingting Yu

Developer, Data Scientist

My research interests include time-series analysis, longitudinal analysis, image analysis …