Merging Datasets - adding rows

This is not as common, but sometimes you want to “stack” datasets by adding rows - perhaps merging multiple survey datasets from different groups or over time. So, you are adding participants responses by adding rows to your dataset.

Typically you will want EVERY COLUMN to match - number of columns and column names must match. But there are ways to add new rows even if there are more/different columns - suppose you added a question in a new survey but still want to merge all the other question responses.

library(palmerpenguins)

allpenguins <- penguins
table(penguins$species)
Adelie Chinstrap Gentoo
152 68 124
adeliepenguins <- penguins %>%
  filter(species == "Adelie")

gentoopenguins <- penguins %>%
  filter(species == "Gentoo")

head(adeliepenguins)
species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g sex year
Adelie Torgersen 39.1 18.7 181 3750 male 2007
Adelie Torgersen 39.5 17.4 186 3800 female 2007
Adelie Torgersen 40.3 18.0 195 3250 female 2007
Adelie Torgersen NA NA NA NA NA 2007
Adelie Torgersen 36.7 19.3 193 3450 female 2007
Adelie Torgersen 39.3 20.6 190 3650 male 2007
head(gentoopenguins)
species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g sex year
Gentoo Biscoe 46.1 13.2 211 4500 female 2007
Gentoo Biscoe 50.0 16.3 230 5700 male 2007
Gentoo Biscoe 48.7 14.1 210 4450 female 2007
Gentoo Biscoe 50.0 15.2 218 5700 male 2007
Gentoo Biscoe 47.6 14.5 215 5400 male 2007
Gentoo Biscoe 46.5 13.5 210 4550 female 2007
# merge adelie and gentoo
# same number of columns
# same names of columns
# use rbind() from base R

adelie_gentoo <- rbind(adeliepenguins,
                       gentoopenguins)

adelie_rearrange <- adeliepenguins %>%
  relocate(year, .before = species)

adeliere_gentoo <- rbind(adelie_rearrange,
                         gentoopenguins)

# suppose we add a new column to adelie

adeliemod <- adeliepenguins %>%
  mutate(bill_length_cm = bill_length_mm/10)

# let's try this again with
# added column in adeliemod
adeliemod_gentoo <- rbind(adeliemod,
                          gentoopenguins)
## Error in rbind(deparse.level, ...): numbers of columns of arguments do not match
# we get an error - now what...
# use alternate approach with
# bind_rows() from dplyr

adelie_gentoo2 <- 
  dplyr::bind_rows(adeliepenguins,
                   gentoopenguins)

# now the extra column is added and NAs
# are automatically added for the 
# missing data in the gentoopenguins dataset
# for the bill_length_cm column in
# adeliemod modified dataset

adeliemod_gentoo2 <- 
  dplyr::bind_rows(adeliemod,
                   gentoopenguins)

# suppose some column names do not match
gentoomod <- gentoopenguins %>%
  rename(date = year)

names(adeliemod)
## [1] "species"           "island"            "bill_length_mm"   
## [4] "bill_depth_mm"     "flipper_length_mm" "body_mass_g"      
## [7] "sex"               "year"              "bill_length_cm"
names(gentoomod)
## [1] "species"           "island"            "bill_length_mm"   
## [4] "bill_depth_mm"     "flipper_length_mm" "body_mass_g"      
## [7] "sex"               "date"
adeliemod_gentoomod <- 
  dplyr::bind_rows(adeliemod,
                   gentoomod)

names(adeliemod_gentoomod)
##  [1] "species"           "island"            "bill_length_mm"   
##  [4] "bill_depth_mm"     "flipper_length_mm" "body_mass_g"      
##  [7] "sex"               "year"              "bill_length_cm"   
## [10] "date"
head(adeliemod_gentoomod)
species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g sex year bill_length_cm date
Adelie Torgersen 39.1 18.7 181 3750 male 2007 3.91 NA
Adelie Torgersen 39.5 17.4 186 3800 female 2007 3.95 NA
Adelie Torgersen 40.3 18.0 195 3250 female 2007 4.03 NA
Adelie Torgersen NA NA NA NA NA 2007 NA NA
Adelie Torgersen 36.7 19.3 193 3450 female 2007 3.67 NA
Adelie Torgersen 39.3 20.6 190 3650 male 2007 3.93 NA

So be sure you line up your column names FIRST.

Merging Datasets - adding columns

This is very common - you have two (or more) datasets for the subjects in your project. There could be survey data, lab data, EMR data, etc and you want to merge them together to build a complete patient record.

However, you will need to make decisions on who you want to keep in the final dataset - everyone? only people with all data elements? only people in the “main” dataset?

library(readr)
data1 <- read_csv("data1.csv")
## Rows: 10 Columns: 3
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## dbl (3): id, age, bmi
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
data2 <- read_csv("data2.csv")
## Rows: 9 Columns: 3
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (1): htn
## dbl (2): id, satis
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
data3 <- read_csv("data3.csv")
## Rows: 6 Columns: 4
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (1): htn
## dbl (3): id, age, bmi
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
data1
id age bmi
1 34 25.4
2 45 NA
3 42 30.0
4 34 24.3
5 26 22.1
6 32 33.4
7 NA 28.9
8 50 26.5
9 44 28.9
10 35 31.2
data2
id satis htn
1 90 yes
3 60 no
4 75 no
6 88 yes
7 94 yes
8 45 NA
9 67 no
11 89 no
13 91 yes
data3
id age bmi htn
15 34 25.4 yes
16 45 NA no
18 42 30.0 no
19 34 24.3 yes
22 26 22.1 yes
23 32 33.4 NA

Keep only rows (people) in both datasets - inner join

both <- dplyr::inner_join(x = data1,
                          y = data2,
                          by = "id")
both
id age bmi satis htn
1 34 25.4 90 yes
3 42 30.0 60 no
4 34 24.3 75 no
6 32 33.4 88 yes
7 NA 28.9 94 yes
8 50 26.5 45 NA
9 44 28.9 67 no

Keep rows (people) in dataset 1 - left join

left <- dplyr::left_join(x = data1,
                         y = data2,
                         by = "id")
left
id age bmi satis htn
1 34 25.4 90 yes
2 45 NA NA NA
3 42 30.0 60 no
4 34 24.3 75 no
5 26 22.1 NA NA
6 32 33.4 88 yes
7 NA 28.9 94 yes
8 50 26.5 45 NA
9 44 28.9 67 no
10 35 31.2 NA NA

Keep rows (people) in dataset 2 - right join - or just reverse left join

right <- dplyr::right_join(x = data1,
                           y = data2,
                           by = "id")
right
id age bmi satis htn
1 34 25.4 90 yes
3 42 30.0 60 no
4 34 24.3 75 no
6 32 33.4 88 yes
7 NA 28.9 94 yes
8 50 26.5 45 NA
9 44 28.9 67 no
11 NA NA 89 no
13 NA NA 91 yes

Keep ALL rows (people) in either dataset - full join

full <- dplyr::full_join(x = data1,
                         y = data2,
                         by = "id")
full
id age bmi satis htn
1 34 25.4 90 yes
2 45 NA NA NA
3 42 30.0 60 no
4 34 24.3 75 no
5 26 22.1 NA NA
6 32 33.4 88 yes
7 NA 28.9 94 yes
8 50 26.5 45 NA
9 44 28.9 67 no
10 35 31.2 NA NA
11 NA NA 89 no
13 NA NA 91 yes

Merge datasets by columns - adding rows

# look at full and data3 datasets
# these have different IDs but some
# same variables in common
full
id age bmi satis htn
1 34 25.4 90 yes
2 45 NA NA NA
3 42 30.0 60 no
4 34 24.3 75 no
5 26 22.1 NA NA
6 32 33.4 88 yes
7 NA 28.9 94 yes
8 50 26.5 45 NA
9 44 28.9 67 no
10 35 31.2 NA NA
11 NA NA 89 no
13 NA NA 91 yes
data3
id age bmi htn
15 34 25.4 yes
16 45 NA no
18 42 30.0 no
19 34 24.3 yes
22 26 22.1 yes
23 32 33.4 NA
# let's "bind" rows - stack the datasets
# this will merge all columns that have matching names
# any columns that do not match are retained
# and NAs used to fill in spots that do not match
bind_rows(list(full, data3))
id age bmi satis htn
1 34 25.4 90 yes
2 45 NA NA NA
3 42 30.0 60 no
4 34 24.3 75 no
5 26 22.1 NA NA
6 32 33.4 88 yes
7 NA 28.9 94 yes
8 50 26.5 45 NA
9 44 28.9 67 no
10 35 31.2 NA NA
11 NA NA 89 no
13 NA NA 91 yes
15 34 25.4 NA yes
16 45 NA NA no
18 42 30.0 NA no
19 34 24.3 NA yes
22 26 22.1 NA yes
23 32 33.4 NA NA