Merging Datasets - adding rows
This is not as common, but sometimes you want to “stack” datasets by adding rows - perhaps merging multiple survey datasets from different groups or over time. So, you are adding participants responses by adding rows to your dataset.
Typically you will want EVERY COLUMN to match - number of columns and column names must match. But there are ways to add new rows even if there are more/different columns - suppose you added a question in a new survey but still want to merge all the other question responses.
library(palmerpenguins)
allpenguins <- penguins
table(penguins$species)
adeliepenguins <- penguins %>%
filter(species == "Adelie")
gentoopenguins <- penguins %>%
filter(species == "Gentoo")
head(adeliepenguins)
Adelie |
Torgersen |
39.1 |
18.7 |
181 |
3750 |
male |
2007 |
Adelie |
Torgersen |
39.5 |
17.4 |
186 |
3800 |
female |
2007 |
Adelie |
Torgersen |
40.3 |
18.0 |
195 |
3250 |
female |
2007 |
Adelie |
Torgersen |
NA |
NA |
NA |
NA |
NA |
2007 |
Adelie |
Torgersen |
36.7 |
19.3 |
193 |
3450 |
female |
2007 |
Adelie |
Torgersen |
39.3 |
20.6 |
190 |
3650 |
male |
2007 |
head(gentoopenguins)
Gentoo |
Biscoe |
46.1 |
13.2 |
211 |
4500 |
female |
2007 |
Gentoo |
Biscoe |
50.0 |
16.3 |
230 |
5700 |
male |
2007 |
Gentoo |
Biscoe |
48.7 |
14.1 |
210 |
4450 |
female |
2007 |
Gentoo |
Biscoe |
50.0 |
15.2 |
218 |
5700 |
male |
2007 |
Gentoo |
Biscoe |
47.6 |
14.5 |
215 |
5400 |
male |
2007 |
Gentoo |
Biscoe |
46.5 |
13.5 |
210 |
4550 |
female |
2007 |
# merge adelie and gentoo
# same number of columns
# same names of columns
# use rbind() from base R
adelie_gentoo <- rbind(adeliepenguins,
gentoopenguins)
adelie_rearrange <- adeliepenguins %>%
relocate(year, .before = species)
adeliere_gentoo <- rbind(adelie_rearrange,
gentoopenguins)
# suppose we add a new column to adelie
adeliemod <- adeliepenguins %>%
mutate(bill_length_cm = bill_length_mm/10)
# let's try this again with
# added column in adeliemod
adeliemod_gentoo <- rbind(adeliemod,
gentoopenguins)
## Error in rbind(deparse.level, ...): numbers of columns of arguments do not match
# we get an error - now what...
# use alternate approach with
# bind_rows() from dplyr
adelie_gentoo2 <-
dplyr::bind_rows(adeliepenguins,
gentoopenguins)
# now the extra column is added and NAs
# are automatically added for the
# missing data in the gentoopenguins dataset
# for the bill_length_cm column in
# adeliemod modified dataset
adeliemod_gentoo2 <-
dplyr::bind_rows(adeliemod,
gentoopenguins)
# suppose some column names do not match
gentoomod <- gentoopenguins %>%
rename(date = year)
names(adeliemod)
## [1] "species" "island" "bill_length_mm"
## [4] "bill_depth_mm" "flipper_length_mm" "body_mass_g"
## [7] "sex" "year" "bill_length_cm"
names(gentoomod)
## [1] "species" "island" "bill_length_mm"
## [4] "bill_depth_mm" "flipper_length_mm" "body_mass_g"
## [7] "sex" "date"
adeliemod_gentoomod <-
dplyr::bind_rows(adeliemod,
gentoomod)
names(adeliemod_gentoomod)
## [1] "species" "island" "bill_length_mm"
## [4] "bill_depth_mm" "flipper_length_mm" "body_mass_g"
## [7] "sex" "year" "bill_length_cm"
## [10] "date"
head(adeliemod_gentoomod)
Adelie |
Torgersen |
39.1 |
18.7 |
181 |
3750 |
male |
2007 |
3.91 |
NA |
Adelie |
Torgersen |
39.5 |
17.4 |
186 |
3800 |
female |
2007 |
3.95 |
NA |
Adelie |
Torgersen |
40.3 |
18.0 |
195 |
3250 |
female |
2007 |
4.03 |
NA |
Adelie |
Torgersen |
NA |
NA |
NA |
NA |
NA |
2007 |
NA |
NA |
Adelie |
Torgersen |
36.7 |
19.3 |
193 |
3450 |
female |
2007 |
3.67 |
NA |
Adelie |
Torgersen |
39.3 |
20.6 |
190 |
3650 |
male |
2007 |
3.93 |
NA |
So be sure you line up your column names FIRST.
Merging Datasets - adding columns
This is very common - you have two (or more) datasets for the subjects in your project. There could be survey data, lab data, EMR data, etc and you want to merge them together to build a complete patient record.
However, you will need to make decisions on who you want to keep in the final dataset - everyone? only people with all data elements? only people in the “main” dataset?
library(readr)
data1 <- read_csv("data1.csv")
## Rows: 10 Columns: 3
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## dbl (3): id, age, bmi
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
data2 <- read_csv("data2.csv")
## Rows: 9 Columns: 3
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (1): htn
## dbl (2): id, satis
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
data3 <- read_csv("data3.csv")
## Rows: 6 Columns: 4
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (1): htn
## dbl (3): id, age, bmi
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
data1
1 |
34 |
25.4 |
2 |
45 |
NA |
3 |
42 |
30.0 |
4 |
34 |
24.3 |
5 |
26 |
22.1 |
6 |
32 |
33.4 |
7 |
NA |
28.9 |
8 |
50 |
26.5 |
9 |
44 |
28.9 |
10 |
35 |
31.2 |
data2
1 |
90 |
yes |
3 |
60 |
no |
4 |
75 |
no |
6 |
88 |
yes |
7 |
94 |
yes |
8 |
45 |
NA |
9 |
67 |
no |
11 |
89 |
no |
13 |
91 |
yes |
data3
15 |
34 |
25.4 |
yes |
16 |
45 |
NA |
no |
18 |
42 |
30.0 |
no |
19 |
34 |
24.3 |
yes |
22 |
26 |
22.1 |
yes |
23 |
32 |
33.4 |
NA |
Keep only rows (people) in both datasets - inner join
both <- dplyr::inner_join(x = data1,
y = data2,
by = "id")
both
1 |
34 |
25.4 |
90 |
yes |
3 |
42 |
30.0 |
60 |
no |
4 |
34 |
24.3 |
75 |
no |
6 |
32 |
33.4 |
88 |
yes |
7 |
NA |
28.9 |
94 |
yes |
8 |
50 |
26.5 |
45 |
NA |
9 |
44 |
28.9 |
67 |
no |
Keep rows (people) in dataset 1 - left join
left <- dplyr::left_join(x = data1,
y = data2,
by = "id")
left
1 |
34 |
25.4 |
90 |
yes |
2 |
45 |
NA |
NA |
NA |
3 |
42 |
30.0 |
60 |
no |
4 |
34 |
24.3 |
75 |
no |
5 |
26 |
22.1 |
NA |
NA |
6 |
32 |
33.4 |
88 |
yes |
7 |
NA |
28.9 |
94 |
yes |
8 |
50 |
26.5 |
45 |
NA |
9 |
44 |
28.9 |
67 |
no |
10 |
35 |
31.2 |
NA |
NA |
Keep rows (people) in dataset 2 - right join - or just reverse left join
right <- dplyr::right_join(x = data1,
y = data2,
by = "id")
right
1 |
34 |
25.4 |
90 |
yes |
3 |
42 |
30.0 |
60 |
no |
4 |
34 |
24.3 |
75 |
no |
6 |
32 |
33.4 |
88 |
yes |
7 |
NA |
28.9 |
94 |
yes |
8 |
50 |
26.5 |
45 |
NA |
9 |
44 |
28.9 |
67 |
no |
11 |
NA |
NA |
89 |
no |
13 |
NA |
NA |
91 |
yes |
Keep ALL rows (people) in either dataset - full join
full <- dplyr::full_join(x = data1,
y = data2,
by = "id")
full
1 |
34 |
25.4 |
90 |
yes |
2 |
45 |
NA |
NA |
NA |
3 |
42 |
30.0 |
60 |
no |
4 |
34 |
24.3 |
75 |
no |
5 |
26 |
22.1 |
NA |
NA |
6 |
32 |
33.4 |
88 |
yes |
7 |
NA |
28.9 |
94 |
yes |
8 |
50 |
26.5 |
45 |
NA |
9 |
44 |
28.9 |
67 |
no |
10 |
35 |
31.2 |
NA |
NA |
11 |
NA |
NA |
89 |
no |
13 |
NA |
NA |
91 |
yes |
Merge datasets by columns - adding rows
# look at full and data3 datasets
# these have different IDs but some
# same variables in common
full
1 |
34 |
25.4 |
90 |
yes |
2 |
45 |
NA |
NA |
NA |
3 |
42 |
30.0 |
60 |
no |
4 |
34 |
24.3 |
75 |
no |
5 |
26 |
22.1 |
NA |
NA |
6 |
32 |
33.4 |
88 |
yes |
7 |
NA |
28.9 |
94 |
yes |
8 |
50 |
26.5 |
45 |
NA |
9 |
44 |
28.9 |
67 |
no |
10 |
35 |
31.2 |
NA |
NA |
11 |
NA |
NA |
89 |
no |
13 |
NA |
NA |
91 |
yes |
data3
15 |
34 |
25.4 |
yes |
16 |
45 |
NA |
no |
18 |
42 |
30.0 |
no |
19 |
34 |
24.3 |
yes |
22 |
26 |
22.1 |
yes |
23 |
32 |
33.4 |
NA |
# let's "bind" rows - stack the datasets
# this will merge all columns that have matching names
# any columns that do not match are retained
# and NAs used to fill in spots that do not match
bind_rows(list(full, data3))
1 |
34 |
25.4 |
90 |
yes |
2 |
45 |
NA |
NA |
NA |
3 |
42 |
30.0 |
60 |
no |
4 |
34 |
24.3 |
75 |
no |
5 |
26 |
22.1 |
NA |
NA |
6 |
32 |
33.4 |
88 |
yes |
7 |
NA |
28.9 |
94 |
yes |
8 |
50 |
26.5 |
45 |
NA |
9 |
44 |
28.9 |
67 |
no |
10 |
35 |
31.2 |
NA |
NA |
11 |
NA |
NA |
89 |
no |
13 |
NA |
NA |
91 |
yes |
15 |
34 |
25.4 |
NA |
yes |
16 |
45 |
NA |
NA |
no |
18 |
42 |
30.0 |
NA |
no |
19 |
34 |
24.3 |
NA |
yes |
22 |
26 |
22.1 |
NA |
yes |
23 |
32 |
33.4 |
NA |
NA |