I have two dataframes (x & y) where the IDs are student_name, father_name and mother_name. Because of typographical errors (\"n\"
This takes a list of common column names, matches based on agrep of all those columns combined, and then if all.x or all.y equals TRUE it appends non-matching records filling in missing columns with NA. Unlike merge, the column names to match on need to be the same in each data frame. The challenge would seem to be setting the agrep options correctly to avoid spurious matches.
agrepMerge <- function(df1, df2, by, all.x = FALSE, all.y = FALSE,
ignore.case = FALSE, value = FALSE, max.distance = 0.1, useBytes = FALSE) {
df1$index <- apply(df1[,by, drop = FALSE], 1, paste, sep = "", collapse = "")
df2$index <- apply(df2[,by, drop = FALSE], 1, paste, sep = "", collapse = "")
matches <- lapply(seq_along(df1$index), function(i, ...) {
agrep(df1$index[i], df2$index, ignore.case = ignore.case, value = value,
max.distance = max.distance, useBytes = useBytes)
})
df1_match <- rep(1:nrow(df1), sapply(matches, length))
df2_match <- unlist(matches)
df1_hits <- df1[df1_match,]
df2_hits <- df2[df2_match,]
df1_miss <- df1[setdiff(seq_along(df1$index), df1_match),]
df2_miss <- df2[setdiff(seq_along(df2$index), df2_match),]
remove_cols <- colnames(df2_hits) %in% colnames(df1_hits)
df_out <- cbind(df1_hits, df2_hits[,!remove_cols])
if(all.x) {
missing_cols <- setdiff(colnames(df_out), colnames(df1_miss))
df1_miss[missing_cols] <- NA
df_out <- rbind(df_out, df1_miss)
}
if(all.x) {
missing_cols <- setdiff(colnames(df_out), colnames(df2_miss))
df2_miss[missing_cols] <- NA
df_out <- rbind(df_out, df2_miss)
}
df_out[,setdiff(colnames(df_out), "index")]
}