问题
Let's say you have two data frames, both of which contain some, but not all of the same records. Where they are the same records, the id variable in both data frames matches. There is a particular variable in each data frame that needs to be checked for consistency across the data frames, and any discrepancies need to be printed:
d1 <- ## first dataframe
d2 <- ## second dataframe
colnames(d1) #column headings for dataframe 1
[1] "id" "variable1" "variable2" "variable3"
colnames(d2) #column headings for dataframe 2 are identical
[1] "id" "variable1" "variable2" "variable3"
length(d1$id) #there are 200 records in dataframe 1
[1] 200
length(d2$id) #there are not the same number in dataframe 2
[1] 150
##Some function that takes d1$id, matches with d2$id, then compares the values of the matched, returning any discrepancies
I constructed an elaborate loop for this, but feel as though this is not the right way of going about it. Surely there is some better way than this for-if-for-if-if statement.
for (i in seq(d1$id)){ ##Sets up counter for loop
if (d1$id[i] %in% d2$id){ ## Search, compares and saves a common id and variable
index <- d1$id[i];
variable_d1 <- d1$variable1[i];
for (p in seq(d2$id)){ set
if (d2$id[p] == index){ ## saves the corresponding value in the second dataframe
variable_d2 <- d2$variable1[p];
if (variable_d2 != variable_d1) { ## prints if they are not equal
print(index);
}
}
}
}
}
回答1:
Here's a solution, using random input data with a 50% chance that a given cell will be discrepant between d1
and d2
:
set.seed(1);
d1 <- data.frame(id=sample(300,200),variable1=sample(2,200,replace=T),variable2=sample(2,200,replace=T),variable3=sample(2,200,replace=T));
d2 <- data.frame(id=sample(300,150),variable1=sample(2,150,replace=T),variable2=sample(2,150,replace=T),variable3=sample(2,150,replace=T));
head(d1);
## id variable1 variable2 variable3
## 1 80 1 2 2
## 2 112 1 1 2
## 3 171 2 2 1
## 4 270 1 2 2
## 5 60 1 2 2
## 6 266 2 2 2
head(d2);
## id variable1 variable2 variable3
## 1 258 1 2 1
## 2 11 1 1 1
## 3 290 2 1 2
## 4 222 2 1 2
## 5 81 2 1 1
## 6 200 1 2 1
com <- intersect(d1$id,d2$id); ## derive common id values
d1com <- match(com,d1$id); ## find indexes of d1 that correspond to common id values, in order of com
d2com <- match(com,d2$id); ## find indexes of d2 that correspond to common id values, in order of com
v1diff <- com[d1$variable1[d1com]!=d2$variable1[d2com]]; ## get ids of variable1 discrepancies
v1diff;
## [1] 60 278 18 219 290 35 107 4 237 131 50 210 29 168 6 174 61 127 99 220 247 244 157 51 84 122 196 125 265 115 186 139 3 132 223 211 268 102 155 207 238 41 199 200 231 236 172 275 250 176 248 255 222 59 100 33 124
v2diff <- com[d1$variable2[d1com]!=d2$variable2[d2com]]; ## get ids of variable2 discrepancies
v2diff;
## [1] 112 60 18 198 219 290 131 50 210 29 168 258 215 291 127 161 99 220 110 293 87 164 84 122 196 125 186 139 81 132 82 89 223 268 98 14 155 241 207 231 172 62 275 176 248 255 59 298 100 12 156
v3diff <- com[d1$variable3[d1com]!=d2$variable3[d2com]]; ## get ids of variable3 discrepancies
v3diff;
## [1] 278 219 290 35 4 237 131 168 202 174 215 220 247 244 261 293 164 13 294 84 196 125 265 115 186 81 3 89 223 211 268 98 14 155 241 207 38 191 200 276 250 45 269 255 298 100 12 156 124
Here's a proof that all variable1
values for ids in v1diff
are really discrepant between d1
and d2
:
d1$variable1[match(v1diff,d1$id)]; d2$variable1[match(v1diff,d2$id)];
## [1] 1 2 2 1 1 2 2 1 1 1 2 2 2 2 1 2 2 1 2 2 1 1 2 1 1 2 1 1 1 1 1 1 1 1 1 2 2 2 1 2 2 1 1 2 1 1 2 1 2 1 2 2 1 2 2 1 1
## [1] 2 1 1 2 2 1 1 2 2 2 1 1 1 1 2 1 1 2 1 1 2 2 1 2 2 1 2 2 2 2 2 2 2 2 2 1 1 1 2 1 1 2 2 1 2 2 1 2 1 2 1 1 2 1 1 2 2
Here's a proof that all variable1
values for ids not in v1diff
are not discrepant between d1
and d2
:
with(subset(d1,id%in%com&!id%in%v1diff),variable1[order(id)]); with(subset(d2,id%in%com&!id%in%v1diff),variable1[order(id)]);
## [1] 1 1 2 1 1 1 2 2 1 2 2 1 2 2 1 1 2 1 2 1 2 1 1 1 1 1 1 2 2 2 2 1 1 1 2 2 2 1 1 1 1
## [1] 1 1 2 1 1 1 2 2 1 2 2 1 2 2 1 1 2 1 2 1 2 1 1 1 1 1 1 2 2 2 2 1 1 1 2 2 2 1 1 1 1
Here, I wrapped this solution in a function which returns the vectors of discrepant id values in a list, with each component named for the variable it represents:
compare <- function(d1,d2,cols=setdiff(intersect(colnames(d1),colnames(d2)),'id')) {
com <- intersect(d1$id,d2$id);
d1com <- match(com,d1$id);
d2com <- match(com,d2$id);
setNames(lapply(cols,function(col) com[d1[[col]][d1com]!=d2[[col]][d2com]]),cols);
};
compare(d1,d2);
## $variable1
## [1] 60 278 18 219 290 35 107 4 237 131 50 210 29 168 6 174 61 127 99 220 247 244 157 51 84 122 196 125 265 115 186 139 3 132 223 211 268 102 155 207 238 41 199 200 231 236 172 275 250 176 248 255 222 59 100 33 124
##
## $variable2
## [1] 112 60 18 198 219 290 131 50 210 29 168 258 215 291 127 161 99 220 110 293 87 164 84 122 196 125 186 139 81 132 82 89 223 268 98 14 155 241 207 231 172 62 275 176 248 255 59 298 100 12 156
##
## $variable3
## [1] 278 219 290 35 4 237 131 168 202 174 215 220 247 244 261 293 164 13 294 84 196 125 265 115 186 81 3 89 223 211 268 98 14 155 241 207 38 191 200 276 250 45 269 255 298 100 12 156 124
回答2:
Here is an approach using merge
.
First, merge the dataframes, keeping all columns.
x <- merge(d1, d1, by="id")
Then, find all rows which do not match:
x[x$variable1.x != x$variable1.y | x$variable2.x != x$variable2.y |
x$variable3.x != x$variable3.y, ]
来源:https://stackoverflow.com/questions/30607019/function-or-other-basic-script-that-compares-values-on-two-variables-in-a-datafr