Rebecca C. Steorts
August 30, 2016
Let's investigate what record linkage data sets look like in practice.
## load packages
library(RecordLinkage)
data(RLdata500)
## read in and remove missing values
myDat <- RLdata500[,-c(2,4)]
dim(myDat)
[1] 500 5
head(myDat)
fname_c1 lname_c1 by bm bd
1 CARSTEN MEIER 1949 7 22
2 GERD BAUER 1968 7 27
3 ROBERT HARTMANN 1930 4 30
4 STEFAN WOLFF 1957 9 2
5 RALF KRUEGER 1966 1 13
6 JUERGEN FRANKE 1929 7 4
## unique ids
uniqueID <- identity.RLdata500
head(uniqueID)
[1] 34 51 115 189 72 142
(clusterSize <- table(table(uniqueID)))
1 2
400 50
barplot(clusterSize, xlab="cluster size", ylab="number of records in each cluster", col="blue", width=c(2,2))
fDat08 <- read.table("../../datasets/SHIW/v1_08_r6.txt",header=TRUE)
fDat10 <- read.table("../../datasets/SHIW/v2_10_r6.txt",header=TRUE)
dim(fDat08)
[1] 434 6
dim(fDat10)
[1] 355 6
## Let's combine the two data sets
fDat <- rbind(fDat08, fDat10)
head(fDat)
id SEX ANASC STUDIO QUAL SETT
1 2160221 1 1941 5 6 5
2 222511 1 1928 3 6 5
3 222621 1 1941 5 6 5
4 222631 2 1931 3 6 5
5 222632 1 1960 4 1 2
6 222661 1 1926 3 6 5
## Let's keep the unique id separate.
funiqueID <- fDat$id
fDat <- fDat[,-1]
(fclusterSize <- table(table(funiqueID)))
1 2
385 202
barplot(fclusterSize, xlab="cluster size", ylab="number of records in each cluster", col="red", width=c(2,2))
# Let's just work with the first 10 records
ital <- fDat[1:10,]
# Let's initialize the variables we need
true.link <- exact.match <- near.twin <- rep(F,choose(dim(ital)[1],2))
for(i in 1:(dim(ital)[1]-1)){
for(j in (i+1):dim(ital)[1]){
rec.i <- ital[i,]
rec.j <- ital[j,]
if(rec.i[1]==rec.j[1]){true.link[i] <- T}
if(all(rec.i[-1]==rec.j[-1])){exact.match[i] <- T}
if(sum(rec.i[-1]!=rec.j[-1])<=1){near.twin[i] <- T}
}
}
(exact.match.fnr <- sum(true.link & !exact.match)/sum(true.link))
[1] 0.875
(exact.match.fdr <- sum(exact.match & !true.link)/sum(exact.match))
[1] 0
(near.twin.fnr <- sum(true.link & !near.twin)/sum(true.link))
[1] 0.25
(near.twin.fdr <- sum(near.twin & !true.link)/sum(near.twin))
[1] 0.1428571