Vectorization and Joins


Vectorization

Sampling Example

Lets consider a function to generate samples from a bivariate normal.


We want to sample x and y such that


$$ \left( \begin{array}{c} x\\y \end{array} \right) \sim N \left( \left( \begin{array}{c} \mu_x\\\mu_y \end{array} \right) , \left( \begin{array}{cc} \sigma_x^2 & \rho \sigma_x\sigma_y\\ \rho \sigma_x\sigma_y & \sigma_y^2 \end{array} \right) \right) $$


we can accomplish this by sampling x and y separately from



$$ \begin{aligned} x &\;\sim N(\mu_x, \sigma_x^2) \\ y\,|\,x &\;\sim N\left(\mu_y + \frac{\sigma_y}{\sigma_x}\rho(x-\mu_x), (1-\rho^2)\sigma_y^2\right) \end{aligned} $$

respectively.

Nonvectorized rbinorm

rbinorm = function(n, mu=c(0,0), sigma=c(1,1), rho=0)
{
  res = matrix(NA, ncol=2, nrow=n)
  colnames(res) = c("x","y")
  for(i in 1:n)
  {
    x = rnorm(1, mu[1], sigma[1])
    y = rnorm(1, mu[2] + (sigma[2]/sigma[1]) * rho * (x - mu[1]), sqrt(1-rho^2) * sigma[2])
    
    res[i,] = c(x,y)
  }

  return(res)
}

Vectorized rbinorm

rbinorm_vec = function(n, mu=c(0,0), sigma=c(1,1), rho=0)
{
  x = rnorm(n, mu[1], sigma[1])
  y = rnorm(n, mu[2] + (sigma[2]/sigma[1]) * rho * (x - mu[1]), sqrt(1-rho^2) * sigma[2])
    
  return(cbind(x,y))
}

Output

par(mfrow=c(1,2), mar=c(4,4,1,1))
plot(rbinorm(1e5,rho=0.7),pch=16,cex=0.2,col=adjustcolor("black",alpha=0.1))
plot(rbinorm_vec(1e5,rho=0.7),pch=16,cex=0.2,col=adjustcolor("black",alpha=0.1))

plot of chunk unnamed-chunk-4

Microbenchmark

library(microbenchmark)
print(microbenchmark(rbinorm(1e5,rho=0.7),rbinorm_vec(1e5,rho=0.7), times=10))
## Unit: milliseconds
##                           expr     min      lq    mean  median      uq     max neval
##      rbinorm(1e+05, rho = 0.7) 1582.28 1586.79 1622.61 1620.34 1641.42 1728.11    10
##  rbinorm_vec(1e+05, rho = 0.7)   26.52   26.97   27.66   27.93   28.06   28.69    10

Distance Vectorization

gcd = function(loc1, loc2)
{
  deg2rad = function(deg) return(deg*pi/180)

  y1 = deg2rad( loc1[2] )
  y2 = deg2rad( loc2[2] )
  x1 = deg2rad( loc1[1] )
  x2 = deg2rad( loc2[1] )

  R = 6371 # Earth mean radius in km
  
  d = sin(y1)*sin(y2) + cos(y1)*cos(y2) * cos(x2-x1)
  if(d > 1 | d < -1) d = 0
  else d = acos(d) * R
  
  return(d) # distance in km
}

GCD - Non-vectorized

gcds = function(l1, l2=l1)
{
  d = matrix(NA, nrow=nrow(l1), ncol=nrow(l2))
  
  for(i in 1:nrow(l1))
  {
    for(j in 1:nrow(l2))
    {
      d[i,j] = gcd(l1[i,],l2[j,])
    }
  }

  return(d)
}

GCD - Vectorized

gcd_vec = function(loc1, loc2)
{
  loc1 = matrix(loc1*pi/180,ncol=2)
  loc2 = matrix(loc2*pi/180,ncol=2)

  R = 6371 # Earth mean radius in km

  d = sin(loc1[,2])*sin(loc2[,2]) + cos(loc1[,2])*cos(loc2[,2]) * cos(loc2[,1]-loc1[,1])  
  s = d > 1 | d < -1
  d[s] = 0
  d[!s] = acos(d[!s]) * R
  
  return(d) # distance in km
}

gcds_vec = function(l1, l2=l1)
{
  return( apply(l2, 1, function(x) gcd_vec(l1,x)) )
}

Performance

n = 200
locs = cbind(x = runif(n,-180,180), y = runif(n,-90,90))

microbenchmark(gcds(locs),
               gcds_vec(locs),
               times=10)
## Unit: milliseconds
##            expr  min      lq    mean  median      uq     max neval
##      gcds(locs) 1010 1020.81 1028.48 1021.19 1024.54 1064.78    10
##  gcds_vec(locs)   27   27.09   27.48   27.41   27.53   28.81    10
sum(abs(gcds(locs)-gcds_vec(locs)))
## [1] 0

Joins

Data

Name Email
Alice alice@company.com
Bob bob@company.com
Carol carol@company.com
Dave dave@company.com
Eve eve@company.com
Name Phone
Bob 919 555-1111
Carol 919 555-2222
Eve 919 555-3333
Eve 310 555-3333
Frank 919 555-4444

Outer Join

Outer join of email with phone by name

name email phone
Alice alice@company.com NULL
Bob bob@company.com 919 555-1111
Carol carol@company.com 919 555-2222
Dave dave@company.com NULL
Eve eve@company.com 919 555-3333
Eve eve@company.com 310 555-3333
Frank NULL 919 555-4444

Inner Join

Inner join of email with phone by name

Name Email Phone
Bob bob@company.com 919 555-1111
Carol carol@company.com 919 555-2222
Eve eve@company.com 919 555-3333
Eve eve@company.com 310 555-3333

Left Join

Left join of email with phone by name

name email phone
Alice alice@company.com NULL
Bob bob@company.com 919 555-1111
Carol carol@company.com 919 555-2222
Dave dave@company.com NULL
Eve eve@company.com 919 555-3333
Eve eve@company.com 310 555-3333

Right Join

Right join of email with phone by name

name email phone
Bob bob@company.com 919 555-1111
Carol carol@company.com 919 555-2222
Eve eve@company.com 919 555-3333
Eve eve@company.com 310 555-3333
Frank NULL 919 555-4444

Data - R

## 
## Attaching package: 'dplyr'
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
addr
##    name             email
## 1 Alice alice@company.com
## 2   Bob   bob@company.com
## 3 Carol carol@company.com
## 4  Dave  dave@company.com
## 5   Eve   eve@company.com
phone
##    name        phone
## 1   Bob 919 555-1111
## 2 Carol 919 555-2222
## 3   Eve 919 555-3333
## 4   Eve 310 555-3333
## 5 Frank 919 555-4444

Outer Join - R

Base R:

merge(addr, phone, all=TRUE)
##    name             email        phone
## 1 Alice alice@company.com         <NA>
## 2   Bob   bob@company.com 919 555-1111
## 3 Carol carol@company.com 919 555-2222
## 4  Dave  dave@company.com         <NA>
## 5   Eve   eve@company.com 919 555-3333
## 6   Eve   eve@company.com 310 555-3333
## 7 Frank              <NA> 919 555-4444

dplyr:

Not implemented, due in 0.3.1.







Inner Join - R

Base R:

merge(addr, phone, all=FALSE)
##    name             email        phone
## 1   Bob   bob@company.com 919 555-1111
## 2 Carol carol@company.com 919 555-2222
## 3   Eve   eve@company.com 919 555-3333
## 4   Eve   eve@company.com 310 555-3333


dplyr:

inner_join(addr,phone)
## Joining by: "name"
##    name             email        phone
## 1   Bob   bob@company.com 919 555-1111
## 2 Carol carol@company.com 919 555-2222
## 3   Eve   eve@company.com 919 555-3333
## 4   Eve   eve@company.com 310 555-3333

Left Join - R

Base R:

merge(addr, phone, all.x=TRUE)
##    name             email        phone
## 1 Alice alice@company.com         <NA>
## 2   Bob   bob@company.com 919 555-1111
## 3 Carol carol@company.com 919 555-2222
## 4  Dave  dave@company.com         <NA>
## 5   Eve   eve@company.com 919 555-3333
## 6   Eve   eve@company.com 310 555-3333


dplyr:

left_join(addr,phone)
## Joining by: "name"
##    name             email        phone
## 1 Alice alice@company.com         <NA>
## 2   Bob   bob@company.com 919 555-1111
## 3 Carol carol@company.com 919 555-2222
## 4  Dave  dave@company.com         <NA>
## 5   Eve   eve@company.com 919 555-3333
## 6   Eve   eve@company.com 310 555-3333

Right Join - R

Base R:

merge(addr, phone, all.y=TRUE)
##    name             email        phone
## 1   Bob   bob@company.com 919 555-1111
## 2 Carol carol@company.com 919 555-2222
## 3   Eve   eve@company.com 919 555-3333
## 4   Eve   eve@company.com 310 555-3333
## 5 Frank              <NA> 919 555-4444

dplyr:

Not implemented, due in 0.3.1.







Semi and Anti Joins - R

semi_join(addr, phone)
## Joining by: "name"
##    name             email
## 1   Bob   bob@company.com
## 2 Carol carol@company.com
## 3   Eve   eve@company.com
anti_join(addr, phone)
## Joining by: "name"
##    name             email
## 1  Dave  dave@company.com
## 2 Alice alice@company.com

Acknowledgments

Acknowledgments

Above materials are derived in part from the following sources: