Vectorization

Sampling Example

Lets consider a function to generate samples from a bivariate normal.


We want to sample x and y such that

\[ \left( \begin{array}{c} x\\y \end{array} \right) \sim N \left( \left( \begin{array}{c} \mu_x\\\mu_y \end{array} \right) , \left( \begin{array}{cc} \sigma_x^2 & \rho \sigma_x\sigma_y\\ \rho \sigma_x\sigma_y & \sigma_y^2 \end{array} \right) \right) \]


we can accomplish this by sampling x and y separately from


\[ \begin{aligned} x &\;\sim N(\mu_x, \sigma_x^2) \\ y\,|\,x &\;\sim N\left(\mu_y + \frac{\sigma_y}{\sigma_x}\rho(x-\mu_x), (1-\rho^2)\sigma_y^2\right) \end{aligned} \]

respectively.

Nonvectorized rbinorm

rbinorm = function(n, mu=c(0,0), sigma=c(1,1), rho=0)
{
  res = matrix(NA, ncol=2, nrow=n)
  colnames(res) = c("x","y")
  for(i in 1:n)
  {
    x = rnorm(1, mu[1], sigma[1])
    y = rnorm(1, mu[2] + (sigma[2]/sigma[1]) * rho * (x - mu[1]), sqrt(1-rho^2) * sigma[2])
    
    res[i,] = c(x,y)
  }

  return(res)
}

Vectorized rbinorm

rbinorm_vec = function(n, mu=c(0,0), sigma=c(1,1), rho=0)
{
  x = rnorm(n, mu[1], sigma[1])
  y = rnorm(n, mu[2] + (sigma[2]/sigma[1]) * rho * (x - mu[1]), sqrt(1-rho^2) * sigma[2])
    
  return(cbind(x,y))
}

Output

par(mfrow=c(1,2), mar=c(4,4,1,1))
plot(rbinorm(1e5,rho=0.7),pch=16,cex=0.2,col=adjustcolor("black",alpha=0.1))
plot(rbinorm_vec(1e5,rho=0.7),pch=16,cex=0.2,col=adjustcolor("black",alpha=0.1))

plot of chunk unnamed-chunk-4

Microbenchmark

library(microbenchmark)
print(microbenchmark(rbinorm(1e5,rho=0.7),rbinorm_vec(1e5,rho=0.7), times=10))
## Unit: milliseconds
##                           expr     min     lq    mean  median      uq     max neval
##      rbinorm(1e+05, rho = 0.7) 1626.22 1629.9 1658.19 1658.77 1684.33 1690.40    10
##  rbinorm_vec(1e+05, rho = 0.7)   26.38   26.5   27.19   26.97   27.92   28.35    10

Distance Vectorization

gcd = function(loc1, loc2)
{
  deg2rad = function(deg) return(deg*pi/180)

  y1 = deg2rad( loc1[2] )
  y2 = deg2rad( loc2[2] )
  x1 = deg2rad( loc1[1] )
  x2 = deg2rad( loc2[1] )

  R = 6371 # Earth mean radius in km
  
  d = sin(y1)*sin(y2) + cos(y1)*cos(y2) * cos(x2-x1)
  if(d > 1 | d < -1) d = 0
  else d = acos(d) * R
  
  return(d) # distance in km
}

GCD - Non-vectorized

gcds = function(l1, l2=l1)
{
  d = matrix(NA, nrow=nrow(l1), ncol=nrow(l2))
  
  for(i in 1:nrow(l1))
  {
    for(j in 1:nrow(l2))
    {
      d[i,j] = gcd(l1[i,],l2[j,])
    }
  }

  return(d)
}

GCD - Vectorized

gcd_vec = function(loc1, loc2)
{
  loc1 = matrix(loc1*pi/180,ncol=2)
  loc2 = matrix(loc2*pi/180,ncol=2)

  R = 6371 # Earth mean radius in km

  d = sin(loc1[,2])*sin(loc2[,2]) + cos(loc1[,2])*cos(loc2[,2]) * cos(loc2[,1]-loc1[,1])  
  s = d > 1 | d < -1
  d[s] = 0
  d[!s] = acos(d[!s]) * R
  
  return(d) # distance in km
}

gcds_vec = function(l1, l2=l1)
{
  return( apply(l2, 1, function(x) gcd_vec(l1,x)) )
}

Performance

n = 200
locs = cbind(x = runif(n,-180,180), y = runif(n,-90,90))

microbenchmark(gcds(locs),
               gcds_vec(locs),
               times=10)
## Unit: milliseconds
##            expr     min      lq    mean  median      uq     max neval
##      gcds(locs) 1007.14 1046.46 1106.32 1104.08 1175.90 1215.68    10
##  gcds_vec(locs)   28.91   30.39   31.19   30.63   32.02   35.22    10
sum(abs(gcds(locs)-gcds_vec(locs)))
## [1] 0

Joins

Data

Outer Join

Inner Join

Left Join

Right Join

Data - R

addr
##    name             email
## 1 Alice alice@company.com
## 2   Bob   bob@company.com
## 3 Carol carol@company.com
## 4  Dave  dave@company.com
## 5   Eve   eve@company.com
phone
##    name        phone
## 1   Bob 919 555-1111
## 2 Carol 919 555-2222
## 3   Eve 919 555-3333
## 4   Eve 310 555-3333
## 5 Frank 919 555-4444

Outer Join - R

Base R:

merge(addr, phone, all=TRUE)
##    name             email        phone
## 1 Alice alice@company.com         <NA>
## 2   Bob   bob@company.com 919 555-1111
## 3 Carol carol@company.com 919 555-2222
## 4  Dave  dave@company.com         <NA>
## 5   Eve   eve@company.com 919 555-3333
## 6   Eve   eve@company.com 310 555-3333
## 7 Frank              <NA> 919 555-4444

dplyr:

Not implemented, due in 0.3.1.







Inner Join - R

Base R:

merge(addr, phone, all=FALSE)
##    name             email        phone
## 1   Bob   bob@company.com 919 555-1111
## 2 Carol carol@company.com 919 555-2222
## 3   Eve   eve@company.com 919 555-3333
## 4   Eve   eve@company.com 310 555-3333


dplyr:

inner_join(addr,phone)
## Joining by: "name"
##    name             email        phone
## 1   Bob   bob@company.com 919 555-1111
## 2 Carol carol@company.com 919 555-2222
## 3   Eve   eve@company.com 919 555-3333
## 4   Eve   eve@company.com 310 555-3333

Left Join - R

Base R:

merge(addr, phone, all.x=TRUE)
##    name             email        phone
## 1 Alice alice@company.com         <NA>
## 2   Bob   bob@company.com 919 555-1111
## 3 Carol carol@company.com 919 555-2222
## 4  Dave  dave@company.com         <NA>
## 5   Eve   eve@company.com 919 555-3333
## 6   Eve   eve@company.com 310 555-3333


dplyr:

left_join(addr,phone)
## Joining by: "name"
##    name             email        phone
## 1 Alice alice@company.com         <NA>
## 2   Bob   bob@company.com 919 555-1111
## 3 Carol carol@company.com 919 555-2222
## 4  Dave  dave@company.com         <NA>
## 5   Eve   eve@company.com 919 555-3333
## 6   Eve   eve@company.com 310 555-3333

Right Join - R

Base R:

merge(addr, phone, all.y=TRUE)
##    name             email        phone
## 1   Bob   bob@company.com 919 555-1111
## 2 Carol carol@company.com 919 555-2222
## 3   Eve   eve@company.com 919 555-3333
## 4   Eve   eve@company.com 310 555-3333
## 5 Frank              <NA> 919 555-4444

dplyr:

Not implemented, due in 0.3.1.







Semi and Anti Joins - R

semi_join(addr, phone)
## Joining by: "name"
##    name             email
## 1   Bob   bob@company.com
## 2 Carol carol@company.com
## 3   Eve   eve@company.com
anti_join(addr, phone)
## Joining by: "name"
##    name             email
## 1  Dave  dave@company.com
## 2 Alice alice@company.com

Acknowledgments

Acknowledgments