# Vectorization

## Sampling Example

Lets consider a function to generate samples from a bivariate normal.

We want to sample x and y such that

$$\left( \begin{array}{c} x\\y \end{array} \right) \sim N \left( \left( \begin{array}{c} \mu_x\\\mu_y \end{array} \right) , \left( \begin{array}{cc} \sigma_x^2 & \rho \sigma_x\sigma_y\\ \rho \sigma_x\sigma_y & \sigma_y^2 \end{array} \right) \right)$$

we can accomplish this by sampling x and y separately from

\begin{aligned} x &\;\sim N(\mu_x, \sigma_x^2) \\ y\,|\,x &\;\sim N\left(\mu_y + \frac{\sigma_y}{\sigma_x}\rho(x-\mu_x), (1-\rho^2)\sigma_y^2\right) \end{aligned}

respectively.

## Nonvectorized rbinorm

rbinorm = function(n, mu=c(0,0), sigma=c(1,1), rho=0)
{
res = matrix(NA, ncol=2, nrow=n)
colnames(res) = c("x","y")
for(i in 1:n)
{
x = rnorm(1, mu[1], sigma[1])
y = rnorm(1, mu[2] + (sigma[2]/sigma[1]) * rho * (x - mu[1]), sqrt(1-rho^2) * sigma[2])

res[i,] = c(x,y)
}

return(res)
}

## Vectorized rbinorm

rbinorm_vec = function(n, mu=c(0,0), sigma=c(1,1), rho=0)
{
x = rnorm(n, mu[1], sigma[1])
y = rnorm(n, mu[2] + (sigma[2]/sigma[1]) * rho * (x - mu[1]), sqrt(1-rho^2) * sigma[2])

return(cbind(x,y))
}

## Output

par(mfrow=c(1,2), mar=c(4,4,1,1))
plot(rbinorm_vec(1e5,rho=0.7),pch=16,cex=0.2,col=adjustcolor("black",alpha=0.1))

## Microbenchmark

library(microbenchmark)
print(microbenchmark(rbinorm(1e5,rho=0.7),rbinorm_vec(1e5,rho=0.7), times=10))
## Unit: milliseconds
##                           expr     min      lq    mean  median      uq     max neval
##      rbinorm(1e+05, rho = 0.7) 1582.28 1586.79 1622.61 1620.34 1641.42 1728.11    10
##  rbinorm_vec(1e+05, rho = 0.7)   26.52   26.97   27.66   27.93   28.06   28.69    10

## Distance Vectorization

gcd = function(loc1, loc2)
{

R = 6371 # Earth mean radius in km

d = sin(y1)*sin(y2) + cos(y1)*cos(y2) * cos(x2-x1)
if(d > 1 | d < -1) d = 0
else d = acos(d) * R

return(d) # distance in km
}

## GCD - Non-vectorized

gcds = function(l1, l2=l1)
{
d = matrix(NA, nrow=nrow(l1), ncol=nrow(l2))

for(i in 1:nrow(l1))
{
for(j in 1:nrow(l2))
{
d[i,j] = gcd(l1[i,],l2[j,])
}
}

return(d)
}

## GCD - Vectorized

gcd_vec = function(loc1, loc2)
{
loc1 = matrix(loc1*pi/180,ncol=2)
loc2 = matrix(loc2*pi/180,ncol=2)

R = 6371 # Earth mean radius in km

d = sin(loc1[,2])*sin(loc2[,2]) + cos(loc1[,2])*cos(loc2[,2]) * cos(loc2[,1]-loc1[,1])
s = d > 1 | d < -1
d[s] = 0
d[!s] = acos(d[!s]) * R

return(d) # distance in km
}

gcds_vec = function(l1, l2=l1)
{
return( apply(l2, 1, function(x) gcd_vec(l1,x)) )
}

## Performance

n = 200
locs = cbind(x = runif(n,-180,180), y = runif(n,-90,90))

microbenchmark(gcds(locs),
gcds_vec(locs),
times=10)
## Unit: milliseconds
##            expr  min      lq    mean  median      uq     max neval
##      gcds(locs) 1010 1020.81 1028.48 1021.19 1024.54 1064.78    10
##  gcds_vec(locs)   27   27.09   27.48   27.41   27.53   28.81    10
sum(abs(gcds(locs)-gcds_vec(locs)))
## [1] 0

# Joins

## Data

Name Email
Alice alice@company.com
Bob bob@company.com
Carol carol@company.com
Dave dave@company.com
Eve eve@company.com
Name Phone
Bob 919 555-1111
Carol 919 555-2222
Eve 919 555-3333
Eve 310 555-3333
Frank 919 555-4444

## Outer Join

Outer join of email with phone by name

name email phone
Alice alice@company.com NULL
Bob bob@company.com 919 555-1111
Carol carol@company.com 919 555-2222
Dave dave@company.com NULL
Eve eve@company.com 919 555-3333
Eve eve@company.com 310 555-3333
Frank NULL 919 555-4444

## Inner Join

Inner join of email with phone by name

Name Email Phone
Bob bob@company.com 919 555-1111
Carol carol@company.com 919 555-2222
Eve eve@company.com 919 555-3333
Eve eve@company.com 310 555-3333

## Left Join

Left join of email with phone by name

name email phone
Alice alice@company.com NULL
Bob bob@company.com 919 555-1111
Carol carol@company.com 919 555-2222
Dave dave@company.com NULL
Eve eve@company.com 919 555-3333
Eve eve@company.com 310 555-3333

## Right Join

Right join of email with phone by name

name email phone
Bob bob@company.com 919 555-1111
Carol carol@company.com 919 555-2222
Eve eve@company.com 919 555-3333
Eve eve@company.com 310 555-3333
Frank NULL 919 555-4444

## Data - R

##
## Attaching package: 'dplyr'
##
## The following object is masked from 'package:stats':
##
##     filter
##
## The following objects are masked from 'package:base':
##
##     intersect, setdiff, setequal, union
addr
##    name             email
## 1 Alice alice@company.com
## 2   Bob   bob@company.com
## 3 Carol carol@company.com
## 4  Dave  dave@company.com
## 5   Eve   eve@company.com
phone
##    name        phone
## 1   Bob 919 555-1111
## 2 Carol 919 555-2222
## 3   Eve 919 555-3333
## 4   Eve 310 555-3333
## 5 Frank 919 555-4444

## Outer Join - R

Base R:

merge(addr, phone, all=TRUE)
##    name             email        phone
## 1 Alice alice@company.com         <NA>
## 2   Bob   bob@company.com 919 555-1111
## 3 Carol carol@company.com 919 555-2222
## 4  Dave  dave@company.com         <NA>
## 5   Eve   eve@company.com 919 555-3333
## 6   Eve   eve@company.com 310 555-3333
## 7 Frank              <NA> 919 555-4444

dplyr:

Not implemented, due in 0.3.1.

## Inner Join - R

Base R:

merge(addr, phone, all=FALSE)
##    name             email        phone
## 1   Bob   bob@company.com 919 555-1111
## 2 Carol carol@company.com 919 555-2222
## 3   Eve   eve@company.com 919 555-3333
## 4   Eve   eve@company.com 310 555-3333

dplyr:

inner_join(addr,phone)
## Joining by: "name"
##    name             email        phone
## 1   Bob   bob@company.com 919 555-1111
## 2 Carol carol@company.com 919 555-2222
## 3   Eve   eve@company.com 919 555-3333
## 4   Eve   eve@company.com 310 555-3333

## Left Join - R

Base R:

merge(addr, phone, all.x=TRUE)
##    name             email        phone
## 1 Alice alice@company.com         <NA>
## 2   Bob   bob@company.com 919 555-1111
## 3 Carol carol@company.com 919 555-2222
## 4  Dave  dave@company.com         <NA>
## 5   Eve   eve@company.com 919 555-3333
## 6   Eve   eve@company.com 310 555-3333

dplyr:

left_join(addr,phone)
## Joining by: "name"
##    name             email        phone
## 1 Alice alice@company.com         <NA>
## 2   Bob   bob@company.com 919 555-1111
## 3 Carol carol@company.com 919 555-2222
## 4  Dave  dave@company.com         <NA>
## 5   Eve   eve@company.com 919 555-3333
## 6   Eve   eve@company.com 310 555-3333

## Right Join - R

Base R:

merge(addr, phone, all.y=TRUE)
##    name             email        phone
## 1   Bob   bob@company.com 919 555-1111
## 2 Carol carol@company.com 919 555-2222
## 3   Eve   eve@company.com 919 555-3333
## 4   Eve   eve@company.com 310 555-3333
## 5 Frank              <NA> 919 555-4444

dplyr:

Not implemented, due in 0.3.1.

## Semi and Anti Joins - R

semi_join(addr, phone)
## Joining by: "name"
##    name             email
## 1   Bob   bob@company.com
## 2 Carol carol@company.com
## 3   Eve   eve@company.com
anti_join(addr, phone)
## Joining by: "name"
##    name             email
## 1  Dave  dave@company.com
## 2 Alice alice@company.com

# Acknowledgments

## Acknowledgments

Above materials are derived in part from the following sources: