Rcpp + Thrust

class: center, middle, inverse, title-slide

# Rcpp + Thrust
### Colin Rundel
### 2019-03-26

---

exclude: true

```r
knitr::opts_chunk$set(
  cache = FALSE, 
  fig.align = "center", 
  fig.width = 10, 
  out.width = "90%",
  dpi = 150
)
```

```r
eng_nvcc = function(options) {
  file = paste0(options$label,".cu")
  exec = sub("\\.cu$", "", file)
  writeLines(options$code, file)
  
  on.exit(unlink(c(file, exec)), add = TRUE)
  
  
  bin = options$engine.opts$bin
  flags = options$engine.opts$flags
  
  if (is.null(flags))
    flags = ""
  
  if (is.null(bin))
    bin = "/usr/local/cuda/bin"
  
  nvcc = file.path(bin,"nvcc")
  
  nvcc_cmd = paste(nvcc, flags, file, "-o", exec)
  exec_cmd = paste0("./", exec)
  
  if (options$eval) {
    nvcc_out = system(paste(nvcc_cmd, "2>&1"), intern = TRUE)
    exec_out = system(paste(exec_cmd, "2>&1"), intern = TRUE)
    
    out = rmarkdown:::one_string(c(
      paste(">", nvcc_cmd),
      nvcc_out,
      "",
      paste(">", exec_cmd),
      exec_out
    ))
  } else {
    out = ''
  }
  
  options$engine = "cpp"
  
  knitr::engine_output(options, options$code, out)
}

knitr::knit_engines$set(nvcc = eng_nvcc)
```

---
class: middle

# C++ Classes / Structs

---

## A Simple Example

.small[
```cpp
template<typename T>
struct rect {
T x1, x2, y1, y2;

rect() 
: x1(0), x2(1), y1(0), y2(1)
{ }

rect(T _x1, T _x2, T _y1, T _y2)
: x1(_x1), x2(_x2), y1(_y1), y2(_y2)
{ }

rect(rect<T> const& r) {
x1 = r.x1;
x2 = r.x2;
y1 = r.y1;
y2 = r.y2;
}

T get_width() {
return x2 - x1;
}

T get_height() {
return y2 - y1;
}

T get_area() {
return get_height() * get_width();
}
};
```
]

---

## Functors

In the STL sometimes we have to use a function which can only take a single argument (e.g. when using `std::transform`) but we still want to write generic code that has additional parameters. We can accomplish this using a functor, which is just a struct that acts like a function by overloading the `()` operator.

.smaller[ .columns[
.col[

```cpp
#include <Rcpp.h>

double scale_3_5(double val) {
  return 3*val + 5;
}

// [[Rcpp::export]]
Rcpp::NumericVector scale_fixed(Rcpp::NumericVector v) {
  std::transform(
    v.begin(), v.end(), 
    v.begin(), 
    scale_3_5
  );
  
  return v;
}
```

```r
scale_fixed(1:5)
```

```
## [1]  8 11 14 17 20
```
]

.col[

```cpp
#include <Rcpp.h>

struct scale {
  double a,b;
  scale(double _a, double _b) : a(_a), b(_b) { }
  
  double operator()(double& x) const {
    return a*x + b;
  }
};

// [[Rcpp::export]]
Rcpp::NumericVector scale_functor(Rcpp::NumericVector v, 
                                  double a, double b) {
  std::transform(
    v.begin(), v.end(), 
    v.begin(), 
    scale(a,b)
  );
  
  return v;
}
```

```r
scale_functor(1:5, 3, 5)
```

```
## [1]  8 11 14 17 20
```

```r
scale_functor(1:5, 5, 3)
```

```
## [1]  8 13 18 23 28
```
]
] ]

---
class: middle

# CUDA & Thrust

---

## Checking CUDA - deviceQuery

.smaller[

```bash
/usr/local/cuda/samples/1_Utilities/deviceQuery/deviceQuery
```

```
## /usr/local/cuda/samples/1_Utilities/deviceQuery/deviceQuery Starting...
## 
##  CUDA Device Query (Runtime API) version (CUDART static linking)
## 
## Detected 2 CUDA Capable device(s)
## 
## Device 0: "Tesla P100-PCIE-16GB"
##   CUDA Driver Version / Runtime Version          10.0 / 10.0
##   CUDA Capability Major/Minor version number:    6.0
##   Total amount of global memory:                 16281 MBytes (17071734784 bytes)
##   (56) Multiprocessors, ( 64) CUDA Cores/MP:     3584 CUDA Cores
##   GPU Max Clock rate:                            1329 MHz (1.33 GHz)
##   Memory Clock rate:                             715 Mhz
##   Memory Bus Width:                              4096-bit
##   L2 Cache Size:                                 4194304 bytes
##   Maximum Texture Dimension Size (x,y,z)         1D=(131072), 2D=(131072, 65536), 3D=(16384, 16384, 16384)
##   Maximum Layered 1D Texture Size, (num) layers  1D=(32768), 2048 layers
##   Maximum Layered 2D Texture Size, (num) layers  2D=(32768, 32768), 2048 layers
##   Total amount of constant memory:               65536 bytes
##   Total amount of shared memory per block:       49152 bytes
##   Total number of registers available per block: 65536
##   Warp size:                                     32
##   Maximum number of threads per multiprocessor:  2048
##   Maximum number of threads per block:           1024
##   Max dimension size of a thread block (x,y,z): (1024, 1024, 64)
##   Max dimension size of a grid size    (x,y,z): (2147483647, 65535, 65535)
##   Maximum memory pitch:                          2147483647 bytes
##   Texture alignment:                             512 bytes
##   Concurrent copy and kernel execution:          Yes with 2 copy engine(s)
##   Run time limit on kernels:                     No
##   Integrated GPU sharing Host Memory:            No
##   Support host page-locked memory mapping:       Yes
##   Alignment requirement for Surfaces:            Yes
##   Device has ECC support:                        Enabled
##   Device supports Unified Addressing (UVA):      Yes
##   Device supports Compute Preemption:            Yes
##   Supports Cooperative Kernel Launch:            Yes
##   Supports MultiDevice Co-op Kernel Launch:      Yes
##   Device PCI Domain ID / Bus ID / location ID:   0 / 2 / 0
##   Compute Mode:
##      < Default (multiple host threads can use ::cudaSetDevice() with device simultaneously) >
## 
## Device 1: "Tesla P100-PCIE-16GB"
##   CUDA Driver Version / Runtime Version          10.0 / 10.0
##   CUDA Capability Major/Minor version number:    6.0
##   Total amount of global memory:                 16281 MBytes (17071734784 bytes)
##   (56) Multiprocessors, ( 64) CUDA Cores/MP:     3584 CUDA Cores
##   GPU Max Clock rate:                            1329 MHz (1.33 GHz)
##   Memory Clock rate:                             715 Mhz
##   Memory Bus Width:                              4096-bit
##   L2 Cache Size:                                 4194304 bytes
##   Maximum Texture Dimension Size (x,y,z)         1D=(131072), 2D=(131072, 65536), 3D=(16384, 16384, 16384)
##   Maximum Layered 1D Texture Size, (num) layers  1D=(32768), 2048 layers
##   Maximum Layered 2D Texture Size, (num) layers  2D=(32768, 32768), 2048 layers
##   Total amount of constant memory:               65536 bytes
##   Total amount of shared memory per block:       49152 bytes
##   Total number of registers available per block: 65536
##   Warp size:                                     32
##   Maximum number of threads per multiprocessor:  2048
##   Maximum number of threads per block:           1024
##   Max dimension size of a thread block (x,y,z): (1024, 1024, 64)
##   Max dimension size of a grid size    (x,y,z): (2147483647, 65535, 65535)
##   Maximum memory pitch:                          2147483647 bytes
##   Texture alignment:                             512 bytes
##   Concurrent copy and kernel execution:          Yes with 2 copy engine(s)
##   Run time limit on kernels:                     No
##   Integrated GPU sharing Host Memory:            No
##   Support host page-locked memory mapping:       Yes
##   Alignment requirement for Surfaces:            Yes
##   Device has ECC support:                        Enabled
##   Device supports Unified Addressing (UVA):      Yes
##   Device supports Compute Preemption:            Yes
##   Supports Cooperative Kernel Launch:            Yes
##   Supports MultiDevice Co-op Kernel Launch:      Yes
##   Device PCI Domain ID / Bus ID / location ID:   0 / 3 / 0
##   Compute Mode:
##      < Default (multiple host threads can use ::cudaSetDevice() with device simultaneously) >
## > Peer access from Tesla P100-PCIE-16GB (GPU0) -> Tesla P100-PCIE-16GB (GPU1) : Yes
## > Peer access from Tesla P100-PCIE-16GB (GPU1) -> Tesla P100-PCIE-16GB (GPU0) : Yes
## 
## deviceQuery, CUDA Driver = CUDART, CUDA Driver Version = 10.0, CUDA Runtime Version = 10.0, NumDevs = 2
## Result = PASS
```
]

---

## Checking CUDA - nvidia-smi

.small[

```bash
nvidia-smi
```

```
## Tue Mar 26 10:06:12 2019       
## +-----------------------------------------------------------------------------+
## | NVIDIA-SMI 410.104      Driver Version: 410.104      CUDA Version: 10.0     |
## |-------------------------------+----------------------+----------------------+
## | GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
## | Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
## |===============================+======================+======================|
## |   0  Tesla P100-PCIE...  Off  | 00000000:02:00.0 Off |                    0 |
## | N/A   37C    P0    31W / 250W |    751MiB / 16280MiB |      0%      Default |
## +-------------------------------+----------------------+----------------------+
## |   1  Tesla P100-PCIE...  Off  | 00000000:03:00.0 Off |                    0 |
## | N/A   38C    P0    30W / 250W |     10MiB / 16280MiB |      0%      Default |
## +-------------------------------+----------------------+----------------------+
##                                                                                
## +-----------------------------------------------------------------------------+
## | Processes:                                                       GPU Memory |
## |  GPU       PID   Type   Process name                             Usage      |
## |=============================================================================|
## |    0    218630      C   /usr/lib/rstudio-server/bin/rsession         741MiB |
## +-----------------------------------------------------------------------------+
```
]

---

## Rcpp Plugins

.small[

```r
Rcpp:::.plugins
```

```
## <environment: 0x55f696af7be8>
```

```r
ls(envir=Rcpp:::.plugins)
```

```
##  [1] "cpp0x"         "cpp11"         "cpp14"         "cpp17"         "cpp1y"         "cpp1z"        
##  [7] "cpp2a"         "cpp98"         "openmp"        "unwindProtect"
```

```r
Rcpp:::.plugins$openmp
```

```
## function () 
## {
##     list(env = list(PKG_CXXFLAGS = "-fopenmp", PKG_LIBS = "-fopenmp"))
## }
## <bytecode: 0x55f696af90f8>
## <environment: namespace:Rcpp>
```

```r
Rcpp:::.plugins$cpp11
```

```
## function () 
## {
##     if (getRversion() >= "3.4") 
##         list(env = list(USE_CXX11 = "yes"))
##     else if (getRversion() >= "3.1") 
##         list(env = list(USE_CXX1X = "yes"))
##     else if (.Platform$OS.type == "windows") 
##         list(env = list(PKG_CXXFLAGS = "-std=c++0x"))
##     else list(env = list(PKG_CXXFLAGS = "-std=c++11"))
## }
## <bytecode: 0x55f696af72f0>
## <environment: namespace:Rcpp>
```
]

---

## Rcpp Thrust Plugin

```r
thrust = function() {
  list(
    env = list(
      MAKEFLAGS = paste(
        "CXX=/usr/local/cuda/bin/nvcc",
        "CXXFLAGS=-x\\ cu\\ -g\\ -G\\ -O3 --std=c++11",
        "CXXPICFLAGS=-Xcompiler\\ -fpic\\ -Xcudafe\\ --diag_suppress=code_is_unreachable",
        "LDFLAGS="
      ),
      PKG_CXXFLAGS = paste0("-I", here::here())
    )
  )
}
Rcpp::registerPlugin("thrust", thrust)
```

---

## Host Vector

.small[

```cpp
// [[Rcpp::plugins(thrust)]]
#include <Rcpp.h>
#include <thrust/device_vector.h>

// [[Rcpp::export]]
void host_vec(Rcpp::NumericVector v) {
  thrust::host_vector<double> H(v.begin(), v.end());
  
  // H.size() returns the size of vector H
  Rcpp::Rcout << "H has size " << H.size() << std::endl;
  
  // print contents of H
  for(int i = 0; i < H.size(); i++)
    Rcpp::Rcout << "H[" << i << "] = " << H[i] << std::endl;
}
```

```r
host_vec(1:5)
```

```
## H has size 5
## H[0] = 1
## H[1] = 2
## H[2] = 3
## H[3] = 4
## H[4] = 5
```
]

---

## Device Vector

.small[

```cpp
// [[Rcpp::plugins(thrust)]]
#include <Rcpp.h>
#include <thrust/device_vector.h>

// [[Rcpp::export]]
void dev_vec(Rcpp::NumericVector v) {
  thrust::device_vector<double> D(v.begin(), v.end());
  
  // H.size() returns the size of vector H
  Rcpp::Rcout << "D has size " << D.size() << std::endl;
  
  // print contents of H
  for(int i = 0; i < D.size(); i++)
    Rcpp::Rcout << "D[" << i << "] = " << D[i] << std::endl;
}
```

```r
dev_vec(1:5)
```

```
## D has size 5
## D[0] = 1
## D[1] = 2
## D[2] = 3
## D[3] = 4
## D[4] = 5
```
]

---

## GPU Sort

.small[

```cpp
// [[Rcpp::plugins(thrust)]]
#include <Rcpp.h>
#include <thrust/device_vector.h>

// [[Rcpp::export]]
void std_sort(int n) {
  Rcpp::NumericVector r = Rcpp::runif(n);
  Rcpp::NumericVector v(r.begin(), r.end());
  std::sort(v.begin(), v.end());
}

// [[Rcpp::export]]
void thrust_host_sort(int n) {
  Rcpp::NumericVector r = Rcpp::runif(n);
  thrust::host_vector<double> H(r.begin(), r.end());
  thrust::sort(H.begin(), H.end());
}

// [[Rcpp::export]]
void thrust_gpu_sort(int n) {
  Rcpp::NumericVector r = Rcpp::runif(n);
  thrust::device_vector<double> D(r.begin(), r.end());
  thrust::sort(D.begin(), D.end());
}
```
]

---

```
## Warning: Some expressions had a GC in every iteration; so filtering is disabled.
```

---

## GPU Sort - sort timing

.small[

<img src="Lec13_files/figure-html/unnamed-chunk-17-1.png" width="90%" style="display: block; margin: auto;" />
]

---

## GPU Vectorized exp

.small[

```cpp
// [[Rcpp::plugins(thrust)]]
#include <Rcpp.h>
#include <thrust/device_vector.h>

template<class T>
struct exp_functor{
  __host__ __device__ T operator()(T &x) const{
    return exp(x);
  }
};

// [[Rcpp::export]]
void std_exp(int n) {
  std::vector<double> v(n, 1.0);
  std::transform(v.begin(), v.end(), v.begin(), exp_functor<double>());
}

// [[Rcpp::export]]
void thrust_host_exp(int n) {
  thrust::host_vector<double> H(n, 1.0);
  thrust::transform(H.begin(), H.end(), H.begin(), exp_functor<double>());
}

// [[Rcpp::export]]
void thrust_dev_exp(int n) {
  thrust::device_vector<double> D(n, 1.0);
  thrust::transform(D.begin(), D.end(), D.begin(), exp_functor<double>());
}
```
]

---

---

## GPU Exp - Timings

---

## Covariance Matrix Calculation

```r
r_sq_exp_cov = function(d, sigma2=1, range=1) {
  sigma2*exp(-d*d*range*range)
}
```

```cpp
// [[Rcpp::depends(RcppArmadillo)]]
#include <RcppArmadillo.h>

// [[Rcpp::export]]
arma::mat arma_sq_exp_cov(arma::mat const& d, double sigma2 = 1, double range=1) {
  
  return sigma2 * arma::exp(-(arma::square(d * range)));
}
```

---

```cpp
// [[Rcpp::plugins(thrust)]]
#include <Rcpp.h>
#include <thrust/device_vector.h>

struct sq_exp_cov_functor{
  double sigma2;
  double range;
  
  sq_exp_cov_functor(double s, double r) : sigma2(s), range(r) { }
  
  __host__ __device__ double operator()(double &d) const{
    return sigma2 * exp(-d*d * range*range);
  }
};

// [[Rcpp::export]]
Rcpp::NumericMatrix gpu_sq_exp_cov(Rcpp::NumericMatrix const& d, 
                                   double sigma2 = 1, double range=1) {
  thrust::device_vector<double> D(d.begin(), d.end());
  thrust::transform(D.begin(), D.end(), D.begin(), sq_exp_cov_functor(sigma2, range));
  
  Rcpp::NumericMatrix cov(d.nrow(), d.ncol());
  thrust::copy(D.begin(), D.end(), cov.begin());
  
  return cov;
}
```

---

---

---

## Thrust Reductions

```cpp
// [[Rcpp::plugins(thrust)]]
#include <Rcpp.h>
#include <thrust/device_vector.h>
#include <thrust/functional.h>

// [[Rcpp::export]]
double thrust_sum(Rcpp::NumericVector v) {
  thrust::device_vector<double> d(v.begin(), v.end());
  double sum = thrust::reduce(
    d.begin(), d.end(), (double) 0, thrust::plus<double>()
  );
  
  return sum;
}
```

```r
thrust_sum(1:5)
```

```
## [1] 15
```

---

.small[

```cpp
// [[Rcpp::plugins(thrust)]]
#include <Rcpp.h>
#include <thrust/device_vector.h>
#include <thrust/functional.h>

template<class T>
struct prod {
  __host__ __device__ T operator()(T& x, T& y) const{
    return x * y;
  }
};

// [[Rcpp::export]]
double thrust_prod(Rcpp::NumericVector v) {
  thrust::device_vector<double> d(v.begin(), v.end());
  
  return thrust::reduce(
    d.begin(), d.end(), (double) 1, prod<double>()
  );
}
```

```r
thrust_prod(1:5)
```

```
## [1] 120
```
]

---

.small[

```cpp
// [[Rcpp::plugins(thrust)]]
#include <Rcpp.h>
#include <thrust/device_vector.h>\

template<class T>
struct min_functor {
  __host__ __device__ T operator()(T& x, T& y) const{
    return min(x, y);
  }
};

// [[Rcpp::export]]
double thrust_min(Rcpp::NumericVector v) {
  thrust::device_vector<double> d(v.begin(), v.end());
  
  return thrust::reduce(
    d.begin(), d.end(), (double) 1, min_functor<double>()
  );
}
```

```r
thrust_min(c(3,6,2,1,7,9))
```

```
## [1] 1
```
]

---

## Transform + Reduce

.small[

```cpp
// [[Rcpp::plugins(thrust)]]
#include <Rcpp.h>
#include <thrust/device_vector.h>
#include <thrust/transform_reduce.h>
#include <thrust/functional.h>

template<class T>
struct square {
  __host__ __device__ T operator()(T const& x) const{
    return x * x;
  }
};

// [[Rcpp::export]]
double thrust_rmse(Rcpp::NumericVector v) {
  thrust::device_vector<double> d(v.begin(), v.end());
  
  double sum_of_squares = thrust::transform_reduce(
    d.begin(), d.end(), square<double>(), (double) 0, thrust::plus<double>()
  );
  
  return sqrt(sum_of_squares / v.size());
}
```

```r
resid = rnorm(1000)

thrust_rmse(resid)
```

```
## [1] 0.9960206
```

```r
resid^2 %>% mean() %>% sqrt()
```

```
## [1] 0.9960206
```
]