Bigish data


Reading

Reading NYParkingViolations.csv

If we use the basic approach of read.csv, we end up waiting a really long time,

system.time(park<-read.csv("/home/vis/cr173/Sta523/data/parking/NYParkingViolations.csv"))

##    user  system elapsed 
## 544.698   7.411 555.405 

Nearly 10 minutes to read in a 1.7 gigabyte CSV file.

Improvement - stringsAsFactors

system.time(read.csv("/home/vis/cr173/Sta523/data/parking/NYParkingViolations.csv", 
                     stringsAsFactors=FALSE)
           )

##    user  system elapsed 
## 480.018   6.643 486.517 

Improvement - comment.char

system.time(read.csv("/home/vis/cr173/Sta523/data/parking/NYParkingViolations.csv", 
                     stringsAsFactors=FALSE, 
                     comment.char="")
           )

##    user  system elapsed 
## 348.993   4.488 353.381 

Improvement - colClasses

system.time(read.csv("/home/vis/cr173/Sta523/data/parking/NYParkingViolations.csv", 
                     stringsAsFactors=FALSE, 
                     comment.char="", 
                     colClasses=c("numeric",  "character", "character", "character", "character", 
                                 "integer",   "character", "character", "character", "integer", 
                                 "integer",   "integer",   "integer",   "integer",   "integer",  
                                 "integer",   "integer",   "character", "character", "character", 
                                 "character", "character", "character", "character", "character", 
                                 "character", "integer",   "integer",   "character", "character", 
                                 "character", "character", "character", "character", "integer",
                                 "integer",   "character", "integer",   "character", "character", 
                                 "logical",   "logical",   "logical")
                    )
           )

##    user  system elapsed 
## 239.846   2.583 242.328 

Improvement - data.table & fread

library(data.table)
park = fread("/home/vis/cr173/Sta523/data/parking/NYParkingViolations.csv",sep=",")

## Read 9100278 rows and 43 (of 43) columns from 1.673 GB file in 00:00:44

## Warning messages:
## 1: In fread("/home/vis/cr173/Sta523/data/parking/NYParkingViolations.csv",  :
##   Bumped column 26 to type character on data row 15, field contains 'GOUVERNUR ST EAST RI'. 
##   Coercing previously read values in this column from logical, integer or numeric back to 
##   character which may not be lossless; e.g., if '00' and '000' occurred before they will now 
##   be just '0', and there may be inconsistencies with treatment of ',,' and ',NA,' too (if they 
##   occurred in this column before the bump). If this matters please rerun and set 'colClasses' 
##   to 'character' for this column. Please note that column type detection uses the first 5 rows, 
##   the middle 5 rows and the last 5 rows, so hopefully this message should be very rare. If 
##   reporting to datatable-help, please rerun and include the output from verbose=TRUE.

Improvement - data.table & fread (cont.)

## 2: In fread("/home/vis/cr173/Sta523/data/parking/NYParkingViolations.csv",  :
##   Bumped column 30 to type character on data row 2647, field contains 'T'. Coercing previously 
##   read values in this column from logical, integer or numeric back to character which may not be 
##   lossless; e.g., if '00' and '000' occurred before they will now be just '0', and there may be
##   inconsistencies with treatment of ',,' and ',NA,' too (if they occurred in this column before the 
##   bump). If this matters please rerun and set 'colClasses' to 'character' for this column. Please 
##   note that column type detection uses the first 5 rows, the middle 5 rows and the last 5 rows, so 
##   hopefully this message should be very rare. If reporting to datatable-help, please rerun and 
##   include the output from verbose=TRUE.

## 3: In fread("/home/vis/cr173/Sta523/data/parking/NYParkingViolations.csv",  :
##   Some columns have been read as type 'integer64' but package bit64 isn't loaded. Those columns will 
##   display as strange looking floating point data. There is no need to reload the data. Just require(bit64) 
##   to obtain the integer64 print method and print the data again.

Data status

class(park)

## "data.table" "data.frame"

park = as.data.frame(park)
class(park)

## [1] "data.frame"

Contents

head(park)
##   Summons Number Plate ID Registration State Plate Type Issue Date Violation Code
## 1     1361929741  FCJ5493                 NY        PAS 12/18/1970             20
## 2     1366962000  63540MC                 NY        COM 02/02/1971             46
## 3     1356906515  GFM1421                 NY        PAS 09/18/1971             40
## 4     1342296217  FYM5117                 NY        SRF 09/18/1971             21
## 5     1342296199  95V6675                 TX        PAS 09/18/1971             21
## 6     1342296187  GCY4187                 NY        SRF 09/18/1971             21
##
##   Vehicle Body Type Vehicle Make Issuing Agency Street Code1 Street Code2 Street Code3
## 1              SUBN          GMC              S        35030        89180        32600
## 2              DELV        FRUEH              P        58830        11430        11450
## 3               SDN        MAZDA              X        13610        37270        37290
## 4              SUBN        NISSA              S        28190        55690        11590
## 5                            GMC              S        11790        35780        57890
## 6               VAN         FORD              S        11790        35780        57890
##
##   Vehicle Expiration Date Violation Location Violation Precinct Issuer Precinct Issuer Code
## 1                20140728                 70                 70               0      829379
## 2                       0                 68                 68              68      928157
## 3                20150301                 33                 33              33      938472
## 4                20140427                104                104               0      515350
## 5                       0                104                104               0      515350
## 6                20150430                104                104               0      515350
##
## ...

Processing

Big vs Bigish data

  • We are working with data that is large, but will still fit in memory.

  • R loves to make extra copies of objects, so we need to be careful - even a handful of copies with exhaust the memory on most systems.

    • Less of an issue on saxon (256 GB of Ram), but this is a shared resource - use to much and your session will be killed.
  • In general you should prefer:


subsetting >>>> apply > loops


built-in/base >> user defined functions

Some Interactive EDA

library(lubridate)
names(park)

head(park[["Issue Date"]])

t = table(park[["Issue Date"]])

d = data.frame(date = mdy(names(t)), count = c(t))

head(d)
plot(d$date, d$count)

d = d[order(d$date),]

plot(d$date, d$count,type='l')


start = mdy("8/1/2013")
end = mdy("6/28/2014")


with(d[d$date >= start & d$date <= end,], plot(date, count,type='l'))
with(d[d$date >= start-months(1) & d$date <= end+months(1),], plot(date, count,type='l'))

d$wday = wday(d$date,label = TRUE)
head(d)


d_sub = d[d$date >= start & d$date <= end,]

t = tapply(d_sub$count, d_sub$wday, sum)
plot(factor(names(t),levels=names(t),ordered=TRUE),t)

(cont.)

# Vehicle year distirbution
t = table(park[["Vehicle Year"]])[-1]

plot(as.numeric(names(t)),t,type='l')
t = t[as.numeric(names(t))<=2014]
plot(as.numeric(names(t)),t,type='l')

(cont.)

# Street words
t = c(park[["Street Name"]],park[["Intersecting Street"]])

t = str_trim(t)
t = t[t != ""]

t = unlist(str_split(t," "))
t = t[t != ""]

t = tolower(t)

head(table(t))

t = str_replace(t,"^`","")
t = t[t != ""]
t = t[t != "-"]

head(table(t))
head(table(t), n=200)

tt = sort(table(t),decreasing = TRUE)
head(tt)

head(tt, n=50)