If we use the basic approach of read.csv
, we end up waiting a really long time,
system.time(park<-read.csv("/home/vis/cr173/Sta523/data/parking/NYParkingViolations.csv"))
## user system elapsed
## 544.698 7.411 555.405
Nearly 10 minutes to read in a 1.7 gigabyte CSV file.
system.time(read.csv("/home/vis/cr173/Sta523/data/parking/NYParkingViolations.csv",
stringsAsFactors=FALSE)
)
## user system elapsed
## 480.018 6.643 486.517
system.time(read.csv("/home/vis/cr173/Sta523/data/parking/NYParkingViolations.csv",
stringsAsFactors=FALSE,
comment.char="")
)
## user system elapsed
## 348.993 4.488 353.381
system.time(read.csv("/home/vis/cr173/Sta523/data/parking/NYParkingViolations.csv",
stringsAsFactors=FALSE,
comment.char="",
colClasses=c("numeric", "character", "character", "character", "character",
"integer", "character", "character", "character", "integer",
"integer", "integer", "integer", "integer", "integer",
"integer", "integer", "character", "character", "character",
"character", "character", "character", "character", "character",
"character", "integer", "integer", "character", "character",
"character", "character", "character", "character", "integer",
"integer", "character", "integer", "character", "character",
"logical", "logical", "logical")
)
)
## user system elapsed
## 239.846 2.583 242.328
library(data.table)
park = fread("/home/vis/cr173/Sta523/data/parking/NYParkingViolations.csv",sep=",")
## Read 9100278 rows and 43 (of 43) columns from 1.673 GB file in 00:00:44
## Warning messages:
## 1: In fread("/home/vis/cr173/Sta523/data/parking/NYParkingViolations.csv", :
## Bumped column 26 to type character on data row 15, field contains 'GOUVERNUR ST EAST RI'.
## Coercing previously read values in this column from logical, integer or numeric back to
## character which may not be lossless; e.g., if '00' and '000' occurred before they will now
## be just '0', and there may be inconsistencies with treatment of ',,' and ',NA,' too (if they
## occurred in this column before the bump). If this matters please rerun and set 'colClasses'
## to 'character' for this column. Please note that column type detection uses the first 5 rows,
## the middle 5 rows and the last 5 rows, so hopefully this message should be very rare. If
## reporting to datatable-help, please rerun and include the output from verbose=TRUE.
## 2: In fread("/home/vis/cr173/Sta523/data/parking/NYParkingViolations.csv", :
## Bumped column 30 to type character on data row 2647, field contains 'T'. Coercing previously
## read values in this column from logical, integer or numeric back to character which may not be
## lossless; e.g., if '00' and '000' occurred before they will now be just '0', and there may be
## inconsistencies with treatment of ',,' and ',NA,' too (if they occurred in this column before the
## bump). If this matters please rerun and set 'colClasses' to 'character' for this column. Please
## note that column type detection uses the first 5 rows, the middle 5 rows and the last 5 rows, so
## hopefully this message should be very rare. If reporting to datatable-help, please rerun and
## include the output from verbose=TRUE.
## 3: In fread("/home/vis/cr173/Sta523/data/parking/NYParkingViolations.csv", :
## Some columns have been read as type 'integer64' but package bit64 isn't loaded. Those columns will
## display as strange looking floating point data. There is no need to reload the data. Just require(bit64)
## to obtain the integer64 print method and print the data again.
class(park)
## "data.table" "data.frame"
park = as.data.frame(park)
class(park)
## [1] "data.frame"
head(park)
## Summons Number Plate ID Registration State Plate Type Issue Date Violation Code
## 1 1361929741 FCJ5493 NY PAS 12/18/1970 20
## 2 1366962000 63540MC NY COM 02/02/1971 46
## 3 1356906515 GFM1421 NY PAS 09/18/1971 40
## 4 1342296217 FYM5117 NY SRF 09/18/1971 21
## 5 1342296199 95V6675 TX PAS 09/18/1971 21
## 6 1342296187 GCY4187 NY SRF 09/18/1971 21
##
## Vehicle Body Type Vehicle Make Issuing Agency Street Code1 Street Code2 Street Code3
## 1 SUBN GMC S 35030 89180 32600
## 2 DELV FRUEH P 58830 11430 11450
## 3 SDN MAZDA X 13610 37270 37290
## 4 SUBN NISSA S 28190 55690 11590
## 5 GMC S 11790 35780 57890
## 6 VAN FORD S 11790 35780 57890
##
## Vehicle Expiration Date Violation Location Violation Precinct Issuer Precinct Issuer Code
## 1 20140728 70 70 0 829379
## 2 0 68 68 68 928157
## 3 20150301 33 33 33 938472
## 4 20140427 104 104 0 515350
## 5 0 104 104 0 515350
## 6 20150430 104 104 0 515350
##
## ...
We are working with data that is large, but will still fit in memory.
R loves to make extra copies of objects, so we need to be careful - even a handful of copies with exhaust the memory on most systems.
In general you should prefer:
subsetting >>>> apply > loops
library(lubridate)
names(park)
head(park[["Issue Date"]])
t = table(park[["Issue Date"]])
d = data.frame(date = mdy(names(t)), count = c(t))
head(d)
plot(d$date, d$count)
d = d[order(d$date),]
plot(d$date, d$count,type='l')
start = mdy("8/1/2013")
end = mdy("6/28/2014")
with(d[d$date >= start & d$date <= end,], plot(date, count,type='l'))
with(d[d$date >= start-months(1) & d$date <= end+months(1),], plot(date, count,type='l'))
d$wday = wday(d$date,label = TRUE)
head(d)
d_sub = d[d$date >= start & d$date <= end,]
t = tapply(d_sub$count, d_sub$wday, sum)
plot(factor(names(t),levels=names(t),ordered=TRUE),t)
# Vehicle year distirbution
t = table(park[["Vehicle Year"]])[-1]
plot(as.numeric(names(t)),t,type='l')
t = t[as.numeric(names(t))<=2014]
plot(as.numeric(names(t)),t,type='l')
# Street words
t = c(park[["Street Name"]],park[["Intersecting Street"]])
t = str_trim(t)
t = t[t != ""]
t = unlist(str_split(t," "))
t = t[t != ""]
t = tolower(t)
head(table(t))
t = str_replace(t,"^`","")
t = t[t != ""]
t = t[t != "-"]
head(table(t))
head(table(t), n=200)
tt = sort(table(t),decreasing = TRUE)
head(tt)
head(tt, n=50)