library(gamlr) ## Browsing History. ## The table has three colums: [machine] id, site [id], [# of] visits web <- read.csv("CS2006domains.csv") ## Tell R that 'id' is a factor; we know there are 10000 machines n <- 1e4 web$id <- factor(web$id, levels=1:n) ## Read in the actual website names, and use these to create a site factor ## We know that there are 1000, and these are their names (in correct order) d <- 1e3 sitenames <- scan("CS2006sites.txt", what="character") web$site <- factor(web$site, levels=1:d, labels=sitenames) ## get total visits per-machine and % of time on each site ## tapply(a,b,c) does c(a) for every level of factor b. machinetotals <- as.vector(tapply(web$visits,web$id,sum)) ## it returns matrix; we'll make it a vector visitpercent <- 100*web$visits/machinetotals[web$id] ## use this info in a sparse matrix ## this is something you'll be doing a lot; familiarize yourself. xweb <- sparseMatrix( i=as.numeric(web$id), j=as.numeric(web$site), x=visitpercent, dims=c(nlevels(web$id),nlevels(web$site)), dimnames=list(id=levels(web$id), site=levels(web$sites))) ## transactions spend <- read.csv("CS2006transactions.csv") spend$product <- factor(spend$product, levels=1:d) spend$id <- factor(spend$id, levels=1:n) xspend <- sparseMatrix( i=as.numeric(spend$id), j=as.numeric(spend$product), x=spend$dollars, dims=c(nlevels(spend$id),nlevels(spend$product)), dimnames=list(id=levels(spend$id), product=levels(spend$product))) all(rownames(xspend)==rownames(xweb)) yspend <- cbind(id=as.numeric(rownames(xspend)),spend=rowSums(xspend)) write.table(yspend, row.names=FALSE, file="CS2006totalspend.csv", sep=",",quote=FALSE) ## demographics demo <- read.csv("CS2006demographics.csv") demo$id <- factor(demo$id) Xdemo <- sparse.model.matrix(id ~ .-1, data=demo)