## ----echo = FALSE, results = "hide", message = FALSE-------------------------- knitr::opts_chunk$set(collapse = TRUE, comment = "#>") require(bit) require(microbenchmark) # rmarkdown::render("vignettes/bit-performance.Rmd") # these are the real settings for the performance vignette times <- 5 Domain <- c(small=1e3, big=1e6) Sample <- c(small=1e3, big=1e6) # these are the settings to keep the cost of CRAN low #times <- 5 #Domain <- c(small=1e1, big=1e3) #Sample <- c(small=1e1, big=1e3) pagebreak <- function() { if(knitr::is_latex_output()) return("\\newpage") else return('
') } ## ----echo=TRUE, results='asis'------------------------------------------------ a <- 1L b <- 1e7L i <- sample(a:b,1e3) x <- c( R = median(microbenchmark((a:b)[-i], times=times)$time) , bit = median(microbenchmark(bit_rangediff(c(a,b), i), times=times)$time) , merge = median(microbenchmark(merge_rangediff(c(a,b), bit_sort(i)), times=times)$time) ) knitr::kable(as.data.frame(as.list(x/x["R"]*100)), caption="% of time relative to R", digits=1) ## ----echo=FALSE, results='asis'----------------------------------------------- knitr::kable( data.frame(coin="random 50%", often="random 99%", rare="random 1%", chunk="contiguous chunk of 5%") , caption="selection characteristic") ## ----echo=FALSE, results='asis'----------------------------------------------- B <- booltypes[c("logical","bit","bitwhich","which","ri")] M <- c("size", "[]", "[which]", "[which]<-TRUE", "[]<-logical", "!", "&", "|", "==", "!=", "summary") G <- list( coin = function(n)sample(c(FALSE, TRUE), n, replace=TRUE, prob=c(0.5,0.5)) , often = function(n)sample(c(FALSE, TRUE), n, replace=TRUE, prob=c(0.01,0.99)) , rare = function(n)sample(c(FALSE, TRUE), n, replace=TRUE, prob=c(0.99,0.01)) , chunk = function(n)ri(n%/%20,2L*n%/%20,n) ) X <- vector("list", length(B)*length(G)) dim(X) <- c(booltype=length(B), data=length(G)) dimnames(X) <- list(booltype=names(B), data=names(G)) tim <- array(NA , dim=c(booltype=length(B), metric=length(M), data=length(G)) , dimnames=list(booltype=names(B), metric=M, data=names(G)) ) for (g in names(G)){ x <- G[[g]](Sample[["big"]]) if (g %in% c("coin","often","rare")) w <- as.which(as.logical(x)) for (b in B){ if (booltypes[[b]] < 'ri' || (b == 'ri' && g=='chunk')){ X[[b,g]] <- as.booltype(x, b) if (g %in% c("coin","often","rare") && b %in% c("logical","bit","bitwhich")){ l <- as.booltype(logical(Sample[["big"]]), b) tim[b,"[which]",g] <- median(microbenchmark(l[w], times=times)$time) tim[b,"[which]<-TRUE",g] <- median(microbenchmark(l[w]<-TRUE, times=times)$time) tim[b,"[]",g] <- median(microbenchmark(l[], times=times)$time) tim[b,"[]<-logical",g] <- median(microbenchmark(l[]<-x, times=times)$time) } tim[b,"size",g] <- object.size(X[[b,g]]) } } } for (g in names(G)){ for (b in c("logical","bit","bitwhich")){ x <- X[[b,g]] if (!is.null(x)){ tim[b,"!",g] <- median(microbenchmark(!x, times=times)$time) tim[b,"&",g] <- median(microbenchmark(x & x, times=times)$time) tim[b,"|",g] <- median(microbenchmark(x | x, times=times)$time) tim[b,"==",g] <- median(microbenchmark(x == x, times=times)$time) tim[b,"!=",g] <- median(microbenchmark(x != x, times=times)$time) tim[b,"summary",g] <- median(microbenchmark(summary.booltype(x), times=times)$time) } } } i <- match("size", M) for(b in rev(names(B))) # logical was in first position, so we do this last! { tim[b,i,] <- 100 * tim[b,i,] / tim["logical",i,] tim[b,-i,] <- 100 * tim[b,-i,] / max(tim["logical",-i,], na.rm=TRUE) } #rm(X) ## ----echo=FALSE, fig.cap = "% size and execution time for bit (b) and bitwhich (w) relative to logical (R) in the 'rare' scenario"---- x <- tim[1:3,,"rare"] m <- rep("", ncol(x)) m <- as.vector(rbind(m, colnames(x), m)) dotchart(x, xlim=c(0,max(100, max(x))), labels=m, pch=c("R","b","w"), col=c("black","blue","red"), main="% size and timings in 'rare' scenario", sub="l='logical' b='bit' w='bitwhich' % of max(R) in all scenarios") ## ----echo=FALSE, fig.cap = "% size and execution time for bit (b) and bitwhich (w) relative to logical (R) in the 'often' scenario"---- x <- tim[1:3,,"often"] dotchart(x, xlim=c(0,max(100, max(x))), labels=m, pch=c("R","b","w"), col=c("black","blue","red"), main="% size and timings in 'often' scenario", sub="l='logical' b='bit' w='bitwhich' % of max(R) in all scenarios") ## ----echo=FALSE, fig.cap = "% size and execution time for bit (b) and bitwhich (w) relative to logical (R) in the 'coin' scenario"---- x <- tim[1:3,,"coin"] dotchart(x, xlim=c(0,max(100, max(x))), labels=m, pch=c("R","b","w"), col=c("black","blue","red"), main="% size and timings in 'coin' scenario", sub="l='logical' b='bit' w='bitwhich' % of max(R) in all scenarios") ## ----echo=FALSE, results='asis'----------------------------------------------- knitr::kable(round(tim[,"size",], 1), caption="% bytes of logical") ## ----echo=FALSE, results='asis'----------------------------------------------- knitr::kable(round(tim[,"[]",], 1), caption="% time of logical") ## ----echo=FALSE, results='asis'----------------------------------------------- knitr::kable(round(tim[,"[]<-logical",], 1), caption="% time of logical") ## ----echo=FALSE, results='asis'----------------------------------------------- knitr::kable(round(tim[,"[which]",], 1), caption="% time of logical") ## ----echo=FALSE, results='asis'----------------------------------------------- knitr::kable(round(tim[,"[which]<-TRUE",], 1), caption="% time of logical") ## ----echo=FALSE, results='asis'----------------------------------------------- knitr::kable(round(tim[,"!",], 1), caption="% time for Boolean NOT") ## ----echo=FALSE, results='asis'----------------------------------------------- knitr::kable(round(tim[,"&",], 1), caption="% time for Boolean &") ## ----echo=FALSE, results='asis'----------------------------------------------- knitr::kable(round(tim[,"|",], 1), caption="% time for Boolean |") ## ----echo=FALSE, results='asis'----------------------------------------------- knitr::kable(round(tim[,"==",], 1), caption="% time for Boolean ==") ## ----echo=FALSE, results='asis'----------------------------------------------- knitr::kable(round(tim[,"!=",], 1), caption="% time for Boolean !=") ## ----echo=FALSE, results='asis'----------------------------------------------- knitr::kable(round(tim[,"summary",][1:2,1:2], 1), caption="% time for Boolean summary") ## ----echo=FALSE, results='asis'----------------------------------------------- binaryDomain <- list( smallsmall = rep(Domain["small"], 2) , smallbig=Domain , bigsmall=rev(Domain) , bigbig=rep(Domain["big"], 2) ) binarySample <- list( smallsmall = rep(Sample["small"], 2) , smallbig=Sample , bigsmall=rev(Sample) , bigbig=rep(Sample["big"], 2) ) M <- c("R","bit","merge") G <- c("sort","sortunique") D <- c("unsorted","sorted") sortM <- vector("list", length(M)*length(G)) dim(sortM) <- c(method=length(M), goal=length(G)) dimnames(sortM) <- list(method=M, goal=G) sortM[["R","sort"]] <- sort sortM[["R", "sortunique"]] <- function(x)sort(unique(x)) sortM[["bit","sort"]] <- bit_sort sortM[["bit","sortunique"]] <- bit_sort_unique timsort <- array(NA_integer_ , dim=c(M=2, G=length(G), D=length(D), N=length(Domain)) , dimnames=list(M=M[1:2], G=G, D=D, N=names(Domain)) ) for(n in names(Domain)){ x <- sample(Domain[[n]], Sample[[n]], replace = TRUE) d <- "unsorted" for (m in c("R","bit")){ for (g in G){ timsort[m,g,d,n] <- median(microbenchmark(sortM[[m,g]](x), times=times)$time) } } x <- bit_sort(x) d <- "sorted" for (m in 1:2){ for (g in G){ timsort[m,g,d,n] <- median(microbenchmark(sortM[[m,g]](x), times=times)$time) } } } binaryU <- c("match","in","notin","union","intersect","setdiff","symdiff","setequal","setearly") binaryM <- vector("list", length(M)*length(binaryU)) dim(binaryM) <- c(method=length(M), task=length(binaryU)) dimnames(binaryM) <- list(method=M, task=binaryU) binaryM[["R","match"]] <- match binaryM[["merge","match"]] <- merge_match binaryM[["R","in"]] <- get("%in%") binaryM[["bit","in"]] <- bit_in binaryM[["merge","in"]] <- merge_in binaryM[["R","notin"]] <- function(x, y)!(x %in% y) binaryM[["bit","notin"]] <- function(x, y)!bit_in(x,y) binaryM[["merge","notin"]] <- merge_notin binaryM[["R","union"]] <- union binaryM[["bit","union"]] <- bit_union binaryM[["merge","union"]] <- merge_union binaryM[["R","intersect"]] <- intersect binaryM[["bit","intersect"]] <- bit_intersect binaryM[["merge","intersect"]] <- merge_intersect binaryM[["R","setdiff"]] <- setdiff binaryM[["bit","setdiff"]] <- bit_setdiff binaryM[["merge","setdiff"]] <- merge_setdiff binaryM[["R","symdiff"]] <- function(x,y)union(setdiff(x,y), setdiff(y,x)) binaryM[["bit","symdiff"]] <- bit_symdiff binaryM[["merge","symdiff"]] <- merge_symdiff binaryM[["R","setequal"]] <- function(x,y)setequal(x,x) # we compare x to x which avoids early termination and hence binaryM[["bit","setequal"]] <- function(x,y)bit_setequal(x,x) binaryM[["merge","setequal"]] <- function(x,y)merge_setequal(x,x) binaryM[["R","setearly"]] <- function(x,y)setequal(x,y) # we compare x to x which avoids early termination and hence binaryM[["bit","setearly"]] <- function(x,y)bit_setequal(x,y) binaryM[["merge","setearly"]] <- function(x,y)merge_setequal(x,y) unaryU <- c("unique","duplicated","anyDuplicated","sumDuplicated") unaryM <- vector("list", length(M)*length(unaryU)) dim(unaryM) <- c(method=length(M), task=length(unaryU)) dimnames(unaryM) <- list(method=M, task=unaryU) unaryM[["R","unique"]] <- unique unaryM[["bit","unique"]] <- bit_unique unaryM[["merge","unique"]] <- merge_unique unaryM[["R","duplicated"]] <- duplicated unaryM[["bit","duplicated"]] <- bit_duplicated unaryM[["merge","duplicated"]] <- merge_duplicated unaryM[["R","anyDuplicated"]] <- anyDuplicated unaryM[["bit","anyDuplicated"]] <- bit_anyDuplicated unaryM[["merge","anyDuplicated"]] <- merge_anyDuplicated unaryM[["R","sumDuplicated"]] <- function(x)sum(duplicated(x)) unaryM[["bit","sumDuplicated"]] <- bit_sumDuplicated unaryM[["merge","sumDuplicated"]] <- merge_sumDuplicated tim <- array(NA_integer_ , dim=c(M=length(M), U=length(unaryU)+length(binaryU), N=length(binaryDomain), D=length(D)) , dimnames=list(M=M, U=c(unaryU,binaryU), N=names(binaryDomain), D=D) ) for(n in names(binaryDomain)){ xnam <- names(binaryDomain[[n]])[1] ynam <- names(binaryDomain[[n]])[2] x <- sample(binaryDomain[[n]][1], binarySample[[n]][1], replace = FALSE) y <- sample(binaryDomain[[n]][2], binarySample[[n]][2], replace = FALSE) d <- "unsorted" if (length(x)==length(y)) for (u in unaryU){ for (m in setdiff(M,"merge")){ f <- unaryM[[m,u]] if (!is.null(f)) tim[m,u,n,d] <- median(microbenchmark(f(x), times=times)$time) } } for (u in binaryU){ for (m in setdiff(M,"merge")){ f <- binaryM[[m,u]] if (!is.null(f)) tim[m,u,n,d] <- median(microbenchmark(f(x,y), times=times)$time) } } x <- bit_sort(x) y <- bit_sort(y) d <- "sorted" if (length(x)==length(y)) for (u in unaryU){ for (m in M){ f <- unaryM[[m,u]] if (!is.null(f)){ tim[m,u,n,d] <- median(microbenchmark(f(x), times=times)$time) # now plug-in measures for unsorted merge if (m == "merge") tim["merge",u,n,"unsorted"] <- timsort["bit","sort","unsorted",xnam] + tim["merge",u,n,"sorted"] } } } for (u in binaryU){ for (m in M){ f <- binaryM[[m,u]] if (!is.null(f)){ tim[m,u,n,d] <- median(microbenchmark(f(x,y), times=times)$time) # now plug-in measures for unsorted merge if (m == "merge") tim["merge",u,n,"unsorted"] <- timsort["bit","sort","unsorted",xnam] + timsort["bit","sort","unsorted",ynam] + tim["merge",u,n,"sorted"] } } } } ## ----echo=FALSE, fig.cap = "Execution time for R (R) and bit (b)"------------- y <- timsort[,,,"big"] y <- 100 * y / max(y["R",,], na.rm=TRUE) oldpar <- par(mfrow=c(2,1), mar=c(5,8,2,1)) x <- y[,,"unsorted"] dotchart(x, xlim=c(0, max(100, max(y))), labels="", pch=c("R","b"), xlab="execution time", main="unsorted", col=c("red","blue")) x <- y[,,"sorted"] dotchart(x, xlim=c(0, max(100, max(y))), labels="", pch=c("R","b"), xlab="execution time", main="sorted", col=c("red","blue")) par(oldpar) ## ----echo=FALSE, results='hide'----------------------------------------------- tim2 <- tim for (n in names(binaryDomain)) for (d in D) tim2[,,n,d] <- 100*tim[,,n,d]/max(tim["R",,n,d], na.rm=TRUE) ## ----echo=FALSE, fig.cap = "Execution time for R, bit and merge relative to most expensive R in 'unsorted bigbig' scenario"---- y <- tim2[,,"bigbig",] y <- 100 * y / max(y["R",,], na.rm=TRUE) x <- y[,,"unsorted"] m <- rep("", ncol(x)) m <- as.vector(rbind(m, colnames(x), m)) dotchart(x, xlim=c(0, max(100,max(y, na.rm=TRUE))), labels=m, pch=c("R","b","m"), col=c("red","blue","black"), main="Timings in 'unsorted bigbig' scenario", sub="R='hash' b='bit' m='merge'") ## ----echo=FALSE, fig.cap = "Execution time for R, bit and merge in 'sorted bigbig' scenario"---- x <- y[,,"sorted"] dotchart(x, xlim=c(0, max(y, na.rm=TRUE)), labels=m, pch=c("R","b","m"), col=c("red","blue","black"), main="Timings in 'sorted bigbig' scenario", sub="R='hash' b='bit' m='merge'") ## ----echo=FALSE, results='asis'----------------------------------------------- x <- 100*timsort["bit",,,]/timsort["R",,,] s <- "sorted" knitr::kable(x[,s,], caption=paste(s,"data relative to R's sort"), digits=1) ## ----echo=FALSE, results='asis'----------------------------------------------- s <- "unsorted" knitr::kable(x[,s,], caption=paste(s,"data relative to R's sort"), digits=1) ## ----echo=FALSE, results='asis'----------------------------------------------- f <- function(u){ n <- c("smallsmall","bigbig") x <- tim[c("bit","merge","merge"),u,n,] dimnames(x)$M[3] <- "sort" dimnames(x)$N <- c("small","big") x["sort",,"unsorted"] <- timsort["bit","sort","unsorted",] x["sort",,"sorted"] <- 0 for (m in dimnames(x)$M) x[m,,] <- x[m,,] / tim["R",u,n,] * 100 x } x <- f("unique") s <- "sorted" knitr::kable(x[,,s], caption=paste(s,"data relative to R"), digits=1) ## ----echo=FALSE, results='asis'----------------------------------------------- s <- "unsorted" knitr::kable(x[,,s], caption=paste(s,"data relative to R"), digits=1) ## ----echo=FALSE, results='asis'----------------------------------------------- x <- f("duplicated") s <- "sorted" knitr::kable(x[,,s], caption=paste(s,"data relative to R"), digits=1) ## ----echo=FALSE, results='asis'----------------------------------------------- s <- "unsorted" knitr::kable(x[,,s], caption=paste(s,"data relative to R"), digits=1) ## ----echo=FALSE, results='asis'----------------------------------------------- x <- f("anyDuplicated") s <- "sorted" knitr::kable(x[,,s], caption=paste(s,"data relative to R"), digits=1) ## ----echo=FALSE, results='asis'----------------------------------------------- s <- "unsorted" knitr::kable(x[,,s], caption=paste(s,"data relative to R"), digits=1) ## ----echo=FALSE, results='asis'----------------------------------------------- x <- f("sumDuplicated") s <- "sorted" knitr::kable(x[,,s], caption=paste(s,"data relative to R"), digits=1) ## ----echo=FALSE, results='asis'----------------------------------------------- s <- "unsorted" knitr::kable(x[,,s], caption=paste(s,"data relative to R"), digits=1) ## ----echo=FALSE, results='asis'----------------------------------------------- f <- function(u){ x <- tim[c("bit","merge","merge"),u,,] dimnames(x)$M[3] <- "sort" s <- timsort["bit","sort","unsorted",] x["sort",,"unsorted"] <- rep(s, c(2,2)) + c(s,s) x["sort",,"sorted"] <- 0 for (m in dimnames(x)$M) x[m,,] <- x[m,,] / tim["R",u,,] * 100 x } x <- f("match") s <- "sorted" knitr::kable(x[,,s], caption=paste(s,"data relative to R"), digits=1) ## ----echo=FALSE, results='asis'----------------------------------------------- s <- "unsorted" knitr::kable(x[,,s], caption=paste(s,"data relative to R"), digits=1) ## ----echo=FALSE, results='asis'----------------------------------------------- x <- f("in") s <- "sorted" knitr::kable(x[,,s], caption=paste(s,"data relative to R"), digits=1) ## ----echo=FALSE, results='asis'----------------------------------------------- s <- "unsorted" knitr::kable(x[,,s], caption=paste(s,"data relative to R"), digits=1) ## ----echo=FALSE, results='asis'----------------------------------------------- x <- f("notin") s <- "sorted" knitr::kable(x[,,s], caption=paste(s,"data relative to R"), digits=1) ## ----echo=FALSE, results='asis'----------------------------------------------- s <- "unsorted" knitr::kable(x[,,s], caption=paste(s,"data relative to R"), digits=1) ## ----echo=FALSE, results='asis'----------------------------------------------- x <- f("union") s <- "sorted" knitr::kable(x[,,s], caption=paste(s,"data relative to R"), digits=1) ## ----echo=FALSE, results='asis'----------------------------------------------- s <- "unsorted" knitr::kable(x[,,s], caption=paste(s,"data relative to R"), digits=1) ## ----echo=FALSE, results='asis'----------------------------------------------- x <- f("intersect") s <- "sorted" knitr::kable(x[,,s], caption=paste(s,"data relative to R"), digits=1) ## ----echo=FALSE, results='asis'----------------------------------------------- s <- "unsorted" knitr::kable(x[,,s], caption=paste(s,"data relative to R"), digits=1) ## ----echo=FALSE, results='asis'----------------------------------------------- x <- f("setdiff") s <- "sorted" knitr::kable(x[,,s], caption=paste(s,"data relative to R"), digits=1) ## ----echo=FALSE, results='asis'----------------------------------------------- s <- "unsorted" knitr::kable(x[,,s], caption=paste(s,"data relative to R"), digits=1) ## ----echo=FALSE, results='asis'----------------------------------------------- x <- f("symdiff") s <- "sorted" knitr::kable(x[,,s], caption=paste(s,"data relative to R"), digits=1) ## ----echo=FALSE, results='asis'----------------------------------------------- s <- "unsorted" knitr::kable(x[,,s], caption=paste(s,"data relative to R"), digits=1) ## ----echo=FALSE, results='asis'----------------------------------------------- x <- f("setequal") s <- "sorted" knitr::kable(x[,,s], caption=paste(s,"data relative to R"), digits=1) ## ----echo=FALSE, results='asis'----------------------------------------------- s <- "unsorted" knitr::kable(x[,,s], caption=paste(s,"data relative to R"), digits=1) ## ----echo=FALSE, results='asis'----------------------------------------------- x <- f("setearly") s <- "sorted" knitr::kable(x[,,s], caption=paste(s,"data relative to R"), digits=1) ## ----echo=FALSE, results='asis'----------------------------------------------- s <- "unsorted" knitr::kable(x[,,s], caption=paste(s,"data relative to R"), digits=1)