Ranged/Filtered Cross Join with R data.table

itemprop="text">

I want to cross-join two data tables
without evaluating the full cross join, using a ranging criterion in the process. In
essence, I would like CJ with filtering/ranging
expression.

Can someone suggest a high
performing approach avoiding the full cross
join?

See test example below doing
the job with the evil full cross
join.

library(data.table)

#
            Test data.
dt1 <- data.table(id1=1:10, D=2*(1:10), key="id1")
dt2
            <- data.table(id2=21:23, D1=c(5, 7, 10), D2=c(9, 12, 16),
            key="id2")

# Desired filtered cross-join data table by hand: D1
            <= D & D <= D2.
dtfDesired <- data.table(


            id1=c(3, 4, 4, 5, 6, 5, 6, 7, 8)
 , id2=c(rep(21, 2), rep(22, 3), rep(23,
            4))
 , D1=c(rep(5, 2), rep(7, 3), rep(10, 4))
 , D=c(6, 8, 8, 10,
            12, 10, 12, 14, 16)
 , D2=c(rep(9, 2), rep(12, 3), rep(16,
            4))
)
setkey(dtfDesired, id1, id2)

# My
            "inefficient" programmatic attempt with full cross join.
fullCJ <-
            function(dt1, dt2) {

 # Full cross-product: NOT acceptable with
            real data!
 dtCrossAll <- CJ(dt1$id1, dt2$id2)

            setnames(dtCrossAll, c("id1", "id2"))
 # Merge all columns.
 dtf
            <- merge(dtCrossAll, dt1, by="id1")
 dtf <- merge(dtf, dt2,
            by="id2")
 setkey(dtf, id1, id2)
 # Reorder columns for
            convenience.
 setcolorder(dtf, c("id1", "id2", "D1", "D", "D2"))
 #
            Finally, filter the cases I want.

 dtf[D1 <= D & D <= D2,
            ]
}

dtf <- fullCJ(dt1, dt2)

#
            Print
            results.
print(dt1)
print(dt2)
print(dtfDesired)
all.equal(dtf,
            dtfDesired)

Test
data output

> # Print
            results.
> print(dt1)
 id1 D
 1: 1 2
 2: 2
            4
 3: 3 6

 4: 4 8
 5: 5 10
 6: 6
            12
 7: 7 14
 8: 8 16
 9: 9 18
10: 10
            20
> print(dt2)
 id2 D1 D2
1: 21 5
            9

2: 22 7 12
3: 23 10 16
>
            print(dtfDesired)
 id1 id2 D1 D D2
1: 3 21 5 6 9
2: 4 21 5
            8 9
3: 4 22 7 8 12
4: 5 22 7 10 12
5: 5 23 10 10
            16
6: 6 22 7 12 12

7: 6 23 10 12 16
8: 7 23 10
            14 16
9: 8 23 10 16 16
> all.equal(dtf, dtfDesired)
[1]
            TRUE

So now the
challenge is to write the filtered cross join in a way that can scale to millions of
rows!

Below are a collection of alternative
implementations including those suggested in answers and
comments.

# My
            "inefficient" programmatic attempt looping manually.
manualIter <-
            function(dt1, dt2) {
 id1Match <- NULL; id2Match <- NULL; dtf <-
            NULL;
 for (i1 in seq_len(nrow(dt1))) {
 # Find matches in dt2 of
            this dt1 row.
 row1 <- dt1[i1, ]
 id1 <- row1$id1
 D
            <- row1$D
 dt2Match <- dt2[D1 <= D & D <= D2,
            ]

 nMatches <- nrow(dt2Match)
 if (0 < nMatches)
            {
 id1Match <- c(id1Match, rep(id1, nMatches))
 id2Match <-
            c(id2Match, dt2Match$id2)
 }
 }
 # Build the return
            data.table for the matching ids.
 dtf <- data.table(id1=id1Match,
            id2=id2Match)
 dtf <- merge(dtf, dt1, by="id1")
 dtf <-
            merge(dtf, dt2, by="id2")

 setkey(dtf, id1, id2)
 #
            Reorder columns for convenience & consistency.
 setcolorder(dtf, c("id1",
            "id2", "D1", "D", "D2"))
 return(dtf)
}

dtJoin1
            <- function(dt1, dt2) {
 dtf <- dt1[, dt2[D1 <= D & D <= D2,
            list(id2=id2)], by=id1]
 dtf <- merge(dtf, dt1, by="id1")
 dtf
            <- merge(dtf, dt2, by="id2")

 setkey(dtf, id1, id2)

            setcolorder(dtf, c("id1", "id2", "D1", "D", "D2")) # Reorder columns for convenience
            & consistency.
 return(dtf)
}

dtJoin2 <-
            function(dt1, dt2) {
 dtf <- dt2[, dt1[D1 <= D & D <= D2,
            list(id1=id1, D1=D1, D=D, D2=D2)], by=id2]
 setkey(dtf, id1, id2)

            setcolorder(dtf, c("id1", "id2", "D1", "D", "D2")) # Reorder columns for convenience
            & consistency.
 return(dtf)

}

#
            Install Bioconductor IRanges (see bioTreeRange
            below).
source("http://bioconductor.org/biocLite.R")
biocLite("IRanges")

#
            Solution using Bioconductor IRanges.
bioTreeRange <- function(dt1, dt2)
            {
 require(IRanges)
 ir1 <- IRanges(dt1$D,
            width=1L)

 ir2 <- IRanges(dt2$D1, dt2$D2)
 olaps <-
            findOverlaps(ir1, ir2, type="within")
 dtf <- cbind(dt1[queryHits(olaps)],
            dt2[subjectHits(olaps)])
 setkey(dtf, id1, id2)
 setcolorder(dtf,
            c("id1", "id2", "D1", "D", "D2")) # Reorder columns for convenience.

            return(dtf)
}

And
now below is a little benchmark on a bigger data set 2-3 orders of magnitude smaller
than my real underlying scenario. The real scenario fails on the full cross-join huge
memory
allocation.

set.seed(1)
n1
            <- 10000
n2 <- 1000
dtbig1 <- data.table(id1=1:n1, D=1:n1,
            key="id1")
dtbig2 <- data.table(id2=1:n2, D1=sort(sample(1:n1, n2)),
            key="id2")
dtbig2$D2 <- with(dtbig2, D1 +
            100)

library("microbenchmark")
mbenchmarkRes <-
            microbenchmark(

 fullCJRes <- fullCJ(dtbig1, dtbig2)

            , manualIterRes <- manualIter(dtbig1, dtbig2)
 , dtJoin1Res <-
            dtJoin1(dtbig1, dtbig2)
 , dtJoin2Res <- dtJoin2(dtbig1, dtbig2)

            , bioTreeRangeRes <- bioTreeRange(dtbig1, dtbig2)
 , times=3, unit="s",
            control=list(order="inorder", warmup=1)
)
mbenchmarkRes$expr <-
            c("fullCJ", "manualIter", "dtJoin1", "dtJoin2", "bioTreeRangeRes") # Shorten names for
            better display.

# Print
            microbenchmark

print(mbenchmarkRes,
            order="median")

And
now the current benchmark results I got on my
machine:

> print(mbenchmarkRes,
            order="median")
Unit: seconds
 expr min lq median uq max
            neval
 bioTreeRangeRes 0.05833279 0.05843753 0.05854227 0.06099377 0.06344527
            3
 dtJoin2 1.20519664 1.21583650 1.22647637 1.23606216 1.24564796
            3

 fullCJ 4.00370434 4.03572702 4.06774969 4.17001658 4.27228347
            3
 dtJoin1 8.02416333 8.03504136 8.04591938 8.20015977 8.35440016 3

            manualIter 8.69061759 8.69716448 8.70371137 8.76859060 8.83346982
            3

The
Bioconductor tree/IRanges solution from Arun (bioTreeRangeRes) is two orders of
magnitude faster than the alternatives. But the install seems to have updated other CRAN
libraries (my fault, I accepted it when the install asked the question); some of them
can no longer be found when loading them -- e.g., gtools and
gplots.

The fastest pure
data.table option from BrodieG (dtJoin2) is probably not as efficient as I need it to be
but at least is reasonable in terms of memory consumption (I will let it run overnight
on my real scenario ~ 1 Million rows).

I tried
changing the data table keys (using the dates instead of id's); it did not have any
impact.

As expected, explicitly writing the loop in R
(manualIter) crawls.

class="post-text" itemprop="text">

class="normal">Answer

Recently,
overlap joins are implemented in
data.table. This is a special case where
dt1's `start and end points are identical. You can grab the
latest version from the github project page to try this
out:

require(data.table) ##
            1.9.3+
dt1[, DD := D] ## duplicate column D to create
            intervals
setkey(dt2, D1,D2) ## key needs to be set for 2nd
            argument
foverlaps(dt1, dt2, by.x=c("D", "DD"), by.y=key(dt2),
            nomatch=0L)


# id2 D1 D2 id1 D DD
# 1: 21 5 9 3
            6 6
# 2: 21 5 9 4 8 8
# 3: 22 7 12 4 8 8
# 4: 22 7 12 5 10
            10
# 5: 23 10 16 5 10 10
# 6: 22 7 12 6 12 12
# 7: 23 10
            16 6 12 12
# 8: 23 10 16 7 14 14

# 9: 23 10 16 8 16
            16

Here's the results
benchmarking on the same data you've shown in your
post:

# Unit: seconds
#
            expr min lq median uq max neval
# olaps 0.03600603 0.03971068 0.04341533
            0.04857602 0.05373671 3
# bioTreeRangeRes 0.11356837 0.11673968 0.11991100
            0.12499391 0.13007681 3
# dtJoin2 2.61679908 2.70327940 2.78975971 2.86864832
            2.94753693 3

# fullCJ 4.45173294 4.75271285 5.05369275 5.08333291
            5.11297307 3
# dtJoin1 16.51898878 17.39207632 18.26516387 18.60092303
            18.93668220 3
# manualIter 29.36023340 30.13354967 30.90686594 33.55910653
            36.21134712 3

where
dt_olaps
is:

dt_olaps <- function(dt1,
            dt2) {
 dt1[, DD := D]
 setkey(dt2, D1,D2)


            foverlaps(dt1, dt2, by.x=c("D","DD"), by.y=key(dt2),
            nomatch=0L)
}

Blog

Sunday, 26 November 2017

Ranged/Filtered Cross Join with R data.table

No comments:

Post a Comment

php - file_get_contents shows unexpected output while reading a file