{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE GADTs #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE RankNTypes #-}
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE TypeApplications #-}

module DataFrame.Operations.Join where

import Control.Applicative (asum)
import qualified Data.HashMap.Strict as HM
import qualified Data.Map.Strict as M
import Data.Maybe (fromMaybe)
import qualified Data.Text as T
import Data.Type.Equality (TestEquality (..))
import qualified Data.Vector as VB
import qualified Data.Vector.Unboxed as VU
import DataFrame.Internal.Column as D
import DataFrame.Internal.DataFrame as D
import DataFrame.Operations.Aggregation as D
import DataFrame.Operations.Core as D
import Type.Reflection

-- | Equivalent to SQL join types.
data JoinType
    = INNER
    | LEFT
    | RIGHT
    | FULL_OUTER

{- | Join two dataframes using SQL join semantics.

Only inner join is implemented for now.
-}
join ::
    JoinType ->
    [T.Text] ->
    DataFrame -> -- Right hand side
    DataFrame -> -- Left hand side
    DataFrame
join :: JoinType -> [Text] -> DataFrame -> DataFrame -> DataFrame
join JoinType
INNER [Text]
xs DataFrame
right = [Text] -> DataFrame -> DataFrame -> DataFrame
innerJoin [Text]
xs DataFrame
right
join JoinType
LEFT [Text]
xs DataFrame
right = [Text] -> DataFrame -> DataFrame -> DataFrame
leftJoin [Text]
xs DataFrame
right
join JoinType
RIGHT [Text]
xs DataFrame
right = [Text] -> DataFrame -> DataFrame -> DataFrame
rightJoin [Text]
xs DataFrame
right
join JoinType
FULL_OUTER [Text]
xs DataFrame
right = [Text] -> DataFrame -> DataFrame -> DataFrame
fullOuterJoin [Text]
xs DataFrame
right

{- | Performs an inner join on two dataframes using the specified key columns.
Returns only rows where the key values exist in both dataframes.

==== __Example__
@
ghci> df = D.fromNamedColumns [("key", D.fromList ["K0", "K1", "K2", "K3"]), ("A", D.fromList ["A0", "A1", "A2", "A3"])]
ghci> other = D.fromNamedColumns [("key", D.fromList ["K0", "K1", "K2"]), ("B", D.fromList ["B0", "B1", "B2"])]
ghci> D.innerJoin ["key"] df other

-----------------
 key  |  A  |  B
------|-----|----
 Text | Text| Text
------|-----|----
 K0   | A0  | B0
 K1   | A1  | B1
 K2   | A2  | B2

@
-}
innerJoin :: [T.Text] -> DataFrame -> DataFrame -> DataFrame
innerJoin :: [Text] -> DataFrame -> DataFrame -> DataFrame
innerJoin [Text]
cs DataFrame
right DataFrame
left =
    let
        -- Prepare Keys for the Right DataFrame
        rightIndicesToGroup :: [Int]
rightIndicesToGroup =
            [Int
c | (Text
k, Int
c) <- Map Text Int -> [(Text, Int)]
forall k a. Map k a -> [(k, a)]
M.toList (DataFrame -> Map Text Int
D.columnIndices DataFrame
right), Text
k Text -> [Text] -> Bool
forall a. Eq a => a -> [a] -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` [Text]
cs]

        rightRowRepresentations :: VU.Vector Int
        rightRowRepresentations :: Vector Int
rightRowRepresentations =
            Int -> (Int -> Int) -> Vector Int
forall a. Unbox a => Int -> (Int -> a) -> Vector a
VU.generate ((Int, Int) -> Int
forall a b. (a, b) -> a
fst (DataFrame -> (Int, Int)
D.dimensions DataFrame
right)) ([Int] -> DataFrame -> Int -> Int
D.mkRowRep [Int]
rightIndicesToGroup DataFrame
right)

        -- Build the Hash Map: Int -> Vector of Indices
        -- We use ifoldr to efficiently insert (index, key) without intermediate allocations.
        rightKeyMap :: HM.HashMap Int (VU.Vector Int)
        rightKeyMap :: HashMap Int (Vector Int)
rightKeyMap =
            let accumulator :: HashMap Int [Int]
accumulator =
                    (Int -> Int -> HashMap Int [Int] -> HashMap Int [Int])
-> HashMap Int [Int] -> Vector Int -> HashMap Int [Int]
forall a b. Unbox a => (Int -> a -> b -> b) -> b -> Vector a -> b
VU.ifoldr
                        (\Int
i Int
key HashMap Int [Int]
acc -> ([Int] -> [Int] -> [Int])
-> Int -> [Int] -> HashMap Int [Int] -> HashMap Int [Int]
forall k v.
(Eq k, Hashable k) =>
(v -> v -> v) -> k -> v -> HashMap k v -> HashMap k v
HM.insertWith [Int] -> [Int] -> [Int]
forall a. [a] -> [a] -> [a]
(++) Int
key [Int
i] HashMap Int [Int]
acc)
                        HashMap Int [Int]
forall k v. HashMap k v
HM.empty
                        Vector Int
rightRowRepresentations
             in ([Int] -> Vector Int)
-> HashMap Int [Int] -> HashMap Int (Vector Int)
forall v1 v2 k. (v1 -> v2) -> HashMap k v1 -> HashMap k v2
HM.map ([Int] -> Vector Int
forall a. Unbox a => [a] -> Vector a
VU.fromList ([Int] -> Vector Int) -> ([Int] -> [Int]) -> [Int] -> Vector Int
forall b c a. (b -> c) -> (a -> b) -> a -> c
. [Int] -> [Int]
forall a. [a] -> [a]
reverse) HashMap Int [Int]
accumulator

        -- Prepare Keys for Left DataFrame
        leftIndicesToGroup :: [Int]
leftIndicesToGroup =
            [Int
c | (Text
k, Int
c) <- Map Text Int -> [(Text, Int)]
forall k a. Map k a -> [(k, a)]
M.toList (DataFrame -> Map Text Int
D.columnIndices DataFrame
left), Text
k Text -> [Text] -> Bool
forall a. Eq a => a -> [a] -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` [Text]
cs]

        leftRowRepresentations :: VU.Vector Int
        leftRowRepresentations :: Vector Int
leftRowRepresentations =
            Int -> (Int -> Int) -> Vector Int
forall a. Unbox a => Int -> (Int -> a) -> Vector a
VU.generate ((Int, Int) -> Int
forall a b. (a, b) -> a
fst (DataFrame -> (Int, Int)
D.dimensions DataFrame
left)) ([Int] -> DataFrame -> Int -> Int
D.mkRowRep [Int]
leftIndicesToGroup DataFrame
left)

        -- Perform the Join
        ([Vector Int]
leftIndexChunks, [Vector Int]
rightIndexChunks) =
            (Int
 -> Int
 -> ([Vector Int], [Vector Int])
 -> ([Vector Int], [Vector Int]))
-> ([Vector Int], [Vector Int])
-> Vector Int
-> ([Vector Int], [Vector Int])
forall a b. Unbox a => (Int -> a -> b -> b) -> b -> Vector a -> b
VU.ifoldr
                ( \Int
lIdx Int
key ([Vector Int]
lAcc, [Vector Int]
rAcc) ->
                    case Int -> HashMap Int (Vector Int) -> Maybe (Vector Int)
forall k v. (Eq k, Hashable k) => k -> HashMap k v -> Maybe v
HM.lookup Int
key HashMap Int (Vector Int)
rightKeyMap of
                        Maybe (Vector Int)
Nothing -> ([Vector Int]
lAcc, [Vector Int]
rAcc)
                        Just Vector Int
rIndices ->
                            let len :: Int
len = Vector Int -> Int
forall a. Unbox a => Vector a -> Int
VU.length Vector Int
rIndices
                                -- Replicate the Left Index to match the number of Right matches
                                lChunk :: Vector Int
lChunk = Int -> Int -> Vector Int
forall a. Unbox a => Int -> a -> Vector a
VU.replicate Int
len Int
lIdx
                             in (Vector Int
lChunk Vector Int -> [Vector Int] -> [Vector Int]
forall a. a -> [a] -> [a]
: [Vector Int]
lAcc, Vector Int
rIndices Vector Int -> [Vector Int] -> [Vector Int]
forall a. a -> [a] -> [a]
: [Vector Int]
rAcc)
                )
                ([], [])
                Vector Int
leftRowRepresentations

        -- Flatten chunks
        expandedLeftIndicies :: Vector Int
expandedLeftIndicies = [Vector Int] -> Vector Int
forall a. Unbox a => [Vector a] -> Vector a
VU.concat [Vector Int]
leftIndexChunks
        expandedRightIndicies :: Vector Int
expandedRightIndicies = [Vector Int] -> Vector Int
forall a. Unbox a => [Vector a] -> Vector a
VU.concat [Vector Int]
rightIndexChunks

        resultLen :: Int
resultLen = Vector Int -> Int
forall a. Unbox a => Vector a -> Int
VU.length Vector Int
expandedLeftIndicies

        -- Construct Result DataFrames
        expandedLeft :: DataFrame
expandedLeft =
            DataFrame
left
                { columns = VB.map (D.atIndicesStable expandedLeftIndicies) (D.columns left)
                , dataframeDimensions = (resultLen, snd (D.dataframeDimensions left))
                }

        expandedRight :: DataFrame
expandedRight =
            DataFrame
right
                { columns = VB.map (D.atIndicesStable expandedRightIndicies) (D.columns right)
                , dataframeDimensions = (resultLen, snd (D.dataframeDimensions right))
                }

        leftColumns :: [Text]
leftColumns = DataFrame -> [Text]
D.columnNames DataFrame
left
        rightColumns :: [Text]
rightColumns = DataFrame -> [Text]
D.columnNames DataFrame
right

        insertIfPresent :: Text -> Maybe Column -> DataFrame -> DataFrame
insertIfPresent Text
_ Maybe Column
Nothing DataFrame
df = DataFrame
df
        insertIfPresent Text
name (Just Column
c) DataFrame
df = Text -> Column -> DataFrame -> DataFrame
D.insertColumn Text
name Column
c DataFrame
df
     in
        (Text -> DataFrame -> DataFrame)
-> [Text] -> DataFrame -> DataFrame
forall a.
(a -> DataFrame -> DataFrame) -> [a] -> DataFrame -> DataFrame
D.fold
            ( \Text
name DataFrame
df ->
                if Text
name Text -> [Text] -> Bool
forall a. Eq a => a -> [a] -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` [Text]
cs
                    then DataFrame
df
                    else
                        ( if Text
name Text -> [Text] -> Bool
forall a. Eq a => a -> [a] -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` [Text]
leftColumns
                            then Text -> Maybe Column -> DataFrame -> DataFrame
insertIfPresent (Text
"Right_" Text -> Text -> Text
forall a. Semigroup a => a -> a -> a
<> Text
name) (Text -> DataFrame -> Maybe Column
D.getColumn Text
name DataFrame
expandedRight) DataFrame
df
                            else Text -> Maybe Column -> DataFrame -> DataFrame
insertIfPresent Text
name (Text -> DataFrame -> Maybe Column
D.getColumn Text
name DataFrame
expandedRight) DataFrame
df
                        )
            )
            [Text]
rightColumns
            DataFrame
expandedLeft

{- | Performs a left join on two dataframes using the specified key columns.
Returns all rows from the left dataframe, with matching rows from the right dataframe.
Non-matching rows will have Nothing/null values for columns from the right dataframe.

==== __Example__
@
ghci> df = D.fromNamedColumns [("key", D.fromList ["K0", "K1", "K2", "K3"]), ("A", D.fromList ["A0", "A1", "A2", "A3"])]
ghci> other = D.fromNamedColumns [("key", D.fromList ["K0", "K1", "K2"]), ("B", D.fromList ["B0", "B1", "B2"])]
ghci> D.leftJoin ["key"] df other

------------------------
 key  |  A  |     B
------|-----|----------
 Text | Text| Maybe Text
------|-----|----------
 K0   | A0  | Just "B0"
 K1   | A1  | Just "B1"
 K2   | A2  | Just "B2"
 K3   | A3  | Nothing

@
-}
leftJoin ::
    [T.Text] -> DataFrame -> DataFrame -> DataFrame
leftJoin :: [Text] -> DataFrame -> DataFrame -> DataFrame
leftJoin [Text]
cs DataFrame
right DataFrame
left =
    let
        leftIndicesToGroup :: [Int]
leftIndicesToGroup = Map Text Int -> [Int]
forall k a. Map k a -> [a]
M.elems (Map Text Int -> [Int]) -> Map Text Int -> [Int]
forall a b. (a -> b) -> a -> b
$ (Text -> Int -> Bool) -> Map Text Int -> Map Text Int
forall k a. (k -> a -> Bool) -> Map k a -> Map k a
M.filterWithKey (\Text
k Int
_ -> Text
k Text -> [Text] -> Bool
forall a. Eq a => a -> [a] -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` [Text]
cs) (DataFrame -> Map Text Int
D.columnIndices DataFrame
left)
        leftRowRepresentations :: Vector Int
leftRowRepresentations = Int -> (Int -> Int) -> Vector Int
forall a. Unbox a => Int -> (Int -> a) -> Vector a
VU.generate ((Int, Int) -> Int
forall a b. (a, b) -> a
fst (DataFrame -> (Int, Int)
D.dimensions DataFrame
left)) ([Int] -> DataFrame -> Int -> Int
D.mkRowRep [Int]
leftIndicesToGroup DataFrame
left)
        rightIndicesToGroup :: [Int]
rightIndicesToGroup = Map Text Int -> [Int]
forall k a. Map k a -> [a]
M.elems (Map Text Int -> [Int]) -> Map Text Int -> [Int]
forall a b. (a -> b) -> a -> b
$ (Text -> Int -> Bool) -> Map Text Int -> Map Text Int
forall k a. (k -> a -> Bool) -> Map k a -> Map k a
M.filterWithKey (\Text
k Int
_ -> Text
k Text -> [Text] -> Bool
forall a. Eq a => a -> [a] -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` [Text]
cs) (DataFrame -> Map Text Int
D.columnIndices DataFrame
right)
        rightRowRepresentations :: Vector Int
rightRowRepresentations = Int -> (Int -> Int) -> Vector Int
forall a. Unbox a => Int -> (Int -> a) -> Vector a
VU.generate ((Int, Int) -> Int
forall a b. (a, b) -> a
fst (DataFrame -> (Int, Int)
D.dimensions DataFrame
right)) ([Int] -> DataFrame -> Int -> Int
D.mkRowRep [Int]
rightIndicesToGroup DataFrame
right)
        rightKeyCountsAndIndices :: Map Int [Int]
rightKeyCountsAndIndices =
            ((Int, Int) -> Map Int [Int] -> Map Int [Int])
-> Map Int [Int] -> Vector (Int, Int) -> Map Int [Int]
forall a b. Unbox a => (a -> b -> b) -> b -> Vector a -> b
VU.foldr
                (\(Int
i, Int
v) Map Int [Int]
acc -> ([Int] -> [Int] -> [Int])
-> Int -> [Int] -> Map Int [Int] -> Map Int [Int]
forall k a. Ord k => (a -> a -> a) -> k -> a -> Map k a -> Map k a
M.insertWith [Int] -> [Int] -> [Int]
forall a. [a] -> [a] -> [a]
(++) Int
v [Int
i] Map Int [Int]
acc)
                Map Int [Int]
forall k a. Map k a
M.empty
                (Vector Int -> Vector (Int, Int)
forall a. Unbox a => Vector a -> Vector (Int, a)
VU.indexed Vector Int
rightRowRepresentations)
        rightKeyCountsAndIndicesVec :: Map Int (Vector Int)
rightKeyCountsAndIndicesVec = ([Int] -> Vector Int) -> Map Int [Int] -> Map Int (Vector Int)
forall a b k. (a -> b) -> Map k a -> Map k b
M.map [Int] -> Vector Int
forall a. Unbox a => [a] -> Vector a
VU.fromList Map Int [Int]
rightKeyCountsAndIndices
        leftRowCount :: Int
leftRowCount = (Int, Int) -> Int
forall a b. (a, b) -> a
fst (DataFrame -> (Int, Int)
D.dimensions DataFrame
left)
        pairs :: [(Int, Maybe Int)]
pairs =
            [ (Int
i, Maybe Int
maybeRight)
            | Int
i <- [Int
0 .. Int
leftRowCount Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
1]
            , Maybe Int
maybeRight <-
                case Int -> Map Int (Vector Int) -> Maybe (Vector Int)
forall k a. Ord k => k -> Map k a -> Maybe a
M.lookup (Vector Int
leftRowRepresentations Vector Int -> Int -> Int
forall a. Unbox a => Vector a -> Int -> a
VU.! Int
i) Map Int (Vector Int)
rightKeyCountsAndIndicesVec of
                    Maybe (Vector Int)
Nothing -> [Maybe Int
forall a. Maybe a
Nothing]
                    Just Vector Int
rVec -> (Int -> Maybe Int) -> [Int] -> [Maybe Int]
forall a b. (a -> b) -> [a] -> [b]
map Int -> Maybe Int
forall a. a -> Maybe a
Just (Vector Int -> [Int]
forall a. Unbox a => Vector a -> [a]
VU.toList Vector Int
rVec)
            ]
        expandedLeftIndicies :: Vector Int
expandedLeftIndicies = [Int] -> Vector Int
forall a. Unbox a => [a] -> Vector a
VU.fromList (((Int, Maybe Int) -> Int) -> [(Int, Maybe Int)] -> [Int]
forall a b. (a -> b) -> [a] -> [b]
map (Int, Maybe Int) -> Int
forall a b. (a, b) -> a
fst [(Int, Maybe Int)]
pairs)
        expandedRightIndicies :: Vector (Maybe Int)
expandedRightIndicies = [Maybe Int] -> Vector (Maybe Int)
forall a. [a] -> Vector a
VB.fromList (((Int, Maybe Int) -> Maybe Int)
-> [(Int, Maybe Int)] -> [Maybe Int]
forall a b. (a -> b) -> [a] -> [b]
map (Int, Maybe Int) -> Maybe Int
forall a b. (a, b) -> b
snd [(Int, Maybe Int)]
pairs)
        expandedLeft :: DataFrame
expandedLeft =
            DataFrame
left
                { columns = VB.map (D.atIndicesStable expandedLeftIndicies) (D.columns left)
                , dataframeDimensions =
                    (VU.length expandedLeftIndicies, snd (D.dataframeDimensions left))
                }
        expandedRight :: DataFrame
expandedRight =
            DataFrame
right
                { columns = VB.map (D.atIndicesWithNulls expandedRightIndicies) (D.columns right)
                , dataframeDimensions =
                    (VB.length expandedRightIndicies, snd (D.dataframeDimensions right))
                }
        leftColumns :: [Text]
leftColumns = DataFrame -> [Text]
D.columnNames DataFrame
left
        rightColumns :: [Text]
rightColumns = DataFrame -> [Text]
D.columnNames DataFrame
right
        initDf :: DataFrame
initDf = DataFrame
expandedLeft
        insertIfPresent :: Text -> Maybe Column -> DataFrame -> DataFrame
insertIfPresent Text
_ Maybe Column
Nothing DataFrame
df = DataFrame
df
        insertIfPresent Text
name (Just Column
c) DataFrame
df = Text -> Column -> DataFrame -> DataFrame
D.insertColumn Text
name Column
c DataFrame
df
     in
        (Text -> DataFrame -> DataFrame)
-> [Text] -> DataFrame -> DataFrame
forall a.
(a -> DataFrame -> DataFrame) -> [a] -> DataFrame -> DataFrame
D.fold
            ( \Text
name DataFrame
df ->
                if Text
name Text -> [Text] -> Bool
forall a. Eq a => a -> [a] -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` [Text]
cs
                    then DataFrame
df
                    else
                        ( if Text
name Text -> [Text] -> Bool
forall a. Eq a => a -> [a] -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` [Text]
leftColumns
                            then Text -> Maybe Column -> DataFrame -> DataFrame
insertIfPresent (Text
"Right_" Text -> Text -> Text
forall a. Semigroup a => a -> a -> a
<> Text
name) (Text -> DataFrame -> Maybe Column
D.getColumn Text
name DataFrame
expandedRight) DataFrame
df
                            else Text -> Maybe Column -> DataFrame -> DataFrame
insertIfPresent Text
name (Text -> DataFrame -> Maybe Column
D.getColumn Text
name DataFrame
expandedRight) DataFrame
df
                        )
            )
            [Text]
rightColumns
            DataFrame
initDf

{- | Performs a right join on two dataframes using the specified key columns.
Returns all rows from the right dataframe, with matching rows from the left dataframe.
Non-matching rows will have Nothing/null values for columns from the left dataframe.

==== __Example__
@
ghci> df = D.fromNamedColumns [("key", D.fromList ["K0", "K1", "K2", "K3"]), ("A", D.fromList ["A0", "A1", "A2", "A3"])]
ghci> other = D.fromNamedColumns [("key", D.fromList ["K0", "K1"]), ("B", D.fromList ["B0", "B1"])]
ghci> D.rightJoin ["key"] df other

-----------------
 key  |  A  |  B
------|-----|----
 Text | Text| Text
------|-----|----
 K0   | A0  | B0
 K1   | A1  | B1

@
-}
rightJoin ::
    [T.Text] -> DataFrame -> DataFrame -> DataFrame
rightJoin :: [Text] -> DataFrame -> DataFrame -> DataFrame
rightJoin [Text]
cs DataFrame
right DataFrame
left =
    let
        leftIndicesToGroup :: [Int]
leftIndicesToGroup = Map Text Int -> [Int]
forall k a. Map k a -> [a]
M.elems (Map Text Int -> [Int]) -> Map Text Int -> [Int]
forall a b. (a -> b) -> a -> b
$ (Text -> Int -> Bool) -> Map Text Int -> Map Text Int
forall k a. (k -> a -> Bool) -> Map k a -> Map k a
M.filterWithKey (\Text
k Int
_ -> Text
k Text -> [Text] -> Bool
forall a. Eq a => a -> [a] -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` [Text]
cs) (DataFrame -> Map Text Int
D.columnIndices DataFrame
left)
        leftRowRepresentations :: Vector Int
leftRowRepresentations = Int -> (Int -> Int) -> Vector Int
forall a. Unbox a => Int -> (Int -> a) -> Vector a
VU.generate ((Int, Int) -> Int
forall a b. (a, b) -> a
fst (DataFrame -> (Int, Int)
D.dimensions DataFrame
left)) ([Int] -> DataFrame -> Int -> Int
D.mkRowRep [Int]
leftIndicesToGroup DataFrame
left)
        leftKeyCountsAndIndicesVec :: Map Int (Vector Int)
leftKeyCountsAndIndicesVec =
            ([Int] -> Vector Int) -> Map Int [Int] -> Map Int (Vector Int)
forall a b k. (a -> b) -> Map k a -> Map k b
M.map [Int] -> Vector Int
forall a. Unbox a => [a] -> Vector a
VU.fromList (Map Int [Int] -> Map Int (Vector Int))
-> Map Int [Int] -> Map Int (Vector Int)
forall a b. (a -> b) -> a -> b
$
                ((Int, Int) -> Map Int [Int] -> Map Int [Int])
-> Map Int [Int] -> Vector (Int, Int) -> Map Int [Int]
forall a b. Unbox a => (a -> b -> b) -> b -> Vector a -> b
VU.foldr
                    (\(Int
i, Int
v) Map Int [Int]
acc -> ([Int] -> [Int] -> [Int])
-> Int -> [Int] -> Map Int [Int] -> Map Int [Int]
forall k a. Ord k => (a -> a -> a) -> k -> a -> Map k a -> Map k a
M.insertWith [Int] -> [Int] -> [Int]
forall a. [a] -> [a] -> [a]
(++) Int
v [Int
i] Map Int [Int]
acc)
                    Map Int [Int]
forall k a. Map k a
M.empty
                    (Vector Int -> Vector (Int, Int)
forall a. Unbox a => Vector a -> Vector (Int, a)
VU.indexed Vector Int
leftRowRepresentations)
        rightIndicesToGroup :: [Int]
rightIndicesToGroup = Map Text Int -> [Int]
forall k a. Map k a -> [a]
M.elems (Map Text Int -> [Int]) -> Map Text Int -> [Int]
forall a b. (a -> b) -> a -> b
$ (Text -> Int -> Bool) -> Map Text Int -> Map Text Int
forall k a. (k -> a -> Bool) -> Map k a -> Map k a
M.filterWithKey (\Text
k Int
_ -> Text
k Text -> [Text] -> Bool
forall a. Eq a => a -> [a] -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` [Text]
cs) (DataFrame -> Map Text Int
D.columnIndices DataFrame
right)
        rightRowRepresentations :: Vector Int
rightRowRepresentations = Int -> (Int -> Int) -> Vector Int
forall a. Unbox a => Int -> (Int -> a) -> Vector a
VU.generate ((Int, Int) -> Int
forall a b. (a, b) -> a
fst (DataFrame -> (Int, Int)
D.dimensions DataFrame
right)) ([Int] -> DataFrame -> Int -> Int
D.mkRowRep [Int]
rightIndicesToGroup DataFrame
right)
        rightRowCount :: Int
rightRowCount = (Int, Int) -> Int
forall a b. (a, b) -> a
fst (DataFrame -> (Int, Int)
D.dimensions DataFrame
right)
        pairs :: [(Maybe Int, Int)]
pairs =
            [ (Maybe Int
maybeLeft, Int
j)
            | Int
j <- [Int
0 .. Int
rightRowCount Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
1]
            , Maybe Int
maybeLeft <-
                case Int -> Map Int (Vector Int) -> Maybe (Vector Int)
forall k a. Ord k => k -> Map k a -> Maybe a
M.lookup (Vector Int
rightRowRepresentations Vector Int -> Int -> Int
forall a. Unbox a => Vector a -> Int -> a
VU.! Int
j) Map Int (Vector Int)
leftKeyCountsAndIndicesVec of
                    Maybe (Vector Int)
Nothing -> [Maybe Int
forall a. Maybe a
Nothing]
                    Just Vector Int
lVec -> (Int -> Maybe Int) -> [Int] -> [Maybe Int]
forall a b. (a -> b) -> [a] -> [b]
map Int -> Maybe Int
forall a. a -> Maybe a
Just (Vector Int -> [Int]
forall a. Unbox a => Vector a -> [a]
VU.toList Vector Int
lVec)
            ]
        expandedLeftIndicies :: Vector (Maybe Int)
expandedLeftIndicies = [Maybe Int] -> Vector (Maybe Int)
forall a. [a] -> Vector a
VB.fromList (((Maybe Int, Int) -> Maybe Int)
-> [(Maybe Int, Int)] -> [Maybe Int]
forall a b. (a -> b) -> [a] -> [b]
map (Maybe Int, Int) -> Maybe Int
forall a b. (a, b) -> a
fst [(Maybe Int, Int)]
pairs)
        expandedRightIndicies :: Vector Int
expandedRightIndicies = [Int] -> Vector Int
forall a. Unbox a => [a] -> Vector a
VU.fromList (((Maybe Int, Int) -> Int) -> [(Maybe Int, Int)] -> [Int]
forall a b. (a -> b) -> [a] -> [b]
map (Maybe Int, Int) -> Int
forall a b. (a, b) -> b
snd [(Maybe Int, Int)]
pairs)
        expandedLeft :: DataFrame
expandedLeft =
            DataFrame
left
                { columns = VB.map (D.atIndicesWithNulls expandedLeftIndicies) (D.columns left)
                , dataframeDimensions =
                    (VB.length expandedLeftIndicies, snd (D.dataframeDimensions left))
                }
        expandedRight :: DataFrame
expandedRight =
            DataFrame
right
                { columns = VB.map (D.atIndicesStable expandedRightIndicies) (D.columns right)
                , dataframeDimensions =
                    (VU.length expandedRightIndicies, snd (D.dataframeDimensions right))
                }
        leftColumns :: [Text]
leftColumns = DataFrame -> [Text]
D.columnNames DataFrame
left
        rightColumns :: [Text]
rightColumns = DataFrame -> [Text]
D.columnNames DataFrame
right
        initDf :: DataFrame
initDf = DataFrame
expandedLeft
        insertIfPresent :: Text -> Maybe Column -> DataFrame -> DataFrame
insertIfPresent Text
_ Maybe Column
Nothing DataFrame
df = DataFrame
df
        insertIfPresent Text
name (Just Column
c) DataFrame
df = Text -> Column -> DataFrame -> DataFrame
D.insertColumn Text
name Column
c DataFrame
df
     in
        (Text -> DataFrame -> DataFrame)
-> [Text] -> DataFrame -> DataFrame
forall a.
(a -> DataFrame -> DataFrame) -> [a] -> DataFrame -> DataFrame
D.fold
            ( \Text
name DataFrame
df ->
                if Text
name Text -> [Text] -> Bool
forall a. Eq a => a -> [a] -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` [Text]
cs
                    then DataFrame
df
                    else
                        ( if Text
name Text -> [Text] -> Bool
forall a. Eq a => a -> [a] -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` [Text]
leftColumns
                            then Text -> Maybe Column -> DataFrame -> DataFrame
insertIfPresent (Text
"Right_" Text -> Text -> Text
forall a. Semigroup a => a -> a -> a
<> Text
name) (Text -> DataFrame -> Maybe Column
D.getColumn Text
name DataFrame
expandedRight) DataFrame
df
                            else Text -> Maybe Column -> DataFrame -> DataFrame
insertIfPresent Text
name (Text -> DataFrame -> Maybe Column
D.getColumn Text
name DataFrame
expandedRight) DataFrame
df
                        )
            )
            [Text]
rightColumns
            DataFrame
initDf

fullOuterJoin ::
    [T.Text] -> DataFrame -> DataFrame -> DataFrame
fullOuterJoin :: [Text] -> DataFrame -> DataFrame -> DataFrame
fullOuterJoin [Text]
cs DataFrame
right DataFrame
left =
    let
        leftIndicesToGroup :: [Int]
leftIndicesToGroup = Map Text Int -> [Int]
forall k a. Map k a -> [a]
M.elems (Map Text Int -> [Int]) -> Map Text Int -> [Int]
forall a b. (a -> b) -> a -> b
$ (Text -> Int -> Bool) -> Map Text Int -> Map Text Int
forall k a. (k -> a -> Bool) -> Map k a -> Map k a
M.filterWithKey (\Text
k Int
_ -> Text
k Text -> [Text] -> Bool
forall a. Eq a => a -> [a] -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` [Text]
cs) (DataFrame -> Map Text Int
D.columnIndices DataFrame
left)
        leftRowRepresentations :: Vector Int
leftRowRepresentations = Int -> (Int -> Int) -> Vector Int
forall a. Unbox a => Int -> (Int -> a) -> Vector a
VU.generate ((Int, Int) -> Int
forall a b. (a, b) -> a
fst (DataFrame -> (Int, Int)
D.dimensions DataFrame
left)) ([Int] -> DataFrame -> Int -> Int
D.mkRowRep [Int]
leftIndicesToGroup DataFrame
left)
        leftKeyCountsAndIndices :: Map Int [Int]
leftKeyCountsAndIndices =
            ((Int, Int) -> Map Int [Int] -> Map Int [Int])
-> Map Int [Int] -> Vector (Int, Int) -> Map Int [Int]
forall a b. Unbox a => (a -> b -> b) -> b -> Vector a -> b
VU.foldr
                (\(Int
i, Int
v) Map Int [Int]
acc -> ([Int] -> [Int] -> [Int])
-> Int -> [Int] -> Map Int [Int] -> Map Int [Int]
forall k a. Ord k => (a -> a -> a) -> k -> a -> Map k a -> Map k a
M.insertWith [Int] -> [Int] -> [Int]
forall a. [a] -> [a] -> [a]
(++) Int
v [Int
i] Map Int [Int]
acc)
                Map Int [Int]
forall k a. Map k a
M.empty
                (Vector Int -> Vector (Int, Int)
forall a. Unbox a => Vector a -> Vector (Int, a)
VU.indexed Vector Int
leftRowRepresentations)
        leftKeyCountsAndIndicesVec :: Map Int (Vector Int)
leftKeyCountsAndIndicesVec = ([Int] -> Vector Int) -> Map Int [Int] -> Map Int (Vector Int)
forall a b k. (a -> b) -> Map k a -> Map k b
M.map [Int] -> Vector Int
forall a. Unbox a => [a] -> Vector a
VU.fromList Map Int [Int]
leftKeyCountsAndIndices
        rightIndicesToGroup :: [Int]
rightIndicesToGroup = Map Text Int -> [Int]
forall k a. Map k a -> [a]
M.elems (Map Text Int -> [Int]) -> Map Text Int -> [Int]
forall a b. (a -> b) -> a -> b
$ (Text -> Int -> Bool) -> Map Text Int -> Map Text Int
forall k a. (k -> a -> Bool) -> Map k a -> Map k a
M.filterWithKey (\Text
k Int
_ -> Text
k Text -> [Text] -> Bool
forall a. Eq a => a -> [a] -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` [Text]
cs) (DataFrame -> Map Text Int
D.columnIndices DataFrame
right)
        rightRowRepresentations :: Vector Int
rightRowRepresentations = Int -> (Int -> Int) -> Vector Int
forall a. Unbox a => Int -> (Int -> a) -> Vector a
VU.generate ((Int, Int) -> Int
forall a b. (a, b) -> a
fst (DataFrame -> (Int, Int)
D.dimensions DataFrame
right)) ([Int] -> DataFrame -> Int -> Int
D.mkRowRep [Int]
rightIndicesToGroup DataFrame
right)
        rightKeyCountsAndIndices :: Map Int [Int]
rightKeyCountsAndIndices =
            ((Int, Int) -> Map Int [Int] -> Map Int [Int])
-> Map Int [Int] -> Vector (Int, Int) -> Map Int [Int]
forall a b. Unbox a => (a -> b -> b) -> b -> Vector a -> b
VU.foldr
                (\(Int
i, Int
v) Map Int [Int]
acc -> ([Int] -> [Int] -> [Int])
-> Int -> [Int] -> Map Int [Int] -> Map Int [Int]
forall k a. Ord k => (a -> a -> a) -> k -> a -> Map k a -> Map k a
M.insertWith [Int] -> [Int] -> [Int]
forall a. [a] -> [a] -> [a]
(++) Int
v [Int
i] Map Int [Int]
acc)
                Map Int [Int]
forall k a. Map k a
M.empty
                (Vector Int -> Vector (Int, Int)
forall a. Unbox a => Vector a -> Vector (Int, a)
VU.indexed Vector Int
rightRowRepresentations)
        rightKeyCountsAndIndicesVec :: Map Int (Vector Int)
rightKeyCountsAndIndicesVec = ([Int] -> Vector Int) -> Map Int [Int] -> Map Int (Vector Int)
forall a b k. (a -> b) -> Map k a -> Map k b
M.map [Int] -> Vector Int
forall a. Unbox a => [a] -> Vector a
VU.fromList Map Int [Int]
rightKeyCountsAndIndices
        matchedPairs :: [(Maybe Int, Maybe Int)]
matchedPairs =
            ((Vector Int, Vector Int) -> [(Maybe Int, Maybe Int)])
-> [(Vector Int, Vector Int)] -> [(Maybe Int, Maybe Int)]
forall (t :: * -> *) a b. Foldable t => (a -> [b]) -> t a -> [b]
concatMap
                ( \(Vector Int
lVec, Vector Int
rVec) ->
                    [ (Int -> Maybe Int
forall a. a -> Maybe a
Just Int
lIdx, Int -> Maybe Int
forall a. a -> Maybe a
Just Int
rIdx)
                    | Int
lIdx <- Vector Int -> [Int]
forall a. Unbox a => Vector a -> [a]
VU.toList Vector Int
lVec
                    , Int
rIdx <- Vector Int -> [Int]
forall a. Unbox a => Vector a -> [a]
VU.toList Vector Int
rVec
                    ]
                )
                ( Map Int (Vector Int, Vector Int) -> [(Vector Int, Vector Int)]
forall k a. Map k a -> [a]
M.elems
                    ((Vector Int -> Vector Int -> (Vector Int, Vector Int))
-> Map Int (Vector Int)
-> Map Int (Vector Int)
-> Map Int (Vector Int, Vector Int)
forall k a b c.
Ord k =>
(a -> b -> c) -> Map k a -> Map k b -> Map k c
M.intersectionWith (,) Map Int (Vector Int)
leftKeyCountsAndIndicesVec Map Int (Vector Int)
rightKeyCountsAndIndicesVec)
                )
        leftOnlyPairs :: [(Maybe Int, Maybe Int)]
leftOnlyPairs =
            (Vector Int -> [(Maybe Int, Maybe Int)])
-> [Vector Int] -> [(Maybe Int, Maybe Int)]
forall (t :: * -> *) a b. Foldable t => (a -> [b]) -> t a -> [b]
concatMap
                ((Int -> (Maybe Int, Maybe Int))
-> [Int] -> [(Maybe Int, Maybe Int)]
forall a b. (a -> b) -> [a] -> [b]
map (\Int
lIdx -> (Int -> Maybe Int
forall a. a -> Maybe a
Just Int
lIdx, Maybe Int
forall a. Maybe a
Nothing)) ([Int] -> [(Maybe Int, Maybe Int)])
-> (Vector Int -> [Int]) -> Vector Int -> [(Maybe Int, Maybe Int)]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Vector Int -> [Int]
forall a. Unbox a => Vector a -> [a]
VU.toList)
                (Map Int (Vector Int) -> [Vector Int]
forall k a. Map k a -> [a]
M.elems (Map Int (Vector Int)
leftKeyCountsAndIndicesVec Map Int (Vector Int)
-> Map Int (Vector Int) -> Map Int (Vector Int)
forall k a b. Ord k => Map k a -> Map k b -> Map k a
`M.difference` Map Int (Vector Int)
rightKeyCountsAndIndicesVec))
        rightOnlyPairs :: [(Maybe Int, Maybe Int)]
rightOnlyPairs =
            (Vector Int -> [(Maybe Int, Maybe Int)])
-> [Vector Int] -> [(Maybe Int, Maybe Int)]
forall (t :: * -> *) a b. Foldable t => (a -> [b]) -> t a -> [b]
concatMap
                ((Int -> (Maybe Int, Maybe Int))
-> [Int] -> [(Maybe Int, Maybe Int)]
forall a b. (a -> b) -> [a] -> [b]
map (\Int
rIdx -> (Maybe Int
forall a. Maybe a
Nothing, Int -> Maybe Int
forall a. a -> Maybe a
Just Int
rIdx)) ([Int] -> [(Maybe Int, Maybe Int)])
-> (Vector Int -> [Int]) -> Vector Int -> [(Maybe Int, Maybe Int)]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Vector Int -> [Int]
forall a. Unbox a => Vector a -> [a]
VU.toList)
                (Map Int (Vector Int) -> [Vector Int]
forall k a. Map k a -> [a]
M.elems (Map Int (Vector Int)
rightKeyCountsAndIndicesVec Map Int (Vector Int)
-> Map Int (Vector Int) -> Map Int (Vector Int)
forall k a b. Ord k => Map k a -> Map k b -> Map k a
`M.difference` Map Int (Vector Int)
leftKeyCountsAndIndicesVec))
        pairs :: [(Maybe Int, Maybe Int)]
pairs = [(Maybe Int, Maybe Int)]
matchedPairs [(Maybe Int, Maybe Int)]
-> [(Maybe Int, Maybe Int)] -> [(Maybe Int, Maybe Int)]
forall a. [a] -> [a] -> [a]
++ [(Maybe Int, Maybe Int)]
leftOnlyPairs [(Maybe Int, Maybe Int)]
-> [(Maybe Int, Maybe Int)] -> [(Maybe Int, Maybe Int)]
forall a. [a] -> [a] -> [a]
++ [(Maybe Int, Maybe Int)]
rightOnlyPairs
        expandedLeftIndicies :: Vector (Maybe Int)
expandedLeftIndicies = [Maybe Int] -> Vector (Maybe Int)
forall a. [a] -> Vector a
VB.fromList (((Maybe Int, Maybe Int) -> Maybe Int)
-> [(Maybe Int, Maybe Int)] -> [Maybe Int]
forall a b. (a -> b) -> [a] -> [b]
map (Maybe Int, Maybe Int) -> Maybe Int
forall a b. (a, b) -> a
fst [(Maybe Int, Maybe Int)]
pairs)
        expandedRightIndicies :: Vector (Maybe Int)
expandedRightIndicies = [Maybe Int] -> Vector (Maybe Int)
forall a. [a] -> Vector a
VB.fromList (((Maybe Int, Maybe Int) -> Maybe Int)
-> [(Maybe Int, Maybe Int)] -> [Maybe Int]
forall a b. (a -> b) -> [a] -> [b]
map (Maybe Int, Maybe Int) -> Maybe Int
forall a b. (a, b) -> b
snd [(Maybe Int, Maybe Int)]
pairs)
        expandedLeft :: DataFrame
expandedLeft =
            DataFrame
left
                { columns = VB.map (D.atIndicesWithNulls expandedLeftIndicies) (D.columns left)
                , dataframeDimensions =
                    (VB.length expandedLeftIndicies, snd (D.dataframeDimensions left))
                }
        expandedRight :: DataFrame
expandedRight =
            DataFrame
right
                { columns = VB.map (D.atIndicesWithNulls expandedRightIndicies) (D.columns right)
                , dataframeDimensions =
                    (VB.length expandedRightIndicies, snd (D.dataframeDimensions right))
                }
        leftColumns :: [Text]
leftColumns = DataFrame -> [Text]
D.columnNames DataFrame
left
        rightColumns :: [Text]
rightColumns = DataFrame -> [Text]
D.columnNames DataFrame
right
        initDf :: DataFrame
initDf = DataFrame
expandedLeft
        insertIfPresent :: Text -> Maybe Column -> DataFrame -> DataFrame
insertIfPresent Text
_ Maybe Column
Nothing DataFrame
df = DataFrame
df
        insertIfPresent Text
name (Just Column
c) DataFrame
df = Text -> Column -> DataFrame -> DataFrame
D.insertColumn Text
name Column
c DataFrame
df
     in
        (Text -> DataFrame -> DataFrame)
-> [Text] -> DataFrame -> DataFrame
forall a.
(a -> DataFrame -> DataFrame) -> [a] -> DataFrame -> DataFrame
D.fold
            ( \Text
name DataFrame
df ->
                if Text
name Text -> [Text] -> Bool
forall a. Eq a => a -> [a] -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` [Text]
cs
                    then case (Text -> DataFrame -> Column
D.unsafeGetColumn Text
name DataFrame
expandedRight, Text -> DataFrame -> Column
D.unsafeGetColumn Text
name DataFrame
expandedLeft) of
                        ( OptionalColumn (Vector (Maybe a)
left :: VB.Vector (Maybe a))
                            , OptionalColumn (Vector (Maybe a)
right :: VB.Vector (Maybe b))
                            ) -> case TypeRep a -> TypeRep a -> Maybe (a :~: a)
forall a b. TypeRep a -> TypeRep b -> Maybe (a :~: b)
forall {k} (f :: k -> *) (a :: k) (b :: k).
TestEquality f =>
f a -> f b -> Maybe (a :~: b)
testEquality (forall a. Typeable a => TypeRep a
forall {k} (a :: k). Typeable a => TypeRep a
typeRep @a) (forall a. Typeable a => TypeRep a
forall {k} (a :: k). Typeable a => TypeRep a
typeRep @b) of
                                Maybe (a :~: a)
Nothing -> [Char] -> DataFrame
forall a. HasCallStack => [Char] -> a
error [Char]
"Cannot join columns of different types"
                                Just a :~: a
Refl ->
                                    Text -> Vector a -> DataFrame -> DataFrame
forall a (t :: * -> *).
(Columnable a, Foldable t) =>
Text -> t a -> DataFrame -> DataFrame
D.insert
                                        Text
name
                                        ((Maybe a -> a) -> Vector (Maybe a) -> Vector a
forall a b. (a -> b) -> Vector a -> Vector b
VB.map (a -> Maybe a -> a
forall a. a -> Maybe a -> a
fromMaybe a
forall a. HasCallStack => a
undefined) ((Maybe a -> Maybe a -> Maybe a)
-> Vector (Maybe a) -> Vector (Maybe a) -> Vector (Maybe a)
forall a b c. (a -> b -> c) -> Vector a -> Vector b -> Vector c
VB.zipWith (\Maybe a
l Maybe a
r -> [Maybe a] -> Maybe a
forall (t :: * -> *) (f :: * -> *) a.
(Foldable t, Alternative f) =>
t (f a) -> f a
asum [Maybe a
l, Maybe a
r]) Vector (Maybe a)
left Vector (Maybe a)
Vector (Maybe a)
right))
                                        DataFrame
df
                        (Column, Column)
_ -> [Char] -> DataFrame
forall a. HasCallStack => [Char] -> a
error [Char]
"Join should have optional keys."
                    else
                        ( if Text
name Text -> [Text] -> Bool
forall a. Eq a => a -> [a] -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` [Text]
leftColumns
                            then Text -> Maybe Column -> DataFrame -> DataFrame
insertIfPresent (Text
"Right_" Text -> Text -> Text
forall a. Semigroup a => a -> a -> a
<> Text
name) (Text -> DataFrame -> Maybe Column
D.getColumn Text
name DataFrame
expandedRight) DataFrame
df
                            else Text -> Maybe Column -> DataFrame -> DataFrame
insertIfPresent Text
name (Text -> DataFrame -> Maybe Column
D.getColumn Text
name DataFrame
expandedRight) DataFrame
df
                        )
            ) -- ???
            [Text]
rightColumns
            DataFrame
initDf