{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE GADTs #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE RankNTypes #-}
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE TypeApplications #-}
module DataFrame.Operations.Join where
import Control.Applicative (asum)
import qualified Data.HashMap.Strict as HM
import qualified Data.Map.Strict as M
import Data.Maybe (fromMaybe)
import qualified Data.Text as T
import Data.Type.Equality (TestEquality (..))
import qualified Data.Vector as VB
import qualified Data.Vector.Unboxed as VU
import DataFrame.Internal.Column as D
import DataFrame.Internal.DataFrame as D
import DataFrame.Operations.Aggregation as D
import DataFrame.Operations.Core as D
import Type.Reflection
data JoinType
= INNER
| LEFT
| RIGHT
| FULL_OUTER
join ::
JoinType ->
[T.Text] ->
DataFrame ->
DataFrame ->
DataFrame
join :: JoinType -> [Text] -> DataFrame -> DataFrame -> DataFrame
join JoinType
INNER [Text]
xs DataFrame
right = [Text] -> DataFrame -> DataFrame -> DataFrame
innerJoin [Text]
xs DataFrame
right
join JoinType
LEFT [Text]
xs DataFrame
right = [Text] -> DataFrame -> DataFrame -> DataFrame
leftJoin [Text]
xs DataFrame
right
join JoinType
RIGHT [Text]
xs DataFrame
right = [Text] -> DataFrame -> DataFrame -> DataFrame
rightJoin [Text]
xs DataFrame
right
join JoinType
FULL_OUTER [Text]
xs DataFrame
right = [Text] -> DataFrame -> DataFrame -> DataFrame
fullOuterJoin [Text]
xs DataFrame
right
innerJoin :: [T.Text] -> DataFrame -> DataFrame -> DataFrame
innerJoin :: [Text] -> DataFrame -> DataFrame -> DataFrame
innerJoin [Text]
cs DataFrame
right DataFrame
left =
let
rightIndicesToGroup :: [Int]
rightIndicesToGroup =
[Int
c | (Text
k, Int
c) <- Map Text Int -> [(Text, Int)]
forall k a. Map k a -> [(k, a)]
M.toList (DataFrame -> Map Text Int
D.columnIndices DataFrame
right), Text
k Text -> [Text] -> Bool
forall a. Eq a => a -> [a] -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` [Text]
cs]
rightRowRepresentations :: VU.Vector Int
rightRowRepresentations :: Vector Int
rightRowRepresentations =
Int -> (Int -> Int) -> Vector Int
forall a. Unbox a => Int -> (Int -> a) -> Vector a
VU.generate ((Int, Int) -> Int
forall a b. (a, b) -> a
fst (DataFrame -> (Int, Int)
D.dimensions DataFrame
right)) ([Int] -> DataFrame -> Int -> Int
D.mkRowRep [Int]
rightIndicesToGroup DataFrame
right)
rightKeyMap :: HM.HashMap Int (VU.Vector Int)
rightKeyMap :: HashMap Int (Vector Int)
rightKeyMap =
let accumulator :: HashMap Int [Int]
accumulator =
(Int -> Int -> HashMap Int [Int] -> HashMap Int [Int])
-> HashMap Int [Int] -> Vector Int -> HashMap Int [Int]
forall a b. Unbox a => (Int -> a -> b -> b) -> b -> Vector a -> b
VU.ifoldr
(\Int
i Int
key HashMap Int [Int]
acc -> ([Int] -> [Int] -> [Int])
-> Int -> [Int] -> HashMap Int [Int] -> HashMap Int [Int]
forall k v.
(Eq k, Hashable k) =>
(v -> v -> v) -> k -> v -> HashMap k v -> HashMap k v
HM.insertWith [Int] -> [Int] -> [Int]
forall a. [a] -> [a] -> [a]
(++) Int
key [Int
i] HashMap Int [Int]
acc)
HashMap Int [Int]
forall k v. HashMap k v
HM.empty
Vector Int
rightRowRepresentations
in ([Int] -> Vector Int)
-> HashMap Int [Int] -> HashMap Int (Vector Int)
forall v1 v2 k. (v1 -> v2) -> HashMap k v1 -> HashMap k v2
HM.map ([Int] -> Vector Int
forall a. Unbox a => [a] -> Vector a
VU.fromList ([Int] -> Vector Int) -> ([Int] -> [Int]) -> [Int] -> Vector Int
forall b c a. (b -> c) -> (a -> b) -> a -> c
. [Int] -> [Int]
forall a. [a] -> [a]
reverse) HashMap Int [Int]
accumulator
leftIndicesToGroup :: [Int]
leftIndicesToGroup =
[Int
c | (Text
k, Int
c) <- Map Text Int -> [(Text, Int)]
forall k a. Map k a -> [(k, a)]
M.toList (DataFrame -> Map Text Int
D.columnIndices DataFrame
left), Text
k Text -> [Text] -> Bool
forall a. Eq a => a -> [a] -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` [Text]
cs]
leftRowRepresentations :: VU.Vector Int
leftRowRepresentations :: Vector Int
leftRowRepresentations =
Int -> (Int -> Int) -> Vector Int
forall a. Unbox a => Int -> (Int -> a) -> Vector a
VU.generate ((Int, Int) -> Int
forall a b. (a, b) -> a
fst (DataFrame -> (Int, Int)
D.dimensions DataFrame
left)) ([Int] -> DataFrame -> Int -> Int
D.mkRowRep [Int]
leftIndicesToGroup DataFrame
left)
([Vector Int]
leftIndexChunks, [Vector Int]
rightIndexChunks) =
(Int
-> Int
-> ([Vector Int], [Vector Int])
-> ([Vector Int], [Vector Int]))
-> ([Vector Int], [Vector Int])
-> Vector Int
-> ([Vector Int], [Vector Int])
forall a b. Unbox a => (Int -> a -> b -> b) -> b -> Vector a -> b
VU.ifoldr
( \Int
lIdx Int
key ([Vector Int]
lAcc, [Vector Int]
rAcc) ->
case Int -> HashMap Int (Vector Int) -> Maybe (Vector Int)
forall k v. (Eq k, Hashable k) => k -> HashMap k v -> Maybe v
HM.lookup Int
key HashMap Int (Vector Int)
rightKeyMap of
Maybe (Vector Int)
Nothing -> ([Vector Int]
lAcc, [Vector Int]
rAcc)
Just Vector Int
rIndices ->
let len :: Int
len = Vector Int -> Int
forall a. Unbox a => Vector a -> Int
VU.length Vector Int
rIndices
lChunk :: Vector Int
lChunk = Int -> Int -> Vector Int
forall a. Unbox a => Int -> a -> Vector a
VU.replicate Int
len Int
lIdx
in (Vector Int
lChunk Vector Int -> [Vector Int] -> [Vector Int]
forall a. a -> [a] -> [a]
: [Vector Int]
lAcc, Vector Int
rIndices Vector Int -> [Vector Int] -> [Vector Int]
forall a. a -> [a] -> [a]
: [Vector Int]
rAcc)
)
([], [])
Vector Int
leftRowRepresentations
expandedLeftIndicies :: Vector Int
expandedLeftIndicies = [Vector Int] -> Vector Int
forall a. Unbox a => [Vector a] -> Vector a
VU.concat [Vector Int]
leftIndexChunks
expandedRightIndicies :: Vector Int
expandedRightIndicies = [Vector Int] -> Vector Int
forall a. Unbox a => [Vector a] -> Vector a
VU.concat [Vector Int]
rightIndexChunks
resultLen :: Int
resultLen = Vector Int -> Int
forall a. Unbox a => Vector a -> Int
VU.length Vector Int
expandedLeftIndicies
expandedLeft :: DataFrame
expandedLeft =
DataFrame
left
{ columns = VB.map (D.atIndicesStable expandedLeftIndicies) (D.columns left)
, dataframeDimensions = (resultLen, snd (D.dataframeDimensions left))
}
expandedRight :: DataFrame
expandedRight =
DataFrame
right
{ columns = VB.map (D.atIndicesStable expandedRightIndicies) (D.columns right)
, dataframeDimensions = (resultLen, snd (D.dataframeDimensions right))
}
leftColumns :: [Text]
leftColumns = DataFrame -> [Text]
D.columnNames DataFrame
left
rightColumns :: [Text]
rightColumns = DataFrame -> [Text]
D.columnNames DataFrame
right
insertIfPresent :: Text -> Maybe Column -> DataFrame -> DataFrame
insertIfPresent Text
_ Maybe Column
Nothing DataFrame
df = DataFrame
df
insertIfPresent Text
name (Just Column
c) DataFrame
df = Text -> Column -> DataFrame -> DataFrame
D.insertColumn Text
name Column
c DataFrame
df
in
(Text -> DataFrame -> DataFrame)
-> [Text] -> DataFrame -> DataFrame
forall a.
(a -> DataFrame -> DataFrame) -> [a] -> DataFrame -> DataFrame
D.fold
( \Text
name DataFrame
df ->
if Text
name Text -> [Text] -> Bool
forall a. Eq a => a -> [a] -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` [Text]
cs
then DataFrame
df
else
( if Text
name Text -> [Text] -> Bool
forall a. Eq a => a -> [a] -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` [Text]
leftColumns
then Text -> Maybe Column -> DataFrame -> DataFrame
insertIfPresent (Text
"Right_" Text -> Text -> Text
forall a. Semigroup a => a -> a -> a
<> Text
name) (Text -> DataFrame -> Maybe Column
D.getColumn Text
name DataFrame
expandedRight) DataFrame
df
else Text -> Maybe Column -> DataFrame -> DataFrame
insertIfPresent Text
name (Text -> DataFrame -> Maybe Column
D.getColumn Text
name DataFrame
expandedRight) DataFrame
df
)
)
[Text]
rightColumns
DataFrame
expandedLeft
leftJoin ::
[T.Text] -> DataFrame -> DataFrame -> DataFrame
leftJoin :: [Text] -> DataFrame -> DataFrame -> DataFrame
leftJoin [Text]
cs DataFrame
right DataFrame
left =
let
leftIndicesToGroup :: [Int]
leftIndicesToGroup = Map Text Int -> [Int]
forall k a. Map k a -> [a]
M.elems (Map Text Int -> [Int]) -> Map Text Int -> [Int]
forall a b. (a -> b) -> a -> b
$ (Text -> Int -> Bool) -> Map Text Int -> Map Text Int
forall k a. (k -> a -> Bool) -> Map k a -> Map k a
M.filterWithKey (\Text
k Int
_ -> Text
k Text -> [Text] -> Bool
forall a. Eq a => a -> [a] -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` [Text]
cs) (DataFrame -> Map Text Int
D.columnIndices DataFrame
left)
leftRowRepresentations :: Vector Int
leftRowRepresentations = Int -> (Int -> Int) -> Vector Int
forall a. Unbox a => Int -> (Int -> a) -> Vector a
VU.generate ((Int, Int) -> Int
forall a b. (a, b) -> a
fst (DataFrame -> (Int, Int)
D.dimensions DataFrame
left)) ([Int] -> DataFrame -> Int -> Int
D.mkRowRep [Int]
leftIndicesToGroup DataFrame
left)
rightIndicesToGroup :: [Int]
rightIndicesToGroup = Map Text Int -> [Int]
forall k a. Map k a -> [a]
M.elems (Map Text Int -> [Int]) -> Map Text Int -> [Int]
forall a b. (a -> b) -> a -> b
$ (Text -> Int -> Bool) -> Map Text Int -> Map Text Int
forall k a. (k -> a -> Bool) -> Map k a -> Map k a
M.filterWithKey (\Text
k Int
_ -> Text
k Text -> [Text] -> Bool
forall a. Eq a => a -> [a] -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` [Text]
cs) (DataFrame -> Map Text Int
D.columnIndices DataFrame
right)
rightRowRepresentations :: Vector Int
rightRowRepresentations = Int -> (Int -> Int) -> Vector Int
forall a. Unbox a => Int -> (Int -> a) -> Vector a
VU.generate ((Int, Int) -> Int
forall a b. (a, b) -> a
fst (DataFrame -> (Int, Int)
D.dimensions DataFrame
right)) ([Int] -> DataFrame -> Int -> Int
D.mkRowRep [Int]
rightIndicesToGroup DataFrame
right)
rightKeyCountsAndIndices :: Map Int [Int]
rightKeyCountsAndIndices =
((Int, Int) -> Map Int [Int] -> Map Int [Int])
-> Map Int [Int] -> Vector (Int, Int) -> Map Int [Int]
forall a b. Unbox a => (a -> b -> b) -> b -> Vector a -> b
VU.foldr
(\(Int
i, Int
v) Map Int [Int]
acc -> ([Int] -> [Int] -> [Int])
-> Int -> [Int] -> Map Int [Int] -> Map Int [Int]
forall k a. Ord k => (a -> a -> a) -> k -> a -> Map k a -> Map k a
M.insertWith [Int] -> [Int] -> [Int]
forall a. [a] -> [a] -> [a]
(++) Int
v [Int
i] Map Int [Int]
acc)
Map Int [Int]
forall k a. Map k a
M.empty
(Vector Int -> Vector (Int, Int)
forall a. Unbox a => Vector a -> Vector (Int, a)
VU.indexed Vector Int
rightRowRepresentations)
rightKeyCountsAndIndicesVec :: Map Int (Vector Int)
rightKeyCountsAndIndicesVec = ([Int] -> Vector Int) -> Map Int [Int] -> Map Int (Vector Int)
forall a b k. (a -> b) -> Map k a -> Map k b
M.map [Int] -> Vector Int
forall a. Unbox a => [a] -> Vector a
VU.fromList Map Int [Int]
rightKeyCountsAndIndices
leftRowCount :: Int
leftRowCount = (Int, Int) -> Int
forall a b. (a, b) -> a
fst (DataFrame -> (Int, Int)
D.dimensions DataFrame
left)
pairs :: [(Int, Maybe Int)]
pairs =
[ (Int
i, Maybe Int
maybeRight)
| Int
i <- [Int
0 .. Int
leftRowCount Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
1]
, Maybe Int
maybeRight <-
case Int -> Map Int (Vector Int) -> Maybe (Vector Int)
forall k a. Ord k => k -> Map k a -> Maybe a
M.lookup (Vector Int
leftRowRepresentations Vector Int -> Int -> Int
forall a. Unbox a => Vector a -> Int -> a
VU.! Int
i) Map Int (Vector Int)
rightKeyCountsAndIndicesVec of
Maybe (Vector Int)
Nothing -> [Maybe Int
forall a. Maybe a
Nothing]
Just Vector Int
rVec -> (Int -> Maybe Int) -> [Int] -> [Maybe Int]
forall a b. (a -> b) -> [a] -> [b]
map Int -> Maybe Int
forall a. a -> Maybe a
Just (Vector Int -> [Int]
forall a. Unbox a => Vector a -> [a]
VU.toList Vector Int
rVec)
]
expandedLeftIndicies :: Vector Int
expandedLeftIndicies = [Int] -> Vector Int
forall a. Unbox a => [a] -> Vector a
VU.fromList (((Int, Maybe Int) -> Int) -> [(Int, Maybe Int)] -> [Int]
forall a b. (a -> b) -> [a] -> [b]
map (Int, Maybe Int) -> Int
forall a b. (a, b) -> a
fst [(Int, Maybe Int)]
pairs)
expandedRightIndicies :: Vector (Maybe Int)
expandedRightIndicies = [Maybe Int] -> Vector (Maybe Int)
forall a. [a] -> Vector a
VB.fromList (((Int, Maybe Int) -> Maybe Int)
-> [(Int, Maybe Int)] -> [Maybe Int]
forall a b. (a -> b) -> [a] -> [b]
map (Int, Maybe Int) -> Maybe Int
forall a b. (a, b) -> b
snd [(Int, Maybe Int)]
pairs)
expandedLeft :: DataFrame
expandedLeft =
DataFrame
left
{ columns = VB.map (D.atIndicesStable expandedLeftIndicies) (D.columns left)
, dataframeDimensions =
(VU.length expandedLeftIndicies, snd (D.dataframeDimensions left))
}
expandedRight :: DataFrame
expandedRight =
DataFrame
right
{ columns = VB.map (D.atIndicesWithNulls expandedRightIndicies) (D.columns right)
, dataframeDimensions =
(VB.length expandedRightIndicies, snd (D.dataframeDimensions right))
}
leftColumns :: [Text]
leftColumns = DataFrame -> [Text]
D.columnNames DataFrame
left
rightColumns :: [Text]
rightColumns = DataFrame -> [Text]
D.columnNames DataFrame
right
initDf :: DataFrame
initDf = DataFrame
expandedLeft
insertIfPresent :: Text -> Maybe Column -> DataFrame -> DataFrame
insertIfPresent Text
_ Maybe Column
Nothing DataFrame
df = DataFrame
df
insertIfPresent Text
name (Just Column
c) DataFrame
df = Text -> Column -> DataFrame -> DataFrame
D.insertColumn Text
name Column
c DataFrame
df
in
(Text -> DataFrame -> DataFrame)
-> [Text] -> DataFrame -> DataFrame
forall a.
(a -> DataFrame -> DataFrame) -> [a] -> DataFrame -> DataFrame
D.fold
( \Text
name DataFrame
df ->
if Text
name Text -> [Text] -> Bool
forall a. Eq a => a -> [a] -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` [Text]
cs
then DataFrame
df
else
( if Text
name Text -> [Text] -> Bool
forall a. Eq a => a -> [a] -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` [Text]
leftColumns
then Text -> Maybe Column -> DataFrame -> DataFrame
insertIfPresent (Text
"Right_" Text -> Text -> Text
forall a. Semigroup a => a -> a -> a
<> Text
name) (Text -> DataFrame -> Maybe Column
D.getColumn Text
name DataFrame
expandedRight) DataFrame
df
else Text -> Maybe Column -> DataFrame -> DataFrame
insertIfPresent Text
name (Text -> DataFrame -> Maybe Column
D.getColumn Text
name DataFrame
expandedRight) DataFrame
df
)
)
[Text]
rightColumns
DataFrame
initDf
rightJoin ::
[T.Text] -> DataFrame -> DataFrame -> DataFrame
rightJoin :: [Text] -> DataFrame -> DataFrame -> DataFrame
rightJoin [Text]
cs DataFrame
right DataFrame
left =
let
leftIndicesToGroup :: [Int]
leftIndicesToGroup = Map Text Int -> [Int]
forall k a. Map k a -> [a]
M.elems (Map Text Int -> [Int]) -> Map Text Int -> [Int]
forall a b. (a -> b) -> a -> b
$ (Text -> Int -> Bool) -> Map Text Int -> Map Text Int
forall k a. (k -> a -> Bool) -> Map k a -> Map k a
M.filterWithKey (\Text
k Int
_ -> Text
k Text -> [Text] -> Bool
forall a. Eq a => a -> [a] -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` [Text]
cs) (DataFrame -> Map Text Int
D.columnIndices DataFrame
left)
leftRowRepresentations :: Vector Int
leftRowRepresentations = Int -> (Int -> Int) -> Vector Int
forall a. Unbox a => Int -> (Int -> a) -> Vector a
VU.generate ((Int, Int) -> Int
forall a b. (a, b) -> a
fst (DataFrame -> (Int, Int)
D.dimensions DataFrame
left)) ([Int] -> DataFrame -> Int -> Int
D.mkRowRep [Int]
leftIndicesToGroup DataFrame
left)
leftKeyCountsAndIndicesVec :: Map Int (Vector Int)
leftKeyCountsAndIndicesVec =
([Int] -> Vector Int) -> Map Int [Int] -> Map Int (Vector Int)
forall a b k. (a -> b) -> Map k a -> Map k b
M.map [Int] -> Vector Int
forall a. Unbox a => [a] -> Vector a
VU.fromList (Map Int [Int] -> Map Int (Vector Int))
-> Map Int [Int] -> Map Int (Vector Int)
forall a b. (a -> b) -> a -> b
$
((Int, Int) -> Map Int [Int] -> Map Int [Int])
-> Map Int [Int] -> Vector (Int, Int) -> Map Int [Int]
forall a b. Unbox a => (a -> b -> b) -> b -> Vector a -> b
VU.foldr
(\(Int
i, Int
v) Map Int [Int]
acc -> ([Int] -> [Int] -> [Int])
-> Int -> [Int] -> Map Int [Int] -> Map Int [Int]
forall k a. Ord k => (a -> a -> a) -> k -> a -> Map k a -> Map k a
M.insertWith [Int] -> [Int] -> [Int]
forall a. [a] -> [a] -> [a]
(++) Int
v [Int
i] Map Int [Int]
acc)
Map Int [Int]
forall k a. Map k a
M.empty
(Vector Int -> Vector (Int, Int)
forall a. Unbox a => Vector a -> Vector (Int, a)
VU.indexed Vector Int
leftRowRepresentations)
rightIndicesToGroup :: [Int]
rightIndicesToGroup = Map Text Int -> [Int]
forall k a. Map k a -> [a]
M.elems (Map Text Int -> [Int]) -> Map Text Int -> [Int]
forall a b. (a -> b) -> a -> b
$ (Text -> Int -> Bool) -> Map Text Int -> Map Text Int
forall k a. (k -> a -> Bool) -> Map k a -> Map k a
M.filterWithKey (\Text
k Int
_ -> Text
k Text -> [Text] -> Bool
forall a. Eq a => a -> [a] -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` [Text]
cs) (DataFrame -> Map Text Int
D.columnIndices DataFrame
right)
rightRowRepresentations :: Vector Int
rightRowRepresentations = Int -> (Int -> Int) -> Vector Int
forall a. Unbox a => Int -> (Int -> a) -> Vector a
VU.generate ((Int, Int) -> Int
forall a b. (a, b) -> a
fst (DataFrame -> (Int, Int)
D.dimensions DataFrame
right)) ([Int] -> DataFrame -> Int -> Int
D.mkRowRep [Int]
rightIndicesToGroup DataFrame
right)
rightRowCount :: Int
rightRowCount = (Int, Int) -> Int
forall a b. (a, b) -> a
fst (DataFrame -> (Int, Int)
D.dimensions DataFrame
right)
pairs :: [(Maybe Int, Int)]
pairs =
[ (Maybe Int
maybeLeft, Int
j)
| Int
j <- [Int
0 .. Int
rightRowCount Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
1]
, Maybe Int
maybeLeft <-
case Int -> Map Int (Vector Int) -> Maybe (Vector Int)
forall k a. Ord k => k -> Map k a -> Maybe a
M.lookup (Vector Int
rightRowRepresentations Vector Int -> Int -> Int
forall a. Unbox a => Vector a -> Int -> a
VU.! Int
j) Map Int (Vector Int)
leftKeyCountsAndIndicesVec of
Maybe (Vector Int)
Nothing -> [Maybe Int
forall a. Maybe a
Nothing]
Just Vector Int
lVec -> (Int -> Maybe Int) -> [Int] -> [Maybe Int]
forall a b. (a -> b) -> [a] -> [b]
map Int -> Maybe Int
forall a. a -> Maybe a
Just (Vector Int -> [Int]
forall a. Unbox a => Vector a -> [a]
VU.toList Vector Int
lVec)
]
expandedLeftIndicies :: Vector (Maybe Int)
expandedLeftIndicies = [Maybe Int] -> Vector (Maybe Int)
forall a. [a] -> Vector a
VB.fromList (((Maybe Int, Int) -> Maybe Int)
-> [(Maybe Int, Int)] -> [Maybe Int]
forall a b. (a -> b) -> [a] -> [b]
map (Maybe Int, Int) -> Maybe Int
forall a b. (a, b) -> a
fst [(Maybe Int, Int)]
pairs)
expandedRightIndicies :: Vector Int
expandedRightIndicies = [Int] -> Vector Int
forall a. Unbox a => [a] -> Vector a
VU.fromList (((Maybe Int, Int) -> Int) -> [(Maybe Int, Int)] -> [Int]
forall a b. (a -> b) -> [a] -> [b]
map (Maybe Int, Int) -> Int
forall a b. (a, b) -> b
snd [(Maybe Int, Int)]
pairs)
expandedLeft :: DataFrame
expandedLeft =
DataFrame
left
{ columns = VB.map (D.atIndicesWithNulls expandedLeftIndicies) (D.columns left)
, dataframeDimensions =
(VB.length expandedLeftIndicies, snd (D.dataframeDimensions left))
}
expandedRight :: DataFrame
expandedRight =
DataFrame
right
{ columns = VB.map (D.atIndicesStable expandedRightIndicies) (D.columns right)
, dataframeDimensions =
(VU.length expandedRightIndicies, snd (D.dataframeDimensions right))
}
leftColumns :: [Text]
leftColumns = DataFrame -> [Text]
D.columnNames DataFrame
left
rightColumns :: [Text]
rightColumns = DataFrame -> [Text]
D.columnNames DataFrame
right
initDf :: DataFrame
initDf = DataFrame
expandedLeft
insertIfPresent :: Text -> Maybe Column -> DataFrame -> DataFrame
insertIfPresent Text
_ Maybe Column
Nothing DataFrame
df = DataFrame
df
insertIfPresent Text
name (Just Column
c) DataFrame
df = Text -> Column -> DataFrame -> DataFrame
D.insertColumn Text
name Column
c DataFrame
df
in
(Text -> DataFrame -> DataFrame)
-> [Text] -> DataFrame -> DataFrame
forall a.
(a -> DataFrame -> DataFrame) -> [a] -> DataFrame -> DataFrame
D.fold
( \Text
name DataFrame
df ->
if Text
name Text -> [Text] -> Bool
forall a. Eq a => a -> [a] -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` [Text]
cs
then DataFrame
df
else
( if Text
name Text -> [Text] -> Bool
forall a. Eq a => a -> [a] -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` [Text]
leftColumns
then Text -> Maybe Column -> DataFrame -> DataFrame
insertIfPresent (Text
"Right_" Text -> Text -> Text
forall a. Semigroup a => a -> a -> a
<> Text
name) (Text -> DataFrame -> Maybe Column
D.getColumn Text
name DataFrame
expandedRight) DataFrame
df
else Text -> Maybe Column -> DataFrame -> DataFrame
insertIfPresent Text
name (Text -> DataFrame -> Maybe Column
D.getColumn Text
name DataFrame
expandedRight) DataFrame
df
)
)
[Text]
rightColumns
DataFrame
initDf
fullOuterJoin ::
[T.Text] -> DataFrame -> DataFrame -> DataFrame
fullOuterJoin :: [Text] -> DataFrame -> DataFrame -> DataFrame
fullOuterJoin [Text]
cs DataFrame
right DataFrame
left =
let
leftIndicesToGroup :: [Int]
leftIndicesToGroup = Map Text Int -> [Int]
forall k a. Map k a -> [a]
M.elems (Map Text Int -> [Int]) -> Map Text Int -> [Int]
forall a b. (a -> b) -> a -> b
$ (Text -> Int -> Bool) -> Map Text Int -> Map Text Int
forall k a. (k -> a -> Bool) -> Map k a -> Map k a
M.filterWithKey (\Text
k Int
_ -> Text
k Text -> [Text] -> Bool
forall a. Eq a => a -> [a] -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` [Text]
cs) (DataFrame -> Map Text Int
D.columnIndices DataFrame
left)
leftRowRepresentations :: Vector Int
leftRowRepresentations = Int -> (Int -> Int) -> Vector Int
forall a. Unbox a => Int -> (Int -> a) -> Vector a
VU.generate ((Int, Int) -> Int
forall a b. (a, b) -> a
fst (DataFrame -> (Int, Int)
D.dimensions DataFrame
left)) ([Int] -> DataFrame -> Int -> Int
D.mkRowRep [Int]
leftIndicesToGroup DataFrame
left)
leftKeyCountsAndIndices :: Map Int [Int]
leftKeyCountsAndIndices =
((Int, Int) -> Map Int [Int] -> Map Int [Int])
-> Map Int [Int] -> Vector (Int, Int) -> Map Int [Int]
forall a b. Unbox a => (a -> b -> b) -> b -> Vector a -> b
VU.foldr
(\(Int
i, Int
v) Map Int [Int]
acc -> ([Int] -> [Int] -> [Int])
-> Int -> [Int] -> Map Int [Int] -> Map Int [Int]
forall k a. Ord k => (a -> a -> a) -> k -> a -> Map k a -> Map k a
M.insertWith [Int] -> [Int] -> [Int]
forall a. [a] -> [a] -> [a]
(++) Int
v [Int
i] Map Int [Int]
acc)
Map Int [Int]
forall k a. Map k a
M.empty
(Vector Int -> Vector (Int, Int)
forall a. Unbox a => Vector a -> Vector (Int, a)
VU.indexed Vector Int
leftRowRepresentations)
leftKeyCountsAndIndicesVec :: Map Int (Vector Int)
leftKeyCountsAndIndicesVec = ([Int] -> Vector Int) -> Map Int [Int] -> Map Int (Vector Int)
forall a b k. (a -> b) -> Map k a -> Map k b
M.map [Int] -> Vector Int
forall a. Unbox a => [a] -> Vector a
VU.fromList Map Int [Int]
leftKeyCountsAndIndices
rightIndicesToGroup :: [Int]
rightIndicesToGroup = Map Text Int -> [Int]
forall k a. Map k a -> [a]
M.elems (Map Text Int -> [Int]) -> Map Text Int -> [Int]
forall a b. (a -> b) -> a -> b
$ (Text -> Int -> Bool) -> Map Text Int -> Map Text Int
forall k a. (k -> a -> Bool) -> Map k a -> Map k a
M.filterWithKey (\Text
k Int
_ -> Text
k Text -> [Text] -> Bool
forall a. Eq a => a -> [a] -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` [Text]
cs) (DataFrame -> Map Text Int
D.columnIndices DataFrame
right)
rightRowRepresentations :: Vector Int
rightRowRepresentations = Int -> (Int -> Int) -> Vector Int
forall a. Unbox a => Int -> (Int -> a) -> Vector a
VU.generate ((Int, Int) -> Int
forall a b. (a, b) -> a
fst (DataFrame -> (Int, Int)
D.dimensions DataFrame
right)) ([Int] -> DataFrame -> Int -> Int
D.mkRowRep [Int]
rightIndicesToGroup DataFrame
right)
rightKeyCountsAndIndices :: Map Int [Int]
rightKeyCountsAndIndices =
((Int, Int) -> Map Int [Int] -> Map Int [Int])
-> Map Int [Int] -> Vector (Int, Int) -> Map Int [Int]
forall a b. Unbox a => (a -> b -> b) -> b -> Vector a -> b
VU.foldr
(\(Int
i, Int
v) Map Int [Int]
acc -> ([Int] -> [Int] -> [Int])
-> Int -> [Int] -> Map Int [Int] -> Map Int [Int]
forall k a. Ord k => (a -> a -> a) -> k -> a -> Map k a -> Map k a
M.insertWith [Int] -> [Int] -> [Int]
forall a. [a] -> [a] -> [a]
(++) Int
v [Int
i] Map Int [Int]
acc)
Map Int [Int]
forall k a. Map k a
M.empty
(Vector Int -> Vector (Int, Int)
forall a. Unbox a => Vector a -> Vector (Int, a)
VU.indexed Vector Int
rightRowRepresentations)
rightKeyCountsAndIndicesVec :: Map Int (Vector Int)
rightKeyCountsAndIndicesVec = ([Int] -> Vector Int) -> Map Int [Int] -> Map Int (Vector Int)
forall a b k. (a -> b) -> Map k a -> Map k b
M.map [Int] -> Vector Int
forall a. Unbox a => [a] -> Vector a
VU.fromList Map Int [Int]
rightKeyCountsAndIndices
matchedPairs :: [(Maybe Int, Maybe Int)]
matchedPairs =
((Vector Int, Vector Int) -> [(Maybe Int, Maybe Int)])
-> [(Vector Int, Vector Int)] -> [(Maybe Int, Maybe Int)]
forall (t :: * -> *) a b. Foldable t => (a -> [b]) -> t a -> [b]
concatMap
( \(Vector Int
lVec, Vector Int
rVec) ->
[ (Int -> Maybe Int
forall a. a -> Maybe a
Just Int
lIdx, Int -> Maybe Int
forall a. a -> Maybe a
Just Int
rIdx)
| Int
lIdx <- Vector Int -> [Int]
forall a. Unbox a => Vector a -> [a]
VU.toList Vector Int
lVec
, Int
rIdx <- Vector Int -> [Int]
forall a. Unbox a => Vector a -> [a]
VU.toList Vector Int
rVec
]
)
( Map Int (Vector Int, Vector Int) -> [(Vector Int, Vector Int)]
forall k a. Map k a -> [a]
M.elems
((Vector Int -> Vector Int -> (Vector Int, Vector Int))
-> Map Int (Vector Int)
-> Map Int (Vector Int)
-> Map Int (Vector Int, Vector Int)
forall k a b c.
Ord k =>
(a -> b -> c) -> Map k a -> Map k b -> Map k c
M.intersectionWith (,) Map Int (Vector Int)
leftKeyCountsAndIndicesVec Map Int (Vector Int)
rightKeyCountsAndIndicesVec)
)
leftOnlyPairs :: [(Maybe Int, Maybe Int)]
leftOnlyPairs =
(Vector Int -> [(Maybe Int, Maybe Int)])
-> [Vector Int] -> [(Maybe Int, Maybe Int)]
forall (t :: * -> *) a b. Foldable t => (a -> [b]) -> t a -> [b]
concatMap
((Int -> (Maybe Int, Maybe Int))
-> [Int] -> [(Maybe Int, Maybe Int)]
forall a b. (a -> b) -> [a] -> [b]
map (\Int
lIdx -> (Int -> Maybe Int
forall a. a -> Maybe a
Just Int
lIdx, Maybe Int
forall a. Maybe a
Nothing)) ([Int] -> [(Maybe Int, Maybe Int)])
-> (Vector Int -> [Int]) -> Vector Int -> [(Maybe Int, Maybe Int)]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Vector Int -> [Int]
forall a. Unbox a => Vector a -> [a]
VU.toList)
(Map Int (Vector Int) -> [Vector Int]
forall k a. Map k a -> [a]
M.elems (Map Int (Vector Int)
leftKeyCountsAndIndicesVec Map Int (Vector Int)
-> Map Int (Vector Int) -> Map Int (Vector Int)
forall k a b. Ord k => Map k a -> Map k b -> Map k a
`M.difference` Map Int (Vector Int)
rightKeyCountsAndIndicesVec))
rightOnlyPairs :: [(Maybe Int, Maybe Int)]
rightOnlyPairs =
(Vector Int -> [(Maybe Int, Maybe Int)])
-> [Vector Int] -> [(Maybe Int, Maybe Int)]
forall (t :: * -> *) a b. Foldable t => (a -> [b]) -> t a -> [b]
concatMap
((Int -> (Maybe Int, Maybe Int))
-> [Int] -> [(Maybe Int, Maybe Int)]
forall a b. (a -> b) -> [a] -> [b]
map (\Int
rIdx -> (Maybe Int
forall a. Maybe a
Nothing, Int -> Maybe Int
forall a. a -> Maybe a
Just Int
rIdx)) ([Int] -> [(Maybe Int, Maybe Int)])
-> (Vector Int -> [Int]) -> Vector Int -> [(Maybe Int, Maybe Int)]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Vector Int -> [Int]
forall a. Unbox a => Vector a -> [a]
VU.toList)
(Map Int (Vector Int) -> [Vector Int]
forall k a. Map k a -> [a]
M.elems (Map Int (Vector Int)
rightKeyCountsAndIndicesVec Map Int (Vector Int)
-> Map Int (Vector Int) -> Map Int (Vector Int)
forall k a b. Ord k => Map k a -> Map k b -> Map k a
`M.difference` Map Int (Vector Int)
leftKeyCountsAndIndicesVec))
pairs :: [(Maybe Int, Maybe Int)]
pairs = [(Maybe Int, Maybe Int)]
matchedPairs [(Maybe Int, Maybe Int)]
-> [(Maybe Int, Maybe Int)] -> [(Maybe Int, Maybe Int)]
forall a. [a] -> [a] -> [a]
++ [(Maybe Int, Maybe Int)]
leftOnlyPairs [(Maybe Int, Maybe Int)]
-> [(Maybe Int, Maybe Int)] -> [(Maybe Int, Maybe Int)]
forall a. [a] -> [a] -> [a]
++ [(Maybe Int, Maybe Int)]
rightOnlyPairs
expandedLeftIndicies :: Vector (Maybe Int)
expandedLeftIndicies = [Maybe Int] -> Vector (Maybe Int)
forall a. [a] -> Vector a
VB.fromList (((Maybe Int, Maybe Int) -> Maybe Int)
-> [(Maybe Int, Maybe Int)] -> [Maybe Int]
forall a b. (a -> b) -> [a] -> [b]
map (Maybe Int, Maybe Int) -> Maybe Int
forall a b. (a, b) -> a
fst [(Maybe Int, Maybe Int)]
pairs)
expandedRightIndicies :: Vector (Maybe Int)
expandedRightIndicies = [Maybe Int] -> Vector (Maybe Int)
forall a. [a] -> Vector a
VB.fromList (((Maybe Int, Maybe Int) -> Maybe Int)
-> [(Maybe Int, Maybe Int)] -> [Maybe Int]
forall a b. (a -> b) -> [a] -> [b]
map (Maybe Int, Maybe Int) -> Maybe Int
forall a b. (a, b) -> b
snd [(Maybe Int, Maybe Int)]
pairs)
expandedLeft :: DataFrame
expandedLeft =
DataFrame
left
{ columns = VB.map (D.atIndicesWithNulls expandedLeftIndicies) (D.columns left)
, dataframeDimensions =
(VB.length expandedLeftIndicies, snd (D.dataframeDimensions left))
}
expandedRight :: DataFrame
expandedRight =
DataFrame
right
{ columns = VB.map (D.atIndicesWithNulls expandedRightIndicies) (D.columns right)
, dataframeDimensions =
(VB.length expandedRightIndicies, snd (D.dataframeDimensions right))
}
leftColumns :: [Text]
leftColumns = DataFrame -> [Text]
D.columnNames DataFrame
left
rightColumns :: [Text]
rightColumns = DataFrame -> [Text]
D.columnNames DataFrame
right
initDf :: DataFrame
initDf = DataFrame
expandedLeft
insertIfPresent :: Text -> Maybe Column -> DataFrame -> DataFrame
insertIfPresent Text
_ Maybe Column
Nothing DataFrame
df = DataFrame
df
insertIfPresent Text
name (Just Column
c) DataFrame
df = Text -> Column -> DataFrame -> DataFrame
D.insertColumn Text
name Column
c DataFrame
df
in
(Text -> DataFrame -> DataFrame)
-> [Text] -> DataFrame -> DataFrame
forall a.
(a -> DataFrame -> DataFrame) -> [a] -> DataFrame -> DataFrame
D.fold
( \Text
name DataFrame
df ->
if Text
name Text -> [Text] -> Bool
forall a. Eq a => a -> [a] -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` [Text]
cs
then case (Text -> DataFrame -> Column
D.unsafeGetColumn Text
name DataFrame
expandedRight, Text -> DataFrame -> Column
D.unsafeGetColumn Text
name DataFrame
expandedLeft) of
( OptionalColumn (Vector (Maybe a)
left :: VB.Vector (Maybe a))
, OptionalColumn (Vector (Maybe a)
right :: VB.Vector (Maybe b))
) -> case TypeRep a -> TypeRep a -> Maybe (a :~: a)
forall a b. TypeRep a -> TypeRep b -> Maybe (a :~: b)
forall {k} (f :: k -> *) (a :: k) (b :: k).
TestEquality f =>
f a -> f b -> Maybe (a :~: b)
testEquality (forall a. Typeable a => TypeRep a
forall {k} (a :: k). Typeable a => TypeRep a
typeRep @a) (forall a. Typeable a => TypeRep a
forall {k} (a :: k). Typeable a => TypeRep a
typeRep @b) of
Maybe (a :~: a)
Nothing -> [Char] -> DataFrame
forall a. HasCallStack => [Char] -> a
error [Char]
"Cannot join columns of different types"
Just a :~: a
Refl ->
Text -> Vector a -> DataFrame -> DataFrame
forall a (t :: * -> *).
(Columnable a, Foldable t) =>
Text -> t a -> DataFrame -> DataFrame
D.insert
Text
name
((Maybe a -> a) -> Vector (Maybe a) -> Vector a
forall a b. (a -> b) -> Vector a -> Vector b
VB.map (a -> Maybe a -> a
forall a. a -> Maybe a -> a
fromMaybe a
forall a. HasCallStack => a
undefined) ((Maybe a -> Maybe a -> Maybe a)
-> Vector (Maybe a) -> Vector (Maybe a) -> Vector (Maybe a)
forall a b c. (a -> b -> c) -> Vector a -> Vector b -> Vector c
VB.zipWith (\Maybe a
l Maybe a
r -> [Maybe a] -> Maybe a
forall (t :: * -> *) (f :: * -> *) a.
(Foldable t, Alternative f) =>
t (f a) -> f a
asum [Maybe a
l, Maybe a
r]) Vector (Maybe a)
left Vector (Maybe a)
Vector (Maybe a)
right))
DataFrame
df
(Column, Column)
_ -> [Char] -> DataFrame
forall a. HasCallStack => [Char] -> a
error [Char]
"Join should have optional keys."
else
( if Text
name Text -> [Text] -> Bool
forall a. Eq a => a -> [a] -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` [Text]
leftColumns
then Text -> Maybe Column -> DataFrame -> DataFrame
insertIfPresent (Text
"Right_" Text -> Text -> Text
forall a. Semigroup a => a -> a -> a
<> Text
name) (Text -> DataFrame -> Maybe Column
D.getColumn Text
name DataFrame
expandedRight) DataFrame
df
else Text -> Maybe Column -> DataFrame -> DataFrame
insertIfPresent Text
name (Text -> DataFrame -> Maybe Column
D.getColumn Text
name DataFrame
expandedRight) DataFrame
df
)
)
[Text]
rightColumns
DataFrame
initDf