{-# LANGUAGE ExplicitNamespaces #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE GADTs #-}
{-# LANGUAGE InstanceSigs #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE TypeApplications #-}

module DataFrame.Internal.DataFrame where

import qualified Data.Map as M
import qualified Data.Text as T
import qualified Data.Vector as V
import qualified Data.Vector.Generic as VG
import qualified Data.Vector.Unboxed as VU

import Control.Exception (throw)
import Data.Function (on)
import Data.List (sortBy, transpose, (\\))
import Data.Type.Equality (TestEquality (testEquality), type (:~:) (Refl))
import DataFrame.Display.Terminal.PrettyPrint
import DataFrame.Errors
import DataFrame.Internal.Column
import Text.Printf
import Type.Reflection (typeRep)

data DataFrame = DataFrame
    { DataFrame -> Vector Column
columns :: V.Vector Column
    {- ^ Our main data structure stores a dataframe as
    a vector of columns. This improv
    -}
    , DataFrame -> Map Text Int
columnIndices :: M.Map T.Text Int
    -- ^ Keeps the column names in the order they were inserted in.
    , DataFrame -> (Int, Int)
dataframeDimensions :: (Int, Int)
    -- ^ (rows, columns)
    }

{- | A record that contains information about how and what
rows are grouped in the dataframe. This can only be used with
`aggregate`.
-}
data GroupedDataFrame = Grouped
    { GroupedDataFrame -> DataFrame
fullDataframe :: DataFrame
    , GroupedDataFrame -> [Text]
groupedColumns :: [T.Text]
    , GroupedDataFrame -> Vector Int
valueIndices :: VU.Vector Int
    , GroupedDataFrame -> Vector Int
offsets :: VU.Vector Int
    }

instance Show GroupedDataFrame where
    show :: GroupedDataFrame -> String
show (Grouped DataFrame
df [Text]
cols Vector Int
indices Vector Int
os) =
        String -> String -> ShowS
forall r. PrintfType r => String -> r
printf
            String
"{ keyColumns: %s groupedColumns: %s }"
            ([Text] -> String
forall a. Show a => a -> String
show [Text]
cols)
            ([Text] -> String
forall a. Show a => a -> String
show (Map Text Int -> [Text]
forall k a. Map k a -> [k]
M.keys (DataFrame -> Map Text Int
columnIndices DataFrame
df) [Text] -> [Text] -> [Text]
forall a. Eq a => [a] -> [a] -> [a]
\\ [Text]
cols))

instance Eq GroupedDataFrame where
    == :: GroupedDataFrame -> GroupedDataFrame -> Bool
(==) (Grouped DataFrame
df [Text]
cols Vector Int
indices Vector Int
os) (Grouped DataFrame
df' [Text]
cols' Vector Int
indices' Vector Int
os') = (DataFrame
df DataFrame -> DataFrame -> Bool
forall a. Eq a => a -> a -> Bool
== DataFrame
df') Bool -> Bool -> Bool
&& ([Text]
cols [Text] -> [Text] -> Bool
forall a. Eq a => a -> a -> Bool
== [Text]
cols')

instance Eq DataFrame where
    (==) :: DataFrame -> DataFrame -> Bool
    DataFrame
a == :: DataFrame -> DataFrame -> Bool
== DataFrame
b =
        Map Text Int -> [Text]
forall k a. Map k a -> [k]
M.keys (DataFrame -> Map Text Int
columnIndices DataFrame
a) [Text] -> [Text] -> Bool
forall a. Eq a => a -> a -> Bool
== Map Text Int -> [Text]
forall k a. Map k a -> [k]
M.keys (DataFrame -> Map Text Int
columnIndices DataFrame
b)
            Bool -> Bool -> Bool
&& ((Text, Int) -> Bool -> Bool) -> Bool -> [(Text, Int)] -> Bool
forall a b. (a -> b -> b) -> b -> [a] -> b
forall (t :: * -> *) a b.
Foldable t =>
(a -> b -> b) -> b -> t a -> b
foldr
                ( \(Text
name, Int
index) Bool
acc -> Bool
acc Bool -> Bool -> Bool
&& (DataFrame -> Vector Column
columns DataFrame
a Vector Column -> Int -> Maybe Column
forall a. Vector a -> Int -> Maybe a
V.!? Int
index Maybe Column -> Maybe Column -> Bool
forall a. Eq a => a -> a -> Bool
== (DataFrame -> Vector Column
columns DataFrame
b Vector Column -> Int -> Maybe Column
forall a. Vector a -> Int -> Maybe a
V.!? (DataFrame -> Map Text Int
columnIndices DataFrame
b Map Text Int -> Text -> Int
forall k a. Ord k => Map k a -> k -> a
M.! Text
name)))
                )
                Bool
True
                (Map Text Int -> [(Text, Int)]
forall k a. Map k a -> [(k, a)]
M.toList (Map Text Int -> [(Text, Int)]) -> Map Text Int -> [(Text, Int)]
forall a b. (a -> b) -> a -> b
$ DataFrame -> Map Text Int
columnIndices DataFrame
a)

instance Show DataFrame where
    show :: DataFrame -> String
    show :: DataFrame -> String
show DataFrame
d = Text -> String
T.unpack (DataFrame -> Bool -> Text
asText DataFrame
d Bool
False)

-- | For showing the dataframe as markdown in notebooks.
toMarkdownTable :: DataFrame -> T.Text
toMarkdownTable :: DataFrame -> Text
toMarkdownTable DataFrame
df = DataFrame -> Bool -> Text
asText DataFrame
df Bool
True

asText :: DataFrame -> Bool -> T.Text
asText :: DataFrame -> Bool -> Text
asText DataFrame
d Bool
properMarkdown =
    let header :: [Text]
header = Text
"index" Text -> [Text] -> [Text]
forall a. a -> [a] -> [a]
: ((Text, Int) -> Text) -> [(Text, Int)] -> [Text]
forall a b. (a -> b) -> [a] -> [b]
map (Text, Int) -> Text
forall a b. (a, b) -> a
fst (((Text, Int) -> (Text, Int) -> Ordering)
-> [(Text, Int)] -> [(Text, Int)]
forall a. (a -> a -> Ordering) -> [a] -> [a]
sortBy (Int -> Int -> Ordering
forall a. Ord a => a -> a -> Ordering
compare (Int -> Int -> Ordering)
-> ((Text, Int) -> Int) -> (Text, Int) -> (Text, Int) -> Ordering
forall b c a. (b -> b -> c) -> (a -> b) -> a -> a -> c
`on` (Text, Int) -> Int
forall a b. (a, b) -> b
snd) ([(Text, Int)] -> [(Text, Int)]) -> [(Text, Int)] -> [(Text, Int)]
forall a b. (a -> b) -> a -> b
$ Map Text Int -> [(Text, Int)]
forall k a. Map k a -> [(k, a)]
M.toList (DataFrame -> Map Text Int
columnIndices DataFrame
d))
        types :: [Text]
types = Vector Text -> [Text]
forall a. Vector a -> [a]
V.toList (Vector Text -> [Text]) -> Vector Text -> [Text]
forall a b. (a -> b) -> a -> b
$ (Text -> Bool) -> Vector Text -> Vector Text
forall a. (a -> Bool) -> Vector a -> Vector a
V.filter (Text -> Text -> Bool
forall a. Eq a => a -> a -> Bool
/= Text
"") (Vector Text -> Vector Text) -> Vector Text -> Vector Text
forall a b. (a -> b) -> a -> b
$ (Column -> Text) -> Vector Column -> Vector Text
forall a b. (a -> b) -> Vector a -> Vector b
V.map Column -> Text
getType (DataFrame -> Vector Column
columns DataFrame
d)
        getType :: Column -> T.Text
        getType :: Column -> Text
getType (BoxedColumn (Vector a
column :: V.Vector a)) = String -> Text
T.pack (String -> Text) -> String -> Text
forall a b. (a -> b) -> a -> b
$ TypeRep a -> String
forall a. Show a => a -> String
show (forall a. Typeable a => TypeRep a
forall {k} (a :: k). Typeable a => TypeRep a
typeRep @a)
        getType (UnboxedColumn (Vector a
column :: VU.Vector a)) = String -> Text
T.pack (String -> Text) -> String -> Text
forall a b. (a -> b) -> a -> b
$ TypeRep a -> String
forall a. Show a => a -> String
show (forall a. Typeable a => TypeRep a
forall {k} (a :: k). Typeable a => TypeRep a
typeRep @a)
        getType (OptionalColumn (Vector (Maybe a)
column :: V.Vector a)) = String -> Text
T.pack (String -> Text) -> String -> Text
forall a b. (a -> b) -> a -> b
$ TypeRep (Maybe a) -> String
forall a. Show a => a -> String
show (forall a. Typeable a => TypeRep a
forall {k} (a :: k). Typeable a => TypeRep a
typeRep @a)
        -- Separate out cases dynamically so we don't end up making round trip string
        -- copies.
        get :: Maybe Column -> V.Vector T.Text
        get :: Maybe Column -> Vector Text
get (Just (BoxedColumn (Vector a
column :: V.Vector a))) = case TypeRep a -> TypeRep Text -> Maybe (a :~: Text)
forall a b. TypeRep a -> TypeRep b -> Maybe (a :~: b)
forall {k} (f :: k -> *) (a :: k) (b :: k).
TestEquality f =>
f a -> f b -> Maybe (a :~: b)
testEquality (forall a. Typeable a => TypeRep a
forall {k} (a :: k). Typeable a => TypeRep a
typeRep @a) (forall a. Typeable a => TypeRep a
forall {k} (a :: k). Typeable a => TypeRep a
typeRep @T.Text) of
            Just a :~: Text
Refl -> Vector a
Vector Text
column
            Maybe (a :~: Text)
Nothing -> case TypeRep a -> TypeRep String -> Maybe (a :~: String)
forall a b. TypeRep a -> TypeRep b -> Maybe (a :~: b)
forall {k} (f :: k -> *) (a :: k) (b :: k).
TestEquality f =>
f a -> f b -> Maybe (a :~: b)
testEquality (forall a. Typeable a => TypeRep a
forall {k} (a :: k). Typeable a => TypeRep a
typeRep @a) (forall a. Typeable a => TypeRep a
forall {k} (a :: k). Typeable a => TypeRep a
typeRep @String) of
                Just a :~: String
Refl -> (String -> Text) -> Vector String -> Vector Text
forall a b. (a -> b) -> Vector a -> Vector b
V.map String -> Text
T.pack Vector a
Vector String
column
                Maybe (a :~: String)
Nothing -> (a -> Text) -> Vector a -> Vector Text
forall a b. (a -> b) -> Vector a -> Vector b
V.map (String -> Text
T.pack (String -> Text) -> (a -> String) -> a -> Text
forall b c a. (b -> c) -> (a -> b) -> a -> c
. a -> String
forall a. Show a => a -> String
show) Vector a
column
        get (Just (UnboxedColumn Vector a
column)) = (a -> Text) -> Vector a -> Vector Text
forall a b. (a -> b) -> Vector a -> Vector b
V.map (String -> Text
T.pack (String -> Text) -> (a -> String) -> a -> Text
forall b c a. (b -> c) -> (a -> b) -> a -> c
. a -> String
forall a. Show a => a -> String
show) (Vector a -> Vector a
forall (v :: * -> *) a (w :: * -> *).
(Vector v a, Vector w a) =>
v a -> w a
V.convert Vector a
column)
        get (Just (OptionalColumn Vector (Maybe a)
column)) = (Maybe a -> Text) -> Vector (Maybe a) -> Vector Text
forall a b. (a -> b) -> Vector a -> Vector b
V.map (String -> Text
T.pack (String -> Text) -> (Maybe a -> String) -> Maybe a -> Text
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Maybe a -> String
forall a. Show a => a -> String
show) Vector (Maybe a)
column
        get Maybe Column
Nothing = Vector Text
forall a. Vector a
V.empty
        getTextColumnFromFrame :: DataFrame -> (Integer, Text) -> Vector Text
getTextColumnFromFrame DataFrame
df (Integer
i, Text
name) =
            if Integer
i Integer -> Integer -> Bool
forall a. Eq a => a -> a -> Bool
== Integer
0
                then [Text] -> Vector Text
forall a. [a] -> Vector a
V.fromList ((Int -> Text) -> [Int] -> [Text]
forall a b. (a -> b) -> [a] -> [b]
map (String -> Text
T.pack (String -> Text) -> (Int -> String) -> Int -> Text
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Int -> String
forall a. Show a => a -> String
show) [Int
0 .. ((Int, Int) -> Int
forall a b. (a, b) -> a
fst (DataFrame -> (Int, Int)
dataframeDimensions DataFrame
df) Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
1)])
                else Maybe Column -> Vector Text
get (Maybe Column -> Vector Text) -> Maybe Column -> Vector Text
forall a b. (a -> b) -> a -> b
$ Vector Column -> Int -> Maybe Column
forall a. Vector a -> Int -> Maybe a
(V.!?) (DataFrame -> Vector Column
columns DataFrame
d) (Map Text Int -> Text -> Int
forall k a. Ord k => Map k a -> k -> a
(M.!) (DataFrame -> Map Text Int
columnIndices DataFrame
d) Text
name)
        rows :: [[Text]]
rows =
            [[Text]] -> [[Text]]
forall a. [[a]] -> [[a]]
transpose ([[Text]] -> [[Text]]) -> [[Text]] -> [[Text]]
forall a b. (a -> b) -> a -> b
$
                (Integer -> Text -> [Text]) -> [Integer] -> [Text] -> [[Text]]
forall a b c. (a -> b -> c) -> [a] -> [b] -> [c]
zipWith (((Integer, Text) -> [Text]) -> Integer -> Text -> [Text]
forall a b c. ((a, b) -> c) -> a -> b -> c
curry (Vector Text -> [Text]
forall a. Vector a -> [a]
V.toList (Vector Text -> [Text])
-> ((Integer, Text) -> Vector Text) -> (Integer, Text) -> [Text]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. DataFrame -> (Integer, Text) -> Vector Text
getTextColumnFromFrame DataFrame
d)) [Integer
0 ..] [Text]
header
     in Bool -> [Text] -> [Text] -> [[Text]] -> Text
showTable Bool
properMarkdown [Text]
header (Text
"Int" Text -> [Text] -> [Text]
forall a. a -> [a] -> [a]
: [Text]
types) [[Text]]
rows

-- | O(1) Creates an empty dataframe
empty :: DataFrame
empty :: DataFrame
empty =
    DataFrame
        { columns :: Vector Column
columns = Vector Column
forall a. Vector a
V.empty
        , columnIndices :: Map Text Int
columnIndices = Map Text Int
forall k a. Map k a
M.empty
        , dataframeDimensions :: (Int, Int)
dataframeDimensions = (Int
0, Int
0)
        }

getColumn :: T.Text -> DataFrame -> Maybe Column
getColumn :: Text -> DataFrame -> Maybe Column
getColumn Text
name DataFrame
df = do
    Int
i <- DataFrame -> Map Text Int
columnIndices DataFrame
df Map Text Int -> Text -> Maybe Int
forall k a. Ord k => Map k a -> k -> Maybe a
M.!? Text
name
    DataFrame -> Vector Column
columns DataFrame
df Vector Column -> Int -> Maybe Column
forall a. Vector a -> Int -> Maybe a
V.!? Int
i

unsafeGetColumn :: T.Text -> DataFrame -> Column
unsafeGetColumn :: Text -> DataFrame -> Column
unsafeGetColumn Text
name DataFrame
df = case Text -> DataFrame -> Maybe Column
getColumn Text
name DataFrame
df of
    Maybe Column
Nothing -> DataFrameException -> Column
forall a e. Exception e => e -> a
throw (DataFrameException -> Column) -> DataFrameException -> Column
forall a b. (a -> b) -> a -> b
$ Text -> Text -> [Text] -> DataFrameException
ColumnNotFoundException Text
name Text
"" (Map Text Int -> [Text]
forall k a. Map k a -> [k]
M.keys (Map Text Int -> [Text]) -> Map Text Int -> [Text]
forall a b. (a -> b) -> a -> b
$ DataFrame -> Map Text Int
columnIndices DataFrame
df)
    Just Column
col -> Column
col

null :: DataFrame -> Bool
null :: DataFrame -> Bool
null DataFrame
df = Vector Column -> Bool
forall a. Vector a -> Bool
V.null (DataFrame -> Vector Column
columns DataFrame
df)

{- | Returns a dataframe as a two dimentions vector of floats.

All entries in the dataframe must be doubles.
This is useful for handing data over into ML systems.
-}
toMatrix :: DataFrame -> Either DataFrameException (V.Vector (VU.Vector Float))
toMatrix :: DataFrame -> Either DataFrameException (Vector (Vector Float))
toMatrix DataFrame
df = case (Either DataFrameException (Vector (Vector Double))
 -> Column -> Either DataFrameException (Vector (Vector Double)))
-> Either DataFrameException (Vector (Vector Double))
-> Vector Column
-> Either DataFrameException (Vector (Vector Double))
forall a b. (a -> b -> a) -> a -> Vector b -> a
V.foldl'
    (\Either DataFrameException (Vector (Vector Double))
acc Column
c -> Vector (Vector Double) -> Vector Double -> Vector (Vector Double)
forall a. Vector a -> a -> Vector a
V.snoc (Vector (Vector Double) -> Vector Double -> Vector (Vector Double))
-> Either DataFrameException (Vector (Vector Double))
-> Either
     DataFrameException (Vector Double -> Vector (Vector Double))
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Either DataFrameException (Vector (Vector Double))
acc Either DataFrameException (Vector Double -> Vector (Vector Double))
-> Either DataFrameException (Vector Double)
-> Either DataFrameException (Vector (Vector Double))
forall a b.
Either DataFrameException (a -> b)
-> Either DataFrameException a -> Either DataFrameException b
forall (f :: * -> *) a b. Applicative f => f (a -> b) -> f a -> f b
<*> (forall a (v :: * -> *).
(Vector v a, Columnable a) =>
Column -> Either DataFrameException (v a)
toVector @Double Column
c))
    (Vector (Vector Double)
-> Either DataFrameException (Vector (Vector Double))
forall a b. b -> Either a b
Right Vector (Vector Double)
forall a. Vector a
V.empty :: Either DataFrameException (V.Vector (V.Vector Double)))
    (DataFrame -> Vector Column
columns DataFrame
df) of
    Left DataFrameException
e -> DataFrameException
-> Either DataFrameException (Vector (Vector Float))
forall a b. a -> Either a b
Left DataFrameException
e
    Right Vector (Vector Double)
m ->
        Vector (Vector Float)
-> Either DataFrameException (Vector (Vector Float))
forall a. a -> Either DataFrameException a
forall (f :: * -> *) a. Applicative f => a -> f a
pure (Vector (Vector Float)
 -> Either DataFrameException (Vector (Vector Float)))
-> Vector (Vector Float)
-> Either DataFrameException (Vector (Vector Float))
forall a b. (a -> b) -> a -> b
$
            Int -> (Int -> Vector Float) -> Vector (Vector Float)
forall a. Int -> (Int -> a) -> Vector a
V.generate
                ((Int, Int) -> Int
forall a b. (a, b) -> a
fst (DataFrame -> (Int, Int)
dataframeDimensions DataFrame
df))
                ( \Int
i ->
                    (Vector Float -> Int -> Vector Float)
-> Vector Float -> [Int] -> Vector Float
forall b a. (b -> a -> b) -> b -> [a] -> b
forall (t :: * -> *) b a.
Foldable t =>
(b -> a -> b) -> b -> t a -> b
foldl
                        (\Vector Float
acc Int
j -> Vector Float
acc Vector Float -> Float -> Vector Float
forall a. Unbox a => Vector a -> a -> Vector a
`VU.snoc` Double -> Float
forall a b. (Real a, Fractional b) => a -> b
realToFrac ((Vector (Vector Double)
m Vector (Vector Double) -> Int -> Vector Double
forall a. Vector a -> Int -> a
V.! Int
j) Vector Double -> Int -> Double
forall a. Vector a -> Int -> a
V.! Int
i))
                        Vector Float
forall a. Unbox a => Vector a
VU.empty
                        [Int
0 .. (Vector (Vector Double) -> Int
forall a. Vector a -> Int
V.length Vector (Vector Double)
m Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
1)]
                )

{- | Get a specific column as a vector.

You must specify the type via type applications.
-}
columnAsVector :: forall a. (Columnable a) => T.Text -> DataFrame -> V.Vector a
columnAsVector :: forall a. Columnable a => Text -> DataFrame -> Vector a
columnAsVector Text
name DataFrame
df = case Text -> DataFrame -> Column
unsafeGetColumn Text
name DataFrame
df of
    (BoxedColumn (Vector a
col :: V.Vector b)) -> case TypeRep a -> TypeRep a -> Maybe (a :~: a)
forall a b. TypeRep a -> TypeRep b -> Maybe (a :~: b)
forall {k} (f :: k -> *) (a :: k) (b :: k).
TestEquality f =>
f a -> f b -> Maybe (a :~: b)
testEquality (forall a. Typeable a => TypeRep a
forall {k} (a :: k). Typeable a => TypeRep a
typeRep @a) (forall a. Typeable a => TypeRep a
forall {k} (a :: k). Typeable a => TypeRep a
typeRep @b) of
        Maybe (a :~: a)
Nothing -> String -> Vector a
forall a. HasCallStack => String -> a
error String
"Type error"
        Just a :~: a
Refl -> Vector a
Vector a
col
    (OptionalColumn (Vector (Maybe a)
col :: V.Vector b)) -> case TypeRep a -> TypeRep (Maybe a) -> Maybe (a :~: Maybe a)
forall a b. TypeRep a -> TypeRep b -> Maybe (a :~: b)
forall {k} (f :: k -> *) (a :: k) (b :: k).
TestEquality f =>
f a -> f b -> Maybe (a :~: b)
testEquality (forall a. Typeable a => TypeRep a
forall {k} (a :: k). Typeable a => TypeRep a
typeRep @a) (forall a. Typeable a => TypeRep a
forall {k} (a :: k). Typeable a => TypeRep a
typeRep @b) of
        Maybe (a :~: Maybe a)
Nothing -> String -> Vector a
forall a. HasCallStack => String -> a
error String
"Type error"
        Just a :~: Maybe a
Refl -> Vector a
Vector (Maybe a)
col
    (UnboxedColumn (Vector a
col :: VU.Vector b)) -> case TypeRep a -> TypeRep a -> Maybe (a :~: a)
forall a b. TypeRep a -> TypeRep b -> Maybe (a :~: b)
forall {k} (f :: k -> *) (a :: k) (b :: k).
TestEquality f =>
f a -> f b -> Maybe (a :~: b)
testEquality (forall a. Typeable a => TypeRep a
forall {k} (a :: k). Typeable a => TypeRep a
typeRep @a) (forall a. Typeable a => TypeRep a
forall {k} (a :: k). Typeable a => TypeRep a
typeRep @b) of
        Maybe (a :~: a)
Nothing -> String -> Vector a
forall a. HasCallStack => String -> a
error String
"Type error"
        Just a :~: a
Refl -> Vector a -> Vector a
forall (v :: * -> *) a (w :: * -> *).
(Vector v a, Vector w a) =>
v a -> w a
VG.convert Vector a
Vector a
col