{-# LANGUAGE ExplicitNamespaces #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE GADTs #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE RankNTypes #-}
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE TypeApplications #-}

module DataFrame.Operations.Subset where

import qualified Data.List as L
import qualified Data.Map as M
import qualified Data.Set as S
import qualified Data.Text as T
import qualified Data.Vector as V
import qualified Data.Vector.Generic as VG
import qualified Data.Vector.Unboxed as VU
import qualified Prelude

import Control.Exception (throw)
import Data.Function ((&))
import Data.Maybe (
    fromJust,
    fromMaybe,
    isJust,
    isNothing,
 )
import Data.Type.Equality (TestEquality (..))
import DataFrame.Errors (
    DataFrameException (..),
    TypeErrorContext (..),
 )
import DataFrame.Internal.Column
import DataFrame.Internal.DataFrame (
    DataFrame (..),
    empty,
    getColumn,
 )
import DataFrame.Internal.Expression
import DataFrame.Internal.Interpreter
import DataFrame.Operations.Core
import DataFrame.Operations.Transformations (apply)
import System.Random
import Type.Reflection
import Prelude hiding (filter, take)

-- | O(k * n) Take the first n rows of a DataFrame.
take :: Int -> DataFrame -> DataFrame
take :: Int -> DataFrame -> DataFrame
take Int
n DataFrame
d = DataFrame
d{columns = V.map (takeColumn n') (columns d), dataframeDimensions = (n', c)}
  where
    (Int
r, Int
c) = DataFrame -> (Int, Int)
dataframeDimensions DataFrame
d
    n' :: Int
n' = Int -> Int -> Int -> Int
clip Int
n Int
0 Int
r

-- | O(k * n) Take the last n rows of a DataFrame.
takeLast :: Int -> DataFrame -> DataFrame
takeLast :: Int -> DataFrame -> DataFrame
takeLast Int
n DataFrame
d =
    DataFrame
d
        { columns = V.map (takeLastColumn n') (columns d)
        , dataframeDimensions = (n', c)
        }
  where
    (Int
r, Int
c) = DataFrame -> (Int, Int)
dataframeDimensions DataFrame
d
    n' :: Int
n' = Int -> Int -> Int -> Int
clip Int
n Int
0 Int
r

-- | O(k * n) Drop the first n rows of a DataFrame.
drop :: Int -> DataFrame -> DataFrame
drop :: Int -> DataFrame -> DataFrame
drop Int
n DataFrame
d =
    DataFrame
d
        { columns = V.map (sliceColumn n' (max (r - n') 0)) (columns d)
        , dataframeDimensions = (max (r - n') 0, c)
        }
  where
    (Int
r, Int
c) = DataFrame -> (Int, Int)
dataframeDimensions DataFrame
d
    n' :: Int
n' = Int -> Int -> Int -> Int
clip Int
n Int
0 Int
r

-- | O(k * n) Drop the last n rows of a DataFrame.
dropLast :: Int -> DataFrame -> DataFrame
dropLast :: Int -> DataFrame -> DataFrame
dropLast Int
n DataFrame
d =
    DataFrame
d{columns = V.map (sliceColumn 0 n') (columns d), dataframeDimensions = (n', c)}
  where
    (Int
r, Int
c) = DataFrame -> (Int, Int)
dataframeDimensions DataFrame
d
    n' :: Int
n' = Int -> Int -> Int -> Int
clip (Int
r Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
n) Int
0 Int
r

-- | O(k * n) Take a range of rows of a DataFrame.
range :: (Int, Int) -> DataFrame -> DataFrame
range :: (Int, Int) -> DataFrame -> DataFrame
range (Int
start, Int
end) DataFrame
d =
    DataFrame
d
        { columns = V.map (sliceColumn (clip start 0 r) n') (columns d)
        , dataframeDimensions = (n', c)
        }
  where
    (Int
r, Int
c) = DataFrame -> (Int, Int)
dataframeDimensions DataFrame
d
    n' :: Int
n' = Int -> Int -> Int -> Int
clip (Int
end Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
start) Int
0 Int
r

clip :: Int -> Int -> Int -> Int
clip :: Int -> Int -> Int -> Int
clip Int
n Int
left Int
right = Int -> Int -> Int
forall a. Ord a => a -> a -> a
min Int
right (Int -> Int) -> Int -> Int
forall a b. (a -> b) -> a -> b
$ Int -> Int -> Int
forall a. Ord a => a -> a -> a
max Int
n Int
left

{- | O(n * k) Filter rows by a given condition.

> filter "x" even df
-}
filter ::
    forall a.
    (Columnable a) =>
    -- | Column to filter by
    Expr a ->
    -- | Filter condition
    (a -> Bool) ->
    -- | Dataframe to filter
    DataFrame ->
    DataFrame
filter :: forall a.
Columnable a =>
Expr a -> (a -> Bool) -> DataFrame -> DataFrame
filter (Col Text
filterColumnName) a -> Bool
condition DataFrame
df = case Text -> DataFrame -> Maybe Column
getColumn Text
filterColumnName DataFrame
df of
    Maybe Column
Nothing ->
        DataFrameException -> DataFrame
forall a e. Exception e => e -> a
throw (DataFrameException -> DataFrame)
-> DataFrameException -> DataFrame
forall a b. (a -> b) -> a -> b
$
            Text -> Text -> [Text] -> DataFrameException
ColumnNotFoundException Text
filterColumnName Text
"filter" (Map Text Int -> [Text]
forall k a. Map k a -> [k]
M.keys (Map Text Int -> [Text]) -> Map Text Int -> [Text]
forall a b. (a -> b) -> a -> b
$ DataFrame -> Map Text Int
columnIndices DataFrame
df)
    Just (BoxedColumn (Vector a
column :: V.Vector b)) -> Text -> Vector a -> (a -> Bool) -> DataFrame -> DataFrame
forall a b (v :: * -> *).
(Vector v b, Vector v Int, Columnable a, Columnable b) =>
Text -> v b -> (a -> Bool) -> DataFrame -> DataFrame
filterByVector Text
filterColumnName Vector a
column a -> Bool
condition DataFrame
df
    Just (OptionalColumn (Vector (Maybe a)
column :: V.Vector b)) -> Text -> Vector (Maybe a) -> (a -> Bool) -> DataFrame -> DataFrame
forall a b (v :: * -> *).
(Vector v b, Vector v Int, Columnable a, Columnable b) =>
Text -> v b -> (a -> Bool) -> DataFrame -> DataFrame
filterByVector Text
filterColumnName Vector (Maybe a)
column a -> Bool
condition DataFrame
df
    Just (UnboxedColumn (Vector a
column :: VU.Vector b)) -> Text -> Vector a -> (a -> Bool) -> DataFrame -> DataFrame
forall a b (v :: * -> *).
(Vector v b, Vector v Int, Columnable a, Columnable b) =>
Text -> v b -> (a -> Bool) -> DataFrame -> DataFrame
filterByVector Text
filterColumnName Vector a
column a -> Bool
condition DataFrame
df
filter Expr a
expr a -> Bool
condition DataFrame
df =
    let
        (TColumn Column
col) = case forall a.
Columnable a =>
DataFrame -> Expr a -> Either DataFrameException (TypedColumn a)
interpret @a DataFrame
df (Expr a -> Expr a
forall a. (Eq a, Ord a, Show a, Typeable a) => Expr a -> Expr a
normalize Expr a
expr) of
            Left DataFrameException
e -> DataFrameException -> TypedColumn a
forall a e. Exception e => e -> a
throw DataFrameException
e
            Right TypedColumn a
c -> TypedColumn a
c
        indexes :: Vector Int
indexes = case (a -> Bool) -> Column -> Either DataFrameException (Vector Int)
forall a.
Columnable a =>
(a -> Bool) -> Column -> Either DataFrameException (Vector Int)
findIndices a -> Bool
condition Column
col of
            Right Vector Int
ixs -> Vector Int
ixs
            Left DataFrameException
e -> DataFrameException -> Vector Int
forall a e. Exception e => e -> a
throw DataFrameException
e
        c' :: Int
c' = (Int, Int) -> Int
forall a b. (a, b) -> b
snd ((Int, Int) -> Int) -> (Int, Int) -> Int
forall a b. (a -> b) -> a -> b
$ DataFrame -> (Int, Int)
dataframeDimensions DataFrame
df
     in
        DataFrame
df
            { columns = V.map (atIndicesStable indexes) (columns df)
            , dataframeDimensions = (VU.length indexes, c')
            }

filterByVector ::
    forall a b v.
    (VG.Vector v b, VG.Vector v Int, Columnable a, Columnable b) =>
    T.Text -> v b -> (a -> Bool) -> DataFrame -> DataFrame
filterByVector :: forall a b (v :: * -> *).
(Vector v b, Vector v Int, Columnable a, Columnable b) =>
Text -> v b -> (a -> Bool) -> DataFrame -> DataFrame
filterByVector Text
filterColumnName v b
column a -> Bool
condition DataFrame
df = case TypeRep a -> TypeRep b -> Maybe (a :~: b)
forall a b. TypeRep a -> TypeRep b -> Maybe (a :~: b)
forall {k} (f :: k -> *) (a :: k) (b :: k).
TestEquality f =>
f a -> f b -> Maybe (a :~: b)
testEquality (forall a. Typeable a => TypeRep a
forall {k} (a :: k). Typeable a => TypeRep a
typeRep @a) (forall a. Typeable a => TypeRep a
forall {k} (a :: k). Typeable a => TypeRep a
typeRep @b) of
    Maybe (a :~: b)
Nothing ->
        DataFrameException -> DataFrame
forall a e. Exception e => e -> a
throw (DataFrameException -> DataFrame)
-> DataFrameException -> DataFrame
forall a b. (a -> b) -> a -> b
$
            TypeErrorContext a b -> DataFrameException
forall a b.
(Typeable a, Typeable b) =>
TypeErrorContext a b -> DataFrameException
TypeMismatchException
                ( MkTypeErrorContext
                    { userType :: Either String (TypeRep a)
userType = TypeRep a -> Either String (TypeRep a)
forall a b. b -> Either a b
Right (TypeRep a -> Either String (TypeRep a))
-> TypeRep a -> Either String (TypeRep a)
forall a b. (a -> b) -> a -> b
$ forall a. Typeable a => TypeRep a
forall {k} (a :: k). Typeable a => TypeRep a
typeRep @a
                    , expectedType :: Either String (TypeRep b)
expectedType = TypeRep b -> Either String (TypeRep b)
forall a b. b -> Either a b
Right (TypeRep b -> Either String (TypeRep b))
-> TypeRep b -> Either String (TypeRep b)
forall a b. (a -> b) -> a -> b
$ forall a. Typeable a => TypeRep a
forall {k} (a :: k). Typeable a => TypeRep a
typeRep @b
                    , errorColumnName :: Maybe String
errorColumnName = String -> Maybe String
forall a. a -> Maybe a
Just (Text -> String
T.unpack Text
filterColumnName)
                    , callingFunctionName :: Maybe String
callingFunctionName = String -> Maybe String
forall a. a -> Maybe a
Just String
"filter"
                    }
                )
    Just a :~: b
Refl ->
        let
            ixs :: Vector Int
ixs = v Int -> Vector Int
forall (v :: * -> *) a (w :: * -> *).
(Vector v a, Vector w a) =>
v a -> w a
VG.convert ((a -> Bool) -> v a -> v Int
forall (v :: * -> *) a.
(Vector v a, Vector v Int) =>
(a -> Bool) -> v a -> v Int
VG.findIndices a -> Bool
condition v a
v b
column)
         in
            DataFrame
df
                { columns = V.map (atIndicesStable ixs) (columns df)
                , dataframeDimensions = (VG.length ixs, snd (dataframeDimensions df))
                }

{- | O(k) a version of filter where the predicate comes first.

> filterBy even "x" df
-}
filterBy :: (Columnable a) => (a -> Bool) -> Expr a -> DataFrame -> DataFrame
filterBy :: forall a.
Columnable a =>
(a -> Bool) -> Expr a -> DataFrame -> DataFrame
filterBy = (Expr a -> (a -> Bool) -> DataFrame -> DataFrame)
-> (a -> Bool) -> Expr a -> DataFrame -> DataFrame
forall a b c. (a -> b -> c) -> b -> a -> c
flip Expr a -> (a -> Bool) -> DataFrame -> DataFrame
forall a.
Columnable a =>
Expr a -> (a -> Bool) -> DataFrame -> DataFrame
filter

{- | O(k) filters the dataframe with a boolean expression.

> filterWhere (F.col @Int x + F.col y F.> 5) df
-}
filterWhere :: Expr Bool -> DataFrame -> DataFrame
filterWhere :: Expr Bool -> DataFrame -> DataFrame
filterWhere Expr Bool
expr DataFrame
df =
    let
        (TColumn Column
col) = case forall a.
Columnable a =>
DataFrame -> Expr a -> Either DataFrameException (TypedColumn a)
interpret @Bool DataFrame
df (Expr Bool -> Expr Bool
forall a. (Eq a, Ord a, Show a, Typeable a) => Expr a -> Expr a
normalize Expr Bool
expr) of
            Left DataFrameException
e -> DataFrameException -> TypedColumn Bool
forall a e. Exception e => e -> a
throw DataFrameException
e
            Right TypedColumn Bool
c -> TypedColumn Bool
c
        indexes :: Vector Int
indexes = case (Bool -> Bool) -> Column -> Either DataFrameException (Vector Int)
forall a.
Columnable a =>
(a -> Bool) -> Column -> Either DataFrameException (Vector Int)
findIndices Bool -> Bool
forall a. a -> a
id Column
col of
            Right Vector Int
ixs -> Vector Int
ixs
            Left DataFrameException
e -> DataFrameException -> Vector Int
forall a e. Exception e => e -> a
throw DataFrameException
e
        c' :: Int
c' = (Int, Int) -> Int
forall a b. (a, b) -> b
snd ((Int, Int) -> Int) -> (Int, Int) -> Int
forall a b. (a -> b) -> a -> b
$ DataFrame -> (Int, Int)
dataframeDimensions DataFrame
df
     in
        DataFrame
df
            { columns = V.map (atIndicesStable indexes) (columns df)
            , dataframeDimensions = (VU.length indexes, c')
            }

{- | O(k) removes all rows with `Nothing` in a given column from the dataframe.

> filterJust "col" df
-}
filterJust :: T.Text -> DataFrame -> DataFrame
filterJust :: Text -> DataFrame -> DataFrame
filterJust Text
name DataFrame
df = case Text -> DataFrame -> Maybe Column
getColumn Text
name DataFrame
df of
    Maybe Column
Nothing ->
        DataFrameException -> DataFrame
forall a e. Exception e => e -> a
throw (DataFrameException -> DataFrame)
-> DataFrameException -> DataFrame
forall a b. (a -> b) -> a -> b
$ Text -> Text -> [Text] -> DataFrameException
ColumnNotFoundException Text
name Text
"filterJust" (Map Text Int -> [Text]
forall k a. Map k a -> [k]
M.keys (Map Text Int -> [Text]) -> Map Text Int -> [Text]
forall a b. (a -> b) -> a -> b
$ DataFrame -> Map Text Int
columnIndices DataFrame
df)
    Just column :: Column
column@(OptionalColumn (Vector (Maybe a)
col :: V.Vector (Maybe a))) -> Expr (Maybe a) -> (Maybe a -> Bool) -> DataFrame -> DataFrame
forall a.
Columnable a =>
Expr a -> (a -> Bool) -> DataFrame -> DataFrame
filter (forall a. Columnable a => Text -> Expr a
Col @(Maybe a) Text
name) Maybe a -> Bool
forall a. Maybe a -> Bool
isJust DataFrame
df DataFrame -> (DataFrame -> DataFrame) -> DataFrame
forall a b. a -> (a -> b) -> b
& forall b c.
(Columnable b, Columnable c) =>
(b -> c) -> Text -> DataFrame -> DataFrame
apply @(Maybe a) Maybe a -> a
forall a. HasCallStack => Maybe a -> a
fromJust Text
name
    Just Column
column -> DataFrame
df

{- | O(k) returns all rows with `Nothing` in a give column.

> filterNothing "col" df
-}
filterNothing :: T.Text -> DataFrame -> DataFrame
filterNothing :: Text -> DataFrame -> DataFrame
filterNothing Text
name DataFrame
df = case Text -> DataFrame -> Maybe Column
getColumn Text
name DataFrame
df of
    Maybe Column
Nothing ->
        DataFrameException -> DataFrame
forall a e. Exception e => e -> a
throw (DataFrameException -> DataFrame)
-> DataFrameException -> DataFrame
forall a b. (a -> b) -> a -> b
$ Text -> Text -> [Text] -> DataFrameException
ColumnNotFoundException Text
name Text
"filterNothing" (Map Text Int -> [Text]
forall k a. Map k a -> [k]
M.keys (Map Text Int -> [Text]) -> Map Text Int -> [Text]
forall a b. (a -> b) -> a -> b
$ DataFrame -> Map Text Int
columnIndices DataFrame
df)
    Just (OptionalColumn (Vector (Maybe a)
col :: V.Vector (Maybe a))) -> Expr (Maybe a) -> (Maybe a -> Bool) -> DataFrame -> DataFrame
forall a.
Columnable a =>
Expr a -> (a -> Bool) -> DataFrame -> DataFrame
filter (forall a. Columnable a => Text -> Expr a
Col @(Maybe a) Text
name) Maybe a -> Bool
forall a. Maybe a -> Bool
isNothing DataFrame
df
    Maybe Column
_ -> DataFrame
df

{- | O(n * k) removes all rows with `Nothing` from the dataframe.

> filterAllJust df
-}
filterAllJust :: DataFrame -> DataFrame
filterAllJust :: DataFrame -> DataFrame
filterAllJust DataFrame
df = (Text -> DataFrame -> DataFrame)
-> DataFrame -> [Text] -> DataFrame
forall a b. (a -> b -> b) -> b -> [a] -> b
forall (t :: * -> *) a b.
Foldable t =>
(a -> b -> b) -> b -> t a -> b
foldr Text -> DataFrame -> DataFrame
filterJust DataFrame
df (DataFrame -> [Text]
columnNames DataFrame
df)

{- | O(n * k) keeps any row with a null value.

> filterAllNothing df
-}
filterAllNothing :: DataFrame -> DataFrame
filterAllNothing :: DataFrame -> DataFrame
filterAllNothing DataFrame
df = (Text -> DataFrame -> DataFrame)
-> DataFrame -> [Text] -> DataFrame
forall a b. (a -> b -> b) -> b -> [a] -> b
forall (t :: * -> *) a b.
Foldable t =>
(a -> b -> b) -> b -> t a -> b
foldr Text -> DataFrame -> DataFrame
filterNothing DataFrame
df (DataFrame -> [Text]
columnNames DataFrame
df)

{- | O(k) cuts the dataframe in a cube of size (a, b) where
  a is the length and b is the width.

> cube (10, 5) df
-}
cube :: (Int, Int) -> DataFrame -> DataFrame
cube :: (Int, Int) -> DataFrame -> DataFrame
cube (Int
length, Int
width) = Int -> DataFrame -> DataFrame
take Int
length (DataFrame -> DataFrame)
-> (DataFrame -> DataFrame) -> DataFrame -> DataFrame
forall b c a. (b -> c) -> (a -> b) -> a -> c
. [SelectionCriteria] -> DataFrame -> DataFrame
selectBy [(Int, Int) -> SelectionCriteria
ColumnIndexRange (Int
0, Int
width Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
1)]

{- | O(n) Selects a number of columns in a given dataframe.

> select ["name", "age"] df
-}
select ::
    [T.Text] ->
    DataFrame ->
    DataFrame
select :: [Text] -> DataFrame -> DataFrame
select [Text]
cs DataFrame
df
    | [Text] -> Bool
forall a. [a] -> Bool
forall (t :: * -> *) a. Foldable t => t a -> Bool
L.null [Text]
cs = DataFrame
empty
    | (Text -> Bool) -> [Text] -> Bool
forall (t :: * -> *) a. Foldable t => (a -> Bool) -> t a -> Bool
any (Text -> [Text] -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`notElem` DataFrame -> [Text]
columnNames DataFrame
df) [Text]
cs =
        DataFrameException -> DataFrame
forall a e. Exception e => e -> a
throw (DataFrameException -> DataFrame)
-> DataFrameException -> DataFrame
forall a b. (a -> b) -> a -> b
$
            Text -> Text -> [Text] -> DataFrameException
ColumnNotFoundException
                (String -> Text
T.pack (String -> Text) -> String -> Text
forall a b. (a -> b) -> a -> b
$ [Text] -> String
forall a. Show a => a -> String
show ([Text] -> String) -> [Text] -> String
forall a b. (a -> b) -> a -> b
$ [Text]
cs [Text] -> [Text] -> [Text]
forall a. Eq a => [a] -> [a] -> [a]
L.\\ DataFrame -> [Text]
columnNames DataFrame
df)
                Text
"select"
                (DataFrame -> [Text]
columnNames DataFrame
df)
    | Bool
otherwise = (DataFrame -> Text -> DataFrame)
-> DataFrame -> [Text] -> DataFrame
forall b a. (b -> a -> b) -> b -> [a] -> b
forall (t :: * -> *) b a.
Foldable t =>
(b -> a -> b) -> b -> t a -> b
L.foldl' DataFrame -> Text -> DataFrame
addKeyValue DataFrame
empty [Text]
cs
  where
    addKeyValue :: DataFrame -> Text -> DataFrame
addKeyValue DataFrame
d Text
k = DataFrame -> Maybe DataFrame -> DataFrame
forall a. a -> Maybe a -> a
fromMaybe DataFrame
df (Maybe DataFrame -> DataFrame) -> Maybe DataFrame -> DataFrame
forall a b. (a -> b) -> a -> b
$ do
        Column
col <- Text -> DataFrame -> Maybe Column
getColumn Text
k DataFrame
df
        DataFrame -> Maybe DataFrame
forall a. a -> Maybe a
forall (f :: * -> *) a. Applicative f => a -> f a
pure (DataFrame -> Maybe DataFrame) -> DataFrame -> Maybe DataFrame
forall a b. (a -> b) -> a -> b
$ Text -> Column -> DataFrame -> DataFrame
insertColumn Text
k Column
col DataFrame
d

data SelectionCriteria
    = ColumnProperty (Column -> Bool)
    | ColumnNameProperty (T.Text -> Bool)
    | ColumnTextRange (T.Text, T.Text)
    | ColumnIndexRange (Int, Int)
    | ColumnName T.Text

{- | Criteria for selecting a column by name.

> selectBy [byName "Age"] df

equivalent to:

> select ["Age"] df
-}
byName :: T.Text -> SelectionCriteria
byName :: Text -> SelectionCriteria
byName = Text -> SelectionCriteria
ColumnName

{- | Criteria for selecting columns whose property satisfies given predicate.

> selectBy [byProperty isNumeric] df
-}
byProperty :: (Column -> Bool) -> SelectionCriteria
byProperty :: (Column -> Bool) -> SelectionCriteria
byProperty = (Column -> Bool) -> SelectionCriteria
ColumnProperty

{- | Criteria for selecting columns whose name satisfies given predicate.

> selectBy [byNameProperty (T.isPrefixOf "weight")] df
-}
byNameProperty :: (T.Text -> Bool) -> SelectionCriteria
byNameProperty :: (Text -> Bool) -> SelectionCriteria
byNameProperty = (Text -> Bool) -> SelectionCriteria
ColumnNameProperty

{- | Criteria for selecting columns whose names are in the given lexicographic range (inclusive).

> selectBy [byNameRange ("a", "c")] df
-}
byNameRange :: (T.Text, T.Text) -> SelectionCriteria
byNameRange :: (Text, Text) -> SelectionCriteria
byNameRange = (Text, Text) -> SelectionCriteria
ColumnTextRange

{- | Criteria for selecting columns whose indices are in the given (inclusive) range.

> selectBy [byIndexRange (0, 5)] df
-}
byIndexRange :: (Int, Int) -> SelectionCriteria
byIndexRange :: (Int, Int) -> SelectionCriteria
byIndexRange = (Int, Int) -> SelectionCriteria
ColumnIndexRange

-- | O(n) select columns by column predicate name.
selectBy :: [SelectionCriteria] -> DataFrame -> DataFrame
selectBy :: [SelectionCriteria] -> DataFrame -> DataFrame
selectBy [SelectionCriteria]
xs DataFrame
df = [Text] -> DataFrame -> DataFrame
select [Text]
finalSelection DataFrame
df
  where
    finalSelection :: [Text]
finalSelection = (Text -> Bool) -> [Text] -> [Text]
forall a. (a -> Bool) -> [a] -> [a]
Prelude.filter (Text -> Set Text -> Bool
forall a. Ord a => a -> Set a -> Bool
`S.member` Set Text
columnsWithProperties) (DataFrame -> [Text]
columnNames DataFrame
df)
    columnsWithProperties :: Set Text
columnsWithProperties = [Text] -> Set Text
forall a. Ord a => [a] -> Set a
S.fromList (([Text] -> SelectionCriteria -> [Text])
-> [Text] -> [SelectionCriteria] -> [Text]
forall b a. (b -> a -> b) -> b -> [a] -> b
forall (t :: * -> *) b a.
Foldable t =>
(b -> a -> b) -> b -> t a -> b
L.foldl' [Text] -> SelectionCriteria -> [Text]
columnWithProperty [] [SelectionCriteria]
xs)
    columnWithProperty :: [Text] -> SelectionCriteria -> [Text]
columnWithProperty [Text]
acc (ColumnName Text
name) = [Text]
acc [Text] -> [Text] -> [Text]
forall a. [a] -> [a] -> [a]
++ [Text
name]
    columnWithProperty [Text]
acc (ColumnNameProperty Text -> Bool
f) = [Text]
acc [Text] -> [Text] -> [Text]
forall a. [a] -> [a] -> [a]
++ (Text -> Bool) -> [Text] -> [Text]
forall a. (a -> Bool) -> [a] -> [a]
L.filter Text -> Bool
f (DataFrame -> [Text]
columnNames DataFrame
df)
    columnWithProperty [Text]
acc (ColumnTextRange (Text
from, Text
to)) =
        [Text]
acc
            [Text] -> [Text] -> [Text]
forall a. [a] -> [a] -> [a]
++ [Text] -> [Text]
forall a. [a] -> [a]
reverse
                ((Text -> Bool) -> [Text] -> [Text]
forall a. (a -> Bool) -> [a] -> [a]
Prelude.dropWhile (Text
to Text -> Text -> Bool
forall a. Eq a => a -> a -> Bool
/=) ([Text] -> [Text]) -> [Text] -> [Text]
forall a b. (a -> b) -> a -> b
$ [Text] -> [Text]
forall a. [a] -> [a]
reverse ([Text] -> [Text]) -> [Text] -> [Text]
forall a b. (a -> b) -> a -> b
$ (Text -> Bool) -> [Text] -> [Text]
forall a. (a -> Bool) -> [a] -> [a]
dropWhile (Text
from Text -> Text -> Bool
forall a. Eq a => a -> a -> Bool
/=) (DataFrame -> [Text]
columnNames DataFrame
df))
    columnWithProperty [Text]
acc (ColumnIndexRange (Int
from, Int
to)) = [Text]
acc [Text] -> [Text] -> [Text]
forall a. [a] -> [a] -> [a]
++ Int -> [Text] -> [Text]
forall a. Int -> [a] -> [a]
Prelude.take (Int
to Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
from Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
1) (Int -> [Text] -> [Text]
forall a. Int -> [a] -> [a]
Prelude.drop Int
from (DataFrame -> [Text]
columnNames DataFrame
df))
    columnWithProperty [Text]
acc (ColumnProperty Column -> Bool
f) =
        [Text]
acc
            [Text] -> [Text] -> [Text]
forall a. [a] -> [a] -> [a]
++ ((Text, Int) -> Text) -> [(Text, Int)] -> [Text]
forall a b. (a -> b) -> [a] -> [b]
map (Text, Int) -> Text
forall a b. (a, b) -> a
fst (((Text, Int) -> Bool) -> [(Text, Int)] -> [(Text, Int)]
forall a. (a -> Bool) -> [a] -> [a]
L.filter (\(Text
k, Int
v) -> Int
v Int -> [Int] -> Bool
forall a. Eq a => a -> [a] -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` [Int]
ixs) (Map Text Int -> [(Text, Int)]
forall k a. Map k a -> [(k, a)]
M.toAscList (DataFrame -> Map Text Int
columnIndices DataFrame
df)))
      where
        ixs :: [Int]
ixs = ([Int] -> Int -> Column -> [Int])
-> [Int] -> Vector Column -> [Int]
forall a b. (a -> Int -> b -> a) -> a -> Vector b -> a
V.ifoldl' (\[Int]
acc Int
i Column
c -> if Column -> Bool
f Column
c then Int
i Int -> [Int] -> [Int]
forall a. a -> [a] -> [a]
: [Int]
acc else [Int]
acc) [] (DataFrame -> Vector Column
columns DataFrame
df)

{- | O(n) inverse of select

> exclude ["Name"] df
-}
exclude ::
    [T.Text] ->
    DataFrame ->
    DataFrame
exclude :: [Text] -> DataFrame -> DataFrame
exclude [Text]
cs DataFrame
df =
    let keysToKeep :: [Text]
keysToKeep = DataFrame -> [Text]
columnNames DataFrame
df [Text] -> [Text] -> [Text]
forall a. Eq a => [a] -> [a] -> [a]
L.\\ [Text]
cs
     in [Text] -> DataFrame -> DataFrame
select [Text]
keysToKeep DataFrame
df

{- | Sample a dataframe. The double parameter must be between 0 and 1 (inclusive).

==== __Example__
@
ghci> import System.Random
ghci> D.sample (mkStdGen 137) 0.1 df

@
-}
sample :: (RandomGen g) => g -> Double -> DataFrame -> DataFrame
sample :: forall g. RandomGen g => g -> Double -> DataFrame -> DataFrame
sample g
pureGen Double
p DataFrame
df =
    let
        rand :: Vector Double
rand = g -> Int -> Vector Double
forall g. RandomGen g => g -> Int -> Vector Double
generateRandomVector g
pureGen ((Int, Int) -> Int
forall a b. (a, b) -> a
fst (DataFrame -> (Int, Int)
dataframeDimensions DataFrame
df))
     in
        DataFrame
df
            DataFrame -> (DataFrame -> DataFrame) -> DataFrame
forall a b. a -> (a -> b) -> b
& Text -> Vector Double -> DataFrame -> DataFrame
forall a.
(Columnable a, Unbox a) =>
Text -> Vector a -> DataFrame -> DataFrame
insertUnboxedVector Text
"__rand__" Vector Double
rand
            DataFrame -> (DataFrame -> DataFrame) -> DataFrame
forall a b. a -> (a -> b) -> b
& Expr Bool -> DataFrame -> DataFrame
filterWhere
                ( BinaryOp Double Double Bool
-> Expr Double -> Expr Double -> Expr Bool
forall c b a.
(Columnable c, Columnable b, Columnable a) =>
BinaryOp c b a -> Expr c -> Expr b -> Expr a
Binary
                    ( MkBinaryOp
                        { binaryFn :: Double -> Double -> Bool
binaryFn = Double -> Double -> Bool
forall a. Ord a => a -> a -> Bool
(>=)
                        , binaryName :: Text
binaryName = Text
"geq"
                        , binarySymbol :: Maybe Text
binarySymbol = Text -> Maybe Text
forall a. a -> Maybe a
Just Text
">="
                        , binaryCommutative :: Bool
binaryCommutative = Bool
False
                        , binaryPrecedence :: Int
binaryPrecedence = Int
1
                        }
                    )
                    (forall a. Columnable a => Text -> Expr a
Col @Double Text
"__rand__")
                    (Double -> Expr Double
forall a. Columnable a => a -> Expr a
Lit (Double
1 Double -> Double -> Double
forall a. Num a => a -> a -> a
- Double
p))
                )
            DataFrame -> (DataFrame -> DataFrame) -> DataFrame
forall a b. a -> (a -> b) -> b
& [Text] -> DataFrame -> DataFrame
exclude [Text
"__rand__"]

{- | Split a dataset into two. The first in the tuple gets a sample of p (0 <= p <= 1) and the second gets (1 - p). This is useful for creating test and train splits.

==== __Example__
@
ghci> import System.Random
ghci> D.randomSplit (mkStdGen 137) 0.9 df

@
-}
randomSplit ::
    (RandomGen g) => g -> Double -> DataFrame -> (DataFrame, DataFrame)
randomSplit :: forall g.
RandomGen g =>
g -> Double -> DataFrame -> (DataFrame, DataFrame)
randomSplit g
pureGen Double
p DataFrame
df =
    let
        rand :: Vector Double
rand = g -> Int -> Vector Double
forall g. RandomGen g => g -> Int -> Vector Double
generateRandomVector g
pureGen ((Int, Int) -> Int
forall a b. (a, b) -> a
fst (DataFrame -> (Int, Int)
dataframeDimensions DataFrame
df))
        withRand :: DataFrame
withRand = DataFrame
df DataFrame -> (DataFrame -> DataFrame) -> DataFrame
forall a b. a -> (a -> b) -> b
& Text -> Vector Double -> DataFrame -> DataFrame
forall a.
(Columnable a, Unbox a) =>
Text -> Vector a -> DataFrame -> DataFrame
insertUnboxedVector Text
"__rand__" Vector Double
rand
     in
        ( DataFrame
withRand
            DataFrame -> (DataFrame -> DataFrame) -> DataFrame
forall a b. a -> (a -> b) -> b
& Expr Bool -> DataFrame -> DataFrame
filterWhere
                ( BinaryOp Double Double Bool
-> Expr Double -> Expr Double -> Expr Bool
forall c b a.
(Columnable c, Columnable b, Columnable a) =>
BinaryOp c b a -> Expr c -> Expr b -> Expr a
Binary
                    ( MkBinaryOp
                        { binaryFn :: Double -> Double -> Bool
binaryFn = Double -> Double -> Bool
forall a. Ord a => a -> a -> Bool
(<=)
                        , binaryName :: Text
binaryName = Text
"leq"
                        , binarySymbol :: Maybe Text
binarySymbol = Text -> Maybe Text
forall a. a -> Maybe a
Just Text
"<="
                        , binaryCommutative :: Bool
binaryCommutative = Bool
False
                        , binaryPrecedence :: Int
binaryPrecedence = Int
1
                        }
                    )
                    (forall a. Columnable a => Text -> Expr a
Col @Double Text
"__rand__")
                    (Double -> Expr Double
forall a. Columnable a => a -> Expr a
Lit Double
p)
                )
            DataFrame -> (DataFrame -> DataFrame) -> DataFrame
forall a b. a -> (a -> b) -> b
& [Text] -> DataFrame -> DataFrame
exclude [Text
"__rand__"]
        , DataFrame
withRand
            DataFrame -> (DataFrame -> DataFrame) -> DataFrame
forall a b. a -> (a -> b) -> b
& Expr Bool -> DataFrame -> DataFrame
filterWhere
                ( BinaryOp Double Double Bool
-> Expr Double -> Expr Double -> Expr Bool
forall c b a.
(Columnable c, Columnable b, Columnable a) =>
BinaryOp c b a -> Expr c -> Expr b -> Expr a
Binary
                    ( MkBinaryOp
                        { binaryFn :: Double -> Double -> Bool
binaryFn = Double -> Double -> Bool
forall a. Ord a => a -> a -> Bool
(>)
                        , binaryName :: Text
binaryName = Text
"gt"
                        , binarySymbol :: Maybe Text
binarySymbol = Text -> Maybe Text
forall a. a -> Maybe a
Just Text
">"
                        , binaryCommutative :: Bool
binaryCommutative = Bool
False
                        , binaryPrecedence :: Int
binaryPrecedence = Int
1
                        }
                    )
                    (forall a. Columnable a => Text -> Expr a
Col @Double Text
"__rand__")
                    (Double -> Expr Double
forall a. Columnable a => a -> Expr a
Lit Double
p)
                )
            DataFrame -> (DataFrame -> DataFrame) -> DataFrame
forall a b. a -> (a -> b) -> b
& [Text] -> DataFrame -> DataFrame
exclude [Text
"__rand__"]
        )

{- | Creates n folds of a dataframe.

==== __Example__
@
ghci> import System.Random
ghci> D.kFolds (mkStdGen 137) 5 df

@
-}
kFolds :: (RandomGen g) => g -> Int -> DataFrame -> [DataFrame]
kFolds :: forall g. RandomGen g => g -> Int -> DataFrame -> [DataFrame]
kFolds g
pureGen Int
folds DataFrame
df =
    let
        rand :: Vector Double
rand = g -> Int -> Vector Double
forall g. RandomGen g => g -> Int -> Vector Double
generateRandomVector g
pureGen ((Int, Int) -> Int
forall a b. (a, b) -> a
fst (DataFrame -> (Int, Int)
dataframeDimensions DataFrame
df))
        withRand :: DataFrame
withRand = DataFrame
df DataFrame -> (DataFrame -> DataFrame) -> DataFrame
forall a b. a -> (a -> b) -> b
& Text -> Vector Double -> DataFrame -> DataFrame
forall a.
(Columnable a, Unbox a) =>
Text -> Vector a -> DataFrame -> DataFrame
insertUnboxedVector Text
"__rand__" Vector Double
rand
        partitionSize :: Double
partitionSize = Double
1 Double -> Double -> Double
forall a. Fractional a => a -> a -> a
/ Int -> Double
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
folds
        singleFold :: Int -> DataFrame -> DataFrame
singleFold Int
n DataFrame
d =
            DataFrame
d
                DataFrame -> (DataFrame -> DataFrame) -> DataFrame
forall a b. a -> (a -> b) -> b
& Expr Bool -> DataFrame -> DataFrame
filterWhere
                    ( BinaryOp Double Double Bool
-> Expr Double -> Expr Double -> Expr Bool
forall c b a.
(Columnable c, Columnable b, Columnable a) =>
BinaryOp c b a -> Expr c -> Expr b -> Expr a
Binary
                        ( MkBinaryOp
                            { binaryFn :: Double -> Double -> Bool
binaryFn = Double -> Double -> Bool
forall a. Ord a => a -> a -> Bool
(>=)
                            , binaryName :: Text
binaryName = Text
"geq"
                            , binarySymbol :: Maybe Text
binarySymbol = Text -> Maybe Text
forall a. a -> Maybe a
Just Text
">="
                            , binaryCommutative :: Bool
binaryCommutative = Bool
False
                            , binaryPrecedence :: Int
binaryPrecedence = Int
1
                            }
                        )
                        (forall a. Columnable a => Text -> Expr a
Col @Double Text
"__rand__")
                        (Double -> Expr Double
forall a. Columnable a => a -> Expr a
Lit (Int -> Double
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
n Double -> Double -> Double
forall a. Num a => a -> a -> a
* Double
partitionSize))
                    )
        go :: Int -> DataFrame -> [DataFrame]
go (-1) DataFrame
_ = []
        go Int
n DataFrame
d =
            let
                d' :: DataFrame
d' = Int -> DataFrame -> DataFrame
singleFold Int
n DataFrame
d
                d'' :: DataFrame
d'' =
                    DataFrame
d
                        DataFrame -> (DataFrame -> DataFrame) -> DataFrame
forall a b. a -> (a -> b) -> b
& Expr Bool -> DataFrame -> DataFrame
filterWhere
                            ( BinaryOp Double Double Bool
-> Expr Double -> Expr Double -> Expr Bool
forall c b a.
(Columnable c, Columnable b, Columnable a) =>
BinaryOp c b a -> Expr c -> Expr b -> Expr a
Binary
                                ( MkBinaryOp
                                    { binaryFn :: Double -> Double -> Bool
binaryFn = Double -> Double -> Bool
forall a. Ord a => a -> a -> Bool
(<)
                                    , binaryName :: Text
binaryName = Text
"lt"
                                    , binarySymbol :: Maybe Text
binarySymbol = Text -> Maybe Text
forall a. a -> Maybe a
Just Text
"<"
                                    , binaryCommutative :: Bool
binaryCommutative = Bool
False
                                    , binaryPrecedence :: Int
binaryPrecedence = Int
1
                                    }
                                )
                                (forall a. Columnable a => Text -> Expr a
Col @Double Text
"__rand__")
                                (Double -> Expr Double
forall a. Columnable a => a -> Expr a
Lit (Int -> Double
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
n Double -> Double -> Double
forall a. Num a => a -> a -> a
* Double
partitionSize))
                            )
             in
                DataFrame
d' DataFrame -> [DataFrame] -> [DataFrame]
forall a. a -> [a] -> [a]
: Int -> DataFrame -> [DataFrame]
go (Int
n Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
1) DataFrame
d''
     in
        (DataFrame -> DataFrame) -> [DataFrame] -> [DataFrame]
forall a b. (a -> b) -> [a] -> [b]
map ([Text] -> DataFrame -> DataFrame
exclude [Text
"__rand__"]) (Int -> DataFrame -> [DataFrame]
go (Int
folds Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
1) DataFrame
withRand)

generateRandomVector :: (RandomGen g) => g -> Int -> VU.Vector Double
generateRandomVector :: forall g. RandomGen g => g -> Int -> Vector Double
generateRandomVector g
pureGen Int
k = [Double] -> Vector Double
forall a. Unbox a => [a] -> Vector a
VU.fromList ([Double] -> Vector Double) -> [Double] -> Vector Double
forall a b. (a -> b) -> a -> b
$ g -> Int -> [Double]
forall {t} {t}. (Eq t, Num t, RandomGen t) => t -> t -> [Double]
go g
pureGen Int
k
  where
    go :: t -> t -> [Double]
go t
g t
0 = []
    go t
g t
n =
        let
            (Double
v, t
g') = (Double, Double) -> t -> (Double, t)
forall a g. (UniformRange a, RandomGen g) => (a, a) -> g -> (a, g)
uniformR (Double
0 :: Double, Double
1 :: Double) t
g
         in
            Double
v Double -> [Double] -> [Double]
forall a. a -> [a] -> [a]
: t -> t -> [Double]
go t
g' (t
n t -> t -> t
forall a. Num a => a -> a -> a
- t
1)