{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE ExplicitNamespaces #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE GADTs #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE RankNTypes #-}
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE TypeApplications #-}

module DataFrame.Operations.Subset where

import qualified Data.List as L
import qualified Data.Map as M
import qualified Data.Text as T
import qualified Data.Vector as V
import qualified Data.Vector.Generic as VG
import qualified Data.Vector.Unboxed as VU
import qualified Data.Vector.Unboxed.Mutable as VUM
import qualified Prelude

import Control.Exception (throw)
import Control.Monad.ST
import Data.Function ((&))
import Data.Maybe (fromJust, fromMaybe, isJust, isNothing)
import Data.Type.Equality (TestEquality (..))
import DataFrame.Errors (DataFrameException (..), TypeErrorContext (..))
import DataFrame.Internal.Column
import DataFrame.Internal.DataFrame (DataFrame (..), empty, getColumn)
import DataFrame.Internal.Expression
import DataFrame.Operations.Core
import DataFrame.Operations.Transformations (apply)
import System.Random
import Type.Reflection
import Prelude hiding (filter, take)

-- | O(k * n) Take the first n rows of a DataFrame.
take :: Int -> DataFrame -> DataFrame
take :: Int -> DataFrame -> DataFrame
take Int
n DataFrame
d = DataFrame
d{columns = V.map (takeColumn n') (columns d), dataframeDimensions = (n', c)}
  where
    (Int
r, Int
c) = DataFrame -> (Int, Int)
dataframeDimensions DataFrame
d
    n' :: Int
n' = Int -> Int -> Int -> Int
clip Int
n Int
0 Int
r

-- | O(k * n) Take the last n rows of a DataFrame.
takeLast :: Int -> DataFrame -> DataFrame
takeLast :: Int -> DataFrame -> DataFrame
takeLast Int
n DataFrame
d =
    DataFrame
d
        { columns = V.map (takeLastColumn n') (columns d)
        , dataframeDimensions = (n', c)
        }
  where
    (Int
r, Int
c) = DataFrame -> (Int, Int)
dataframeDimensions DataFrame
d
    n' :: Int
n' = Int -> Int -> Int -> Int
clip Int
n Int
0 Int
r

-- | O(k * n) Drop the first n rows of a DataFrame.
drop :: Int -> DataFrame -> DataFrame
drop :: Int -> DataFrame -> DataFrame
drop Int
n DataFrame
d =
    DataFrame
d
        { columns = V.map (sliceColumn n' (max (r - n') 0)) (columns d)
        , dataframeDimensions = (max (r - n') 0, c)
        }
  where
    (Int
r, Int
c) = DataFrame -> (Int, Int)
dataframeDimensions DataFrame
d
    n' :: Int
n' = Int -> Int -> Int -> Int
clip Int
n Int
0 Int
r

-- | O(k * n) Drop the last n rows of a DataFrame.
dropLast :: Int -> DataFrame -> DataFrame
dropLast :: Int -> DataFrame -> DataFrame
dropLast Int
n DataFrame
d =
    DataFrame
d{columns = V.map (sliceColumn 0 n') (columns d), dataframeDimensions = (n', c)}
  where
    (Int
r, Int
c) = DataFrame -> (Int, Int)
dataframeDimensions DataFrame
d
    n' :: Int
n' = Int -> Int -> Int -> Int
clip (Int
r Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
n) Int
0 Int
r

-- | O(k * n) Take a range of rows of a DataFrame.
range :: (Int, Int) -> DataFrame -> DataFrame
range :: (Int, Int) -> DataFrame -> DataFrame
range (Int
start, Int
end) DataFrame
d =
    DataFrame
d
        { columns = V.map (sliceColumn (clip start 0 r) n') (columns d)
        , dataframeDimensions = (n', c)
        }
  where
    (Int
r, Int
c) = DataFrame -> (Int, Int)
dataframeDimensions DataFrame
d
    n' :: Int
n' = Int -> Int -> Int -> Int
clip (Int
end Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
start) Int
0 Int
r

clip :: Int -> Int -> Int -> Int
clip :: Int -> Int -> Int -> Int
clip Int
n Int
left Int
right = Int -> Int -> Int
forall a. Ord a => a -> a -> a
min Int
right (Int -> Int) -> Int -> Int
forall a b. (a -> b) -> a -> b
$ Int -> Int -> Int
forall a. Ord a => a -> a -> a
max Int
n Int
left

{- | O(n * k) Filter rows by a given condition.

> filter "x" even df
-}
filter ::
    forall a.
    (Columnable a) =>
    -- | Column to filter by
    T.Text ->
    -- | Filter condition
    (a -> Bool) ->
    -- | Dataframe to filter
    DataFrame ->
    DataFrame
filter :: forall a.
Columnable a =>
Text -> (a -> Bool) -> DataFrame -> DataFrame
filter Text
filterColumnName a -> Bool
condition DataFrame
df = case Text -> DataFrame -> Maybe Column
getColumn Text
filterColumnName DataFrame
df of
    Maybe Column
Nothing ->
        DataFrameException -> DataFrame
forall a e. Exception e => e -> a
throw (DataFrameException -> DataFrame)
-> DataFrameException -> DataFrame
forall a b. (a -> b) -> a -> b
$
            Text -> Text -> [Text] -> DataFrameException
ColumnNotFoundException Text
filterColumnName Text
"filter" (Map Text Int -> [Text]
forall k a. Map k a -> [k]
M.keys (Map Text Int -> [Text]) -> Map Text Int -> [Text]
forall a b. (a -> b) -> a -> b
$ DataFrame -> Map Text Int
columnIndices DataFrame
df)
    Just (BoxedColumn (Vector a
column :: V.Vector b)) -> Text -> Vector a -> (a -> Bool) -> DataFrame -> DataFrame
forall a b (v :: * -> *).
(Vector v b, Columnable a, Columnable b) =>
Text -> v b -> (a -> Bool) -> DataFrame -> DataFrame
filterByVector Text
filterColumnName Vector a
column a -> Bool
condition DataFrame
df
    Just (OptionalColumn (Vector (Maybe a)
column :: V.Vector b)) -> Text -> Vector (Maybe a) -> (a -> Bool) -> DataFrame -> DataFrame
forall a b (v :: * -> *).
(Vector v b, Columnable a, Columnable b) =>
Text -> v b -> (a -> Bool) -> DataFrame -> DataFrame
filterByVector Text
filterColumnName Vector (Maybe a)
column a -> Bool
condition DataFrame
df
    Just (UnboxedColumn (Vector a
column :: VU.Vector b)) -> Text -> Vector a -> (a -> Bool) -> DataFrame -> DataFrame
forall a b (v :: * -> *).
(Vector v b, Columnable a, Columnable b) =>
Text -> v b -> (a -> Bool) -> DataFrame -> DataFrame
filterByVector Text
filterColumnName Vector a
column a -> Bool
condition DataFrame
df

filterByVector ::
    forall a b v.
    (VG.Vector v b, Columnable a, Columnable b) =>
    T.Text -> v b -> (a -> Bool) -> DataFrame -> DataFrame
filterByVector :: forall a b (v :: * -> *).
(Vector v b, Columnable a, Columnable b) =>
Text -> v b -> (a -> Bool) -> DataFrame -> DataFrame
filterByVector Text
filterColumnName v b
column a -> Bool
condition DataFrame
df = case TypeRep a -> TypeRep b -> Maybe (a :~: b)
forall a b. TypeRep a -> TypeRep b -> Maybe (a :~: b)
forall {k} (f :: k -> *) (a :: k) (b :: k).
TestEquality f =>
f a -> f b -> Maybe (a :~: b)
testEquality (forall a. Typeable a => TypeRep a
forall {k} (a :: k). Typeable a => TypeRep a
typeRep @a) (forall a. Typeable a => TypeRep a
forall {k} (a :: k). Typeable a => TypeRep a
typeRep @b) of
    Maybe (a :~: b)
Nothing ->
        DataFrameException -> DataFrame
forall a e. Exception e => e -> a
throw (DataFrameException -> DataFrame)
-> DataFrameException -> DataFrame
forall a b. (a -> b) -> a -> b
$
            TypeErrorContext a b -> DataFrameException
forall a b.
(Typeable a, Typeable b) =>
TypeErrorContext a b -> DataFrameException
TypeMismatchException
                ( MkTypeErrorContext
                    { userType :: Either String (TypeRep a)
userType = TypeRep a -> Either String (TypeRep a)
forall a b. b -> Either a b
Right (TypeRep a -> Either String (TypeRep a))
-> TypeRep a -> Either String (TypeRep a)
forall a b. (a -> b) -> a -> b
$ forall a. Typeable a => TypeRep a
forall {k} (a :: k). Typeable a => TypeRep a
typeRep @a
                    , expectedType :: Either String (TypeRep b)
expectedType = TypeRep b -> Either String (TypeRep b)
forall a b. b -> Either a b
Right (TypeRep b -> Either String (TypeRep b))
-> TypeRep b -> Either String (TypeRep b)
forall a b. (a -> b) -> a -> b
$ forall a. Typeable a => TypeRep a
forall {k} (a :: k). Typeable a => TypeRep a
typeRep @b
                    , errorColumnName :: Maybe String
errorColumnName = String -> Maybe String
forall a. a -> Maybe a
Just (Text -> String
T.unpack Text
filterColumnName)
                    , callingFunctionName :: Maybe String
callingFunctionName = String -> Maybe String
forall a. a -> Maybe a
Just String
"filter"
                    }
                )
    Just a :~: b
Refl ->
        let
            ixs :: Vector Int
ixs = (a -> Bool) -> v a -> Vector Int
forall (v :: * -> *) a.
Vector v a =>
(a -> Bool) -> v a -> Vector Int
indexes a -> Bool
condition v a
v b
column
         in
            DataFrame
df
                { columns = V.map (atIndicesStable ixs) (columns df)
                , dataframeDimensions = (VG.length ixs, snd (dataframeDimensions df))
                }

indexes :: (VG.Vector v a) => (a -> Bool) -> v a -> VU.Vector Int
indexes :: forall (v :: * -> *) a.
Vector v a =>
(a -> Bool) -> v a -> Vector Int
indexes a -> Bool
condition v a
cols = (forall s. ST s (Vector Int)) -> Vector Int
forall a. (forall s. ST s a) -> a
runST ((forall s. ST s (Vector Int)) -> Vector Int)
-> (forall s. ST s (Vector Int)) -> Vector Int
forall a b. (a -> b) -> a -> b
$ do
    MVector s Int
ixs <- Int -> ST s (MVector (PrimState (ST s)) Int)
forall (m :: * -> *) a.
(PrimMonad m, Unbox a) =>
Int -> m (MVector (PrimState m) a)
VUM.new Int
8192
    (!Int
icount, Int
_, Int
_, !MVector s Int
ixs') <-
        ((Int, Int, Int, MVector s Int)
 -> a -> ST s (Int, Int, Int, MVector s Int))
-> (Int, Int, Int, MVector s Int)
-> v a
-> ST s (Int, Int, Int, MVector s Int)
forall (m :: * -> *) (v :: * -> *) b a.
(Monad m, Vector v b) =>
(a -> b -> m a) -> a -> v b -> m a
VG.foldM
            ( \(!Int
icount, !Int
vcount, !Int
cap, MVector s Int
mv) a
v -> do
                if Bool -> Bool
not (a -> Bool
condition a
v)
                    then
                        (Int, Int, Int, MVector s Int)
-> ST s (Int, Int, Int, MVector s Int)
forall a. a -> ST s a
forall (f :: * -> *) a. Applicative f => a -> f a
pure (Int
icount, Int
vcount Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
1, Int
cap, MVector s Int
mv)
                    else do
                        let shouldGrow :: Bool
shouldGrow = Int
icount Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== Int
cap
                        MVector s Int
mv' <- if Bool
shouldGrow then MVector (PrimState (ST s)) Int
-> Int -> ST s (MVector (PrimState (ST s)) Int)
forall (m :: * -> *) a.
(PrimMonad m, Unbox a) =>
MVector (PrimState m) a -> Int -> m (MVector (PrimState m) a)
VUM.grow MVector s Int
MVector (PrimState (ST s)) Int
mv Int
cap else MVector s Int -> ST s (MVector s Int)
forall a. a -> ST s a
forall (f :: * -> *) a. Applicative f => a -> f a
pure MVector s Int
mv
                        MVector (PrimState (ST s)) Int -> Int -> Int -> ST s ()
forall (m :: * -> *) a.
(PrimMonad m, Unbox a) =>
MVector (PrimState m) a -> Int -> a -> m ()
VUM.write MVector s Int
MVector (PrimState (ST s)) Int
mv' Int
icount Int
vcount
                        (Int, Int, Int, MVector s Int)
-> ST s (Int, Int, Int, MVector s Int)
forall a. a -> ST s a
forall (f :: * -> *) a. Applicative f => a -> f a
pure (Int
icount Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
1, Int
vcount Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
1, Int
cap Int -> Int -> Int
forall a. Num a => a -> a -> a
+ (Int
cap Int -> Int -> Int
forall a. Num a => a -> a -> a
* Bool -> Int
forall a. Enum a => a -> Int
fromEnum Bool
shouldGrow), MVector s Int
mv')
            )
            (Int
0, Int
0, Int
8192, MVector s Int
ixs)
            v a
cols
    MVector (PrimState (ST s)) Int -> ST s (Vector Int)
forall a (m :: * -> *).
(Unbox a, PrimMonad m) =>
MVector (PrimState m) a -> m (Vector a)
VU.freeze (Int -> Int -> MVector s Int -> MVector s Int
forall a s. Unbox a => Int -> Int -> MVector s a -> MVector s a
VUM.slice Int
0 Int
icount MVector s Int
ixs')

{- | O(k) a version of filter where the predicate comes first.

> filterBy even "x" df
-}
filterBy :: (Columnable a) => (a -> Bool) -> T.Text -> DataFrame -> DataFrame
filterBy :: forall a.
Columnable a =>
(a -> Bool) -> Text -> DataFrame -> DataFrame
filterBy = (Text -> (a -> Bool) -> DataFrame -> DataFrame)
-> (a -> Bool) -> Text -> DataFrame -> DataFrame
forall a b c. (a -> b -> c) -> b -> a -> c
flip Text -> (a -> Bool) -> DataFrame -> DataFrame
forall a.
Columnable a =>
Text -> (a -> Bool) -> DataFrame -> DataFrame
filter

{- | O(k) filters the dataframe with a boolean expression.

> filterWhere (F.col @Int x + F.col y F.> 5) df
-}
filterWhere :: Expr Bool -> DataFrame -> DataFrame
filterWhere :: Expr Bool -> DataFrame -> DataFrame
filterWhere Expr Bool
expr DataFrame
df =
    let
        (TColumn Column
col) = case forall a.
Columnable a =>
DataFrame -> Expr a -> Either DataFrameException (TypedColumn a)
interpret @Bool DataFrame
df Expr Bool
expr of
            Left DataFrameException
e -> DataFrameException -> TypedColumn Bool
forall a e. Exception e => e -> a
throw DataFrameException
e
            Right TypedColumn Bool
c -> TypedColumn Bool
c
        indexes :: Vector Int
indexes = case (Bool -> Bool) -> Column -> Either DataFrameException (Vector Int)
forall a.
Columnable a =>
(a -> Bool) -> Column -> Either DataFrameException (Vector Int)
findIndices (Bool -> Bool -> Bool
forall a. Eq a => a -> a -> Bool
== Bool
True) Column
col of
            Right Vector Int
ixs -> Vector Int
ixs
            Left DataFrameException
e -> DataFrameException -> Vector Int
forall a e. Exception e => e -> a
throw DataFrameException
e
        c' :: Int
c' = (Int, Int) -> Int
forall a b. (a, b) -> b
snd ((Int, Int) -> Int) -> (Int, Int) -> Int
forall a b. (a -> b) -> a -> b
$ DataFrame -> (Int, Int)
dataframeDimensions DataFrame
df
     in
        DataFrame
df
            { columns = V.map (atIndicesStable indexes) (columns df)
            , dataframeDimensions = (VU.length indexes, c')
            }

{- | O(k) removes all rows with `Nothing` in a given column from the dataframe.

> filterJust "col" df
-}
filterJust :: T.Text -> DataFrame -> DataFrame
filterJust :: Text -> DataFrame -> DataFrame
filterJust Text
name DataFrame
df = case Text -> DataFrame -> Maybe Column
getColumn Text
name DataFrame
df of
    Maybe Column
Nothing ->
        DataFrameException -> DataFrame
forall a e. Exception e => e -> a
throw (DataFrameException -> DataFrame)
-> DataFrameException -> DataFrame
forall a b. (a -> b) -> a -> b
$ Text -> Text -> [Text] -> DataFrameException
ColumnNotFoundException Text
name Text
"filterJust" (Map Text Int -> [Text]
forall k a. Map k a -> [k]
M.keys (Map Text Int -> [Text]) -> Map Text Int -> [Text]
forall a b. (a -> b) -> a -> b
$ DataFrame -> Map Text Int
columnIndices DataFrame
df)
    Just column :: Column
column@(OptionalColumn (Vector (Maybe a)
col :: V.Vector (Maybe a))) -> forall a.
Columnable a =>
Text -> (a -> Bool) -> DataFrame -> DataFrame
filter @(Maybe a) Text
name Maybe a -> Bool
forall a. Maybe a -> Bool
isJust DataFrame
df DataFrame -> (DataFrame -> DataFrame) -> DataFrame
forall a b. a -> (a -> b) -> b
& forall b c.
(Columnable b, Columnable c) =>
(b -> c) -> Text -> DataFrame -> DataFrame
apply @(Maybe a) Maybe a -> a
forall a. HasCallStack => Maybe a -> a
fromJust Text
name
    Just Column
column -> DataFrame
df

{- | O(k) returns all rows with `Nothing` in a give column.

> filterNothing "col" df
-}
filterNothing :: T.Text -> DataFrame -> DataFrame
filterNothing :: Text -> DataFrame -> DataFrame
filterNothing Text
name DataFrame
df = case Text -> DataFrame -> Maybe Column
getColumn Text
name DataFrame
df of
    Maybe Column
Nothing ->
        DataFrameException -> DataFrame
forall a e. Exception e => e -> a
throw (DataFrameException -> DataFrame)
-> DataFrameException -> DataFrame
forall a b. (a -> b) -> a -> b
$ Text -> Text -> [Text] -> DataFrameException
ColumnNotFoundException Text
name Text
"filterNothing" (Map Text Int -> [Text]
forall k a. Map k a -> [k]
M.keys (Map Text Int -> [Text]) -> Map Text Int -> [Text]
forall a b. (a -> b) -> a -> b
$ DataFrame -> Map Text Int
columnIndices DataFrame
df)
    Just (OptionalColumn (Vector (Maybe a)
col :: V.Vector (Maybe a))) -> forall a.
Columnable a =>
Text -> (a -> Bool) -> DataFrame -> DataFrame
filter @(Maybe a) Text
name Maybe a -> Bool
forall a. Maybe a -> Bool
isNothing DataFrame
df
    Maybe Column
_ -> DataFrame
df

{- | O(n * k) removes all rows with `Nothing` from the dataframe.

> filterAllJust df
-}
filterAllJust :: DataFrame -> DataFrame
filterAllJust :: DataFrame -> DataFrame
filterAllJust DataFrame
df = (Text -> DataFrame -> DataFrame)
-> DataFrame -> [Text] -> DataFrame
forall a b. (a -> b -> b) -> b -> [a] -> b
forall (t :: * -> *) a b.
Foldable t =>
(a -> b -> b) -> b -> t a -> b
foldr Text -> DataFrame -> DataFrame
filterJust DataFrame
df (DataFrame -> [Text]
columnNames DataFrame
df)

{- | O(k) cuts the dataframe in a cube of size (a, b) where
  a is the length and b is the width.

> cube (10, 5) df
-}
cube :: (Int, Int) -> DataFrame -> DataFrame
cube :: (Int, Int) -> DataFrame -> DataFrame
cube (Int
length, Int
width) = Int -> DataFrame -> DataFrame
take Int
length (DataFrame -> DataFrame)
-> (DataFrame -> DataFrame) -> DataFrame -> DataFrame
forall b c a. (b -> c) -> (a -> b) -> a -> c
. [SelectionCriteria] -> DataFrame -> DataFrame
selectBy [(Int, Int) -> SelectionCriteria
ColumnIndexRange (Int
0, Int
width Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
1)]

{- | O(n) Selects a number of columns in a given dataframe.

> select ["name", "age"] df
-}
select ::
    [T.Text] ->
    DataFrame ->
    DataFrame
select :: [Text] -> DataFrame -> DataFrame
select [Text]
cs DataFrame
df
    | [Text] -> Bool
forall a. [a] -> Bool
forall (t :: * -> *) a. Foldable t => t a -> Bool
L.null [Text]
cs = DataFrame
empty
    | (Text -> Bool) -> [Text] -> Bool
forall (t :: * -> *) a. Foldable t => (a -> Bool) -> t a -> Bool
any (Text -> [Text] -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`notElem` DataFrame -> [Text]
columnNames DataFrame
df) [Text]
cs =
        DataFrameException -> DataFrame
forall a e. Exception e => e -> a
throw (DataFrameException -> DataFrame)
-> DataFrameException -> DataFrame
forall a b. (a -> b) -> a -> b
$
            Text -> Text -> [Text] -> DataFrameException
ColumnNotFoundException
                (String -> Text
T.pack (String -> Text) -> String -> Text
forall a b. (a -> b) -> a -> b
$ [Text] -> String
forall a. Show a => a -> String
show ([Text] -> String) -> [Text] -> String
forall a b. (a -> b) -> a -> b
$ [Text]
cs [Text] -> [Text] -> [Text]
forall a. Eq a => [a] -> [a] -> [a]
L.\\ DataFrame -> [Text]
columnNames DataFrame
df)
                Text
"select"
                (DataFrame -> [Text]
columnNames DataFrame
df)
    | Bool
otherwise = (DataFrame -> Text -> DataFrame)
-> DataFrame -> [Text] -> DataFrame
forall b a. (b -> a -> b) -> b -> [a] -> b
forall (t :: * -> *) b a.
Foldable t =>
(b -> a -> b) -> b -> t a -> b
L.foldl' DataFrame -> Text -> DataFrame
addKeyValue DataFrame
empty [Text]
cs
  where
    addKeyValue :: DataFrame -> Text -> DataFrame
addKeyValue DataFrame
d Text
k = DataFrame -> Maybe DataFrame -> DataFrame
forall a. a -> Maybe a -> a
fromMaybe DataFrame
df (Maybe DataFrame -> DataFrame) -> Maybe DataFrame -> DataFrame
forall a b. (a -> b) -> a -> b
$ do
        Column
col <- Text -> DataFrame -> Maybe Column
getColumn Text
k DataFrame
df
        DataFrame -> Maybe DataFrame
forall a. a -> Maybe a
forall (f :: * -> *) a. Applicative f => a -> f a
pure (DataFrame -> Maybe DataFrame) -> DataFrame -> Maybe DataFrame
forall a b. (a -> b) -> a -> b
$ Text -> Column -> DataFrame -> DataFrame
insertColumn Text
k Column
col DataFrame
d

data SelectionCriteria
    = ColumnProperty (Column -> Bool)
    | ColumnNameProperty (T.Text -> Bool)
    | ColumnTextRange (T.Text, T.Text)
    | ColumnIndexRange (Int, Int)
    | ColumnName T.Text

{- | Criteria for selecting a column by name.

> selectBy [byName "Age"] df

equivalent to:

> select ["Age"] df
-}
byName :: T.Text -> SelectionCriteria
byName :: Text -> SelectionCriteria
byName = Text -> SelectionCriteria
ColumnName

{- | Criteria for selecting columns whose property satisfies given predicate.

> selectBy [byProperty isNumeric] df
-}
byProperty :: (Column -> Bool) -> SelectionCriteria
byProperty :: (Column -> Bool) -> SelectionCriteria
byProperty = (Column -> Bool) -> SelectionCriteria
ColumnProperty

{- | Criteria for selecting columns whose name satisfies given predicate.

> selectBy [byNameProperty (T.isPrefixOf "weight")] df
-}
byNameProperty :: (T.Text -> Bool) -> SelectionCriteria
byNameProperty :: (Text -> Bool) -> SelectionCriteria
byNameProperty = (Text -> Bool) -> SelectionCriteria
ColumnNameProperty

{- | Criteria for selecting columns whose names are in the given lexicographic range (inclusive).

> selectBy [byNameRange ("a", "c")] df
-}
byNameRange :: (T.Text, T.Text) -> SelectionCriteria
byNameRange :: (Text, Text) -> SelectionCriteria
byNameRange = (Text, Text) -> SelectionCriteria
ColumnTextRange

{- | Criteria for selecting columns whose indices are in the given (inclusive) range.

> selectBy [byIndexRange (0, 5)] df
-}
byIndexRange :: (Int, Int) -> SelectionCriteria
byIndexRange :: (Int, Int) -> SelectionCriteria
byIndexRange = (Int, Int) -> SelectionCriteria
ColumnIndexRange

-- | O(n) select columns by column predicate name.
selectBy :: [SelectionCriteria] -> DataFrame -> DataFrame
selectBy :: [SelectionCriteria] -> DataFrame -> DataFrame
selectBy [SelectionCriteria]
xs DataFrame
df = [Text] -> DataFrame -> DataFrame
select [Text]
columnsWithProperties DataFrame
df
  where
    columnsWithProperties :: [Text]
columnsWithProperties = ([Text] -> SelectionCriteria -> [Text])
-> [Text] -> [SelectionCriteria] -> [Text]
forall b a. (b -> a -> b) -> b -> [a] -> b
forall (t :: * -> *) b a.
Foldable t =>
(b -> a -> b) -> b -> t a -> b
L.foldl' [Text] -> SelectionCriteria -> [Text]
columnWithProperty [] [SelectionCriteria]
xs
    columnWithProperty :: [Text] -> SelectionCriteria -> [Text]
columnWithProperty [Text]
acc (ColumnName Text
name) = [Text]
acc [Text] -> [Text] -> [Text]
forall a. [a] -> [a] -> [a]
++ [Text
name]
    columnWithProperty [Text]
acc (ColumnNameProperty Text -> Bool
f) = [Text]
acc [Text] -> [Text] -> [Text]
forall a. [a] -> [a] -> [a]
++ (Text -> Bool) -> [Text] -> [Text]
forall a. (a -> Bool) -> [a] -> [a]
L.filter Text -> Bool
f (DataFrame -> [Text]
columnNames DataFrame
df)
    columnWithProperty [Text]
acc (ColumnTextRange (Text
from, Text
to)) =
        [Text]
acc
            [Text] -> [Text] -> [Text]
forall a. [a] -> [a] -> [a]
++ [Text] -> [Text]
forall a. [a] -> [a]
reverse
                ((Text -> Bool) -> [Text] -> [Text]
forall a. (a -> Bool) -> [a] -> [a]
Prelude.dropWhile (Text
to Text -> Text -> Bool
forall a. Eq a => a -> a -> Bool
/=) ([Text] -> [Text]) -> [Text] -> [Text]
forall a b. (a -> b) -> a -> b
$ [Text] -> [Text]
forall a. [a] -> [a]
reverse ([Text] -> [Text]) -> [Text] -> [Text]
forall a b. (a -> b) -> a -> b
$ (Text -> Bool) -> [Text] -> [Text]
forall a. (a -> Bool) -> [a] -> [a]
dropWhile (Text
from Text -> Text -> Bool
forall a. Eq a => a -> a -> Bool
/=) (DataFrame -> [Text]
columnNames DataFrame
df))
    columnWithProperty [Text]
acc (ColumnIndexRange (Int
from, Int
to)) = [Text]
acc [Text] -> [Text] -> [Text]
forall a. [a] -> [a] -> [a]
++ Int -> [Text] -> [Text]
forall a. Int -> [a] -> [a]
Prelude.take (Int
to Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
from Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
1) (Int -> [Text] -> [Text]
forall a. Int -> [a] -> [a]
Prelude.drop Int
from (DataFrame -> [Text]
columnNames DataFrame
df))
    columnWithProperty [Text]
acc (ColumnProperty Column -> Bool
f) =
        [Text]
acc
            [Text] -> [Text] -> [Text]
forall a. [a] -> [a] -> [a]
++ ((Text, Int) -> Text) -> [(Text, Int)] -> [Text]
forall a b. (a -> b) -> [a] -> [b]
map (Text, Int) -> Text
forall a b. (a, b) -> a
fst (((Text, Int) -> Bool) -> [(Text, Int)] -> [(Text, Int)]
forall a. (a -> Bool) -> [a] -> [a]
L.filter (\(Text
k, Int
v) -> Int
v Int -> [Int] -> Bool
forall a. Eq a => a -> [a] -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` [Int]
ixs) (Map Text Int -> [(Text, Int)]
forall k a. Map k a -> [(k, a)]
M.toAscList (DataFrame -> Map Text Int
columnIndices DataFrame
df)))
      where
        ixs :: [Int]
ixs = ([Int] -> Int -> Column -> [Int])
-> [Int] -> Vector Column -> [Int]
forall a b. (a -> Int -> b -> a) -> a -> Vector b -> a
V.ifoldl' (\[Int]
acc Int
i Column
c -> if Column -> Bool
f Column
c then Int
i Int -> [Int] -> [Int]
forall a. a -> [a] -> [a]
: [Int]
acc else [Int]
acc) [] (DataFrame -> Vector Column
columns DataFrame
df)

{- | O(n) inverse of select

> exclude ["Name"] df
-}
exclude ::
    [T.Text] ->
    DataFrame ->
    DataFrame
exclude :: [Text] -> DataFrame -> DataFrame
exclude [Text]
cs DataFrame
df =
    let keysToKeep :: [Text]
keysToKeep = DataFrame -> [Text]
columnNames DataFrame
df [Text] -> [Text] -> [Text]
forall a. Eq a => [a] -> [a] -> [a]
L.\\ [Text]
cs
     in [Text] -> DataFrame -> DataFrame
select [Text]
keysToKeep DataFrame
df

{- | Sample a dataframe. The double parameter must be between 0 and 1 (inclusive).

==== __Example__
@
ghci> import System.Random
ghci> D.sample (mkStdGen 137) 0.1 df

@
-}
sample :: (RandomGen g) => g -> Double -> DataFrame -> DataFrame
sample :: forall g. RandomGen g => g -> Double -> DataFrame -> DataFrame
sample g
pureGen Double
p DataFrame
df =
    let
        rand :: Vector Double
rand = g -> Int -> Vector Double
forall g. RandomGen g => g -> Int -> Vector Double
generateRandomVector g
pureGen ((Int, Int) -> Int
forall a b. (a, b) -> a
fst (DataFrame -> (Int, Int)
dataframeDimensions DataFrame
df))
     in
        DataFrame
df
            DataFrame -> (DataFrame -> DataFrame) -> DataFrame
forall a b. a -> (a -> b) -> b
& Text -> Vector Double -> DataFrame -> DataFrame
forall a.
(Columnable a, Unbox a) =>
Text -> Vector a -> DataFrame -> DataFrame
insertUnboxedVector Text
"__rand__" Vector Double
rand
            DataFrame -> (DataFrame -> DataFrame) -> DataFrame
forall a b. a -> (a -> b) -> b
& Expr Bool -> DataFrame -> DataFrame
filterWhere (Text
-> (Double -> Double -> Bool)
-> Expr Double
-> Expr Double
-> Expr Bool
forall c b a.
(Columnable c, Columnable b, Columnable a) =>
Text -> (c -> b -> a) -> Expr c -> Expr b -> Expr a
BinaryOp Text
"geq" Double -> Double -> Bool
forall a. Ord a => a -> a -> Bool
(>=) (forall a. Columnable a => Text -> Expr a
Col @Double Text
"__rand__") (Double -> Expr Double
forall a. Columnable a => a -> Expr a
Lit (Double
1 Double -> Double -> Double
forall a. Num a => a -> a -> a
- Double
p)))
            DataFrame -> (DataFrame -> DataFrame) -> DataFrame
forall a b. a -> (a -> b) -> b
& [Text] -> DataFrame -> DataFrame
exclude [Text
"__rand__"]

{- | Split a dataset into two. The first in the tuple gets a sample of p (0 <= p <= 1) and the second gets (1 - p). This is useful for creating test and train splits.

==== __Example__
@
ghci> import System.Random
ghci> D.randomSplit (mkStdGen 137) 0.9 df

@
-}
randomSplit ::
    (RandomGen g) => g -> Double -> DataFrame -> (DataFrame, DataFrame)
randomSplit :: forall g.
RandomGen g =>
g -> Double -> DataFrame -> (DataFrame, DataFrame)
randomSplit g
pureGen Double
p DataFrame
df =
    let
        rand :: Vector Double
rand = g -> Int -> Vector Double
forall g. RandomGen g => g -> Int -> Vector Double
generateRandomVector g
pureGen ((Int, Int) -> Int
forall a b. (a, b) -> a
fst (DataFrame -> (Int, Int)
dataframeDimensions DataFrame
df))
        withRand :: DataFrame
withRand = DataFrame
df DataFrame -> (DataFrame -> DataFrame) -> DataFrame
forall a b. a -> (a -> b) -> b
& Text -> Vector Double -> DataFrame -> DataFrame
forall a.
(Columnable a, Unbox a) =>
Text -> Vector a -> DataFrame -> DataFrame
insertUnboxedVector Text
"__rand__" Vector Double
rand
     in
        ( DataFrame
withRand
            DataFrame -> (DataFrame -> DataFrame) -> DataFrame
forall a b. a -> (a -> b) -> b
& Expr Bool -> DataFrame -> DataFrame
filterWhere (Text
-> (Double -> Double -> Bool)
-> Expr Double
-> Expr Double
-> Expr Bool
forall c b a.
(Columnable c, Columnable b, Columnable a) =>
Text -> (c -> b -> a) -> Expr c -> Expr b -> Expr a
BinaryOp Text
"leq" Double -> Double -> Bool
forall a. Ord a => a -> a -> Bool
(<=) (forall a. Columnable a => Text -> Expr a
Col @Double Text
"__rand__") (Double -> Expr Double
forall a. Columnable a => a -> Expr a
Lit Double
p))
            DataFrame -> (DataFrame -> DataFrame) -> DataFrame
forall a b. a -> (a -> b) -> b
& [Text] -> DataFrame -> DataFrame
exclude [Text
"__rand__"]
        , DataFrame
withRand
            DataFrame -> (DataFrame -> DataFrame) -> DataFrame
forall a b. a -> (a -> b) -> b
& Expr Bool -> DataFrame -> DataFrame
filterWhere (Text
-> (Double -> Double -> Bool)
-> Expr Double
-> Expr Double
-> Expr Bool
forall c b a.
(Columnable c, Columnable b, Columnable a) =>
Text -> (c -> b -> a) -> Expr c -> Expr b -> Expr a
BinaryOp Text
"gt" Double -> Double -> Bool
forall a. Ord a => a -> a -> Bool
(>) (forall a. Columnable a => Text -> Expr a
Col @Double Text
"__rand__") (Double -> Expr Double
forall a. Columnable a => a -> Expr a
Lit Double
p))
            DataFrame -> (DataFrame -> DataFrame) -> DataFrame
forall a b. a -> (a -> b) -> b
& [Text] -> DataFrame -> DataFrame
exclude [Text
"__rand__"]
        )

{- | Creates n folds of a dataframe.

==== __Example__
@
ghci> import System.Random
ghci> D.kFolds (mkStdGen 137) 5 df

@
-}
kFolds :: (RandomGen g) => g -> Int -> DataFrame -> [DataFrame]
kFolds :: forall g. RandomGen g => g -> Int -> DataFrame -> [DataFrame]
kFolds g
pureGen Int
folds DataFrame
df =
    let
        rand :: Vector Double
rand = g -> Int -> Vector Double
forall g. RandomGen g => g -> Int -> Vector Double
generateRandomVector g
pureGen ((Int, Int) -> Int
forall a b. (a, b) -> a
fst (DataFrame -> (Int, Int)
dataframeDimensions DataFrame
df))
        withRand :: DataFrame
withRand = DataFrame
df DataFrame -> (DataFrame -> DataFrame) -> DataFrame
forall a b. a -> (a -> b) -> b
& Text -> Vector Double -> DataFrame -> DataFrame
forall a.
(Columnable a, Unbox a) =>
Text -> Vector a -> DataFrame -> DataFrame
insertUnboxedVector Text
"__rand__" Vector Double
rand
        partitionSize :: Double
partitionSize = Double
1 Double -> Double -> Double
forall a. Fractional a => a -> a -> a
/ (Int -> Double
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
folds)
        singleFold :: Int -> DataFrame -> DataFrame
singleFold Int
n DataFrame
d =
            DataFrame
d
                DataFrame -> (DataFrame -> DataFrame) -> DataFrame
forall a b. a -> (a -> b) -> b
& Expr Bool -> DataFrame -> DataFrame
filterWhere
                    ( Text
-> (Double -> Double -> Bool)
-> Expr Double
-> Expr Double
-> Expr Bool
forall c b a.
(Columnable c, Columnable b, Columnable a) =>
Text -> (c -> b -> a) -> Expr c -> Expr b -> Expr a
BinaryOp
                        Text
"geq"
                        Double -> Double -> Bool
forall a. Ord a => a -> a -> Bool
(>=)
                        (forall a. Columnable a => Text -> Expr a
Col @Double Text
"__rand__")
                        (Double -> Expr Double
forall a. Columnable a => a -> Expr a
Lit (Int -> Double
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
n Double -> Double -> Double
forall a. Num a => a -> a -> a
* Double
partitionSize))
                    )
        go :: Int -> DataFrame -> [DataFrame]
go (-1) DataFrame
_ = []
        go Int
n DataFrame
d =
            let
                d' :: DataFrame
d' = Int -> DataFrame -> DataFrame
singleFold Int
n DataFrame
d
                d'' :: DataFrame
d'' =
                    DataFrame
d
                        DataFrame -> (DataFrame -> DataFrame) -> DataFrame
forall a b. a -> (a -> b) -> b
& Expr Bool -> DataFrame -> DataFrame
filterWhere
                            ( Text
-> (Double -> Double -> Bool)
-> Expr Double
-> Expr Double
-> Expr Bool
forall c b a.
(Columnable c, Columnable b, Columnable a) =>
Text -> (c -> b -> a) -> Expr c -> Expr b -> Expr a
BinaryOp
                                Text
"lt"
                                Double -> Double -> Bool
forall a. Ord a => a -> a -> Bool
(<)
                                (forall a. Columnable a => Text -> Expr a
Col @Double Text
"__rand__")
                                (Double -> Expr Double
forall a. Columnable a => a -> Expr a
Lit (Int -> Double
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
n Double -> Double -> Double
forall a. Num a => a -> a -> a
* Double
partitionSize))
                            )
             in
                DataFrame
d' DataFrame -> [DataFrame] -> [DataFrame]
forall a. a -> [a] -> [a]
: Int -> DataFrame -> [DataFrame]
go (Int
n Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
1) DataFrame
d''
     in
        (DataFrame -> DataFrame) -> [DataFrame] -> [DataFrame]
forall a b. (a -> b) -> [a] -> [b]
map ([Text] -> DataFrame -> DataFrame
exclude [Text
"__rand__"]) (Int -> DataFrame -> [DataFrame]
go (Int
folds Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
1) DataFrame
withRand)

generateRandomVector :: (RandomGen g) => g -> Int -> VU.Vector Double
generateRandomVector :: forall g. RandomGen g => g -> Int -> Vector Double
generateRandomVector g
pureGen Int
k = [Double] -> Vector Double
forall a. Unbox a => [a] -> Vector a
VU.fromList ([Double] -> Vector Double) -> [Double] -> Vector Double
forall a b. (a -> b) -> a -> b
$ g -> Int -> [Double]
forall {t} {t}. (Eq t, Num t, RandomGen t) => t -> t -> [Double]
go g
pureGen Int
k
  where
    go :: t -> t -> [Double]
go t
g t
0 = []
    go t
g t
n =
        let
            (Double
v, t
g') = (Double, Double) -> t -> (Double, t)
forall a g. (UniformRange a, RandomGen g) => (a, a) -> g -> (a, g)
uniformR (Double
0 :: Double, Double
1 :: Double) t
g
         in
            Double
v Double -> [Double] -> [Double]
forall a. a -> [a] -> [a]
: t -> t -> [Double]
go t
g' (t
n t -> t -> t
forall a. Num a => a -> a -> a
- t
1)