{-# LANGUAGE ExplicitNamespaces #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE GADTs #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE RankNTypes #-}
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE TypeApplications #-}
module DataFrame.Operations.Subset where
import qualified Data.List as L
import qualified Data.Map as M
import qualified Data.Set as S
import qualified Data.Text as T
import qualified Data.Vector as V
import qualified Data.Vector.Generic as VG
import qualified Data.Vector.Unboxed as VU
import qualified Prelude
import Control.Exception (throw)
import Data.Function ((&))
import Data.Maybe (
fromJust,
fromMaybe,
isJust,
isNothing,
)
import Data.Type.Equality (TestEquality (..))
import DataFrame.Errors (
DataFrameException (..),
TypeErrorContext (..),
)
import DataFrame.Internal.Column
import DataFrame.Internal.DataFrame (
DataFrame (..),
empty,
getColumn,
)
import DataFrame.Internal.Expression
import DataFrame.Internal.Interpreter
import DataFrame.Operations.Core
import DataFrame.Operations.Transformations (apply)
import System.Random
import Type.Reflection
import Prelude hiding (filter, take)
take :: Int -> DataFrame -> DataFrame
take :: Int -> DataFrame -> DataFrame
take Int
n DataFrame
d = DataFrame
d{columns = V.map (takeColumn n') (columns d), dataframeDimensions = (n', c)}
where
(Int
r, Int
c) = DataFrame -> (Int, Int)
dataframeDimensions DataFrame
d
n' :: Int
n' = Int -> Int -> Int -> Int
clip Int
n Int
0 Int
r
takeLast :: Int -> DataFrame -> DataFrame
takeLast :: Int -> DataFrame -> DataFrame
takeLast Int
n DataFrame
d =
DataFrame
d
{ columns = V.map (takeLastColumn n') (columns d)
, dataframeDimensions = (n', c)
}
where
(Int
r, Int
c) = DataFrame -> (Int, Int)
dataframeDimensions DataFrame
d
n' :: Int
n' = Int -> Int -> Int -> Int
clip Int
n Int
0 Int
r
drop :: Int -> DataFrame -> DataFrame
drop :: Int -> DataFrame -> DataFrame
drop Int
n DataFrame
d =
DataFrame
d
{ columns = V.map (sliceColumn n' (max (r - n') 0)) (columns d)
, dataframeDimensions = (max (r - n') 0, c)
}
where
(Int
r, Int
c) = DataFrame -> (Int, Int)
dataframeDimensions DataFrame
d
n' :: Int
n' = Int -> Int -> Int -> Int
clip Int
n Int
0 Int
r
dropLast :: Int -> DataFrame -> DataFrame
dropLast :: Int -> DataFrame -> DataFrame
dropLast Int
n DataFrame
d =
DataFrame
d{columns = V.map (sliceColumn 0 n') (columns d), dataframeDimensions = (n', c)}
where
(Int
r, Int
c) = DataFrame -> (Int, Int)
dataframeDimensions DataFrame
d
n' :: Int
n' = Int -> Int -> Int -> Int
clip (Int
r Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
n) Int
0 Int
r
range :: (Int, Int) -> DataFrame -> DataFrame
range :: (Int, Int) -> DataFrame -> DataFrame
range (Int
start, Int
end) DataFrame
d =
DataFrame
d
{ columns = V.map (sliceColumn (clip start 0 r) n') (columns d)
, dataframeDimensions = (n', c)
}
where
(Int
r, Int
c) = DataFrame -> (Int, Int)
dataframeDimensions DataFrame
d
n' :: Int
n' = Int -> Int -> Int -> Int
clip (Int
end Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
start) Int
0 Int
r
clip :: Int -> Int -> Int -> Int
clip :: Int -> Int -> Int -> Int
clip Int
n Int
left Int
right = Int -> Int -> Int
forall a. Ord a => a -> a -> a
min Int
right (Int -> Int) -> Int -> Int
forall a b. (a -> b) -> a -> b
$ Int -> Int -> Int
forall a. Ord a => a -> a -> a
max Int
n Int
left
filter ::
forall a.
(Columnable a) =>
Expr a ->
(a -> Bool) ->
DataFrame ->
DataFrame
filter :: forall a.
Columnable a =>
Expr a -> (a -> Bool) -> DataFrame -> DataFrame
filter (Col Text
filterColumnName) a -> Bool
condition DataFrame
df = case Text -> DataFrame -> Maybe Column
getColumn Text
filterColumnName DataFrame
df of
Maybe Column
Nothing ->
DataFrameException -> DataFrame
forall a e. Exception e => e -> a
throw (DataFrameException -> DataFrame)
-> DataFrameException -> DataFrame
forall a b. (a -> b) -> a -> b
$
Text -> Text -> [Text] -> DataFrameException
ColumnNotFoundException Text
filterColumnName Text
"filter" (Map Text Int -> [Text]
forall k a. Map k a -> [k]
M.keys (Map Text Int -> [Text]) -> Map Text Int -> [Text]
forall a b. (a -> b) -> a -> b
$ DataFrame -> Map Text Int
columnIndices DataFrame
df)
Just (BoxedColumn (Vector a
column :: V.Vector b)) -> Text -> Vector a -> (a -> Bool) -> DataFrame -> DataFrame
forall a b (v :: * -> *).
(Vector v b, Vector v Int, Columnable a, Columnable b) =>
Text -> v b -> (a -> Bool) -> DataFrame -> DataFrame
filterByVector Text
filterColumnName Vector a
column a -> Bool
condition DataFrame
df
Just (OptionalColumn (Vector (Maybe a)
column :: V.Vector b)) -> Text -> Vector (Maybe a) -> (a -> Bool) -> DataFrame -> DataFrame
forall a b (v :: * -> *).
(Vector v b, Vector v Int, Columnable a, Columnable b) =>
Text -> v b -> (a -> Bool) -> DataFrame -> DataFrame
filterByVector Text
filterColumnName Vector (Maybe a)
column a -> Bool
condition DataFrame
df
Just (UnboxedColumn (Vector a
column :: VU.Vector b)) -> Text -> Vector a -> (a -> Bool) -> DataFrame -> DataFrame
forall a b (v :: * -> *).
(Vector v b, Vector v Int, Columnable a, Columnable b) =>
Text -> v b -> (a -> Bool) -> DataFrame -> DataFrame
filterByVector Text
filterColumnName Vector a
column a -> Bool
condition DataFrame
df
filter Expr a
expr a -> Bool
condition DataFrame
df =
let
(TColumn Column
col) = case forall a.
Columnable a =>
DataFrame -> Expr a -> Either DataFrameException (TypedColumn a)
interpret @a DataFrame
df (Expr a -> Expr a
forall a. (Eq a, Ord a, Show a, Typeable a) => Expr a -> Expr a
normalize Expr a
expr) of
Left DataFrameException
e -> DataFrameException -> TypedColumn a
forall a e. Exception e => e -> a
throw DataFrameException
e
Right TypedColumn a
c -> TypedColumn a
c
indexes :: Vector Int
indexes = case (a -> Bool) -> Column -> Either DataFrameException (Vector Int)
forall a.
Columnable a =>
(a -> Bool) -> Column -> Either DataFrameException (Vector Int)
findIndices a -> Bool
condition Column
col of
Right Vector Int
ixs -> Vector Int
ixs
Left DataFrameException
e -> DataFrameException -> Vector Int
forall a e. Exception e => e -> a
throw DataFrameException
e
c' :: Int
c' = (Int, Int) -> Int
forall a b. (a, b) -> b
snd ((Int, Int) -> Int) -> (Int, Int) -> Int
forall a b. (a -> b) -> a -> b
$ DataFrame -> (Int, Int)
dataframeDimensions DataFrame
df
in
DataFrame
df
{ columns = V.map (atIndicesStable indexes) (columns df)
, dataframeDimensions = (VU.length indexes, c')
}
filterByVector ::
forall a b v.
(VG.Vector v b, VG.Vector v Int, Columnable a, Columnable b) =>
T.Text -> v b -> (a -> Bool) -> DataFrame -> DataFrame
filterByVector :: forall a b (v :: * -> *).
(Vector v b, Vector v Int, Columnable a, Columnable b) =>
Text -> v b -> (a -> Bool) -> DataFrame -> DataFrame
filterByVector Text
filterColumnName v b
column a -> Bool
condition DataFrame
df = case TypeRep a -> TypeRep b -> Maybe (a :~: b)
forall a b. TypeRep a -> TypeRep b -> Maybe (a :~: b)
forall {k} (f :: k -> *) (a :: k) (b :: k).
TestEquality f =>
f a -> f b -> Maybe (a :~: b)
testEquality (forall a. Typeable a => TypeRep a
forall {k} (a :: k). Typeable a => TypeRep a
typeRep @a) (forall a. Typeable a => TypeRep a
forall {k} (a :: k). Typeable a => TypeRep a
typeRep @b) of
Maybe (a :~: b)
Nothing ->
DataFrameException -> DataFrame
forall a e. Exception e => e -> a
throw (DataFrameException -> DataFrame)
-> DataFrameException -> DataFrame
forall a b. (a -> b) -> a -> b
$
TypeErrorContext a b -> DataFrameException
forall a b.
(Typeable a, Typeable b) =>
TypeErrorContext a b -> DataFrameException
TypeMismatchException
( MkTypeErrorContext
{ userType :: Either String (TypeRep a)
userType = TypeRep a -> Either String (TypeRep a)
forall a b. b -> Either a b
Right (TypeRep a -> Either String (TypeRep a))
-> TypeRep a -> Either String (TypeRep a)
forall a b. (a -> b) -> a -> b
$ forall a. Typeable a => TypeRep a
forall {k} (a :: k). Typeable a => TypeRep a
typeRep @a
, expectedType :: Either String (TypeRep b)
expectedType = TypeRep b -> Either String (TypeRep b)
forall a b. b -> Either a b
Right (TypeRep b -> Either String (TypeRep b))
-> TypeRep b -> Either String (TypeRep b)
forall a b. (a -> b) -> a -> b
$ forall a. Typeable a => TypeRep a
forall {k} (a :: k). Typeable a => TypeRep a
typeRep @b
, errorColumnName :: Maybe String
errorColumnName = String -> Maybe String
forall a. a -> Maybe a
Just (Text -> String
T.unpack Text
filterColumnName)
, callingFunctionName :: Maybe String
callingFunctionName = String -> Maybe String
forall a. a -> Maybe a
Just String
"filter"
}
)
Just a :~: b
Refl ->
let
ixs :: Vector Int
ixs = v Int -> Vector Int
forall (v :: * -> *) a (w :: * -> *).
(Vector v a, Vector w a) =>
v a -> w a
VG.convert ((a -> Bool) -> v a -> v Int
forall (v :: * -> *) a.
(Vector v a, Vector v Int) =>
(a -> Bool) -> v a -> v Int
VG.findIndices a -> Bool
condition v a
v b
column)
in
DataFrame
df
{ columns = V.map (atIndicesStable ixs) (columns df)
, dataframeDimensions = (VG.length ixs, snd (dataframeDimensions df))
}
filterBy :: (Columnable a) => (a -> Bool) -> Expr a -> DataFrame -> DataFrame
filterBy :: forall a.
Columnable a =>
(a -> Bool) -> Expr a -> DataFrame -> DataFrame
filterBy = (Expr a -> (a -> Bool) -> DataFrame -> DataFrame)
-> (a -> Bool) -> Expr a -> DataFrame -> DataFrame
forall a b c. (a -> b -> c) -> b -> a -> c
flip Expr a -> (a -> Bool) -> DataFrame -> DataFrame
forall a.
Columnable a =>
Expr a -> (a -> Bool) -> DataFrame -> DataFrame
filter
filterWhere :: Expr Bool -> DataFrame -> DataFrame
filterWhere :: Expr Bool -> DataFrame -> DataFrame
filterWhere Expr Bool
expr DataFrame
df =
let
(TColumn Column
col) = case forall a.
Columnable a =>
DataFrame -> Expr a -> Either DataFrameException (TypedColumn a)
interpret @Bool DataFrame
df (Expr Bool -> Expr Bool
forall a. (Eq a, Ord a, Show a, Typeable a) => Expr a -> Expr a
normalize Expr Bool
expr) of
Left DataFrameException
e -> DataFrameException -> TypedColumn Bool
forall a e. Exception e => e -> a
throw DataFrameException
e
Right TypedColumn Bool
c -> TypedColumn Bool
c
indexes :: Vector Int
indexes = case (Bool -> Bool) -> Column -> Either DataFrameException (Vector Int)
forall a.
Columnable a =>
(a -> Bool) -> Column -> Either DataFrameException (Vector Int)
findIndices Bool -> Bool
forall a. a -> a
id Column
col of
Right Vector Int
ixs -> Vector Int
ixs
Left DataFrameException
e -> DataFrameException -> Vector Int
forall a e. Exception e => e -> a
throw DataFrameException
e
c' :: Int
c' = (Int, Int) -> Int
forall a b. (a, b) -> b
snd ((Int, Int) -> Int) -> (Int, Int) -> Int
forall a b. (a -> b) -> a -> b
$ DataFrame -> (Int, Int)
dataframeDimensions DataFrame
df
in
DataFrame
df
{ columns = V.map (atIndicesStable indexes) (columns df)
, dataframeDimensions = (VU.length indexes, c')
}
filterJust :: T.Text -> DataFrame -> DataFrame
filterJust :: Text -> DataFrame -> DataFrame
filterJust Text
name DataFrame
df = case Text -> DataFrame -> Maybe Column
getColumn Text
name DataFrame
df of
Maybe Column
Nothing ->
DataFrameException -> DataFrame
forall a e. Exception e => e -> a
throw (DataFrameException -> DataFrame)
-> DataFrameException -> DataFrame
forall a b. (a -> b) -> a -> b
$ Text -> Text -> [Text] -> DataFrameException
ColumnNotFoundException Text
name Text
"filterJust" (Map Text Int -> [Text]
forall k a. Map k a -> [k]
M.keys (Map Text Int -> [Text]) -> Map Text Int -> [Text]
forall a b. (a -> b) -> a -> b
$ DataFrame -> Map Text Int
columnIndices DataFrame
df)
Just column :: Column
column@(OptionalColumn (Vector (Maybe a)
col :: V.Vector (Maybe a))) -> Expr (Maybe a) -> (Maybe a -> Bool) -> DataFrame -> DataFrame
forall a.
Columnable a =>
Expr a -> (a -> Bool) -> DataFrame -> DataFrame
filter (forall a. Columnable a => Text -> Expr a
Col @(Maybe a) Text
name) Maybe a -> Bool
forall a. Maybe a -> Bool
isJust DataFrame
df DataFrame -> (DataFrame -> DataFrame) -> DataFrame
forall a b. a -> (a -> b) -> b
& forall b c.
(Columnable b, Columnable c) =>
(b -> c) -> Text -> DataFrame -> DataFrame
apply @(Maybe a) Maybe a -> a
forall a. HasCallStack => Maybe a -> a
fromJust Text
name
Just Column
column -> DataFrame
df
filterNothing :: T.Text -> DataFrame -> DataFrame
filterNothing :: Text -> DataFrame -> DataFrame
filterNothing Text
name DataFrame
df = case Text -> DataFrame -> Maybe Column
getColumn Text
name DataFrame
df of
Maybe Column
Nothing ->
DataFrameException -> DataFrame
forall a e. Exception e => e -> a
throw (DataFrameException -> DataFrame)
-> DataFrameException -> DataFrame
forall a b. (a -> b) -> a -> b
$ Text -> Text -> [Text] -> DataFrameException
ColumnNotFoundException Text
name Text
"filterNothing" (Map Text Int -> [Text]
forall k a. Map k a -> [k]
M.keys (Map Text Int -> [Text]) -> Map Text Int -> [Text]
forall a b. (a -> b) -> a -> b
$ DataFrame -> Map Text Int
columnIndices DataFrame
df)
Just (OptionalColumn (Vector (Maybe a)
col :: V.Vector (Maybe a))) -> Expr (Maybe a) -> (Maybe a -> Bool) -> DataFrame -> DataFrame
forall a.
Columnable a =>
Expr a -> (a -> Bool) -> DataFrame -> DataFrame
filter (forall a. Columnable a => Text -> Expr a
Col @(Maybe a) Text
name) Maybe a -> Bool
forall a. Maybe a -> Bool
isNothing DataFrame
df
Maybe Column
_ -> DataFrame
df
filterAllJust :: DataFrame -> DataFrame
filterAllJust :: DataFrame -> DataFrame
filterAllJust DataFrame
df = (Text -> DataFrame -> DataFrame)
-> DataFrame -> [Text] -> DataFrame
forall a b. (a -> b -> b) -> b -> [a] -> b
forall (t :: * -> *) a b.
Foldable t =>
(a -> b -> b) -> b -> t a -> b
foldr Text -> DataFrame -> DataFrame
filterJust DataFrame
df (DataFrame -> [Text]
columnNames DataFrame
df)
filterAllNothing :: DataFrame -> DataFrame
filterAllNothing :: DataFrame -> DataFrame
filterAllNothing DataFrame
df = (Text -> DataFrame -> DataFrame)
-> DataFrame -> [Text] -> DataFrame
forall a b. (a -> b -> b) -> b -> [a] -> b
forall (t :: * -> *) a b.
Foldable t =>
(a -> b -> b) -> b -> t a -> b
foldr Text -> DataFrame -> DataFrame
filterNothing DataFrame
df (DataFrame -> [Text]
columnNames DataFrame
df)
cube :: (Int, Int) -> DataFrame -> DataFrame
cube :: (Int, Int) -> DataFrame -> DataFrame
cube (Int
length, Int
width) = Int -> DataFrame -> DataFrame
take Int
length (DataFrame -> DataFrame)
-> (DataFrame -> DataFrame) -> DataFrame -> DataFrame
forall b c a. (b -> c) -> (a -> b) -> a -> c
. [SelectionCriteria] -> DataFrame -> DataFrame
selectBy [(Int, Int) -> SelectionCriteria
ColumnIndexRange (Int
0, Int
width Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
1)]
select ::
[T.Text] ->
DataFrame ->
DataFrame
select :: [Text] -> DataFrame -> DataFrame
select [Text]
cs DataFrame
df
| [Text] -> Bool
forall a. [a] -> Bool
forall (t :: * -> *) a. Foldable t => t a -> Bool
L.null [Text]
cs = DataFrame
empty
| (Text -> Bool) -> [Text] -> Bool
forall (t :: * -> *) a. Foldable t => (a -> Bool) -> t a -> Bool
any (Text -> [Text] -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`notElem` DataFrame -> [Text]
columnNames DataFrame
df) [Text]
cs =
DataFrameException -> DataFrame
forall a e. Exception e => e -> a
throw (DataFrameException -> DataFrame)
-> DataFrameException -> DataFrame
forall a b. (a -> b) -> a -> b
$
Text -> Text -> [Text] -> DataFrameException
ColumnNotFoundException
(String -> Text
T.pack (String -> Text) -> String -> Text
forall a b. (a -> b) -> a -> b
$ [Text] -> String
forall a. Show a => a -> String
show ([Text] -> String) -> [Text] -> String
forall a b. (a -> b) -> a -> b
$ [Text]
cs [Text] -> [Text] -> [Text]
forall a. Eq a => [a] -> [a] -> [a]
L.\\ DataFrame -> [Text]
columnNames DataFrame
df)
Text
"select"
(DataFrame -> [Text]
columnNames DataFrame
df)
| Bool
otherwise = (DataFrame -> Text -> DataFrame)
-> DataFrame -> [Text] -> DataFrame
forall b a. (b -> a -> b) -> b -> [a] -> b
forall (t :: * -> *) b a.
Foldable t =>
(b -> a -> b) -> b -> t a -> b
L.foldl' DataFrame -> Text -> DataFrame
addKeyValue DataFrame
empty [Text]
cs
where
addKeyValue :: DataFrame -> Text -> DataFrame
addKeyValue DataFrame
d Text
k = DataFrame -> Maybe DataFrame -> DataFrame
forall a. a -> Maybe a -> a
fromMaybe DataFrame
df (Maybe DataFrame -> DataFrame) -> Maybe DataFrame -> DataFrame
forall a b. (a -> b) -> a -> b
$ do
Column
col <- Text -> DataFrame -> Maybe Column
getColumn Text
k DataFrame
df
DataFrame -> Maybe DataFrame
forall a. a -> Maybe a
forall (f :: * -> *) a. Applicative f => a -> f a
pure (DataFrame -> Maybe DataFrame) -> DataFrame -> Maybe DataFrame
forall a b. (a -> b) -> a -> b
$ Text -> Column -> DataFrame -> DataFrame
insertColumn Text
k Column
col DataFrame
d
data SelectionCriteria
= ColumnProperty (Column -> Bool)
| ColumnNameProperty (T.Text -> Bool)
| ColumnTextRange (T.Text, T.Text)
| ColumnIndexRange (Int, Int)
| ColumnName T.Text
byName :: T.Text -> SelectionCriteria
byName :: Text -> SelectionCriteria
byName = Text -> SelectionCriteria
ColumnName
byProperty :: (Column -> Bool) -> SelectionCriteria
byProperty :: (Column -> Bool) -> SelectionCriteria
byProperty = (Column -> Bool) -> SelectionCriteria
ColumnProperty
byNameProperty :: (T.Text -> Bool) -> SelectionCriteria
byNameProperty :: (Text -> Bool) -> SelectionCriteria
byNameProperty = (Text -> Bool) -> SelectionCriteria
ColumnNameProperty
byNameRange :: (T.Text, T.Text) -> SelectionCriteria
byNameRange :: (Text, Text) -> SelectionCriteria
byNameRange = (Text, Text) -> SelectionCriteria
ColumnTextRange
byIndexRange :: (Int, Int) -> SelectionCriteria
byIndexRange :: (Int, Int) -> SelectionCriteria
byIndexRange = (Int, Int) -> SelectionCriteria
ColumnIndexRange
selectBy :: [SelectionCriteria] -> DataFrame -> DataFrame
selectBy :: [SelectionCriteria] -> DataFrame -> DataFrame
selectBy [SelectionCriteria]
xs DataFrame
df = [Text] -> DataFrame -> DataFrame
select [Text]
finalSelection DataFrame
df
where
finalSelection :: [Text]
finalSelection = (Text -> Bool) -> [Text] -> [Text]
forall a. (a -> Bool) -> [a] -> [a]
Prelude.filter (Text -> Set Text -> Bool
forall a. Ord a => a -> Set a -> Bool
`S.member` Set Text
columnsWithProperties) (DataFrame -> [Text]
columnNames DataFrame
df)
columnsWithProperties :: Set Text
columnsWithProperties = [Text] -> Set Text
forall a. Ord a => [a] -> Set a
S.fromList (([Text] -> SelectionCriteria -> [Text])
-> [Text] -> [SelectionCriteria] -> [Text]
forall b a. (b -> a -> b) -> b -> [a] -> b
forall (t :: * -> *) b a.
Foldable t =>
(b -> a -> b) -> b -> t a -> b
L.foldl' [Text] -> SelectionCriteria -> [Text]
columnWithProperty [] [SelectionCriteria]
xs)
columnWithProperty :: [Text] -> SelectionCriteria -> [Text]
columnWithProperty [Text]
acc (ColumnName Text
name) = [Text]
acc [Text] -> [Text] -> [Text]
forall a. [a] -> [a] -> [a]
++ [Text
name]
columnWithProperty [Text]
acc (ColumnNameProperty Text -> Bool
f) = [Text]
acc [Text] -> [Text] -> [Text]
forall a. [a] -> [a] -> [a]
++ (Text -> Bool) -> [Text] -> [Text]
forall a. (a -> Bool) -> [a] -> [a]
L.filter Text -> Bool
f (DataFrame -> [Text]
columnNames DataFrame
df)
columnWithProperty [Text]
acc (ColumnTextRange (Text
from, Text
to)) =
[Text]
acc
[Text] -> [Text] -> [Text]
forall a. [a] -> [a] -> [a]
++ [Text] -> [Text]
forall a. [a] -> [a]
reverse
((Text -> Bool) -> [Text] -> [Text]
forall a. (a -> Bool) -> [a] -> [a]
Prelude.dropWhile (Text
to Text -> Text -> Bool
forall a. Eq a => a -> a -> Bool
/=) ([Text] -> [Text]) -> [Text] -> [Text]
forall a b. (a -> b) -> a -> b
$ [Text] -> [Text]
forall a. [a] -> [a]
reverse ([Text] -> [Text]) -> [Text] -> [Text]
forall a b. (a -> b) -> a -> b
$ (Text -> Bool) -> [Text] -> [Text]
forall a. (a -> Bool) -> [a] -> [a]
dropWhile (Text
from Text -> Text -> Bool
forall a. Eq a => a -> a -> Bool
/=) (DataFrame -> [Text]
columnNames DataFrame
df))
columnWithProperty [Text]
acc (ColumnIndexRange (Int
from, Int
to)) = [Text]
acc [Text] -> [Text] -> [Text]
forall a. [a] -> [a] -> [a]
++ Int -> [Text] -> [Text]
forall a. Int -> [a] -> [a]
Prelude.take (Int
to Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
from Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
1) (Int -> [Text] -> [Text]
forall a. Int -> [a] -> [a]
Prelude.drop Int
from (DataFrame -> [Text]
columnNames DataFrame
df))
columnWithProperty [Text]
acc (ColumnProperty Column -> Bool
f) =
[Text]
acc
[Text] -> [Text] -> [Text]
forall a. [a] -> [a] -> [a]
++ ((Text, Int) -> Text) -> [(Text, Int)] -> [Text]
forall a b. (a -> b) -> [a] -> [b]
map (Text, Int) -> Text
forall a b. (a, b) -> a
fst (((Text, Int) -> Bool) -> [(Text, Int)] -> [(Text, Int)]
forall a. (a -> Bool) -> [a] -> [a]
L.filter (\(Text
k, Int
v) -> Int
v Int -> [Int] -> Bool
forall a. Eq a => a -> [a] -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` [Int]
ixs) (Map Text Int -> [(Text, Int)]
forall k a. Map k a -> [(k, a)]
M.toAscList (DataFrame -> Map Text Int
columnIndices DataFrame
df)))
where
ixs :: [Int]
ixs = ([Int] -> Int -> Column -> [Int])
-> [Int] -> Vector Column -> [Int]
forall a b. (a -> Int -> b -> a) -> a -> Vector b -> a
V.ifoldl' (\[Int]
acc Int
i Column
c -> if Column -> Bool
f Column
c then Int
i Int -> [Int] -> [Int]
forall a. a -> [a] -> [a]
: [Int]
acc else [Int]
acc) [] (DataFrame -> Vector Column
columns DataFrame
df)
exclude ::
[T.Text] ->
DataFrame ->
DataFrame
exclude :: [Text] -> DataFrame -> DataFrame
exclude [Text]
cs DataFrame
df =
let keysToKeep :: [Text]
keysToKeep = DataFrame -> [Text]
columnNames DataFrame
df [Text] -> [Text] -> [Text]
forall a. Eq a => [a] -> [a] -> [a]
L.\\ [Text]
cs
in [Text] -> DataFrame -> DataFrame
select [Text]
keysToKeep DataFrame
df
sample :: (RandomGen g) => g -> Double -> DataFrame -> DataFrame
sample :: forall g. RandomGen g => g -> Double -> DataFrame -> DataFrame
sample g
pureGen Double
p DataFrame
df =
let
rand :: Vector Double
rand = g -> Int -> Vector Double
forall g. RandomGen g => g -> Int -> Vector Double
generateRandomVector g
pureGen ((Int, Int) -> Int
forall a b. (a, b) -> a
fst (DataFrame -> (Int, Int)
dataframeDimensions DataFrame
df))
in
DataFrame
df
DataFrame -> (DataFrame -> DataFrame) -> DataFrame
forall a b. a -> (a -> b) -> b
& Text -> Vector Double -> DataFrame -> DataFrame
forall a.
(Columnable a, Unbox a) =>
Text -> Vector a -> DataFrame -> DataFrame
insertUnboxedVector Text
"__rand__" Vector Double
rand
DataFrame -> (DataFrame -> DataFrame) -> DataFrame
forall a b. a -> (a -> b) -> b
& Expr Bool -> DataFrame -> DataFrame
filterWhere
( BinaryOp Double Double Bool
-> Expr Double -> Expr Double -> Expr Bool
forall c b a.
(Columnable c, Columnable b, Columnable a) =>
BinaryOp c b a -> Expr c -> Expr b -> Expr a
Binary
( MkBinaryOp
{ binaryFn :: Double -> Double -> Bool
binaryFn = Double -> Double -> Bool
forall a. Ord a => a -> a -> Bool
(>=)
, binaryName :: Text
binaryName = Text
"geq"
, binarySymbol :: Maybe Text
binarySymbol = Text -> Maybe Text
forall a. a -> Maybe a
Just Text
">="
, binaryCommutative :: Bool
binaryCommutative = Bool
False
, binaryPrecedence :: Int
binaryPrecedence = Int
1
}
)
(forall a. Columnable a => Text -> Expr a
Col @Double Text
"__rand__")
(Double -> Expr Double
forall a. Columnable a => a -> Expr a
Lit (Double
1 Double -> Double -> Double
forall a. Num a => a -> a -> a
- Double
p))
)
DataFrame -> (DataFrame -> DataFrame) -> DataFrame
forall a b. a -> (a -> b) -> b
& [Text] -> DataFrame -> DataFrame
exclude [Text
"__rand__"]
randomSplit ::
(RandomGen g) => g -> Double -> DataFrame -> (DataFrame, DataFrame)
randomSplit :: forall g.
RandomGen g =>
g -> Double -> DataFrame -> (DataFrame, DataFrame)
randomSplit g
pureGen Double
p DataFrame
df =
let
rand :: Vector Double
rand = g -> Int -> Vector Double
forall g. RandomGen g => g -> Int -> Vector Double
generateRandomVector g
pureGen ((Int, Int) -> Int
forall a b. (a, b) -> a
fst (DataFrame -> (Int, Int)
dataframeDimensions DataFrame
df))
withRand :: DataFrame
withRand = DataFrame
df DataFrame -> (DataFrame -> DataFrame) -> DataFrame
forall a b. a -> (a -> b) -> b
& Text -> Vector Double -> DataFrame -> DataFrame
forall a.
(Columnable a, Unbox a) =>
Text -> Vector a -> DataFrame -> DataFrame
insertUnboxedVector Text
"__rand__" Vector Double
rand
in
( DataFrame
withRand
DataFrame -> (DataFrame -> DataFrame) -> DataFrame
forall a b. a -> (a -> b) -> b
& Expr Bool -> DataFrame -> DataFrame
filterWhere
( BinaryOp Double Double Bool
-> Expr Double -> Expr Double -> Expr Bool
forall c b a.
(Columnable c, Columnable b, Columnable a) =>
BinaryOp c b a -> Expr c -> Expr b -> Expr a
Binary
( MkBinaryOp
{ binaryFn :: Double -> Double -> Bool
binaryFn = Double -> Double -> Bool
forall a. Ord a => a -> a -> Bool
(<=)
, binaryName :: Text
binaryName = Text
"leq"
, binarySymbol :: Maybe Text
binarySymbol = Text -> Maybe Text
forall a. a -> Maybe a
Just Text
"<="
, binaryCommutative :: Bool
binaryCommutative = Bool
False
, binaryPrecedence :: Int
binaryPrecedence = Int
1
}
)
(forall a. Columnable a => Text -> Expr a
Col @Double Text
"__rand__")
(Double -> Expr Double
forall a. Columnable a => a -> Expr a
Lit Double
p)
)
DataFrame -> (DataFrame -> DataFrame) -> DataFrame
forall a b. a -> (a -> b) -> b
& [Text] -> DataFrame -> DataFrame
exclude [Text
"__rand__"]
, DataFrame
withRand
DataFrame -> (DataFrame -> DataFrame) -> DataFrame
forall a b. a -> (a -> b) -> b
& Expr Bool -> DataFrame -> DataFrame
filterWhere
( BinaryOp Double Double Bool
-> Expr Double -> Expr Double -> Expr Bool
forall c b a.
(Columnable c, Columnable b, Columnable a) =>
BinaryOp c b a -> Expr c -> Expr b -> Expr a
Binary
( MkBinaryOp
{ binaryFn :: Double -> Double -> Bool
binaryFn = Double -> Double -> Bool
forall a. Ord a => a -> a -> Bool
(>)
, binaryName :: Text
binaryName = Text
"gt"
, binarySymbol :: Maybe Text
binarySymbol = Text -> Maybe Text
forall a. a -> Maybe a
Just Text
">"
, binaryCommutative :: Bool
binaryCommutative = Bool
False
, binaryPrecedence :: Int
binaryPrecedence = Int
1
}
)
(forall a. Columnable a => Text -> Expr a
Col @Double Text
"__rand__")
(Double -> Expr Double
forall a. Columnable a => a -> Expr a
Lit Double
p)
)
DataFrame -> (DataFrame -> DataFrame) -> DataFrame
forall a b. a -> (a -> b) -> b
& [Text] -> DataFrame -> DataFrame
exclude [Text
"__rand__"]
)
kFolds :: (RandomGen g) => g -> Int -> DataFrame -> [DataFrame]
kFolds :: forall g. RandomGen g => g -> Int -> DataFrame -> [DataFrame]
kFolds g
pureGen Int
folds DataFrame
df =
let
rand :: Vector Double
rand = g -> Int -> Vector Double
forall g. RandomGen g => g -> Int -> Vector Double
generateRandomVector g
pureGen ((Int, Int) -> Int
forall a b. (a, b) -> a
fst (DataFrame -> (Int, Int)
dataframeDimensions DataFrame
df))
withRand :: DataFrame
withRand = DataFrame
df DataFrame -> (DataFrame -> DataFrame) -> DataFrame
forall a b. a -> (a -> b) -> b
& Text -> Vector Double -> DataFrame -> DataFrame
forall a.
(Columnable a, Unbox a) =>
Text -> Vector a -> DataFrame -> DataFrame
insertUnboxedVector Text
"__rand__" Vector Double
rand
partitionSize :: Double
partitionSize = Double
1 Double -> Double -> Double
forall a. Fractional a => a -> a -> a
/ Int -> Double
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
folds
singleFold :: Int -> DataFrame -> DataFrame
singleFold Int
n DataFrame
d =
DataFrame
d
DataFrame -> (DataFrame -> DataFrame) -> DataFrame
forall a b. a -> (a -> b) -> b
& Expr Bool -> DataFrame -> DataFrame
filterWhere
( BinaryOp Double Double Bool
-> Expr Double -> Expr Double -> Expr Bool
forall c b a.
(Columnable c, Columnable b, Columnable a) =>
BinaryOp c b a -> Expr c -> Expr b -> Expr a
Binary
( MkBinaryOp
{ binaryFn :: Double -> Double -> Bool
binaryFn = Double -> Double -> Bool
forall a. Ord a => a -> a -> Bool
(>=)
, binaryName :: Text
binaryName = Text
"geq"
, binarySymbol :: Maybe Text
binarySymbol = Text -> Maybe Text
forall a. a -> Maybe a
Just Text
">="
, binaryCommutative :: Bool
binaryCommutative = Bool
False
, binaryPrecedence :: Int
binaryPrecedence = Int
1
}
)
(forall a. Columnable a => Text -> Expr a
Col @Double Text
"__rand__")
(Double -> Expr Double
forall a. Columnable a => a -> Expr a
Lit (Int -> Double
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
n Double -> Double -> Double
forall a. Num a => a -> a -> a
* Double
partitionSize))
)
go :: Int -> DataFrame -> [DataFrame]
go (-1) DataFrame
_ = []
go Int
n DataFrame
d =
let
d' :: DataFrame
d' = Int -> DataFrame -> DataFrame
singleFold Int
n DataFrame
d
d'' :: DataFrame
d'' =
DataFrame
d
DataFrame -> (DataFrame -> DataFrame) -> DataFrame
forall a b. a -> (a -> b) -> b
& Expr Bool -> DataFrame -> DataFrame
filterWhere
( BinaryOp Double Double Bool
-> Expr Double -> Expr Double -> Expr Bool
forall c b a.
(Columnable c, Columnable b, Columnable a) =>
BinaryOp c b a -> Expr c -> Expr b -> Expr a
Binary
( MkBinaryOp
{ binaryFn :: Double -> Double -> Bool
binaryFn = Double -> Double -> Bool
forall a. Ord a => a -> a -> Bool
(<)
, binaryName :: Text
binaryName = Text
"lt"
, binarySymbol :: Maybe Text
binarySymbol = Text -> Maybe Text
forall a. a -> Maybe a
Just Text
"<"
, binaryCommutative :: Bool
binaryCommutative = Bool
False
, binaryPrecedence :: Int
binaryPrecedence = Int
1
}
)
(forall a. Columnable a => Text -> Expr a
Col @Double Text
"__rand__")
(Double -> Expr Double
forall a. Columnable a => a -> Expr a
Lit (Int -> Double
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
n Double -> Double -> Double
forall a. Num a => a -> a -> a
* Double
partitionSize))
)
in
DataFrame
d' DataFrame -> [DataFrame] -> [DataFrame]
forall a. a -> [a] -> [a]
: Int -> DataFrame -> [DataFrame]
go (Int
n Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
1) DataFrame
d''
in
(DataFrame -> DataFrame) -> [DataFrame] -> [DataFrame]
forall a b. (a -> b) -> [a] -> [b]
map ([Text] -> DataFrame -> DataFrame
exclude [Text
"__rand__"]) (Int -> DataFrame -> [DataFrame]
go (Int
folds Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
1) DataFrame
withRand)
generateRandomVector :: (RandomGen g) => g -> Int -> VU.Vector Double
generateRandomVector :: forall g. RandomGen g => g -> Int -> Vector Double
generateRandomVector g
pureGen Int
k = [Double] -> Vector Double
forall a. Unbox a => [a] -> Vector a
VU.fromList ([Double] -> Vector Double) -> [Double] -> Vector Double
forall a b. (a -> b) -> a -> b
$ g -> Int -> [Double]
forall {t} {t}. (Eq t, Num t, RandomGen t) => t -> t -> [Double]
go g
pureGen Int
k
where
go :: t -> t -> [Double]
go t
g t
0 = []
go t
g t
n =
let
(Double
v, t
g') = (Double, Double) -> t -> (Double, t)
forall a g. (UniformRange a, RandomGen g) => (a, a) -> g -> (a, g)
uniformR (Double
0 :: Double, Double
1 :: Double) t
g
in
Double
v Double -> [Double] -> [Double]
forall a. a -> [a] -> [a]
: t -> t -> [Double]
go t
g' (t
n t -> t -> t
forall a. Num a => a -> a -> a
- t
1)