{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE RankNTypes #-}
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE TypeApplications #-}
{-# LANGUAGE GADTs #-}
{-# LANGUAGE FlexibleContexts #-}
module DataFrame.Operations.Subset where

import qualified Data.List as L
import qualified Data.Map as M
import qualified Data.Set as S
import qualified Data.Text as T
import qualified Data.Vector as V
import qualified Data.Vector.Unboxed as VU
import qualified Data.Vector.Generic as VG
import qualified Prelude

import Control.Exception (throw)
import DataFrame.Errors (DataFrameException(..))
import DataFrame.Internal.Column
import DataFrame.Internal.DataFrame (DataFrame(..), getColumn, empty)
import DataFrame.Internal.Expression
import DataFrame.Internal.Row (mkRowFromArgs)
import DataFrame.Internal.Types (RowValue, toRowValue)
import DataFrame.Operations.Core
import DataFrame.Operations.Transformations (apply)
import Data.Function ((&))
import Data.Maybe (isJust, fromJust, fromMaybe)
import Prelude hiding (filter, take)
import Type.Reflection

-- | O(k * n) Take the first n rows of a DataFrame.
take :: Int -> DataFrame -> DataFrame
take :: Int -> DataFrame -> DataFrame
take Int
n DataFrame
d = DataFrame
d {columns = V.map (takeColumn n' <$>) (columns d), dataframeDimensions = (n', c)}
  where
    (Int
r, Int
c) = DataFrame -> (Int, Int)
dataframeDimensions DataFrame
d
    n' :: Int
n' = Int -> Int -> Int -> Int
clip Int
n Int
0 Int
r

takeLast :: Int -> DataFrame -> DataFrame
takeLast :: Int -> DataFrame -> DataFrame
takeLast Int
n DataFrame
d = DataFrame
d {columns = V.map (takeLastColumn n' <$>) (columns d), dataframeDimensions = (n', c)}
  where
    (Int
r, Int
c) = DataFrame -> (Int, Int)
dataframeDimensions DataFrame
d
    n' :: Int
n' = Int -> Int -> Int -> Int
clip Int
n Int
0 Int
r

drop :: Int -> DataFrame -> DataFrame
drop :: Int -> DataFrame -> DataFrame
drop Int
n DataFrame
d = DataFrame
d {columns = V.map (sliceColumn n' (max (r - n') 0) <$>) (columns d), dataframeDimensions = (max (r - n') 0, c)}
  where
    (Int
r, Int
c) = DataFrame -> (Int, Int)
dataframeDimensions DataFrame
d
    n' :: Int
n' = Int -> Int -> Int -> Int
clip Int
n Int
0 Int
r

dropLast :: Int -> DataFrame -> DataFrame
dropLast :: Int -> DataFrame -> DataFrame
dropLast Int
n DataFrame
d = DataFrame
d {columns = V.map (sliceColumn 0 n' <$>) (columns d), dataframeDimensions = (n', c)}
  where
    (Int
r, Int
c) = DataFrame -> (Int, Int)
dataframeDimensions DataFrame
d
    n' :: Int
n' = Int -> Int -> Int -> Int
clip (Int
r Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
n) Int
0 Int
r

-- | O(k * n) Take a range of rows of a DataFrame.
range :: (Int, Int) -> DataFrame -> DataFrame
range :: (Int, Int) -> DataFrame -> DataFrame
range (Int
start, Int
end) DataFrame
d = DataFrame
d {columns = V.map (sliceColumn (clip start 0 r) n' <$>) (columns d), dataframeDimensions = (n', c)}
  where
    (Int
r, Int
c) = DataFrame -> (Int, Int)
dataframeDimensions DataFrame
d
    n' :: Int
n' = Int -> Int -> Int -> Int
clip (Int
end Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
start) Int
0 Int
r

clip :: Int -> Int -> Int -> Int
clip :: Int -> Int -> Int -> Int
clip Int
n Int
left Int
right = Int -> Int -> Int
forall a. Ord a => a -> a -> a
min Int
right (Int -> Int) -> Int -> Int
forall a b. (a -> b) -> a -> b
$ Int -> Int -> Int
forall a. Ord a => a -> a -> a
max Int
n Int
left

-- | O(n * k) Filter rows by a given condition.
--
-- filter "x" even df
filter ::
  forall a.
  (Columnable a) =>
  -- | Column to filter by
  T.Text ->
  -- | Filter condition
  (a -> Bool) ->
  -- | Dataframe to filter
  DataFrame ->
  DataFrame
filter :: forall a.
Columnable a =>
Text -> (a -> Bool) -> DataFrame -> DataFrame
filter Text
filterColumnName a -> Bool
condition DataFrame
df = case Text -> DataFrame -> Maybe Column
getColumn Text
filterColumnName DataFrame
df of
  Maybe Column
Nothing -> DataFrameException -> DataFrame
forall a e. Exception e => e -> a
throw (DataFrameException -> DataFrame)
-> DataFrameException -> DataFrame
forall a b. (a -> b) -> a -> b
$ Text -> Text -> [Text] -> DataFrameException
ColumnNotFoundException Text
filterColumnName Text
"filter" (((Text, Int) -> Text) -> [(Text, Int)] -> [Text]
forall a b. (a -> b) -> [a] -> [b]
map (Text, Int) -> Text
forall a b. (a, b) -> a
fst ([(Text, Int)] -> [Text]) -> [(Text, Int)] -> [Text]
forall a b. (a -> b) -> a -> b
$ Map Text Int -> [(Text, Int)]
forall k a. Map k a -> [(k, a)]
M.toList (Map Text Int -> [(Text, Int)]) -> Map Text Int -> [(Text, Int)]
forall a b. (a -> b) -> a -> b
$ DataFrame -> Map Text Int
columnIndices DataFrame
df)
  Just Column
column -> case (Set Int -> Int -> a -> Set Int)
-> Set Int -> Column -> Maybe (Set Int)
forall a b.
(Columnable a, Columnable b) =>
(b -> Int -> a -> b) -> b -> Column -> Maybe b
ifoldlColumn (\Set Int
s Int
i a
v -> if a -> Bool
condition a
v then Int -> Set Int -> Set Int
forall a. Ord a => a -> Set a -> Set a
S.insert Int
i Set Int
s else Set Int
s) Set Int
forall a. Set a
S.empty Column
column of
    Maybe (Set Int)
Nothing -> DataFrameException -> DataFrame
forall a e. Exception e => e -> a
throw (DataFrameException -> DataFrame)
-> DataFrameException -> DataFrame
forall a b. (a -> b) -> a -> b
$ TypeRep a -> String -> Text -> Text -> DataFrameException
forall a.
Typeable a =>
TypeRep a -> String -> Text -> Text -> DataFrameException
TypeMismatchException' (forall a. Typeable a => TypeRep a
forall {k} (a :: k). Typeable a => TypeRep a
typeRep @a) (Column -> String
columnTypeString Column
column) Text
filterColumnName Text
"filter"
    Just Set Int
indexes -> let
        c' :: Int
c' = (Int, Int) -> Int
forall a b. (a, b) -> b
snd ((Int, Int) -> Int) -> (Int, Int) -> Int
forall a b. (a -> b) -> a -> b
$ DataFrame -> (Int, Int)
dataframeDimensions DataFrame
df
        pick :: Set Int -> f Column -> f Column
pick Set Int
idxs f Column
col = Set Int -> Column -> Column
atIndices Set Int
idxs (Column -> Column) -> f Column -> f Column
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> f Column
col
      in DataFrame
df {columns = V.map (pick indexes) (columns df), dataframeDimensions = (S.size indexes, c')}

-- | O(k) a version of filter where the predicate comes first.
--
-- > filterBy even "x" df
filterBy :: (Columnable a) => (a -> Bool) -> T.Text -> DataFrame -> DataFrame
filterBy :: forall a.
Columnable a =>
(a -> Bool) -> Text -> DataFrame -> DataFrame
filterBy = (Text -> (a -> Bool) -> DataFrame -> DataFrame)
-> (a -> Bool) -> Text -> DataFrame -> DataFrame
forall a b c. (a -> b -> c) -> b -> a -> c
flip Text -> (a -> Bool) -> DataFrame -> DataFrame
forall a.
Columnable a =>
Text -> (a -> Bool) -> DataFrame -> DataFrame
filter

-- | O(k) filters the dataframe with a row predicate. The arguments in the function
--   must appear in the same order as they do in the list.
--
-- > filterWhere (["x", "y"], func (\x y -> x + y > 5)) df
filterWhere :: Expr Bool -> DataFrame -> DataFrame
filterWhere :: Expr Bool -> DataFrame -> DataFrame
filterWhere Expr Bool
expr DataFrame
df = let
    (TColumn Column
col) = forall a b. Columnable a => DataFrame -> Expr a -> TypedColumn a
interpret @Bool DataFrame
df Expr Bool
expr
    (Just Set Int
indexes) = (Set Int -> Int -> Bool -> Set Int)
-> Set Int -> Column -> Maybe (Set Int)
forall a b.
(Columnable a, Columnable b) =>
(b -> Int -> a -> b) -> b -> Column -> Maybe b
ifoldlColumn (\Set Int
s Int
i Bool
satisfied -> if Bool
satisfied then Int -> Set Int -> Set Int
forall a. Ord a => a -> Set a -> Set a
S.insert Int
i Set Int
s else Set Int
s) Set Int
forall a. Set a
S.empty Column
col
    c' :: Int
c' = (Int, Int) -> Int
forall a b. (a, b) -> b
snd ((Int, Int) -> Int) -> (Int, Int) -> Int
forall a b. (a -> b) -> a -> b
$ DataFrame -> (Int, Int)
dataframeDimensions DataFrame
df
    pick :: Set Int -> f Column -> f Column
pick Set Int
idxs f Column
col = Set Int -> Column -> Column
atIndices Set Int
idxs (Column -> Column) -> f Column -> f Column
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> f Column
col
  in DataFrame
df {columns = V.map (pick indexes) (columns df), dataframeDimensions = (S.size indexes, c')}


-- | O(k) removes all rows with `Nothing` in a given column from the dataframe.
--
-- > filterJust df
filterJust :: T.Text -> DataFrame -> DataFrame
filterJust :: Text -> DataFrame -> DataFrame
filterJust Text
name DataFrame
df = case Text -> DataFrame -> Maybe Column
getColumn Text
name DataFrame
df of
  Maybe Column
Nothing -> DataFrameException -> DataFrame
forall a e. Exception e => e -> a
throw (DataFrameException -> DataFrame)
-> DataFrameException -> DataFrame
forall a b. (a -> b) -> a -> b
$ Text -> Text -> [Text] -> DataFrameException
ColumnNotFoundException Text
name Text
"filterJust" (((Text, Int) -> Text) -> [(Text, Int)] -> [Text]
forall a b. (a -> b) -> [a] -> [b]
map (Text, Int) -> Text
forall a b. (a, b) -> a
fst ([(Text, Int)] -> [Text]) -> [(Text, Int)] -> [Text]
forall a b. (a -> b) -> a -> b
$ Map Text Int -> [(Text, Int)]
forall k a. Map k a -> [(k, a)]
M.toList (Map Text Int -> [(Text, Int)]) -> Map Text Int -> [(Text, Int)]
forall a b. (a -> b) -> a -> b
$ DataFrame -> Map Text Int
columnIndices DataFrame
df)
  Just column :: Column
column@(OptionalColumn (Vector (Maybe a)
col :: V.Vector (Maybe a))) -> forall a.
Columnable a =>
Text -> (a -> Bool) -> DataFrame -> DataFrame
filter @(Maybe a) Text
name Maybe a -> Bool
forall a. Maybe a -> Bool
isJust DataFrame
df DataFrame -> (DataFrame -> DataFrame) -> DataFrame
forall a b. a -> (a -> b) -> b
& forall b c.
(Columnable b, Columnable c) =>
(b -> c) -> Text -> DataFrame -> DataFrame
apply @(Maybe a) Maybe a -> a
forall a. HasCallStack => Maybe a -> a
fromJust Text
name
  Just Column
column -> DataFrame
df

-- | O(n * k) removes all rows with `Nothing` from the dataframe.
--
-- > filterJust df
filterAllJust :: DataFrame -> DataFrame
filterAllJust :: DataFrame -> DataFrame
filterAllJust DataFrame
df = (Text -> DataFrame -> DataFrame)
-> DataFrame -> [Text] -> DataFrame
forall a b. (a -> b -> b) -> b -> [a] -> b
forall (t :: * -> *) a b.
Foldable t =>
(a -> b -> b) -> b -> t a -> b
foldr Text -> DataFrame -> DataFrame
filterJust DataFrame
df (DataFrame -> [Text]
columnNames DataFrame
df)

-- | O(k) cuts the dataframe in a cube of size (a, b) where
--   a is the length and b is the width.   
--
-- > cube (10, 5) df
cube :: (Int, Int) -> DataFrame -> DataFrame
cube :: (Int, Int) -> DataFrame -> DataFrame
cube (Int
length, Int
width) = Int -> DataFrame -> DataFrame
take Int
length (DataFrame -> DataFrame)
-> (DataFrame -> DataFrame) -> DataFrame -> DataFrame
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Int, Int) -> DataFrame -> DataFrame
selectIntRange (Int
0, Int
width Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
1)

-- | O(n) Selects a number of columns in a given dataframe.
--
-- > select ["name", "age"] df
select ::
  [T.Text] ->
  DataFrame ->
  DataFrame
select :: [Text] -> DataFrame -> DataFrame
select [Text]
cs DataFrame
df
  | [Text] -> Bool
forall a. [a] -> Bool
forall (t :: * -> *) a. Foldable t => t a -> Bool
L.null [Text]
cs = DataFrame
empty
  | (Text -> Bool) -> [Text] -> Bool
forall (t :: * -> *) a. Foldable t => (a -> Bool) -> t a -> Bool
any (Text -> [Text] -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`notElem` DataFrame -> [Text]
columnNames DataFrame
df) [Text]
cs = DataFrameException -> DataFrame
forall a e. Exception e => e -> a
throw (DataFrameException -> DataFrame)
-> DataFrameException -> DataFrame
forall a b. (a -> b) -> a -> b
$ Text -> Text -> [Text] -> DataFrameException
ColumnNotFoundException (String -> Text
T.pack (String -> Text) -> String -> Text
forall a b. (a -> b) -> a -> b
$ [Text] -> String
forall a. Show a => a -> String
show ([Text] -> String) -> [Text] -> String
forall a b. (a -> b) -> a -> b
$ [Text]
cs [Text] -> [Text] -> [Text]
forall a. Eq a => [a] -> [a] -> [a]
L.\\ DataFrame -> [Text]
columnNames DataFrame
df) Text
"select" (DataFrame -> [Text]
columnNames DataFrame
df)
  | Bool
otherwise = (DataFrame -> Text -> DataFrame)
-> DataFrame -> [Text] -> DataFrame
forall b a. (b -> a -> b) -> b -> [a] -> b
forall (t :: * -> *) b a.
Foldable t =>
(b -> a -> b) -> b -> t a -> b
L.foldl' DataFrame -> Text -> DataFrame
addKeyValue DataFrame
empty [Text]
cs
  where
    cIndexAssoc :: [(Text, Int)]
cIndexAssoc = Map Text Int -> [(Text, Int)]
forall k a. Map k a -> [(k, a)]
M.toList (Map Text Int -> [(Text, Int)]) -> Map Text Int -> [(Text, Int)]
forall a b. (a -> b) -> a -> b
$ DataFrame -> Map Text Int
columnIndices DataFrame
df
    remaining :: [(Text, Int)]
remaining = ((Text, Int) -> Bool) -> [(Text, Int)] -> [(Text, Int)]
forall a. (a -> Bool) -> [a] -> [a]
L.filter (\(!Text
c, Int
_) -> Text
c Text -> [Text] -> Bool
forall a. Eq a => a -> [a] -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` [Text]
cs) [(Text, Int)]
cIndexAssoc
    removed :: [(Text, Int)]
removed = [(Text, Int)]
cIndexAssoc [(Text, Int)] -> [(Text, Int)] -> [(Text, Int)]
forall a. Eq a => [a] -> [a] -> [a]
L.\\ [(Text, Int)]
remaining
    indexes :: [Int]
indexes = ((Text, Int) -> Int) -> [(Text, Int)] -> [Int]
forall a b. (a -> b) -> [a] -> [b]
map (Text, Int) -> Int
forall a b. (a, b) -> b
snd [(Text, Int)]
remaining
    (Int
r, Int
c) = DataFrame -> (Int, Int)
dataframeDimensions DataFrame
df
    addKeyValue :: DataFrame -> Text -> DataFrame
addKeyValue DataFrame
d Text
k =
      DataFrame
d
        { columns = V.imap (\Int
i Maybe Column
v -> if Int
i Int -> [Int] -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`notElem` [Int]
indexes then Maybe Column
forall a. Maybe a
Nothing else Maybe Column
v) (columns df),
          columnIndices = M.fromList remaining,
          freeIndices = map snd removed ++ freeIndices df,
          dataframeDimensions = (r, L.length remaining)
        }

-- | O(n) select columns by index range of column names.
selectIntRange :: (Int, Int) -> DataFrame -> DataFrame
selectIntRange :: (Int, Int) -> DataFrame -> DataFrame
selectIntRange (Int
from, Int
to) DataFrame
df = [Text] -> DataFrame -> DataFrame
select (Int -> [Text] -> [Text]
forall a. Int -> [a] -> [a]
Prelude.take (Int
to Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
from Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
1) ([Text] -> [Text]) -> [Text] -> [Text]
forall a b. (a -> b) -> a -> b
$ Int -> [Text] -> [Text]
forall a. Int -> [a] -> [a]
Prelude.drop Int
from (DataFrame -> [Text]
columnNames DataFrame
df)) DataFrame
df

-- | O(n) select columns by index range of column names.
selectRange :: (T.Text, T.Text) -> DataFrame -> DataFrame
selectRange :: (Text, Text) -> DataFrame -> DataFrame
selectRange (Text
from, Text
to) DataFrame
df = [Text] -> DataFrame -> DataFrame
select ([Text] -> [Text]
forall a. [a] -> [a]
reverse ([Text] -> [Text]) -> [Text] -> [Text]
forall a b. (a -> b) -> a -> b
$ (Text -> Bool) -> [Text] -> [Text]
forall a. (a -> Bool) -> [a] -> [a]
Prelude.dropWhile (Text
to Text -> Text -> Bool
forall a. Eq a => a -> a -> Bool
/=) ([Text] -> [Text]) -> [Text] -> [Text]
forall a b. (a -> b) -> a -> b
$ [Text] -> [Text]
forall a. [a] -> [a]
reverse ([Text] -> [Text]) -> [Text] -> [Text]
forall a b. (a -> b) -> a -> b
$ (Text -> Bool) -> [Text] -> [Text]
forall a. (a -> Bool) -> [a] -> [a]
dropWhile (Text
from Text -> Text -> Bool
forall a. Eq a => a -> a -> Bool
/=) (DataFrame -> [Text]
columnNames DataFrame
df)) DataFrame
df

-- | O(n) select columns by column predicate name.
selectBy :: (T.Text -> Bool) -> DataFrame -> DataFrame
selectBy :: (Text -> Bool) -> DataFrame -> DataFrame
selectBy Text -> Bool
f DataFrame
df = [Text] -> DataFrame -> DataFrame
select ((Text -> Bool) -> [Text] -> [Text]
forall a. (a -> Bool) -> [a] -> [a]
L.filter Text -> Bool
f (DataFrame -> [Text]
columnNames DataFrame
df)) DataFrame
df

-- | O(n) inverse of select
--
-- > exclude ["Name"] df
exclude ::
  [T.Text] ->
  DataFrame ->
  DataFrame
exclude :: [Text] -> DataFrame -> DataFrame
exclude [Text]
cs DataFrame
df =
  let keysToKeep :: [Text]
keysToKeep = DataFrame -> [Text]
columnNames DataFrame
df [Text] -> [Text] -> [Text]
forall a. Eq a => [a] -> [a] -> [a]
L.\\ [Text]
cs
   in [Text] -> DataFrame -> DataFrame
select [Text]
keysToKeep DataFrame
df