{-# LANGUAGE ExplicitNamespaces #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE GADTs #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE RankNTypes #-}
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE TypeApplications #-}
{-# LANGUAGE BangPatterns #-}
module DataFrame.Operations.Core where

import qualified Data.List as L
import qualified Data.Map as M
import qualified Data.Map.Strict as MS
import qualified Data.Set as S
import qualified Data.Text as T
import qualified Data.Vector.Generic as VG
import qualified Data.Vector as V
import qualified Data.Vector.Unboxed as VU

import Control.Exception ( throw )
import DataFrame.Errors
import DataFrame.Internal.Column ( Column(..), toColumn', toColumn, columnLength, columnTypeString, expandColumn, Columnable)
import DataFrame.Internal.DataFrame (DataFrame(..), getColumn, null, empty)
import DataFrame.Internal.Parsing (isNullish)
import Data.Either
import Data.Function (on, (&))
import Data.Maybe
import Data.Type.Equality (type (:~:)(Refl), TestEquality(..))
import Type.Reflection
import Prelude hiding (null)

-- | O(1) Get DataFrame dimensions i.e. (rows, columns)
dimensions :: DataFrame -> (Int, Int)
dimensions :: DataFrame -> (Int, Int)
dimensions = DataFrame -> (Int, Int)
dataframeDimensions
{-# INLINE dimensions #-}

-- | O(k) Get column names of the DataFrame in order of insertion.
columnNames :: DataFrame -> [T.Text]
columnNames :: DataFrame -> [Text]
columnNames = ((Text, Int) -> Text) -> [(Text, Int)] -> [Text]
forall a b. (a -> b) -> [a] -> [b]
map (Text, Int) -> Text
forall a b. (a, b) -> a
fst ([(Text, Int)] -> [Text])
-> (DataFrame -> [(Text, Int)]) -> DataFrame -> [Text]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ((Text, Int) -> (Text, Int) -> Ordering)
-> [(Text, Int)] -> [(Text, Int)]
forall a. (a -> a -> Ordering) -> [a] -> [a]
L.sortBy (Int -> Int -> Ordering
forall a. Ord a => a -> a -> Ordering
compare (Int -> Int -> Ordering)
-> ((Text, Int) -> Int) -> (Text, Int) -> (Text, Int) -> Ordering
forall b c a. (b -> b -> c) -> (a -> b) -> a -> a -> c
`on` (Text, Int) -> Int
forall a b. (a, b) -> b
snd)([(Text, Int)] -> [(Text, Int)])
-> (DataFrame -> [(Text, Int)]) -> DataFrame -> [(Text, Int)]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Map Text Int -> [(Text, Int)]
forall k a. Map k a -> [(k, a)]
M.toList (Map Text Int -> [(Text, Int)])
-> (DataFrame -> Map Text Int) -> DataFrame -> [(Text, Int)]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. DataFrame -> Map Text Int
columnIndices
{-# INLINE columnNames #-}

-- | /O(n)/ Adds a vector to the dataframe.
insertColumn ::
  forall a.
  Columnable a =>
  -- | Column Name
  T.Text ->
  -- | Vector to add to column
  V.Vector a ->
  -- | DataFrame to add column to
  DataFrame ->
  DataFrame
insertColumn :: forall a.
Columnable a =>
Text -> Vector a -> DataFrame -> DataFrame
insertColumn Text
name Vector a
xs = Text -> Maybe Column -> DataFrame -> DataFrame
insertColumn' Text
name (Column -> Maybe Column
forall a. a -> Maybe a
Just (Vector a -> Column
forall a.
(Columnable a, ColumnifyRep (KindOf a) a) =>
Vector a -> Column
toColumn' Vector a
xs))
{-# INLINE insertColumn #-}

cloneColumn :: T.Text -> T.Text -> DataFrame -> DataFrame
cloneColumn :: Text -> Text -> DataFrame -> DataFrame
cloneColumn Text
original Text
new DataFrame
df = DataFrame -> Maybe DataFrame -> DataFrame
forall a. a -> Maybe a -> a
fromMaybe (DataFrameException -> DataFrame
forall a e. Exception e => e -> a
throw (DataFrameException -> DataFrame)
-> DataFrameException -> DataFrame
forall a b. (a -> b) -> a -> b
$ Text -> Text -> [Text] -> DataFrameException
ColumnNotFoundException Text
original Text
"cloneColumn" (((Text, Int) -> Text) -> [(Text, Int)] -> [Text]
forall a b. (a -> b) -> [a] -> [b]
map (Text, Int) -> Text
forall a b. (a, b) -> a
fst ([(Text, Int)] -> [Text]) -> [(Text, Int)] -> [Text]
forall a b. (a -> b) -> a -> b
$ Map Text Int -> [(Text, Int)]
forall k a. Map k a -> [(k, a)]
M.toList (Map Text Int -> [(Text, Int)]) -> Map Text Int -> [(Text, Int)]
forall a b. (a -> b) -> a -> b
$ DataFrame -> Map Text Int
columnIndices DataFrame
df)) (Maybe DataFrame -> DataFrame) -> Maybe DataFrame -> DataFrame
forall a b. (a -> b) -> a -> b
$ do
  Column
column <- Text -> DataFrame -> Maybe Column
getColumn Text
original DataFrame
df
  DataFrame -> Maybe DataFrame
forall a. a -> Maybe a
forall (m :: * -> *) a. Monad m => a -> m a
return (DataFrame -> Maybe DataFrame) -> DataFrame -> Maybe DataFrame
forall a b. (a -> b) -> a -> b
$ Text -> Maybe Column -> DataFrame -> DataFrame
insertColumn' Text
new (Column -> Maybe Column
forall a. a -> Maybe a
Just Column
column) DataFrame
df

-- | /O(n)/ Adds an unboxed vector to the dataframe.
insertUnboxedColumn ::
  forall a.
  (Columnable a, VU.Unbox a) =>
  -- | Column Name
  T.Text ->
  -- | Unboxed vector to add to column
  VU.Vector a ->
  -- | DataFrame to add to column
  DataFrame ->
  DataFrame
insertUnboxedColumn :: forall a.
(Columnable a, Unbox a) =>
Text -> Vector a -> DataFrame -> DataFrame
insertUnboxedColumn Text
name Vector a
xs = Text -> Maybe Column -> DataFrame -> DataFrame
insertColumn' Text
name (Column -> Maybe Column
forall a. a -> Maybe a
Just (Vector a -> Column
forall a. (Columnable a, Unbox a) => Vector a -> Column
UnboxedColumn Vector a
xs))

-- -- | /O(n)/ Add a column to the dataframe. Not meant for external use.
insertColumn' ::
  -- | Column Name
  T.Text ->
  -- | Column to add
  Maybe Column ->
  -- | DataFrame to add to column
  DataFrame ->
  DataFrame
insertColumn' :: Text -> Maybe Column -> DataFrame -> DataFrame
insertColumn' Text
_ Maybe Column
Nothing DataFrame
d = DataFrame
d
insertColumn' Text
name optCol :: Maybe Column
optCol@(Just Column
column) DataFrame
d
    | Text -> Map Text Int -> Bool
forall k a. Ord k => k -> Map k a -> Bool
M.member Text
name (DataFrame -> Map Text Int
columnIndices DataFrame
d) = let
        i :: Int
i = Map Text Int -> Text -> Int
forall k a. Ord k => Map k a -> k -> a
(M.!) (DataFrame -> Map Text Int
columnIndices DataFrame
d) Text
name
      in DataFrame
d { columns = columns d V.// [(i, optCol)] }
    | Bool
otherwise = DataFrame
insertNewColumn
      where
        l :: Int
l = Column -> Int
columnLength Column
column
        (Int
r, Int
c) = DataFrame -> (Int, Int)
dataframeDimensions DataFrame
d
        diff :: Int
diff = Int -> Int
forall a. Num a => a -> a
abs (Int
l Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
r)
        insertNewColumn :: DataFrame
insertNewColumn
          -- If we have a non-empty dataframe and we have more rows in the new column than the other column
          -- we should make all the other columns have null and then add the new column. 
          | Int
r Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
> Int
0 Bool -> Bool -> Bool
&& Int
l Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
> Int
r = let
              indexes :: [Int]
indexes = (((Text, Int) -> Int) -> [(Text, Int)] -> [Int]
forall a b. (a -> b) -> [a] -> [b]
map (Text, Int) -> Int
forall a b. (a, b) -> b
snd ([(Text, Int)] -> [Int])
-> (DataFrame -> [(Text, Int)]) -> DataFrame -> [Int]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ((Text, Int) -> (Text, Int) -> Ordering)
-> [(Text, Int)] -> [(Text, Int)]
forall a. (a -> a -> Ordering) -> [a] -> [a]
L.sortBy (Int -> Int -> Ordering
forall a. Ord a => a -> a -> Ordering
compare (Int -> Int -> Ordering)
-> ((Text, Int) -> Int) -> (Text, Int) -> (Text, Int) -> Ordering
forall b c a. (b -> b -> c) -> (a -> b) -> a -> a -> c
`on` (Text, Int) -> Int
forall a b. (a, b) -> b
snd)([(Text, Int)] -> [(Text, Int)])
-> (DataFrame -> [(Text, Int)]) -> DataFrame -> [(Text, Int)]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Map Text Int -> [(Text, Int)]
forall k a. Map k a -> [(k, a)]
M.toList (Map Text Int -> [(Text, Int)])
-> (DataFrame -> Map Text Int) -> DataFrame -> [(Text, Int)]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. DataFrame -> Map Text Int
columnIndices) DataFrame
d
              nonEmptyColumns :: [Column]
nonEmptyColumns = ([Column] -> Int -> [Column]) -> [Column] -> [Int] -> [Column]
forall b a. (b -> a -> b) -> b -> [a] -> b
forall (t :: * -> *) b a.
Foldable t =>
(b -> a -> b) -> b -> t a -> b
L.foldl' (\[Column]
acc Int
i -> [Column]
acc [Column] -> [Column] -> [Column]
forall a. [a] -> [a] -> [a]
++ [Column -> (Column -> Column) -> Maybe Column -> Column
forall b a. b -> (a -> b) -> Maybe a -> b
maybe ([Char] -> Column
forall a. HasCallStack => [Char] -> a
error [Char]
"Unexpected") (Int -> Column -> Column
expandColumn Int
diff) (DataFrame -> Vector (Maybe Column)
columns DataFrame
d Vector (Maybe Column) -> Int -> Maybe Column
forall a. Vector a -> Int -> a
V.! Int
i)]) [] [Int]
indexes
            in [(Text, Column)] -> DataFrame
fromList ([Text] -> [Column] -> [(Text, Column)]
forall a b. [a] -> [b] -> [(a, b)]
zip (DataFrame -> [Text]
columnNames DataFrame
d [Text] -> [Text] -> [Text]
forall a. [a] -> [a] -> [a]
++ [Text
name]) ([Column]
nonEmptyColumns [Column] -> [Column] -> [Column]
forall a. [a] -> [a] -> [a]
++ [Column
column]))
          | Bool
otherwise = let
                (Int
n:[Int]
rest) = case DataFrame -> [Int]
freeIndices DataFrame
d of
                  [] -> [Vector (Maybe Column) -> Int
forall (v :: * -> *) a. Vector v a => v a -> Int
VG.length (DataFrame -> Vector (Maybe Column)
columns DataFrame
d)..(Vector (Maybe Column) -> Int
forall (v :: * -> *) a. Vector v a => v a -> Int
VG.length (DataFrame -> Vector (Maybe Column)
columns DataFrame
d) Int -> Int -> Int
forall a. Num a => a -> a -> a
* Int
2 Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
1)]
                  [Int]
lst -> [Int]
lst
                columns' :: Vector (Maybe Column)
columns' = if [Int] -> Bool
forall a. [a] -> Bool
forall (t :: * -> *) a. Foldable t => t a -> Bool
L.null (DataFrame -> [Int]
freeIndices DataFrame
d)
                          then DataFrame -> Vector (Maybe Column)
columns DataFrame
d Vector (Maybe Column)
-> Vector (Maybe Column) -> Vector (Maybe Column)
forall a. Vector a -> Vector a -> Vector a
V.++ Int -> Maybe Column -> Vector (Maybe Column)
forall a. Int -> a -> Vector a
V.replicate (Vector (Maybe Column) -> Int
forall (v :: * -> *) a. Vector v a => v a -> Int
VG.length (DataFrame -> Vector (Maybe Column)
columns DataFrame
d)) Maybe Column
forall a. Maybe a
Nothing
                          else DataFrame -> Vector (Maybe Column)
columns DataFrame
d
                xs' :: Maybe Column
xs'
                  | Int
diff Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
<= Int
0 Bool -> Bool -> Bool
|| DataFrame -> Bool
null DataFrame
d = Maybe Column
optCol
                  | Bool
otherwise = Int -> Column -> Column
expandColumn Int
diff (Column -> Column) -> Maybe Column -> Maybe Column
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Maybe Column
optCol
            in DataFrame
d
                  { columns = columns' V.// [(n, xs')],
                    columnIndices = M.insert name n (columnIndices d),
                    freeIndices = rest,
                    dataframeDimensions = (max l r, c + 1)
                  }

-- | /O(k)/ Add a column to the dataframe providing a default.
-- This constructs a new vector and also may convert it
-- to an unboxed vector if necessary. Since columns are usually
-- large the runtime is dominated by the length of the list, k.
insertColumnWithDefault ::
  forall a.
  (Columnable a) =>
  -- | Default Value
  a ->
  -- | Column name
  T.Text ->
  -- | Data to add to column
  V.Vector a ->
  -- | DataFrame to add to column
  DataFrame ->
  DataFrame
insertColumnWithDefault :: forall a.
Columnable a =>
a -> Text -> Vector a -> DataFrame -> DataFrame
insertColumnWithDefault a
defaultValue Text
name Vector a
xs DataFrame
d =
  let (Int
rows, Int
_) = DataFrame -> (Int, Int)
dataframeDimensions DataFrame
d
      values :: Vector a
values = Vector a
xs Vector a -> Vector a -> Vector a
forall a. Vector a -> Vector a -> Vector a
V.++ Int -> a -> Vector a
forall a. Int -> a -> Vector a
V.replicate (Int
rows Int -> Int -> Int
forall a. Num a => a -> a -> a
- Vector a -> Int
forall a. Vector a -> Int
V.length Vector a
xs) a
defaultValue
   in Text -> Maybe Column -> DataFrame -> DataFrame
insertColumn' Text
name (Column -> Maybe Column
forall a. a -> Maybe a
Just (Column -> Maybe Column) -> Column -> Maybe Column
forall a b. (a -> b) -> a -> b
$ Vector a -> Column
forall a.
(Columnable a, ColumnifyRep (KindOf a) a) =>
Vector a -> Column
toColumn' Vector a
values) DataFrame
d

-- TODO: Add existence check in rename.
rename :: T.Text -> T.Text -> DataFrame -> DataFrame
rename :: Text -> Text -> DataFrame -> DataFrame
rename Text
orig Text
new DataFrame
df = DataFrame -> Maybe DataFrame -> DataFrame
forall a. a -> Maybe a -> a
fromMaybe (DataFrameException -> DataFrame
forall a e. Exception e => e -> a
throw (DataFrameException -> DataFrame)
-> DataFrameException -> DataFrame
forall a b. (a -> b) -> a -> b
$ Text -> Text -> [Text] -> DataFrameException
ColumnNotFoundException Text
orig Text
"rename" (((Text, Int) -> Text) -> [(Text, Int)] -> [Text]
forall a b. (a -> b) -> [a] -> [b]
map (Text, Int) -> Text
forall a b. (a, b) -> a
fst ([(Text, Int)] -> [Text]) -> [(Text, Int)] -> [Text]
forall a b. (a -> b) -> a -> b
$ Map Text Int -> [(Text, Int)]
forall k a. Map k a -> [(k, a)]
M.toList (Map Text Int -> [(Text, Int)]) -> Map Text Int -> [(Text, Int)]
forall a b. (a -> b) -> a -> b
$ DataFrame -> Map Text Int
columnIndices DataFrame
df)) (Maybe DataFrame -> DataFrame) -> Maybe DataFrame -> DataFrame
forall a b. (a -> b) -> a -> b
$ do
  Int
columnIndex <- Text -> Map Text Int -> Maybe Int
forall k a. Ord k => k -> Map k a -> Maybe a
M.lookup Text
orig (DataFrame -> Map Text Int
columnIndices DataFrame
df)
  let origRemoved :: Map Text Int
origRemoved = Text -> Map Text Int -> Map Text Int
forall k a. Ord k => k -> Map k a -> Map k a
M.delete Text
orig (DataFrame -> Map Text Int
columnIndices DataFrame
df)
  let newAdded :: Map Text Int
newAdded = Text -> Int -> Map Text Int -> Map Text Int
forall k a. Ord k => k -> a -> Map k a -> Map k a
M.insert Text
new Int
columnIndex Map Text Int
origRemoved
  DataFrame -> Maybe DataFrame
forall a. a -> Maybe a
forall (m :: * -> *) a. Monad m => a -> m a
return DataFrame
df { columnIndices = newAdded }

-- | O(1) Get the number of elements in a given column.
columnSize :: T.Text -> DataFrame -> Maybe Int
columnSize :: Text -> DataFrame -> Maybe Int
columnSize Text
name DataFrame
df = Column -> Int
columnLength (Column -> Int) -> Maybe Column -> Maybe Int
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Text -> DataFrame -> Maybe Column
getColumn Text
name DataFrame
df

data ColumnInfo = ColumnInfo {
    ColumnInfo -> Text
nameOfColumn :: !T.Text,
    ColumnInfo -> Int
nonNullValues :: !Int,
    ColumnInfo -> Int
nullValues :: !Int,
    ColumnInfo -> Int
partiallyParsedValues :: !Int,
    ColumnInfo -> Int
uniqueValues :: !Int,
    ColumnInfo -> Text
typeOfColumn :: !T.Text
  }

-- | O(n) Returns the number of non-null columns in the dataframe and the type associated
-- with each column.
columnInfo :: DataFrame -> DataFrame
columnInfo :: DataFrame -> DataFrame
columnInfo DataFrame
df = DataFrame
empty DataFrame -> (DataFrame -> DataFrame) -> DataFrame
forall a b. a -> (a -> b) -> b
& Text -> Maybe Column -> DataFrame -> DataFrame
insertColumn' Text
"Column Name" (Column -> Maybe Column
forall a. a -> Maybe a
Just (Column -> Maybe Column) -> Column -> Maybe Column
forall a b. (a -> b) -> a -> b
$! [Text] -> Column
forall a.
(Columnable a, ColumnifyRep (KindOf a) a) =>
[a] -> Column
toColumn ((ColumnInfo -> Text) -> [ColumnInfo] -> [Text]
forall a b. (a -> b) -> [a] -> [b]
map ColumnInfo -> Text
nameOfColumn [ColumnInfo]
infos))
                      DataFrame -> (DataFrame -> DataFrame) -> DataFrame
forall a b. a -> (a -> b) -> b
& Text -> Maybe Column -> DataFrame -> DataFrame
insertColumn' Text
"# Non-null Values" (Column -> Maybe Column
forall a. a -> Maybe a
Just (Column -> Maybe Column) -> Column -> Maybe Column
forall a b. (a -> b) -> a -> b
$! [Int] -> Column
forall a.
(Columnable a, ColumnifyRep (KindOf a) a) =>
[a] -> Column
toColumn ((ColumnInfo -> Int) -> [ColumnInfo] -> [Int]
forall a b. (a -> b) -> [a] -> [b]
map ColumnInfo -> Int
nonNullValues [ColumnInfo]
infos))
                      DataFrame -> (DataFrame -> DataFrame) -> DataFrame
forall a b. a -> (a -> b) -> b
& Text -> Maybe Column -> DataFrame -> DataFrame
insertColumn' Text
"# Null Values" (Column -> Maybe Column
forall a. a -> Maybe a
Just (Column -> Maybe Column) -> Column -> Maybe Column
forall a b. (a -> b) -> a -> b
$! [Int] -> Column
forall a.
(Columnable a, ColumnifyRep (KindOf a) a) =>
[a] -> Column
toColumn ((ColumnInfo -> Int) -> [ColumnInfo] -> [Int]
forall a b. (a -> b) -> [a] -> [b]
map ColumnInfo -> Int
nullValues [ColumnInfo]
infos))
                      DataFrame -> (DataFrame -> DataFrame) -> DataFrame
forall a b. a -> (a -> b) -> b
& Text -> Maybe Column -> DataFrame -> DataFrame
insertColumn' Text
"# Partially parsed" (Column -> Maybe Column
forall a. a -> Maybe a
Just (Column -> Maybe Column) -> Column -> Maybe Column
forall a b. (a -> b) -> a -> b
$! [Int] -> Column
forall a.
(Columnable a, ColumnifyRep (KindOf a) a) =>
[a] -> Column
toColumn ((ColumnInfo -> Int) -> [ColumnInfo] -> [Int]
forall a b. (a -> b) -> [a] -> [b]
map ColumnInfo -> Int
partiallyParsedValues [ColumnInfo]
infos))
                      DataFrame -> (DataFrame -> DataFrame) -> DataFrame
forall a b. a -> (a -> b) -> b
& Text -> Maybe Column -> DataFrame -> DataFrame
insertColumn' Text
"# Unique Values" (Column -> Maybe Column
forall a. a -> Maybe a
Just (Column -> Maybe Column) -> Column -> Maybe Column
forall a b. (a -> b) -> a -> b
$! [Int] -> Column
forall a.
(Columnable a, ColumnifyRep (KindOf a) a) =>
[a] -> Column
toColumn ((ColumnInfo -> Int) -> [ColumnInfo] -> [Int]
forall a b. (a -> b) -> [a] -> [b]
map ColumnInfo -> Int
uniqueValues [ColumnInfo]
infos))
                      DataFrame -> (DataFrame -> DataFrame) -> DataFrame
forall a b. a -> (a -> b) -> b
& Text -> Maybe Column -> DataFrame -> DataFrame
insertColumn' Text
"Type" (Column -> Maybe Column
forall a. a -> Maybe a
Just (Column -> Maybe Column) -> Column -> Maybe Column
forall a b. (a -> b) -> a -> b
$! [Text] -> Column
forall a.
(Columnable a, ColumnifyRep (KindOf a) a) =>
[a] -> Column
toColumn ((ColumnInfo -> Text) -> [ColumnInfo] -> [Text]
forall a b. (a -> b) -> [a] -> [b]
map ColumnInfo -> Text
typeOfColumn [ColumnInfo]
infos))
  where
    infos :: [ColumnInfo]
infos = (ColumnInfo -> ColumnInfo -> Ordering)
-> [ColumnInfo] -> [ColumnInfo]
forall a. (a -> a -> Ordering) -> [a] -> [a]
L.sortBy (Int -> Int -> Ordering
forall a. Ord a => a -> a -> Ordering
compare (Int -> Int -> Ordering)
-> (ColumnInfo -> Int) -> ColumnInfo -> ColumnInfo -> Ordering
forall b c a. (b -> b -> c) -> (a -> b) -> a -> a -> c
`on` ColumnInfo -> Int
nonNullValues) (([ColumnInfo] -> Int -> Maybe Column -> [ColumnInfo])
-> [ColumnInfo] -> Vector (Maybe Column) -> [ColumnInfo]
forall a b. (a -> Int -> b -> a) -> a -> Vector b -> a
V.ifoldl' [ColumnInfo] -> Int -> Maybe Column -> [ColumnInfo]
go [] (DataFrame -> Vector (Maybe Column)
columns DataFrame
df)) :: [ColumnInfo]
    indexMap :: Map Int Text
indexMap = [(Int, Text)] -> Map Int Text
forall k a. Ord k => [(k, a)] -> Map k a
M.fromList (((Text, Int) -> (Int, Text)) -> [(Text, Int)] -> [(Int, Text)]
forall a b. (a -> b) -> [a] -> [b]
map (\(Text
a, Int
b) -> (Int
b, Text
a)) ([(Text, Int)] -> [(Int, Text)]) -> [(Text, Int)] -> [(Int, Text)]
forall a b. (a -> b) -> a -> b
$ Map Text Int -> [(Text, Int)]
forall k a. Map k a -> [(k, a)]
M.toList (DataFrame -> Map Text Int
columnIndices DataFrame
df))
    columnName :: Int -> Maybe Text
columnName Int
i = Int -> Map Int Text -> Maybe Text
forall k a. Ord k => k -> Map k a -> Maybe a
M.lookup Int
i Map Int Text
indexMap
    go :: [ColumnInfo] -> Int -> Maybe Column -> [ColumnInfo]
go [ColumnInfo]
acc Int
i Maybe Column
Nothing = [ColumnInfo]
acc
    go [ColumnInfo]
acc Int
i (Just col :: Column
col@(OptionalColumn (Vector (Maybe a)
c :: V.Vector a))) = let
        cname :: Maybe Text
cname = Int -> Maybe Text
columnName Int
i
        countNulls :: Int
countNulls = Column -> Int
nulls Column
col
        countPartial :: Int
countPartial = Column -> Int
partiallyParsed Column
col
        columnType :: Text
columnType = [Char] -> Text
T.pack ([Char] -> Text) -> [Char] -> Text
forall a b. (a -> b) -> a -> b
$ TypeRep (Maybe a) -> [Char]
forall a. Show a => a -> [Char]
show (TypeRep (Maybe a) -> [Char]) -> TypeRep (Maybe a) -> [Char]
forall a b. (a -> b) -> a -> b
$ forall a. Typeable a => TypeRep a
forall {k} (a :: k). Typeable a => TypeRep a
typeRep @a
        unique :: Int
unique = Set (Maybe a) -> Int
forall a. Set a -> Int
S.size (Set (Maybe a) -> Int) -> Set (Maybe a) -> Int
forall a b. (a -> b) -> a -> b
$ (Maybe a -> Set (Maybe a) -> Set (Maybe a))
-> Set (Maybe a) -> Vector (Maybe a) -> Set (Maybe a)
forall (v :: * -> *) a b.
Vector v a =>
(a -> b -> b) -> b -> v a -> b
VG.foldr Maybe a -> Set (Maybe a) -> Set (Maybe a)
forall a. Ord a => a -> Set a -> Set a
S.insert Set (Maybe a)
forall a. Set a
S.empty Vector (Maybe a)
c
      in if Maybe Text -> Bool
forall a. Maybe a -> Bool
isNothing Maybe Text
cname then [ColumnInfo]
acc else Text -> Int -> Int -> Int -> Int -> Text -> ColumnInfo
ColumnInfo (Text -> Maybe Text -> Text
forall a. a -> Maybe a -> a
fromMaybe Text
"" Maybe Text
cname) (Column -> Int
columnLength Column
col Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
countNulls) Int
countNulls Int
countPartial Int
unique Text
columnType ColumnInfo -> [ColumnInfo] -> [ColumnInfo]
forall a. a -> [a] -> [a]
: [ColumnInfo]
acc
    go [ColumnInfo]
acc Int
i (Just col :: Column
col@(BoxedColumn (Vector a
c :: V.Vector a))) = let
        cname :: Maybe Text
cname = Int -> Maybe Text
columnName Int
i
        countPartial :: Int
countPartial = Column -> Int
partiallyParsed Column
col
        columnType :: Text
columnType = [Char] -> Text
T.pack ([Char] -> Text) -> [Char] -> Text
forall a b. (a -> b) -> a -> b
$ TypeRep a -> [Char]
forall a. Show a => a -> [Char]
show (TypeRep a -> [Char]) -> TypeRep a -> [Char]
forall a b. (a -> b) -> a -> b
$ forall a. Typeable a => TypeRep a
forall {k} (a :: k). Typeable a => TypeRep a
typeRep @a
        unique :: Int
unique = Set a -> Int
forall a. Set a -> Int
S.size (Set a -> Int) -> Set a -> Int
forall a b. (a -> b) -> a -> b
$ (a -> Set a -> Set a) -> Set a -> Vector a -> Set a
forall (v :: * -> *) a b.
Vector v a =>
(a -> b -> b) -> b -> v a -> b
VG.foldr a -> Set a -> Set a
forall a. Ord a => a -> Set a -> Set a
S.insert Set a
forall a. Set a
S.empty Vector a
c
      in if Maybe Text -> Bool
forall a. Maybe a -> Bool
isNothing Maybe Text
cname then [ColumnInfo]
acc else Text -> Int -> Int -> Int -> Int -> Text -> ColumnInfo
ColumnInfo (Text -> Maybe Text -> Text
forall a. a -> Maybe a -> a
fromMaybe Text
"" Maybe Text
cname) (Column -> Int
columnLength Column
col) Int
0 Int
countPartial Int
unique Text
columnType ColumnInfo -> [ColumnInfo] -> [ColumnInfo]
forall a. a -> [a] -> [a]
: [ColumnInfo]
acc
    go [ColumnInfo]
acc Int
i (Just col :: Column
col@(UnboxedColumn Vector a
c)) = let
        cname :: Maybe Text
cname = Int -> Maybe Text
columnName Int
i
        columnType :: Text
columnType = [Char] -> Text
T.pack ([Char] -> Text) -> [Char] -> Text
forall a b. (a -> b) -> a -> b
$ Column -> [Char]
columnTypeString Column
col
        unique :: Int
unique = Set a -> Int
forall a. Set a -> Int
S.size (Set a -> Int) -> Set a -> Int
forall a b. (a -> b) -> a -> b
$ (a -> Set a -> Set a) -> Set a -> Vector a -> Set a
forall (v :: * -> *) a b.
Vector v a =>
(a -> b -> b) -> b -> v a -> b
VG.foldr a -> Set a -> Set a
forall a. Ord a => a -> Set a -> Set a
S.insert Set a
forall a. Set a
S.empty Vector a
c
        -- Unboxed columns cannot have nulls since Maybe
        -- is not an instance of Unbox a
      in if Maybe Text -> Bool
forall a. Maybe a -> Bool
isNothing Maybe Text
cname then [ColumnInfo]
acc else Text -> Int -> Int -> Int -> Int -> Text -> ColumnInfo
ColumnInfo (Text -> Maybe Text -> Text
forall a. a -> Maybe a -> a
fromMaybe Text
"" Maybe Text
cname) (Column -> Int
columnLength Column
col) Int
0 Int
0 Int
unique Text
columnType ColumnInfo -> [ColumnInfo] -> [ColumnInfo]
forall a. a -> [a] -> [a]
: [ColumnInfo]
acc

nulls :: Column -> Int
nulls :: Column -> Int
nulls (OptionalColumn Vector (Maybe a)
xs) = Vector (Maybe a) -> Int
forall (v :: * -> *) a. Vector v a => v a -> Int
VG.length (Vector (Maybe a) -> Int) -> Vector (Maybe a) -> Int
forall a b. (a -> b) -> a -> b
$ (Maybe a -> Bool) -> Vector (Maybe a) -> Vector (Maybe a)
forall (v :: * -> *) a. Vector v a => (a -> Bool) -> v a -> v a
VG.filter Maybe a -> Bool
forall a. Maybe a -> Bool
isNothing Vector (Maybe a)
xs
nulls (BoxedColumn (Vector a
xs :: V.Vector a)) = case TypeRep a -> TypeRep Text -> Maybe (a :~: Text)
forall a b. TypeRep a -> TypeRep b -> Maybe (a :~: b)
forall {k} (f :: k -> *) (a :: k) (b :: k).
TestEquality f =>
f a -> f b -> Maybe (a :~: b)
testEquality (forall a. Typeable a => TypeRep a
forall {k} (a :: k). Typeable a => TypeRep a
typeRep @a) (forall a. Typeable a => TypeRep a
forall {k} (a :: k). Typeable a => TypeRep a
typeRep @T.Text) of
  Just a :~: Text
Refl -> Vector Text -> Int
forall (v :: * -> *) a. Vector v a => v a -> Int
VG.length (Vector Text -> Int) -> Vector Text -> Int
forall a b. (a -> b) -> a -> b
$ (Text -> Bool) -> Vector Text -> Vector Text
forall (v :: * -> *) a. Vector v a => (a -> Bool) -> v a -> v a
VG.filter Text -> Bool
isNullish Vector a
Vector Text
xs
  Maybe (a :~: Text)
Nothing -> case TypeRep a -> TypeRep [Char] -> Maybe (a :~: [Char])
forall a b. TypeRep a -> TypeRep b -> Maybe (a :~: b)
forall {k} (f :: k -> *) (a :: k) (b :: k).
TestEquality f =>
f a -> f b -> Maybe (a :~: b)
testEquality (forall a. Typeable a => TypeRep a
forall {k} (a :: k). Typeable a => TypeRep a
typeRep @a) (forall a. Typeable a => TypeRep a
forall {k} (a :: k). Typeable a => TypeRep a
typeRep @String) of
    Just a :~: [Char]
Refl -> Vector [Char] -> Int
forall (v :: * -> *) a. Vector v a => v a -> Int
VG.length (Vector [Char] -> Int) -> Vector [Char] -> Int
forall a b. (a -> b) -> a -> b
$ ([Char] -> Bool) -> Vector [Char] -> Vector [Char]
forall (v :: * -> *) a. Vector v a => (a -> Bool) -> v a -> v a
VG.filter (Text -> Bool
isNullish (Text -> Bool) -> ([Char] -> Text) -> [Char] -> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. [Char] -> Text
T.pack) Vector a
Vector [Char]
xs
    Maybe (a :~: [Char])
Nothing -> case forall a. Typeable a => TypeRep a
forall {k} (a :: k). Typeable a => TypeRep a
typeRep @a of
      App TypeRep a
t1 TypeRep b
t2 -> case TypeRep a -> TypeRep Maybe -> Maybe (a :~~: Maybe)
forall k1 k2 (a :: k1) (b :: k2).
TypeRep a -> TypeRep b -> Maybe (a :~~: b)
eqTypeRep TypeRep a
t1 (forall {k} (a :: k). Typeable a => TypeRep a
forall (a :: * -> *). Typeable a => TypeRep a
typeRep @Maybe) of
          Just a :~~: Maybe
HRefl -> Vector (Maybe b) -> Int
forall (v :: * -> *) a. Vector v a => v a -> Int
VG.length (Vector (Maybe b) -> Int) -> Vector (Maybe b) -> Int
forall a b. (a -> b) -> a -> b
$ (Maybe b -> Bool) -> Vector (Maybe b) -> Vector (Maybe b)
forall (v :: * -> *) a. Vector v a => (a -> Bool) -> v a -> v a
VG.filter Maybe b -> Bool
forall a. Maybe a -> Bool
isNothing Vector a
Vector (Maybe b)
xs
          Maybe (a :~~: Maybe)
Nothing -> Int
0
      TypeRep a
_ -> Int
0
nulls Column
_ = Int
0

partiallyParsed :: Column -> Int
partiallyParsed :: Column -> Int
partiallyParsed (BoxedColumn (Vector a
xs :: V.Vector a)) =
  case forall a. Typeable a => TypeRep a
forall {k} (a :: k). Typeable a => TypeRep a
typeRep @a of
    App (App TypeRep a
tycon TypeRep b
t1) TypeRep b
t2 -> case TypeRep a -> TypeRep Either -> Maybe (a :~~: Either)
forall k1 k2 (a :: k1) (b :: k2).
TypeRep a -> TypeRep b -> Maybe (a :~~: b)
eqTypeRep TypeRep a
tycon (forall {k} (a :: k). Typeable a => TypeRep a
forall (a :: * -> * -> *). Typeable a => TypeRep a
typeRep @Either) of
      Just a :~~: Either
HRefl -> Vector (Either b b) -> Int
forall (v :: * -> *) a. Vector v a => v a -> Int
VG.length (Vector (Either b b) -> Int) -> Vector (Either b b) -> Int
forall a b. (a -> b) -> a -> b
$ (Either b b -> Bool) -> Vector (Either b b) -> Vector (Either b b)
forall (v :: * -> *) a. Vector v a => (a -> Bool) -> v a -> v a
VG.filter Either b b -> Bool
forall a b. Either a b -> Bool
isLeft Vector a
Vector (Either b b)
xs
      Maybe (a :~~: Either)
Nothing -> Int
0
    TypeRep a
_ -> Int
0
partiallyParsed Column
_ = Int
0

fromList :: [(T.Text, Column)] -> DataFrame
fromList :: [(Text, Column)] -> DataFrame
fromList = (DataFrame -> (Text, Column) -> DataFrame)
-> DataFrame -> [(Text, Column)] -> DataFrame
forall b a. (b -> a -> b) -> b -> [a] -> b
forall (t :: * -> *) b a.
Foldable t =>
(b -> a -> b) -> b -> t a -> b
L.foldl' (\DataFrame
df (!Text
name, !Column
column) -> Text -> Maybe Column -> DataFrame -> DataFrame
insertColumn' Text
name (Column -> Maybe Column
forall a. a -> Maybe a
Just (Column -> Maybe Column) -> Column -> Maybe Column
forall a b. (a -> b) -> a -> b
$! Column
column) DataFrame
df) DataFrame
empty

fromColumnList :: [Column] -> DataFrame
fromColumnList :: [Column] -> DataFrame
fromColumnList = [(Text, Column)] -> DataFrame
fromList ([(Text, Column)] -> DataFrame)
-> ([Column] -> [(Text, Column)]) -> [Column] -> DataFrame
forall b c a. (b -> c) -> (a -> b) -> a -> c
. [Text] -> [Column] -> [(Text, Column)]
forall a b. [a] -> [b] -> [(a, b)]
zip ((Integer -> Text) -> [Integer] -> [Text]
forall a b. (a -> b) -> [a] -> [b]
map ([Char] -> Text
T.pack ([Char] -> Text) -> (Integer -> [Char]) -> Integer -> Text
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Integer -> [Char]
forall a. Show a => a -> [Char]
show) [Integer
0..])

-- | O (k * n) Counts the occurences of each value in a given column.
valueCounts :: forall a. (Columnable a) => T.Text -> DataFrame -> [(a, Int)]
valueCounts :: forall a. Columnable a => Text -> DataFrame -> [(a, Int)]
valueCounts Text
columnName DataFrame
df = case Text -> DataFrame -> Maybe Column
getColumn Text
columnName DataFrame
df of
      Maybe Column
Nothing -> DataFrameException -> [(a, Int)]
forall a e. Exception e => e -> a
throw (DataFrameException -> [(a, Int)])
-> DataFrameException -> [(a, Int)]
forall a b. (a -> b) -> a -> b
$ Text -> Text -> [Text] -> DataFrameException
ColumnNotFoundException Text
columnName Text
"sortBy" (((Text, Int) -> Text) -> [(Text, Int)] -> [Text]
forall a b. (a -> b) -> [a] -> [b]
map (Text, Int) -> Text
forall a b. (a, b) -> a
fst ([(Text, Int)] -> [Text]) -> [(Text, Int)] -> [Text]
forall a b. (a -> b) -> a -> b
$ Map Text Int -> [(Text, Int)]
forall k a. Map k a -> [(k, a)]
M.toList (Map Text Int -> [(Text, Int)]) -> Map Text Int -> [(Text, Int)]
forall a b. (a -> b) -> a -> b
$ DataFrame -> Map Text Int
columnIndices DataFrame
df)
      Just (BoxedColumn (Vector a
column' :: V.Vector c)) ->
        let
          column :: Map a Int
column = (Map a Int -> a -> Map a Int) -> Map a Int -> Vector a -> Map a Int
forall a b. (a -> b -> a) -> a -> Vector b -> a
V.foldl' (\Map a Int
m a
v -> (Int -> Int -> Int) -> a -> Int -> Map a Int -> Map a Int
forall k a. Ord k => (a -> a -> a) -> k -> a -> Map k a -> Map k a
MS.insertWith Int -> Int -> Int
forall a. Num a => a -> a -> a
(+) a
v (Int
1 :: Int) Map a Int
m) Map a Int
forall k a. Map k a
M.empty Vector a
column'
        in case (forall a. Typeable a => TypeRep a
forall {k} (a :: k). Typeable a => TypeRep a
typeRep @a) TypeRep a -> TypeRep a -> Maybe (a :~: a)
forall a b. TypeRep a -> TypeRep b -> Maybe (a :~: b)
forall {k} (f :: k -> *) (a :: k) (b :: k).
TestEquality f =>
f a -> f b -> Maybe (a :~: b)
`testEquality` (forall a. Typeable a => TypeRep a
forall {k} (a :: k). Typeable a => TypeRep a
typeRep @c) of
              Maybe (a :~: a)
Nothing -> DataFrameException -> [(a, Int)]
forall a e. Exception e => e -> a
throw (DataFrameException -> [(a, Int)])
-> DataFrameException -> [(a, Int)]
forall a b. (a -> b) -> a -> b
$ TypeRep a -> TypeRep a -> Text -> Text -> DataFrameException
forall a b.
(Typeable a, Typeable b) =>
TypeRep a -> TypeRep b -> Text -> Text -> DataFrameException
TypeMismatchException (forall a. Typeable a => TypeRep a
forall {k} (a :: k). Typeable a => TypeRep a
typeRep @a) (forall a. Typeable a => TypeRep a
forall {k} (a :: k). Typeable a => TypeRep a
typeRep @c) Text
columnName Text
"valueCounts"
              Just a :~: a
Refl -> Map a Int -> [(a, Int)]
forall k a. Map k a -> [(k, a)]
M.toAscList Map a Int
Map a Int
column
      Just (OptionalColumn (Vector (Maybe a)
column' :: V.Vector c)) ->
        let
          column :: Map (Maybe a) Int
column = (Map (Maybe a) Int -> Maybe a -> Map (Maybe a) Int)
-> Map (Maybe a) Int -> Vector (Maybe a) -> Map (Maybe a) Int
forall a b. (a -> b -> a) -> a -> Vector b -> a
V.foldl' (\Map (Maybe a) Int
m Maybe a
v -> (Int -> Int -> Int)
-> Maybe a -> Int -> Map (Maybe a) Int -> Map (Maybe a) Int
forall k a. Ord k => (a -> a -> a) -> k -> a -> Map k a -> Map k a
MS.insertWith Int -> Int -> Int
forall a. Num a => a -> a -> a
(+) Maybe a
v (Int
1 :: Int) Map (Maybe a) Int
m) Map (Maybe a) Int
forall k a. Map k a
M.empty Vector (Maybe a)
column'
        in case (forall a. Typeable a => TypeRep a
forall {k} (a :: k). Typeable a => TypeRep a
typeRep @a) TypeRep a -> TypeRep (Maybe a) -> Maybe (a :~: Maybe a)
forall a b. TypeRep a -> TypeRep b -> Maybe (a :~: b)
forall {k} (f :: k -> *) (a :: k) (b :: k).
TestEquality f =>
f a -> f b -> Maybe (a :~: b)
`testEquality` (forall a. Typeable a => TypeRep a
forall {k} (a :: k). Typeable a => TypeRep a
typeRep @c) of
              Maybe (a :~: Maybe a)
Nothing -> DataFrameException -> [(a, Int)]
forall a e. Exception e => e -> a
throw (DataFrameException -> [(a, Int)])
-> DataFrameException -> [(a, Int)]
forall a b. (a -> b) -> a -> b
$ TypeRep a
-> TypeRep (Maybe a) -> Text -> Text -> DataFrameException
forall a b.
(Typeable a, Typeable b) =>
TypeRep a -> TypeRep b -> Text -> Text -> DataFrameException
TypeMismatchException (forall a. Typeable a => TypeRep a
forall {k} (a :: k). Typeable a => TypeRep a
typeRep @a) (forall a. Typeable a => TypeRep a
forall {k} (a :: k). Typeable a => TypeRep a
typeRep @c) Text
columnName Text
"valueCounts"
              Just a :~: Maybe a
Refl -> Map a Int -> [(a, Int)]
forall k a. Map k a -> [(k, a)]
M.toAscList Map a Int
Map (Maybe a) Int
column
      Just (UnboxedColumn (Vector a
column' :: VU.Vector c)) -> let
          column :: Map a Int
column = (Map a Int -> a -> Map a Int) -> Map a Int -> Vector a -> Map a Int
forall a b. (a -> b -> a) -> a -> Vector b -> a
V.foldl' (\Map a Int
m a
v -> (Int -> Int -> Int) -> a -> Int -> Map a Int -> Map a Int
forall k a. Ord k => (a -> a -> a) -> k -> a -> Map k a -> Map k a
MS.insertWith Int -> Int -> Int
forall a. Num a => a -> a -> a
(+) a
v (Int
1 :: Int) Map a Int
m) Map a Int
forall k a. Map k a
M.empty (Vector a -> Vector a
forall (v :: * -> *) a (w :: * -> *).
(Vector v a, Vector w a) =>
v a -> w a
V.convert Vector a
column')
        in case (forall a. Typeable a => TypeRep a
forall {k} (a :: k). Typeable a => TypeRep a
typeRep @a) TypeRep a -> TypeRep a -> Maybe (a :~: a)
forall a b. TypeRep a -> TypeRep b -> Maybe (a :~: b)
forall {k} (f :: k -> *) (a :: k) (b :: k).
TestEquality f =>
f a -> f b -> Maybe (a :~: b)
`testEquality` (forall a. Typeable a => TypeRep a
forall {k} (a :: k). Typeable a => TypeRep a
typeRep @c) of
          Maybe (a :~: a)
Nothing -> DataFrameException -> [(a, Int)]
forall a e. Exception e => e -> a
throw (DataFrameException -> [(a, Int)])
-> DataFrameException -> [(a, Int)]
forall a b. (a -> b) -> a -> b
$ TypeRep a -> TypeRep a -> Text -> Text -> DataFrameException
forall a b.
(Typeable a, Typeable b) =>
TypeRep a -> TypeRep b -> Text -> Text -> DataFrameException
TypeMismatchException (forall a. Typeable a => TypeRep a
forall {k} (a :: k). Typeable a => TypeRep a
typeRep @a) (forall a. Typeable a => TypeRep a
forall {k} (a :: k). Typeable a => TypeRep a
typeRep @c) Text
columnName Text
"valueCounts"
          Just a :~: a
Refl -> Map a Int -> [(a, Int)]
forall k a. Map k a -> [(k, a)]
M.toAscList Map a Int
Map a Int
column

fold :: (a -> DataFrame -> DataFrame) -> [a] -> DataFrame -> DataFrame
fold :: forall a.
(a -> DataFrame -> DataFrame) -> [a] -> DataFrame -> DataFrame
fold a -> DataFrame -> DataFrame
f [a]
xs DataFrame
acc = (DataFrame -> a -> DataFrame) -> DataFrame -> [a] -> DataFrame
forall b a. (b -> a -> b) -> b -> [a] -> b
forall (t :: * -> *) b a.
Foldable t =>
(b -> a -> b) -> b -> t a -> b
L.foldl' ((a -> DataFrame -> DataFrame) -> DataFrame -> a -> DataFrame
forall a b c. (a -> b -> c) -> b -> a -> c
flip a -> DataFrame -> DataFrame
f) DataFrame
acc [a]
xs