{-# LANGUAGE ExplicitNamespaces #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE GADTs #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE RankNTypes #-}
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE TypeApplications #-}

module DataFrame.Operations.Statistics where

import qualified Data.List as L
import qualified Data.Map as M
import qualified Data.Text as T
import qualified Data.Vector as V
import qualified Data.Vector.Generic as VG
import qualified Data.Vector.Unboxed as VU

import Prelude as P

import Control.Exception (throw)
import Data.Function ((&))
import Data.Maybe (fromMaybe, isJust)
import Data.Type.Equality (TestEquality (testEquality), type (:~:) (Refl))
import DataFrame.Errors (DataFrameException (..))
import DataFrame.Internal.Column
import DataFrame.Internal.DataFrame (
    DataFrame (..),
    empty,
    getColumn,
 )
import DataFrame.Internal.Expression
import DataFrame.Internal.Interpreter
import DataFrame.Internal.Row (showValue, toAny)
import DataFrame.Internal.Statistics
import DataFrame.Internal.Types
import DataFrame.Operations.Core
import DataFrame.Operations.Subset (filterJust)
import DataFrame.Operations.Transformations (impute)
import Text.Printf (printf)
import Type.Reflection (typeRep)

{- | Show a frequency table for a categorical feaure.

__Examples:__

@
ghci> df <- D.readCsv ".\/data\/housing.csv"

ghci> D.frequencies "ocean_proximity" df

---------------------------------------------------------------------
   Statistic    | <1H OCEAN | INLAND | ISLAND | NEAR BAY | NEAR OCEAN
----------------|-----------|--------|--------|----------|-----------
      Text      |    Any    |  Any   |  Any   |   Any    |    Any
----------------|-----------|--------|--------|----------|-----------
 Count          | 9136      | 6551   | 5      | 2290     | 2658
 Percentage (%) | 44.26%    | 31.74% | 0.02%  | 11.09%   | 12.88%
@
-}
frequencies :: forall a. (Columnable a) => Expr a -> DataFrame -> DataFrame
frequencies :: forall a. Columnable a => Expr a -> DataFrame -> DataFrame
frequencies Expr a
expr DataFrame
df =
    let
        counts :: [(a, Int)]
counts = Expr a -> DataFrame -> [(a, Int)]
forall a.
(Ord a, Columnable a) =>
Expr a -> DataFrame -> [(a, Int)]
valueCounts Expr a
expr DataFrame
df
        calculatePercentage :: [(a, a)] -> a -> Any
calculatePercentage [(a, a)]
cs a
k = String -> Any
forall a. Columnable a => a -> Any
toAny (String -> Any) -> String -> Any
forall a b. (a -> b) -> a -> b
$ Double -> String
toPct2dp (a -> Double
forall a b. (Integral a, Num b) => a -> b
fromIntegral a
k Double -> Double -> Double
forall a. Fractional a => a -> a -> a
/ a -> Double
forall a b. (Integral a, Num b) => a -> b
fromIntegral ([a] -> a
forall a. Num a => [a] -> a
forall (t :: * -> *) a. (Foldable t, Num a) => t a -> a
P.sum ([a] -> a) -> [a] -> a
forall a b. (a -> b) -> a -> b
$ ((a, a) -> a) -> [(a, a)] -> [a]
forall a b. (a -> b) -> [a] -> [b]
map (a, a) -> a
forall a b. (a, b) -> b
snd [(a, a)]
cs))
        initDf :: DataFrame
initDf =
            DataFrame
empty
                DataFrame -> (DataFrame -> DataFrame) -> DataFrame
forall a b. a -> (a -> b) -> b
& Text -> Vector Text -> DataFrame -> DataFrame
forall a.
Columnable a =>
Text -> Vector a -> DataFrame -> DataFrame
insertVector Text
"Statistic" ([Text] -> Vector Text
forall a. [a] -> Vector a
V.fromList [Text
"Count" :: T.Text, Text
"Percentage (%)"])
        freqs :: Vector a -> DataFrame
freqs Vector a
col =
            (DataFrame -> (a, Int) -> DataFrame)
-> DataFrame -> [(a, Int)] -> DataFrame
forall b a. (b -> a -> b) -> b -> [a] -> b
forall (t :: * -> *) b a.
Foldable t =>
(b -> a -> b) -> b -> t a -> b
L.foldl'
                ( \DataFrame
d (a
col, Int
k) ->
                    Text -> Vector Any -> DataFrame -> DataFrame
forall a.
Columnable a =>
Text -> Vector a -> DataFrame -> DataFrame
insertVector
                        (forall a. Columnable a => a -> Text
showValue @a a
col)
                        ([Any] -> Vector Any
forall a. [a] -> Vector a
V.fromList [Int -> Any
forall a. Columnable a => a -> Any
toAny Int
k, [(a, Int)] -> Int -> Any
forall {a} {a} {a}.
(Integral a, Integral a) =>
[(a, a)] -> a -> Any
calculatePercentage [(a, Int)]
counts Int
k])
                        DataFrame
d
                )
                DataFrame
initDf
                [(a, Int)]
counts
     in
        case Expr a -> DataFrame -> Either DataFrameException (Vector a)
forall a.
Columnable a =>
Expr a -> DataFrame -> Either DataFrameException (Vector a)
columnAsVector Expr a
expr DataFrame
df of
            Left DataFrameException
err -> DataFrameException -> DataFrame
forall a e. Exception e => e -> a
throw DataFrameException
err
            Right Vector a
column -> Vector a -> DataFrame
freqs Vector a
column

-- | Calculates the mean of a given column as a standalone value.
mean ::
    forall a. (Columnable a, Real a, VU.Unbox a) => Expr a -> DataFrame -> Double
mean :: forall a.
(Columnable a, Real a, Unbox a) =>
Expr a -> DataFrame -> Double
mean (Col Text
name) DataFrame
df = case Text -> DataFrame -> Maybe (Vector Double)
_getColumnAsDouble Text
name DataFrame
df of
    Just Vector Double
xs -> Vector Double -> Double
meanDouble' Vector Double
xs
    Maybe (Vector Double)
Nothing -> String -> Double
forall a. HasCallStack => String -> a
error String
"[INTERNAL ERROR] Column is non-numeric"
mean Expr a
expr DataFrame
df = case DataFrame -> Expr a -> Either DataFrameException (TypedColumn a)
forall a.
Columnable a =>
DataFrame -> Expr a -> Either DataFrameException (TypedColumn a)
interpret DataFrame
df Expr a
expr of
    Left DataFrameException
e -> DataFrameException -> Double
forall a e. Exception e => e -> a
throw DataFrameException
e
    Right (TColumn Column
col) -> case forall a.
(Columnable a, Unbox a) =>
Column -> Either DataFrameException (Vector a)
toUnboxedVector @a Column
col of
        Left DataFrameException
e -> DataFrameException -> Double
forall a e. Exception e => e -> a
throw DataFrameException
e
        Right Vector a
xs -> Vector a -> Double
forall a. (Real a, Unbox a) => Vector a -> Double
mean' Vector a
xs

meanMaybe ::
    forall a. (Columnable a, Real a) => Expr (Maybe a) -> DataFrame -> Double
meanMaybe :: forall a.
(Columnable a, Real a) =>
Expr (Maybe a) -> DataFrame -> Double
meanMaybe (Col Text
name) DataFrame
df =
    (Vector Double -> Double
forall a. (Real a, Unbox a) => Vector a -> Double
mean' (Vector Double -> Double)
-> (Vector (Maybe a) -> Vector Double)
-> Vector (Maybe a)
-> Double
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Vector (Maybe a) -> Vector Double
forall a. Real a => Vector (Maybe a) -> Vector Double
optionalToDoubleVector)
        ((DataFrameException -> Vector (Maybe a))
-> (Vector (Maybe a) -> Vector (Maybe a))
-> Either DataFrameException (Vector (Maybe a))
-> Vector (Maybe a)
forall a c b. (a -> c) -> (b -> c) -> Either a b -> c
either DataFrameException -> Vector (Maybe a)
forall a e. Exception e => e -> a
throw Vector (Maybe a) -> Vector (Maybe a)
forall a. a -> a
id (Expr (Maybe a)
-> DataFrame -> Either DataFrameException (Vector (Maybe a))
forall a.
Columnable a =>
Expr a -> DataFrame -> Either DataFrameException (Vector a)
columnAsVector (forall a. Columnable a => Text -> Expr a
Col @(Maybe a) Text
name) DataFrame
df))
meanMaybe Expr (Maybe a)
expr DataFrame
df = case forall a.
Columnable a =>
DataFrame -> Expr a -> Either DataFrameException (TypedColumn a)
interpret @(Maybe a) DataFrame
df Expr (Maybe a)
expr of
    Left DataFrameException
e -> DataFrameException -> Double
forall a e. Exception e => e -> a
throw DataFrameException
e
    Right (TColumn Column
col) -> case forall a (v :: * -> *).
(Vector v a, Columnable a) =>
Column -> Either DataFrameException (v a)
toVector @(Maybe a) Column
col of
        Left DataFrameException
e -> DataFrameException -> Double
forall a e. Exception e => e -> a
throw DataFrameException
e
        Right Vector (Maybe a)
xs -> (Vector Double -> Double
forall a. (Real a, Unbox a) => Vector a -> Double
mean' (Vector Double -> Double)
-> (Vector (Maybe a) -> Vector Double)
-> Vector (Maybe a)
-> Double
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Vector (Maybe a) -> Vector Double
forall a. Real a => Vector (Maybe a) -> Vector Double
optionalToDoubleVector) Vector (Maybe a)
xs

-- | Calculates the median of a given column as a standalone value.
median ::
    forall a. (Columnable a, Real a, VU.Unbox a) => Expr a -> DataFrame -> Double
median :: forall a.
(Columnable a, Real a, Unbox a) =>
Expr a -> DataFrame -> Double
median (Col Text
name) DataFrame
df = case Expr a -> DataFrame -> Either DataFrameException (Vector a)
forall a.
(Columnable a, Unbox a) =>
Expr a -> DataFrame -> Either DataFrameException (Vector a)
columnAsUnboxedVector (forall a. Columnable a => Text -> Expr a
Col @a Text
name) DataFrame
df of
    Right Vector a
xs -> Vector a -> Double
forall a. (Real a, Unbox a) => Vector a -> Double
median' Vector a
xs
    Left DataFrameException
e -> DataFrameException -> Double
forall a e. Exception e => e -> a
throw DataFrameException
e
median Expr a
expr DataFrame
df = case DataFrame -> Expr a -> Either DataFrameException (TypedColumn a)
forall a.
Columnable a =>
DataFrame -> Expr a -> Either DataFrameException (TypedColumn a)
interpret DataFrame
df Expr a
expr of
    Left DataFrameException
e -> DataFrameException -> Double
forall a e. Exception e => e -> a
throw DataFrameException
e
    Right (TColumn Column
col) -> case forall a.
(Columnable a, Unbox a) =>
Column -> Either DataFrameException (Vector a)
toUnboxedVector @a Column
col of
        Left DataFrameException
e -> DataFrameException -> Double
forall a e. Exception e => e -> a
throw DataFrameException
e
        Right Vector a
xs -> Vector a -> Double
forall a. (Real a, Unbox a) => Vector a -> Double
median' Vector a
xs

-- | Calculates the median of a given column (containing optional values) as a standalone value.
medianMaybe ::
    forall a. (Columnable a, Real a) => Expr (Maybe a) -> DataFrame -> Double
medianMaybe :: forall a.
(Columnable a, Real a) =>
Expr (Maybe a) -> DataFrame -> Double
medianMaybe (Col Text
name) DataFrame
df =
    (Vector Double -> Double
forall a. (Real a, Unbox a) => Vector a -> Double
median' (Vector Double -> Double)
-> (Vector (Maybe a) -> Vector Double)
-> Vector (Maybe a)
-> Double
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Vector (Maybe a) -> Vector Double
forall a. Real a => Vector (Maybe a) -> Vector Double
optionalToDoubleVector)
        ((DataFrameException -> Vector (Maybe a))
-> (Vector (Maybe a) -> Vector (Maybe a))
-> Either DataFrameException (Vector (Maybe a))
-> Vector (Maybe a)
forall a c b. (a -> c) -> (b -> c) -> Either a b -> c
either DataFrameException -> Vector (Maybe a)
forall a e. Exception e => e -> a
throw Vector (Maybe a) -> Vector (Maybe a)
forall a. a -> a
id (Expr (Maybe a)
-> DataFrame -> Either DataFrameException (Vector (Maybe a))
forall a.
Columnable a =>
Expr a -> DataFrame -> Either DataFrameException (Vector a)
columnAsVector (forall a. Columnable a => Text -> Expr a
Col @(Maybe a) Text
name) DataFrame
df))
medianMaybe Expr (Maybe a)
expr DataFrame
df = case forall a.
Columnable a =>
DataFrame -> Expr a -> Either DataFrameException (TypedColumn a)
interpret @(Maybe a) DataFrame
df Expr (Maybe a)
expr of
    Left DataFrameException
e -> DataFrameException -> Double
forall a e. Exception e => e -> a
throw DataFrameException
e
    Right (TColumn Column
col) -> case forall a (v :: * -> *).
(Vector v a, Columnable a) =>
Column -> Either DataFrameException (v a)
toVector @(Maybe a) Column
col of
        Left DataFrameException
e -> DataFrameException -> Double
forall a e. Exception e => e -> a
throw DataFrameException
e
        Right Vector (Maybe a)
xs -> (Vector Double -> Double
forall a. (Real a, Unbox a) => Vector a -> Double
median' (Vector Double -> Double)
-> (Vector (Maybe a) -> Vector Double)
-> Vector (Maybe a)
-> Double
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Vector (Maybe a) -> Vector Double
forall a. Real a => Vector (Maybe a) -> Vector Double
optionalToDoubleVector) Vector (Maybe a)
xs

-- | Calculates the nth percentile of a given column as a standalone value.
percentile ::
    forall a.
    (Columnable a, Real a, VU.Unbox a) => Int -> Expr a -> DataFrame -> Double
percentile :: forall a.
(Columnable a, Real a, Unbox a) =>
Int -> Expr a -> DataFrame -> Double
percentile Int
n (Col Text
name) DataFrame
df = case Expr a -> DataFrame -> Either DataFrameException (Vector a)
forall a.
(Columnable a, Unbox a) =>
Expr a -> DataFrame -> Either DataFrameException (Vector a)
columnAsUnboxedVector (forall a. Columnable a => Text -> Expr a
Col @a Text
name) DataFrame
df of
    Right Vector a
xs -> Int -> Vector a -> Double
forall a. (Unbox a, Num a, Real a) => Int -> Vector a -> Double
percentile' Int
n Vector a
xs
    Left DataFrameException
e -> DataFrameException -> Double
forall a e. Exception e => e -> a
throw DataFrameException
e
percentile Int
n Expr a
expr DataFrame
df = case DataFrame -> Expr a -> Either DataFrameException (TypedColumn a)
forall a.
Columnable a =>
DataFrame -> Expr a -> Either DataFrameException (TypedColumn a)
interpret DataFrame
df Expr a
expr of
    Left DataFrameException
e -> DataFrameException -> Double
forall a e. Exception e => e -> a
throw DataFrameException
e
    Right (TColumn Column
col) -> case forall a.
(Columnable a, Unbox a) =>
Column -> Either DataFrameException (Vector a)
toUnboxedVector @a Column
col of
        Left DataFrameException
e -> DataFrameException -> Double
forall a e. Exception e => e -> a
throw DataFrameException
e
        Right Vector a
xs -> Int -> Vector a -> Double
forall a. (Unbox a, Num a, Real a) => Int -> Vector a -> Double
percentile' Int
n Vector a
xs

-- | Calculates the nth percentile of a given column as a standalone value.
genericPercentile ::
    forall a.
    (Columnable a, Ord a) => Int -> Expr a -> DataFrame -> a
genericPercentile :: forall a. (Columnable a, Ord a) => Int -> Expr a -> DataFrame -> a
genericPercentile Int
n (Col Text
name) DataFrame
df = case Expr a -> DataFrame -> Either DataFrameException (Vector a)
forall a.
Columnable a =>
Expr a -> DataFrame -> Either DataFrameException (Vector a)
columnAsVector (forall a. Columnable a => Text -> Expr a
Col @a Text
name) DataFrame
df of
    Right Vector a
xs -> Int -> Vector a -> a
forall a. (Ord a, Eq a) => Int -> Vector a -> a
percentileOrd' Int
n Vector a
xs
    Left DataFrameException
e -> DataFrameException -> a
forall a e. Exception e => e -> a
throw DataFrameException
e
genericPercentile Int
n Expr a
expr DataFrame
df = case DataFrame -> Expr a -> Either DataFrameException (TypedColumn a)
forall a.
Columnable a =>
DataFrame -> Expr a -> Either DataFrameException (TypedColumn a)
interpret DataFrame
df Expr a
expr of
    Left DataFrameException
e -> DataFrameException -> a
forall a e. Exception e => e -> a
throw DataFrameException
e
    Right (TColumn Column
col) -> case forall a (v :: * -> *).
(Vector v a, Columnable a) =>
Column -> Either DataFrameException (v a)
toVector @a Column
col of
        Left DataFrameException
e -> DataFrameException -> a
forall a e. Exception e => e -> a
throw DataFrameException
e
        Right Vector a
xs -> Int -> Vector a -> a
forall a. (Ord a, Eq a) => Int -> Vector a -> a
percentileOrd' Int
n Vector a
xs

-- | Calculates the standard deviation of a given column as a standalone value.
standardDeviation ::
    forall a. (Columnable a, Real a, VU.Unbox a) => Expr a -> DataFrame -> Double
standardDeviation :: forall a.
(Columnable a, Real a, Unbox a) =>
Expr a -> DataFrame -> Double
standardDeviation (Col Text
name) DataFrame
df = case Expr a -> DataFrame -> Either DataFrameException (Vector a)
forall a.
(Columnable a, Unbox a) =>
Expr a -> DataFrame -> Either DataFrameException (Vector a)
columnAsUnboxedVector (forall a. Columnable a => Text -> Expr a
Col @a Text
name) DataFrame
df of
    Right Vector a
xs -> (Double -> Double
forall a. Floating a => a -> a
sqrt (Double -> Double) -> (Vector a -> Double) -> Vector a -> Double
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Vector a -> Double
forall a. (Real a, Unbox a) => Vector a -> Double
variance') Vector a
xs
    Left DataFrameException
e -> DataFrameException -> Double
forall a e. Exception e => e -> a
throw DataFrameException
e
standardDeviation Expr a
expr DataFrame
df = case DataFrame -> Expr a -> Either DataFrameException (TypedColumn a)
forall a.
Columnable a =>
DataFrame -> Expr a -> Either DataFrameException (TypedColumn a)
interpret DataFrame
df Expr a
expr of
    Left DataFrameException
e -> DataFrameException -> Double
forall a e. Exception e => e -> a
throw DataFrameException
e
    Right (TColumn Column
col) -> case forall a.
(Columnable a, Unbox a) =>
Column -> Either DataFrameException (Vector a)
toUnboxedVector @a Column
col of
        Left DataFrameException
e -> DataFrameException -> Double
forall a e. Exception e => e -> a
throw DataFrameException
e
        Right Vector a
xs -> (Double -> Double
forall a. Floating a => a -> a
sqrt (Double -> Double) -> (Vector a -> Double) -> Vector a -> Double
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Vector a -> Double
forall a. (Real a, Unbox a) => Vector a -> Double
variance') Vector a
xs

-- | Calculates the skewness of a given column as a standalone value.
skewness ::
    forall a. (Columnable a, Real a, VU.Unbox a) => Expr a -> DataFrame -> Double
skewness :: forall a.
(Columnable a, Real a, Unbox a) =>
Expr a -> DataFrame -> Double
skewness (Col Text
name) DataFrame
df = case Expr a -> DataFrame -> Either DataFrameException (Vector a)
forall a.
(Columnable a, Unbox a) =>
Expr a -> DataFrame -> Either DataFrameException (Vector a)
columnAsUnboxedVector (forall a. Columnable a => Text -> Expr a
Col @a Text
name) DataFrame
df of
    Right Vector a
xs -> Vector a -> Double
forall a. (Unbox a, Real a, Num a) => Vector a -> Double
skewness' Vector a
xs
    Left DataFrameException
e -> DataFrameException -> Double
forall a e. Exception e => e -> a
throw DataFrameException
e
skewness Expr a
expr DataFrame
df = case DataFrame -> Expr a -> Either DataFrameException (TypedColumn a)
forall a.
Columnable a =>
DataFrame -> Expr a -> Either DataFrameException (TypedColumn a)
interpret DataFrame
df Expr a
expr of
    Left DataFrameException
e -> DataFrameException -> Double
forall a e. Exception e => e -> a
throw DataFrameException
e
    Right (TColumn Column
col) -> case forall a.
(Columnable a, Unbox a) =>
Column -> Either DataFrameException (Vector a)
toUnboxedVector @a Column
col of
        Left DataFrameException
e -> DataFrameException -> Double
forall a e. Exception e => e -> a
throw DataFrameException
e
        Right Vector a
xs -> Vector a -> Double
forall a. (Unbox a, Real a, Num a) => Vector a -> Double
skewness' Vector a
xs

-- | Calculates the variance of a given column as a standalone value.
variance ::
    forall a. (Columnable a, Real a, VU.Unbox a) => Expr a -> DataFrame -> Double
variance :: forall a.
(Columnable a, Real a, Unbox a) =>
Expr a -> DataFrame -> Double
variance (Col Text
name) DataFrame
df = case Text -> DataFrame -> Maybe (Vector Double)
_getColumnAsDouble Text
name DataFrame
df of
    Just Vector Double
xs -> Vector Double -> Double
varianceDouble' Vector Double
xs
    Maybe (Vector Double)
Nothing -> String -> Double
forall a. HasCallStack => String -> a
error String
"[INTERNAL ERROR] Column is non-numeric"
variance Expr a
expr DataFrame
df = case DataFrame -> Expr a -> Either DataFrameException (TypedColumn a)
forall a.
Columnable a =>
DataFrame -> Expr a -> Either DataFrameException (TypedColumn a)
interpret DataFrame
df Expr a
expr of
    Left DataFrameException
e -> DataFrameException -> Double
forall a e. Exception e => e -> a
throw DataFrameException
e
    Right (TColumn Column
col) -> case forall a.
(Columnable a, Unbox a) =>
Column -> Either DataFrameException (Vector a)
toUnboxedVector @a Column
col of
        Left DataFrameException
e -> DataFrameException -> Double
forall a e. Exception e => e -> a
throw DataFrameException
e
        Right Vector a
xs -> Vector a -> Double
forall a. (Real a, Unbox a) => Vector a -> Double
variance' Vector a
xs

-- | Calculates the inter-quartile range of a given column as a standalone value.
interQuartileRange ::
    forall a. (Columnable a, Real a, VU.Unbox a) => Expr a -> DataFrame -> Double
interQuartileRange :: forall a.
(Columnable a, Real a, Unbox a) =>
Expr a -> DataFrame -> Double
interQuartileRange (Col Text
name) DataFrame
df = case Expr a -> DataFrame -> Either DataFrameException (Vector a)
forall a.
(Columnable a, Unbox a) =>
Expr a -> DataFrame -> Either DataFrameException (Vector a)
columnAsUnboxedVector (forall a. Columnable a => Text -> Expr a
Col @a Text
name) DataFrame
df of
    Right Vector a
xs -> Vector a -> Double
forall a. (Unbox a, Num a, Real a) => Vector a -> Double
interQuartileRange' Vector a
xs
    Left DataFrameException
e -> DataFrameException -> Double
forall a e. Exception e => e -> a
throw DataFrameException
e
interQuartileRange Expr a
expr DataFrame
df = case DataFrame -> Expr a -> Either DataFrameException (TypedColumn a)
forall a.
Columnable a =>
DataFrame -> Expr a -> Either DataFrameException (TypedColumn a)
interpret DataFrame
df Expr a
expr of
    Left DataFrameException
e -> DataFrameException -> Double
forall a e. Exception e => e -> a
throw DataFrameException
e
    Right (TColumn Column
col) -> case forall a.
(Columnable a, Unbox a) =>
Column -> Either DataFrameException (Vector a)
toUnboxedVector @a Column
col of
        Left DataFrameException
e -> DataFrameException -> Double
forall a e. Exception e => e -> a
throw DataFrameException
e
        Right Vector a
xs -> Vector a -> Double
forall a. (Unbox a, Num a, Real a) => Vector a -> Double
interQuartileRange' Vector a
xs

-- | Calculates the Pearson's correlation coefficient between two given columns as a standalone value.
correlation :: T.Text -> T.Text -> DataFrame -> Maybe Double
correlation :: Text -> Text -> DataFrame -> Maybe Double
correlation Text
first Text
second DataFrame
df = do
    Vector Double
f <- Text -> DataFrame -> Maybe (Vector Double)
_getColumnAsDouble Text
first DataFrame
df
    Vector Double
s <- Text -> DataFrame -> Maybe (Vector Double)
_getColumnAsDouble Text
second DataFrame
df
    Vector Double -> Vector Double -> Maybe Double
correlation' Vector Double
f Vector Double
s

_getColumnAsDouble :: T.Text -> DataFrame -> Maybe (VU.Vector Double)
_getColumnAsDouble :: Text -> DataFrame -> Maybe (Vector Double)
_getColumnAsDouble Text
name DataFrame
df = case Text -> DataFrame -> Maybe Column
getColumn Text
name DataFrame
df of
    Just (UnboxedColumn (Vector a
f :: VU.Vector a)) -> case TypeRep a -> TypeRep Double -> Maybe (a :~: Double)
forall a b. TypeRep a -> TypeRep b -> Maybe (a :~: b)
forall {k} (f :: k -> *) (a :: k) (b :: k).
TestEquality f =>
f a -> f b -> Maybe (a :~: b)
testEquality (forall a. Typeable a => TypeRep a
forall {k} (a :: k). Typeable a => TypeRep a
typeRep @a) (forall a. Typeable a => TypeRep a
forall {k} (a :: k). Typeable a => TypeRep a
typeRep @Double) of
        Just a :~: Double
Refl -> Vector Double -> Maybe (Vector Double)
forall a. a -> Maybe a
Just Vector a
Vector Double
f
        Maybe (a :~: Double)
Nothing -> case forall a. SBoolI (IntegralTypes a) => SBool (IntegralTypes a)
sIntegral @a of
            SBool (IntegralTypes a)
STrue -> Vector Double -> Maybe (Vector Double)
forall a. a -> Maybe a
Just ((a -> Double) -> Vector a -> Vector Double
forall a b. (Unbox a, Unbox b) => (a -> b) -> Vector a -> Vector b
VU.map a -> Double
forall a b. (Integral a, Num b) => a -> b
fromIntegral Vector a
f)
            SBool (IntegralTypes a)
SFalse -> case forall a. SBoolI (FloatingTypes a) => SBool (FloatingTypes a)
sFloating @a of
                SBool (FloatingTypes a)
STrue -> Vector Double -> Maybe (Vector Double)
forall a. a -> Maybe a
Just ((a -> Double) -> Vector a -> Vector Double
forall a b. (Unbox a, Unbox b) => (a -> b) -> Vector a -> Vector b
VU.map a -> Double
forall a b. (Real a, Fractional b) => a -> b
realToFrac Vector a
f)
                SBool (FloatingTypes a)
SFalse -> Maybe (Vector Double)
forall a. Maybe a
Nothing
    Maybe Column
Nothing ->
        DataFrameException -> Maybe (Vector Double)
forall a e. Exception e => e -> a
throw (DataFrameException -> Maybe (Vector Double))
-> DataFrameException -> Maybe (Vector Double)
forall a b. (a -> b) -> a -> b
$
            Text -> Text -> [Text] -> DataFrameException
ColumnNotFoundException Text
name Text
"_getColumnAsDouble" (Map Text Int -> [Text]
forall k a. Map k a -> [k]
M.keys (Map Text Int -> [Text]) -> Map Text Int -> [Text]
forall a b. (a -> b) -> a -> b
$ DataFrame -> Map Text Int
columnIndices DataFrame
df)
    Maybe Column
_ -> Maybe (Vector Double)
forall a. Maybe a
Nothing -- Return a type mismatch error here.
{-# INLINE _getColumnAsDouble #-}

optionalToDoubleVector :: (Real a) => V.Vector (Maybe a) -> VU.Vector Double
optionalToDoubleVector :: forall a. Real a => Vector (Maybe a) -> Vector Double
optionalToDoubleVector =
    [Double] -> Vector Double
forall a. Unbox a => [a] -> Vector a
VU.fromList
        ([Double] -> Vector Double)
-> (Vector (Maybe a) -> [Double])
-> Vector (Maybe a)
-> Vector Double
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ([Double] -> Maybe a -> [Double])
-> [Double] -> Vector (Maybe a) -> [Double]
forall a b. (a -> b -> a) -> a -> Vector b -> a
V.foldl'
            (\[Double]
acc Maybe a
e -> if Maybe a -> Bool
forall a. Maybe a -> Bool
isJust Maybe a
e then a -> Double
forall a b. (Real a, Fractional b) => a -> b
realToFrac (a -> Maybe a -> a
forall a. a -> Maybe a -> a
fromMaybe a
0 Maybe a
e) Double -> [Double] -> [Double]
forall a. a -> [a] -> [a]
: [Double]
acc else [Double]
acc)
            []

-- | Calculates the sum of a given column as a standalone value.
sum ::
    forall a. (Columnable a, Num a) => Expr a -> DataFrame -> a
sum :: forall a. (Columnable a, Num a) => Expr a -> DataFrame -> a
sum (Col Text
name) DataFrame
df = case Text -> DataFrame -> Maybe Column
getColumn Text
name DataFrame
df of
    Maybe Column
Nothing -> DataFrameException -> a
forall a e. Exception e => e -> a
throw (DataFrameException -> a) -> DataFrameException -> a
forall a b. (a -> b) -> a -> b
$ Text -> Text -> [Text] -> DataFrameException
ColumnNotFoundException Text
name Text
"sum" (Map Text Int -> [Text]
forall k a. Map k a -> [k]
M.keys (Map Text Int -> [Text]) -> Map Text Int -> [Text]
forall a b. (a -> b) -> a -> b
$ DataFrame -> Map Text Int
columnIndices DataFrame
df)
    Just ((UnboxedColumn (Vector a
column :: VU.Vector a'))) -> case TypeRep a -> TypeRep a -> Maybe (a :~: a)
forall a b. TypeRep a -> TypeRep b -> Maybe (a :~: b)
forall {k} (f :: k -> *) (a :: k) (b :: k).
TestEquality f =>
f a -> f b -> Maybe (a :~: b)
testEquality (forall a. Typeable a => TypeRep a
forall {k} (a :: k). Typeable a => TypeRep a
typeRep @a') (forall a. Typeable a => TypeRep a
forall {k} (a :: k). Typeable a => TypeRep a
typeRep @a) of
        Just a :~: a
Refl -> Vector a -> a
forall (v :: * -> *) a. (Vector v a, Num a) => v a -> a
VG.sum Vector a
Vector a
column
        Maybe (a :~: a)
Nothing -> a
0
    Just ((BoxedColumn (Vector a
column :: V.Vector a'))) -> case TypeRep a -> TypeRep a -> Maybe (a :~: a)
forall a b. TypeRep a -> TypeRep b -> Maybe (a :~: b)
forall {k} (f :: k -> *) (a :: k) (b :: k).
TestEquality f =>
f a -> f b -> Maybe (a :~: b)
testEquality (forall a. Typeable a => TypeRep a
forall {k} (a :: k). Typeable a => TypeRep a
typeRep @a') (forall a. Typeable a => TypeRep a
forall {k} (a :: k). Typeable a => TypeRep a
typeRep @a) of
        Just a :~: a
Refl -> Vector a -> a
forall (v :: * -> *) a. (Vector v a, Num a) => v a -> a
VG.sum Vector a
Vector a
column
        Maybe (a :~: a)
Nothing -> a
0
    Just ((OptionalColumn (Vector (Maybe a)
column :: V.Vector (Maybe a')))) -> case TypeRep a -> TypeRep a -> Maybe (a :~: a)
forall a b. TypeRep a -> TypeRep b -> Maybe (a :~: b)
forall {k} (f :: k -> *) (a :: k) (b :: k).
TestEquality f =>
f a -> f b -> Maybe (a :~: b)
testEquality (forall a. Typeable a => TypeRep a
forall {k} (a :: k). Typeable a => TypeRep a
typeRep @a') (forall a. Typeable a => TypeRep a
forall {k} (a :: k). Typeable a => TypeRep a
typeRep @a) of
        Just a :~: a
Refl -> Vector a -> a
forall (v :: * -> *) a. (Vector v a, Num a) => v a -> a
VG.sum ((Maybe a -> a) -> Vector (Maybe a) -> Vector a
forall (v :: * -> *) a b.
(Vector v a, Vector v b) =>
(a -> b) -> v a -> v b
VG.map (a -> Maybe a -> a
forall a. a -> Maybe a -> a
fromMaybe a
0) Vector (Maybe a)
Vector (Maybe a)
column)
        Maybe (a :~: a)
Nothing -> a
0
sum Expr a
expr DataFrame
df = case DataFrame -> Expr a -> Either DataFrameException (TypedColumn a)
forall a.
Columnable a =>
DataFrame -> Expr a -> Either DataFrameException (TypedColumn a)
interpret DataFrame
df Expr a
expr of
    Left DataFrameException
e -> DataFrameException -> a
forall a e. Exception e => e -> a
throw DataFrameException
e
    Right (TColumn Column
xs) -> case forall a (v :: * -> *).
(Vector v a, Columnable a) =>
Column -> Either DataFrameException (v a)
toVector @a @V.Vector Column
xs of
        Left DataFrameException
e -> DataFrameException -> a
forall a e. Exception e => e -> a
throw DataFrameException
e
        Right Vector a
xs -> Vector a -> a
forall (v :: * -> *) a. (Vector v a, Num a) => v a -> a
VG.sum Vector a
xs

{- | /O(n)/ Impute missing values in a column using a derived scalar.

Given

* an expression @f :: 'Expr' b -> 'Expr' b@ that, when interpreted over a
  non-nullable column, produces the same value in every row (for example a
  mean, median, or other aggregate), and
* a nullable column @'Expr' ('Maybe' b)@

this function:

1. Drops all @Nothing@ values from the target column.
2. Interprets @f@ on the remaining non-null values.
3. Checks that the resulting column contains a single repeated value.
4. Uses that value to impute all @Nothing@s in the original column.

==== __Throws__

* 'DataFrameException' - if the column does not exist, is empty,

==== __Example__
@
>>> :set -XOverloadedStrings
>>> import qualified DataFrame as D
>>> let df =
...       D.fromNamedColumns
...         [ ("age", D.fromList [Just 10, Nothing, Just 20 :: Maybe Int]) ]
>>>
>>> -- Impute missing ages with the mean of the observed ages
>>> D.imputeWith F.mean "age" df
-- age
-- ----
-- 10
-- 15
-- 20
@
-}
imputeWith ::
    forall b.
    (Columnable b) =>
    (Expr b -> Expr b) ->
    Expr (Maybe b) ->
    DataFrame ->
    DataFrame
imputeWith :: forall b.
Columnable b =>
(Expr b -> Expr b) -> Expr (Maybe b) -> DataFrame -> DataFrame
imputeWith Expr b -> Expr b
f col :: Expr (Maybe b)
col@(Col Text
columnName) DataFrame
df = case forall a.
Columnable a =>
DataFrame -> Expr a -> Either DataFrameException (TypedColumn a)
interpret @b (Text -> DataFrame -> DataFrame
filterJust Text
columnName DataFrame
df) (Expr b -> Expr b
f (forall a. Columnable a => Text -> Expr a
Col @b Text
columnName)) of
    Left DataFrameException
e -> DataFrameException -> DataFrame
forall a e. Exception e => e -> a
throw DataFrameException
e
    Right (TColumn Column
value) -> case forall a. Columnable a => Column -> Either DataFrameException a
headColumn @b Column
value of
        Left DataFrameException
e -> DataFrameException -> DataFrame
forall a e. Exception e => e -> a
throw DataFrameException
e
        Right b
h ->
            if (b -> Bool) -> [b] -> Bool
forall (t :: * -> *) a. Foldable t => (a -> Bool) -> t a -> Bool
all (b -> b -> Bool
forall a. Eq a => a -> a -> Bool
== b
h) (forall a. Columnable a => Column -> [a]
toList @b Column
value)
                then Expr (Maybe b) -> b -> DataFrame -> DataFrame
forall b.
Columnable b =>
Expr (Maybe b) -> b -> DataFrame -> DataFrame
impute Expr (Maybe b)
col b
h DataFrame
df
                else String -> DataFrame
forall a. HasCallStack => String -> a
error String
"Impute expression returned more than one value"
imputeWith Expr b -> Expr b
_ Expr (Maybe b)
_ DataFrame
df = DataFrame
df

applyStatistic ::
    (VU.Vector Double -> Double) -> T.Text -> DataFrame -> Maybe Double
applyStatistic :: (Vector Double -> Double) -> Text -> DataFrame -> Maybe Double
applyStatistic Vector Double -> Double
f Text
name DataFrame
df = Vector Double -> Maybe Double
apply (Vector Double -> Maybe Double)
-> Maybe (Vector Double) -> Maybe Double
forall (m :: * -> *) a b. Monad m => (a -> m b) -> m a -> m b
=<< Text -> DataFrame -> Maybe (Vector Double)
_getColumnAsDouble Text
name (Text -> DataFrame -> DataFrame
filterJust Text
name DataFrame
df)
  where
    apply :: Vector Double -> Maybe Double
apply Vector Double
col =
        let
            res :: Double
res = Vector Double -> Double
f Vector Double
col
         in
            if Double -> Bool
forall a. RealFloat a => a -> Bool
isNaN Double
res then Maybe Double
forall a. Maybe a
Nothing else Double -> Maybe Double
forall a. a -> Maybe a
forall (f :: * -> *) a. Applicative f => a -> f a
pure Double
res
{-# INLINE applyStatistic #-}

applyStatistics ::
    (VU.Vector Double -> VU.Vector Double) ->
    T.Text ->
    DataFrame ->
    Maybe (VU.Vector Double)
applyStatistics :: (Vector Double -> Vector Double)
-> Text -> DataFrame -> Maybe (Vector Double)
applyStatistics Vector Double -> Vector Double
f Text
name DataFrame
df = (Vector Double -> Vector Double)
-> Maybe (Vector Double) -> Maybe (Vector Double)
forall a b. (a -> b) -> Maybe a -> Maybe b
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap Vector Double -> Vector Double
f (Text -> DataFrame -> Maybe (Vector Double)
_getColumnAsDouble Text
name (Text -> DataFrame -> DataFrame
filterJust Text
name DataFrame
df))

-- | Descriptive statistics of the numeric columns.
summarize :: DataFrame -> DataFrame
summarize :: DataFrame -> DataFrame
summarize DataFrame
df =
    (Text -> DataFrame -> DataFrame)
-> [Text] -> DataFrame -> DataFrame
forall a.
(a -> DataFrame -> DataFrame) -> [a] -> DataFrame -> DataFrame
fold
        Text -> DataFrame -> DataFrame
columnStats
        (DataFrame -> [Text]
columnNames DataFrame
df)
        ( [(Text, Column)] -> DataFrame
fromNamedColumns
            [
                ( Text
"Statistic"
                , [Text] -> Column
forall a.
(Columnable a, ColumnifyRep (KindOf a) a) =>
[a] -> Column
fromList
                    [ Text
"Count" :: T.Text
                    , Text
"Mean"
                    , Text
"Minimum"
                    , Text
"25%"
                    , Text
"Median"
                    , Text
"75%"
                    , Text
"Max"
                    , Text
"StdDev"
                    , Text
"IQR"
                    , Text
"Skewness"
                    ]
                )
            ]
        )
  where
    columnStats :: Text -> DataFrame -> DataFrame
columnStats Text
name DataFrame
d =
        if (Maybe Double -> Bool) -> [Maybe Double] -> Bool
forall (t :: * -> *) a. Foldable t => (a -> Bool) -> t a -> Bool
all Maybe Double -> Bool
forall a. Maybe a -> Bool
isJust (Text -> [Maybe Double]
stats Text
name)
            then
                Text -> Vector Double -> DataFrame -> DataFrame
forall a.
(Columnable a, Unbox a) =>
Text -> Vector a -> DataFrame -> DataFrame
insertUnboxedVector
                    Text
name
                    ([Double] -> Vector Double
forall a. Unbox a => [a] -> Vector a
VU.fromList ((Maybe Double -> Double) -> [Maybe Double] -> [Double]
forall a b. (a -> b) -> [a] -> [b]
map (Int -> Double -> Double
roundTo Int
2 (Double -> Double)
-> (Maybe Double -> Double) -> Maybe Double -> Double
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Double -> Maybe Double -> Double
forall a. a -> Maybe a -> a
fromMaybe Double
0) ([Maybe Double] -> [Double]) -> [Maybe Double] -> [Double]
forall a b. (a -> b) -> a -> b
$ Text -> [Maybe Double]
stats Text
name))
                    DataFrame
d
            else DataFrame
d
    stats :: Text -> [Maybe Double]
stats Text
name =
        let
            count :: Maybe Double
count = Int -> Double
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Int -> Double) -> (Column -> Int) -> Column -> Double
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Column -> Int
numElements (Column -> Double) -> Maybe Column -> Maybe Double
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Text -> DataFrame -> Maybe Column
getColumn Text
name DataFrame
df
            quantiles :: Maybe (Vector Double)
quantiles = (Vector Double -> Vector Double)
-> Text -> DataFrame -> Maybe (Vector Double)
applyStatistics (Vector Int -> Int -> Vector Double -> Vector Double
forall a.
(Unbox a, Num a, Real a) =>
Vector Int -> Int -> Vector a -> Vector Double
quantiles' ([Int] -> Vector Int
forall a. Unbox a => [a] -> Vector a
VU.fromList [Int
0, Int
1, Int
2, Int
3, Int
4]) Int
4) Text
name DataFrame
df
            min' :: Maybe Double
min' = (Vector Double -> Int -> Double) -> Int -> Vector Double -> Double
forall a b c. (a -> b -> c) -> b -> a -> c
flip Vector Double -> Int -> Double
forall (v :: * -> *) a.
(HasCallStack, Vector v a) =>
v a -> Int -> a
(VG.!) Int
0 (Vector Double -> Double) -> Maybe (Vector Double) -> Maybe Double
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Maybe (Vector Double)
quantiles
            quartile1 :: Maybe Double
quartile1 = (Vector Double -> Int -> Double) -> Int -> Vector Double -> Double
forall a b c. (a -> b -> c) -> b -> a -> c
flip Vector Double -> Int -> Double
forall (v :: * -> *) a.
(HasCallStack, Vector v a) =>
v a -> Int -> a
(VG.!) Int
1 (Vector Double -> Double) -> Maybe (Vector Double) -> Maybe Double
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Maybe (Vector Double)
quantiles
            median' :: Maybe Double
median' = (Vector Double -> Int -> Double) -> Int -> Vector Double -> Double
forall a b c. (a -> b -> c) -> b -> a -> c
flip Vector Double -> Int -> Double
forall (v :: * -> *) a.
(HasCallStack, Vector v a) =>
v a -> Int -> a
(VG.!) Int
2 (Vector Double -> Double) -> Maybe (Vector Double) -> Maybe Double
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Maybe (Vector Double)
quantiles
            quartile3 :: Maybe Double
quartile3 = (Vector Double -> Int -> Double) -> Int -> Vector Double -> Double
forall a b c. (a -> b -> c) -> b -> a -> c
flip Vector Double -> Int -> Double
forall (v :: * -> *) a.
(HasCallStack, Vector v a) =>
v a -> Int -> a
(VG.!) Int
3 (Vector Double -> Double) -> Maybe (Vector Double) -> Maybe Double
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Maybe (Vector Double)
quantiles
            max' :: Maybe Double
max' = (Vector Double -> Int -> Double) -> Int -> Vector Double -> Double
forall a b c. (a -> b -> c) -> b -> a -> c
flip Vector Double -> Int -> Double
forall (v :: * -> *) a.
(HasCallStack, Vector v a) =>
v a -> Int -> a
(VG.!) Int
4 (Vector Double -> Double) -> Maybe (Vector Double) -> Maybe Double
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Maybe (Vector Double)
quantiles
            iqr :: Maybe Double
iqr = (-) (Double -> Double -> Double)
-> Maybe Double -> Maybe (Double -> Double)
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Maybe Double
quartile3 Maybe (Double -> Double) -> Maybe Double -> Maybe Double
forall a b. Maybe (a -> b) -> Maybe a -> Maybe b
forall (f :: * -> *) a b. Applicative f => f (a -> b) -> f a -> f b
<*> Maybe Double
quartile1
            doubleColumn :: Text -> Maybe (Vector Double)
doubleColumn Text
col = Text -> DataFrame -> Maybe (Vector Double)
_getColumnAsDouble Text
col (Text -> DataFrame -> DataFrame
filterJust Text
col DataFrame
df)
         in
            [ Maybe Double
count
            , Vector Double -> Double
forall a. (Real a, Unbox a) => Vector a -> Double
mean' (Vector Double -> Double) -> Maybe (Vector Double) -> Maybe Double
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Text -> Maybe (Vector Double)
doubleColumn Text
name
            , Maybe Double
min'
            , Maybe Double
quartile1
            , Maybe Double
median'
            , Maybe Double
quartile3
            , Maybe Double
max'
            , Double -> Double
forall a. Floating a => a -> a
sqrt (Double -> Double)
-> (Vector Double -> Double) -> Vector Double -> Double
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Vector Double -> Double
forall a. (Real a, Unbox a) => Vector a -> Double
variance' (Vector Double -> Double) -> Maybe (Vector Double) -> Maybe Double
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Text -> Maybe (Vector Double)
doubleColumn Text
name
            , Maybe Double
iqr
            , Vector Double -> Double
forall a. (Unbox a, Real a, Num a) => Vector a -> Double
skewness' (Vector Double -> Double) -> Maybe (Vector Double) -> Maybe Double
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Text -> Maybe (Vector Double)
doubleColumn Text
name
            ]

-- | Round a @Double@ to Specified Precision
roundTo :: Int -> Double -> Double
roundTo :: Int -> Double -> Double
roundTo Int
n Double
x = Integer -> Double
forall a. Num a => Integer -> a
fromInteger (Double -> Integer
forall b. Integral b => Double -> b
forall a b. (RealFrac a, Integral b) => a -> b
round (Double -> Integer) -> Double -> Integer
forall a b. (a -> b) -> a -> b
$ Double
x Double -> Double -> Double
forall a. Num a => a -> a -> a
* Double
10 Double -> Int -> Double
forall a b. (Num a, Integral b) => a -> b -> a
^ Int
n) Double -> Double -> Double
forall a. Fractional a => a -> a -> a
/ Double
10.0 Double -> Int -> Double
forall a b. (Fractional a, Integral b) => a -> b -> a
^^ Int
n

toPct2dp :: Double -> String
toPct2dp :: Double -> String
toPct2dp Double
x
    | Double
x Double -> Double -> Bool
forall a. Ord a => a -> a -> Bool
< Double
0.00005 = String
"<0.01%"
    | Bool
otherwise = String -> Double -> String
forall r. PrintfType r => String -> r
printf String
"%.2f%%" (Double
x Double -> Double -> Double
forall a. Num a => a -> a -> a
* Double
100)