Safe Haskell	None
Language	Haskell2010

DataFrame.DecisionTree

Synopsis

data TreeConfig = TreeConfig {
- maxTreeDepth :: Int
- minSamplesSplit :: Int
- minLeafSize :: Int
- percentiles :: [Int]
- expressionPairs :: Int
- synthConfig :: SynthConfig
- taoIterations :: Int
- taoConvergenceTol :: Double
}
data SynthConfig = SynthConfig {
- maxExprDepth :: Int
- boolExpansion :: Int
- disallowedCombinations :: [(Text, Text)]
- complexityPenalty :: Double
- enableStringOps :: Bool
- enableCrossCols :: Bool
- enableArithOps :: Bool
}
defaultSynthConfig :: SynthConfig
defaultTreeConfig :: TreeConfig
data Tree a
- = Leaf !a
- | Branch !(Expr Bool) !(Tree a) !(Tree a)
treeDepth :: Tree a -> Int
treeToExpr :: Columnable a => Tree a -> Expr a
fitDecisionTree :: Columnable a => TreeConfig -> Expr a -> DataFrame -> Expr a
taoOptimize :: Columnable a => TreeConfig -> Text -> [Expr Bool] -> DataFrame -> Vector Int -> Tree a -> Tree a
taoIteration :: Columnable a => TreeConfig -> Text -> [Expr Bool] -> DataFrame -> Vector Int -> Tree a -> Tree a
optimizeDepthLevel :: Columnable a => TreeConfig -> Text -> [Expr Bool] -> DataFrame -> Vector Int -> Tree a -> Int -> Tree a
optimizeAtDepth :: Columnable a => TreeConfig -> Text -> [Expr Bool] -> DataFrame -> Vector Int -> Tree a -> Int -> Int -> Tree a
optimizeNode :: Columnable a => TreeConfig -> Text -> [Expr Bool] -> DataFrame -> Vector Int -> Tree a -> Tree a
findBestSplitTAO :: Columnable a => TreeConfig -> Text -> [Expr Bool] -> DataFrame -> Vector Int -> Tree a -> Tree a -> Expr Bool -> Expr Bool
data CarePoint = CarePoint {
- cpIndex :: !Int
- cpCorrectDir :: !Direction
}
data Direction
- = GoLeft
- | GoRight
identifyCarePoints :: Columnable a => Text -> DataFrame -> Vector Int -> Tree a -> Tree a -> [CarePoint]
predictWithTree :: Columnable a => Text -> DataFrame -> Int -> Tree a -> a
countCarePointErrors :: Expr Bool -> DataFrame -> [CarePoint] -> Int
partitionIndices :: Expr Bool -> DataFrame -> Vector Int -> (Vector Int, Vector Int)
majorityValueFromIndices :: Columnable a => Text -> DataFrame -> Vector Int -> a
computeTreeLoss :: Columnable a => Text -> DataFrame -> Vector Int -> Tree a -> Double
pruneDead :: Tree a -> Tree a
pruneExpr :: (Columnable a, Eq a) => Expr a -> Expr a
buildGreedyTree :: Columnable a => TreeConfig -> Int -> Text -> [Expr Bool] -> DataFrame -> Tree a
findBestGreedySplit :: Columnable a => TreeConfig -> Text -> [Expr Bool] -> DataFrame -> Maybe (Expr Bool)
data NumExpr
- = NDouble !(Expr Double)
- | NMaybeDouble !(Expr (Maybe Double))
numExprCols :: NumExpr -> [Text]
numExprEq :: NumExpr -> NumExpr -> Bool
combineNumExprs :: NumExpr -> NumExpr -> [NumExpr]
numericConditions :: TreeConfig -> DataFrame -> [Expr Bool]
generateNumericConds :: TreeConfig -> DataFrame -> [Expr Bool]
numericExprsWithTerms :: SynthConfig -> DataFrame -> [NumExpr]
numericCols :: DataFrame -> [NumExpr]
numericExprs :: SynthConfig -> DataFrame -> [NumExpr] -> Int -> Int -> [NumExpr]
boolExprs :: DataFrame -> [Expr Bool] -> [Expr Bool] -> Int -> Int -> [Expr Bool]
generateConditionsOld :: TreeConfig -> DataFrame -> [Expr Bool]
partitionDataFrame :: Expr Bool -> DataFrame -> (DataFrame, DataFrame)
calculateGini :: Columnable a => Text -> DataFrame -> Double
majorityValue :: Columnable a => Text -> DataFrame -> a
getCounts :: Columnable a => Text -> DataFrame -> Map a Int
percentile :: Int -> Expr Double -> DataFrame -> Double
buildTree :: Columnable a => TreeConfig -> Int -> Text -> [Expr Bool] -> DataFrame -> Expr a
findBestSplit :: Columnable a => TreeConfig -> Text -> [Expr Bool] -> DataFrame -> Maybe (Expr Bool)
pruneTree :: (Columnable a, Eq a) => Expr a -> Expr a
type ProbTree a = Tree (Map a Double)
probsFromIndices :: Columnable a => Text -> DataFrame -> Vector Int -> Map a Double
buildProbTree :: Columnable a => Tree a -> Text -> DataFrame -> Vector Int -> ProbTree a
fitProbTree :: Columnable a => TreeConfig -> Expr a -> DataFrame -> Map a (Expr Double)
probExprs :: Columnable a => ProbTree a -> Map a (Expr Double)

Documentation

data TreeConfig Source #

Constructors

TreeConfig
Fields maxTreeDepth :: Int minSamplesSplit :: Int minLeafSize :: Int percentiles :: [Int] expressionPairs :: Int synthConfig :: SynthConfig taoIterations :: Int taoConvergenceTol :: Double

Instances

Instances details

Show TreeConfig Source #
Instance details Defined in DataFrame.DecisionTree Methods showsPrec :: Int -> TreeConfig -> ShowS # show :: TreeConfig -> String # showList :: [TreeConfig] -> ShowS #
Eq TreeConfig Source #
Instance details Defined in DataFrame.DecisionTree Methods (==) :: TreeConfig -> TreeConfig -> Bool # (/=) :: TreeConfig -> TreeConfig -> Bool #

data SynthConfig Source #

Constructors

SynthConfig
Fields maxExprDepth :: Int boolExpansion :: Int disallowedCombinations :: [(Text, Text)] complexityPenalty :: Double enableStringOps :: Bool enableCrossCols :: Bool enableArithOps :: Bool

Instances

Instances details

Show SynthConfig Source #
Instance details Defined in DataFrame.DecisionTree Methods showsPrec :: Int -> SynthConfig -> ShowS # show :: SynthConfig -> String # showList :: [SynthConfig] -> ShowS #
Eq SynthConfig Source #
Instance details Defined in DataFrame.DecisionTree Methods (==) :: SynthConfig -> SynthConfig -> Bool # (/=) :: SynthConfig -> SynthConfig -> Bool #

defaultSynthConfig :: SynthConfig Source #

defaultTreeConfig :: TreeConfig Source #

data Tree a Source #

Constructors

Leaf !a
Branch !(Expr Bool) !(Tree a) !(Tree a)

Instances

Instances details

Show a => Show (Tree a) Source #
Instance details Defined in DataFrame.DecisionTree Methods showsPrec :: Int -> Tree a -> ShowS # show :: Tree a -> String # showList :: [Tree a] -> ShowS #
Eq a => Eq (Tree a) Source #
Instance details Defined in DataFrame.DecisionTree Methods (==) :: Tree a -> Tree a -> Bool # (/=) :: Tree a -> Tree a -> Bool #

treeDepth :: Tree a -> Int Source #

treeToExpr :: Columnable a => Tree a -> Expr a Source #

fitDecisionTree :: Columnable a => TreeConfig -> Expr a -> DataFrame -> Expr a Source #

Fit a TAO decision tree

taoOptimize :: Columnable a => TreeConfig -> Text -> [Expr Bool] -> DataFrame -> Vector Int -> Tree a -> Tree a Source #

taoIteration :: Columnable a => TreeConfig -> Text -> [Expr Bool] -> DataFrame -> Vector Int -> Tree a -> Tree a Source #

optimizeDepthLevel :: Columnable a => TreeConfig -> Text -> [Expr Bool] -> DataFrame -> Vector Int -> Tree a -> Int -> Tree a Source #

optimizeAtDepth :: Columnable a => TreeConfig -> Text -> [Expr Bool] -> DataFrame -> Vector Int -> Tree a -> Int -> Int -> Tree a Source #

optimizeNode :: Columnable a => TreeConfig -> Text -> [Expr Bool] -> DataFrame -> Vector Int -> Tree a -> Tree a Source #

findBestSplitTAO :: Columnable a => TreeConfig -> Text -> [Expr Bool] -> DataFrame -> Vector Int -> Tree a -> Tree a -> Expr Bool -> Expr Bool Source #

data CarePoint Source #

A care point with its index and which direction leads to correct classification

Constructors

CarePoint
Fields cpIndex :: !Int cpCorrectDir :: !Direction

Instances

Instances details

Show CarePoint Source #
Instance details Defined in DataFrame.DecisionTree Methods showsPrec :: Int -> CarePoint -> ShowS # show :: CarePoint -> String # showList :: [CarePoint] -> ShowS #
Eq CarePoint Source #
Instance details Defined in DataFrame.DecisionTree Methods (==) :: CarePoint -> CarePoint -> Bool # (/=) :: CarePoint -> CarePoint -> Bool #

data Direction Source #

Constructors

GoLeft
GoRight

Instances

Instances details

Show Direction Source #
Instance details Defined in DataFrame.DecisionTree Methods showsPrec :: Int -> Direction -> ShowS # show :: Direction -> String # showList :: [Direction] -> ShowS #
Eq Direction Source #
Instance details Defined in DataFrame.DecisionTree Methods (==) :: Direction -> Direction -> Bool # (/=) :: Direction -> Direction -> Bool #

identifyCarePoints :: Columnable a => Text -> DataFrame -> Vector Int -> Tree a -> Tree a -> [CarePoint] Source #

Identify care points: points where exactly one subtree classifies correctly

For each point reaching the node: 1. Compute what label the left subtree would predict 2. Compute what label the right subtree would predict 3. If exactly one matches the true label, it's a care point 4. Record which direction leads to correct classification

predictWithTree :: Columnable a => Text -> DataFrame -> Int -> Tree a -> a Source #

Predict the label for a single point using a fixed tree

countCarePointErrors :: Expr Bool -> DataFrame -> [CarePoint] -> Int Source #

partitionIndices :: Expr Bool -> DataFrame -> Vector Int -> (Vector Int, Vector Int) Source #

majorityValueFromIndices :: Columnable a => Text -> DataFrame -> Vector Int -> a Source #

computeTreeLoss :: Columnable a => Text -> DataFrame -> Vector Int -> Tree a -> Double Source #

pruneDead :: Tree a -> Tree a Source #

pruneExpr :: (Columnable a, Eq a) => Expr a -> Expr a Source #

buildGreedyTree :: Columnable a => TreeConfig -> Int -> Text -> [Expr Bool] -> DataFrame -> Tree a Source #

findBestGreedySplit :: Columnable a => TreeConfig -> Text -> [Expr Bool] -> DataFrame -> Maybe (Expr Bool) Source #

data NumExpr Source #

Unifies non-nullable and nullable Double expressions for feature generation.

Constructors

NDouble !(Expr Double)
NMaybeDouble !(Expr (Maybe Double))

numExprCols :: NumExpr -> [Text] Source #

numExprEq :: NumExpr -> NumExpr -> Bool Source #

combineNumExprs :: NumExpr -> NumExpr -> [NumExpr] Source #

numericConditions :: TreeConfig -> DataFrame -> [Expr Bool] Source #

generateNumericConds :: TreeConfig -> DataFrame -> [Expr Bool] Source #

numericExprsWithTerms :: SynthConfig -> DataFrame -> [NumExpr] Source #

numericCols :: DataFrame -> [NumExpr] Source #

numericExprs :: SynthConfig -> DataFrame -> [NumExpr] -> Int -> Int -> [NumExpr] Source #

boolExprs :: DataFrame -> [Expr Bool] -> [Expr Bool] -> Int -> Int -> [Expr Bool] Source #

generateConditionsOld :: TreeConfig -> DataFrame -> [Expr Bool] Source #

partitionDataFrame :: Expr Bool -> DataFrame -> (DataFrame, DataFrame) Source #

calculateGini :: Columnable a => Text -> DataFrame -> Double Source #

majorityValue :: Columnable a => Text -> DataFrame -> a Source #

getCounts :: Columnable a => Text -> DataFrame -> Map a Int Source #

percentile :: Int -> Expr Double -> DataFrame -> Double Source #

buildTree :: Columnable a => TreeConfig -> Int -> Text -> [Expr Bool] -> DataFrame -> Expr a Source #

findBestSplit :: Columnable a => TreeConfig -> Text -> [Expr Bool] -> DataFrame -> Maybe (Expr Bool) Source #

pruneTree :: (Columnable a, Eq a) => Expr a -> Expr a Source #

type ProbTree a = Tree (Map a Double) Source #

A tree where each leaf stores a class-probability distribution.

probsFromIndices :: Columnable a => Text -> DataFrame -> Vector Int -> Map a Double Source #

Compute normalised class probabilities from a subset of training rows.

buildProbTree :: Columnable a => Tree a -> Text -> DataFrame -> Vector Int -> ProbTree a Source #

Annotate a fitted 'Tree a' with class distributions by routing the training data through it. The split conditions are preserved; only the leaf values change from a majority label to a probability map.

fitProbTree :: Columnable a => TreeConfig -> Expr a -> DataFrame -> Map a (Expr Double) Source #

Fit a TAO decision tree and return one Expr Double per class.

Each (c, e) pair in the result map means: evaluate e on a DataFrame row to get the predicted probability of class c. You can insert these as new columns with derive or evaluate them with interpret.

Example: let pes = fitProbTree @T.Text cfg (Col "species") trainDf -- pes M.! "setosa" :: Expr Double df' = M.foldlWithKey' (\d cls e -> D.derive (cls <> "_prob") e d) testDf pes

probExprs :: Columnable a => ProbTree a -> Map a (Expr Double) Source #

Convert a ProbTree into one 'Expr Double' per class.

Each (c, e) pair means: evaluate e on a DataFrame row to get the predicted probability of class c. You can insert these as new columns with derive or evaluate them with interpret.

Example: let pt = fitProbTree @T.Text cfg (Col "species") trainDf pes = probExprs pt -- pes M.! "setosa" :: Expr Double df' = M.foldlWithKey' (\d cls e -> D.derive (cls <> "_prob") e d) testDf pes