{-# LANGUAGE OverloadedStrings #-} {- | Module : DataFrame Copyright : (c) 2025 License : GPL-3.0 Maintainer : mschavinda@gmail.com Stability : experimental Portability : POSIX Batteries-included entry point for the DataFrame library. This module re-exports the most commonly used pieces of the @dataframe@ library so you can get productive fast in GHCi, IHaskell, or scripts. __Naming convention__ * Use the @D.@ (\"DataFrame\") prefix for core table operations. * Use the @F.@ (\"Functions\") prefix for the expression DSL (columns, math, aggregations). Example session: @ -- GHCi quality-of-life: ghci> :set -XOverloadedStrings -XTypeApplications ghci> :module + DataFrame as D, DataFrame.Functions as F, Data.Text (Text) @ = Quick start Load a CSV, select a few columns, filter, derive a column, then group + aggregate: @ -- 1) Load data ghci> df0 <- D.readCsv "data/housing.csv" ghci> D.describeColumns df0 -------------------------------------------------------------------------------------------------------------------- index | Column Name | # Non-null Values | # Null Values | # Partially parsed | # Unique Values | Type ------|--------------------|-------------------|---------------|--------------------|-----------------|------------- Int | Text | Int | Int | Int | Int | Text ------|--------------------|-------------------|---------------|--------------------|-----------------|------------- 0 | ocean_proximity | 20640 | 0 | 0 | 5 | Text 1 | median_house_value | 20640 | 0 | 0 | 3842 | Double 2 | median_income | 20640 | 0 | 0 | 12928 | Double 3 | households | 20640 | 0 | 0 | 1815 | Double 4 | population | 20640 | 0 | 0 | 3888 | Double 5 | total_bedrooms | 20640 | 0 | 0 | 1924 | Maybe Double 6 | total_rooms | 20640 | 0 | 0 | 5926 | Double 7 | housing_median_age | 20640 | 0 | 0 | 52 | Double 8 | latitude | 20640 | 0 | 0 | 862 | Double 9 | longitude | 20640 | 0 | 0 | 844 | Double -- 2) Project & filter ghci> let df1 = df1 = D.filter @Text "ocean_proximity" (== "ISLAND") df0 D.|> D.select ["median_house_value", "median_income", "ocean_proximity"] -- 3) Add a derived column using the expression DSL -- (col types are explicit via TypeApplications) ghci> df2 = D.derive "rooms_per_household" (F.col @Double "total_rooms" / F.col @Double "households") df0 -- 4) Group + aggregate ghci> let grouped = D.groupBy ["ocean_proximity"] df0 ghci> let summary = D.aggregate [ F.maximum (F.col @Double "median_house_value") `F.as` "max_house_value"] grouped ghci> D.take 5 summary ----------------------------------------- index | ocean_proximity | max_house_value ------|-----------------|---------------- Int | Text | Double ------|-----------------|---------------- 0 | <1H OCEAN | 500001.0 1 | INLAND | 500001.0 2 | ISLAND | 450000.0 3 | NEAR BAY | 500001.0 4 | NEAR OCEAN | 500001.0 @ == Simple operations (cheat sheet) Most users only need a handful of verbs: __I/O__ * @D.readCsv :: FilePath -> IO DataFrame@ * @D.writeCsv :: FilePath -> DataFrame -> IO ()@ * @D.readParquet :: FilePath -> IO DataFrame@ __Exploration__ * @D.take :: Int -> DataFrame -> DataFrame@ * @D.takeLast :: Int -> DataFrame -> DataFrame@ * @D.describeColumns :: DataFrame -> DataFrame@ * @D.summarize :: DataFrame -> DataFrame@ __Row ops__ * @D.filter :: Columnable a => Text -> (a -> Bool) -> DataFrame -> DataFrame@ * @D.sortBy :: SortOrder -> [Text] -> DataFrame -> DataFrame@ __Column ops__ * @D.select :: [Text] -> DataFrame -> DataFrame@ * @D.exclude :: [Text] -> DataFrame -> DataFrame@ * @D.rename :: [(Text,Text)] -> DataFrame -> DataFrame@ * @D.derive :: Text -> D.Expr a -> DataFrame -> DataFrame@ __Group & aggregate__ * @D.groupBy :: [Text] -> DataFrame -> GroupedDataFrame@ * @D.aggregate :: [(Text, F.UExpr)] -> GroupedDataFrame -> DataFrame@ __Joins__ * @D.innerJoin / D.leftJoin / D.rightJoin / D.fullJoin@ == Expression DSL (F.*) at a glance Columns (typed): @ F.col @Text "ocean_proximity" F.col @Double "total_rooms" F.lit @Double 1.0 @ Math & comparisons (overloaded by type): @ (+), (-), (*), (/), abs, log, exp, round (F.eq), (F.gt), (F.geq), (F.lt), (F.leq) @ Aggregations (for 'D.aggregate'): @ F.count @a (F.col @a "c") F.sum @Double (F.col @Double "x") F.mean @Double (F.col @Double "x") F.min @t (F.col @t "x") F.max @t (F.col @t "x") @ == REPL power-tool: ':exposeColumns' Use @:exposeColumns <df>@ in GHCi/IHaskell to turn each column of a bound 'DataFrame' into a local binding with the same (mangled if needed) name and the column's concrete vector type. This is great for quick ad-hoc analysis, plotting, or hand-rolled checks. @ -- Suppose df has columns: "passengers" :: Int, "fare" :: Double, "payment" :: Text ghci> :set -XTemplateHaskell ghci> :exposeColumns df -- Now you have in scope: ghci> :type passengers passengers :: Expr Int ghci> :type fare fare :: Expr Double ghci> :type payment payment :: Expr Text -- You can use them directly: ghci> D.derive "fare_with_tip" (fare * F.lit 1.2) @ Notes: * Name mangling: spaces and non-identifier characters are replaced (e.g. @"trip id"@ -> @trip_id@). * Optional/nullable columns are exposed as @Expr (Maybe a)@. -} module DataFrame ( -- * Core data structures module Dataframe, module Column, module Expression, -- * Core dataframe operations module Core, -- * I/O module CSV, module Parquet, -- * Operations module Subset, module Transformations, module Aggregation, module Sorting, module Merge, module Join, module Statistics, -- * Errors module Errors, -- * Plotting module Plot, -- * Convenience functions (|>), ) where import DataFrame.Display.Terminal.Plot as Plot import DataFrame.Errors as Errors import DataFrame.IO.CSV as CSV (ReadOptions (..), defaultOptions, readCsv, readSeparated, readTsv) import DataFrame.IO.Parquet as Parquet (readParquet) import DataFrame.Internal.Column as Column ( Column, fromList, fromUnboxedVector, fromVector, toList, toVector, ) import DataFrame.Internal.DataFrame as Dataframe ( DataFrame, GroupedDataFrame, columnAsVector, empty, toMatrix, ) import DataFrame.Internal.Expression as Expression (Expr) import DataFrame.Operations.Aggregation as Aggregation (aggregate, distinct, groupBy) import DataFrame.Operations.Core as Core hiding (ColumnInfo (..), nulls, partiallyParsed, renameSafe) import DataFrame.Operations.Join as Join import DataFrame.Operations.Merge as Merge import DataFrame.Operations.Sorting as Sorting import DataFrame.Operations.Statistics as Statistics ( correlation, frequencies, interQuartileRange, mean, median, skewness, standardDeviation, sum, summarize, variance, ) import DataFrame.Operations.Subset as Subset ( cube, drop, dropLast, exclude, filter, filterAllJust, filterBy, filterJust, filterWhere, range, select, selectBy, selectIntRange, selectRange, take, takeLast, ) import DataFrame.Operations.Transformations as Transformations import Data.Function import Data.List import qualified Data.Text as T import qualified Data.Text.IO as T |> :: a -> (a -> b) -> b (|>) = a -> (a -> b) -> b forall a b. a -> (a -> b) -> b (&)