{- | Module : DataFrame Copyright : (c) 2025 License : GPL-3.0 Maintainer : mschavinda@gmail.com Stability : experimental Portability : POSIX Batteries-included entry point for the DataFrame library. This module re-exports the most commonly used pieces of the @dataframe@ library so you can get productive fast in GHCi, IHaskell, or scripts. __Naming convention__ * Use the @D.@ (\"DataFrame\") prefix for core table operations. * Use the @F.@ (\"Functions\") prefix for the expression DSL (columns, math, aggregations). Example session: We provide a script that imports the core functionality and defines helpful macros for writing safe code. @ \$ curl --output dataframe \"https:\/\/raw.githubusercontent.com\/mchav\/dataframe\/refs\/heads\/main\/scripts\/dataframe.sh\" \$ chmod +x dataframe \$ export PATH=$PATH:$PWD/dataframe \$ dataframe Configuring library for fake-package-0... Warning: No exposed modules GHCi, version 9.6.7: https:\/\/www.haskell.org\/ghc\/ :? for help Loaded GHCi configuration from \/tmp\/cabal-repl.-242816\/setcwd.ghci ======================================== 📦Dataframe ======================================== ✨ Modules were automatically imported. 💡 Use prefix 'D' for core functionality. ● E.g. D.readCsv \"\/path\/to\/file\" 💡 Use prefix 'F' for expression functions. ● E.g. F.sum (F.col \@Int \"value\") ✅ Ready. Loaded GHCi configuration from ./dataframe.ghci ghci> @ = Quick start Load a CSV, select a few columns, filter, derive a column, then group + aggregate: @ -- 1) Load data ghci> df0 <- D.readCsv "data/housing.csv" ghci> D.describeColumns df0 -------------------------------------------------------------------------------------------------------------------- index | Column Name | # Non-null Values | # Null Values | # Partially parsed | # Unique Values | Type ------|--------------------|-------------------|---------------|--------------------|-----------------|------------- Int | Text | Int | Int | Int | Int | Text ------|--------------------|-------------------|---------------|--------------------|-----------------|------------- 0 | ocean_proximity | 20640 | 0 | 0 | 5 | Text 1 | median_house_value | 20640 | 0 | 0 | 3842 | Double 2 | median_income | 20640 | 0 | 0 | 12928 | Double 3 | households | 20640 | 0 | 0 | 1815 | Double 4 | population | 20640 | 0 | 0 | 3888 | Double 5 | total_bedrooms | 20640 | 0 | 0 | 1924 | Maybe Double 6 | total_rooms | 20640 | 0 | 0 | 5926 | Double 7 | housing_median_age | 20640 | 0 | 0 | 52 | Double 8 | latitude | 20640 | 0 | 0 | 862 | Double 9 | longitude | 20640 | 0 | 0 | 844 | Double -- 2) Project & filter ghci> :exposeColumn df ghci> df1 = D.filterWhere (ocean_proximity F.== F.lit \"ISLAND\") df0 D.|> D.select [F.name median_house_value, F.name median_income, F.name ocean_proximity] -- 3) Add a derived column using the expression DSL -- (col types are explicit via TypeApplications) ghci> df2 = D.derive "rooms_per_household" (total_rooms / households) df0 -- 4) Group + aggregate ghci> let grouped = D.groupBy ["ocean_proximity"] df0 ghci> let summary = D.aggregate [ F.maximum median_house_value \`F.as\` "max_house_value"] grouped ghci> D.take 5 summary ----------------------------------------- index | ocean_proximity | max_house_value ------|-----------------|---------------- Int | Text | Double ------|-----------------|---------------- 0 | <1H OCEAN | 500001.0 1 | INLAND | 500001.0 2 | ISLAND | 450000.0 3 | NEAR BAY | 500001.0 4 | NEAR OCEAN | 500001.0 @ == Simple operations (cheat sheet) Most users only need a handful of verbs: __I/O__ * @D.readCsv :: FilePath -> IO DataFrame@ * @D.readTsv :: FilePath -> IO DataFrame@ * @D.writeCsv :: FilePath -> DataFrame -> IO ()@ * @D.readParquet :: FilePath -> IO DataFrame@ __Exploration__ * @D.take :: Int -> DataFrame -> DataFrame@ * @D.takeLast :: Int -> DataFrame -> DataFrame@ * @D.describeColumns :: DataFrame -> DataFrame@ * @D.summarize :: DataFrame -> DataFrame@ __Row ops__ * @D.filterWhere :: Expr Bool -> DataFrame -> DataFrame@ * @D.sortBy :: SortOrder -> [Text] -> DataFrame -> DataFrame@ __Column ops__ * @D.select :: [Text] -> DataFrame -> DataFrame@ * @D.exclude :: [Text] -> DataFrame -> DataFrame@ * @D.rename :: [(Text,Text)] -> DataFrame -> DataFrame@ * @D.derive :: Text -> D.Expr a -> DataFrame -> DataFrame@ __Group & aggregate__ * @D.groupBy :: [Text] -> DataFrame -> GroupedDataFrame@ * @D.aggregate :: [(Text, F.UExpr)] -> GroupedDataFrame -> DataFrame@ __Joins__ * @D.innerJoin \/ D.leftJoin \/ D.rightJoin \/ D.fullJoin@ == Expression DSL (F.*) at a glance Columns (typed): @ F.col \@Text "ocean_proximity" F.col \@Double "total_rooms" F.lit \@Double 1.0 @ Math & comparisons (overloaded by type): @ (+), (-), (*), (/), abs, log, exp, round (F.eq), (F.gt), (F.geq), (F.lt), (F.leq) @ Aggregations (for D.'aggregate'): @ F.count \@a (F.col \@a "c") F.sum \@Double (F.col \@Double "x") F.mean \@Double (F.col \@Double "x") F.min \@t (F.col \@t "x") F.max \@t (F.col \@t "x") @ == REPL power-tool: ':exposeColumns' Use @:exposeColumns <df>@ in GHCi/IHaskell to turn each column of a bound 'DataFrame' into a local binding with the same (mangled if needed) name and the column's concrete vector type. This is great for quick ad-hoc analysis, plotting, or hand-rolled checks. @ -- Suppose df has columns: "passengers" :: Int, "fare" :: Double, "payment" :: Text ghci> :set -XTemplateHaskell ghci> :exposeColumns df -- Now you have in scope: ghci> :type passengers passengers :: Expr Int ghci> :type fare fare :: Expr Double ghci> :type payment payment :: Expr Text -- You can use them directly: ghci> D.derive "fare_with_tip" (fare * F.lit 1.2) @ Notes: * Name mangling: spaces and non-identifier characters are replaced (e.g. @"trip id"@ -> @trip_id@). * Optional/nullable columns are exposed as @Expr (Maybe a)@. -} module DataFrame ( -- * Core data structures module Dataframe, module Column, module Row, module Expression, -- * Core dataframe operations module Core, -- * I/O module CSV, module Parquet, -- * Operations module Subset, module Transformations, module Aggregation, module Sorting, module Merge, module Join, module Statistics, -- * Errors module Errors, -- * Plotting module Plot, -- * Convenience functions (|>), ) where import DataFrame.Display.Terminal.Plot as Plot import DataFrame.Errors as Errors import DataFrame.IO.CSV as CSV ( ReadOptions (..), defaultReadOptions, readCsv, readCsvWithOpts, readSeparated, readTsv, writeCsv, writeSeparated, ) import DataFrame.IO.Parquet as Parquet (readParquet) import DataFrame.Internal.Column as Column ( Column, fromList, fromUnboxedVector, fromVector, hasElemType, hasMissing, isNumeric, toList, toVector, ) import DataFrame.Internal.DataFrame as Dataframe ( DataFrame, GroupedDataFrame, columnAsVector, empty, toMarkdownTable, toMatrix, ) import DataFrame.Internal.Expression as Expression (Expr) import DataFrame.Internal.Row as Row (Row, fromAny, toAny, toRowList) import DataFrame.Operations.Aggregation as Aggregation ( aggregate, distinct, groupBy, ) import DataFrame.Operations.Core as Core hiding ( ColumnInfo (..), nulls, partiallyParsed, renameSafe, ) import DataFrame.Operations.Join as Join import DataFrame.Operations.Merge as Merge import DataFrame.Operations.Sorting as Sorting import DataFrame.Operations.Statistics as Statistics ( correlation, frequencies, interQuartileRange, mean, median, skewness, standardDeviation, sum, summarize, variance, ) import DataFrame.Operations.Subset as Subset ( SelectionCriteria, byIndexRange, byName, byNameProperty, byNameRange, byProperty, cube, drop, dropLast, exclude, filter, filterAllJust, filterBy, filterJust, filterNothing, filterWhere, kFolds, randomSplit, range, sample, select, selectBy, take, takeLast, ) import DataFrame.Operations.Transformations as Transformations import Data.Function ((&)) |> :: a -> (a -> b) -> b (|>) = a -> (a -> b) -> b forall a b. a -> (a -> b) -> b (&)