Safe Haskell	None
Language	Haskell2010

DataFrame.IO.Parquet

Synopsis

data ParquetReadOptions = ParquetReadOptions {
- selectedColumns :: Maybe [Text]
- predicate :: Maybe (Expr Bool)
- rowRange :: Maybe (Int, Int)
}
defaultParquetReadOptions :: ParquetReadOptions
readParquet :: FilePath -> IO DataFrame
cleanColPath :: [SNode] -> [String] -> [String]
readParquetWithOpts :: ParquetReadOptions -> FilePath -> IO DataFrame
readParquetFiles :: FilePath -> IO DataFrame
readParquetFilesWithOpts :: ParquetReadOptions -> FilePath -> IO DataFrame
applyRowRange :: ParquetReadOptions -> DataFrame -> DataFrame
applySelectedColumns :: ParquetReadOptions -> DataFrame -> DataFrame
applyPredicate :: ParquetReadOptions -> DataFrame -> DataFrame
applyReadOptions :: ParquetReadOptions -> DataFrame -> DataFrame
readMetadataFromPath :: FilePath -> IO (FileMetadata, ByteString)
readMetadataSizeFromFooter :: ByteString -> (Int, ByteString)
getColumnPaths :: [SchemaElement] -> [(Text, Int)]
findLeafSchema :: [SchemaElement] -> [String] -> Maybe SchemaElement
processColumnPages :: (Int, Int) -> [Page] -> ParquetType -> ParquetEncoding -> Maybe Int32 -> LogicalType -> IO Column
decodePageData :: Maybe DictVals -> (Int, Int) -> ParquetType -> Maybe Int32 -> ParquetEncoding -> [Int] -> [Int] -> Int -> ByteString -> String -> IO Column
applyLogicalType :: LogicalType -> Column -> Column
microsecondsToUTCTime :: Int64 -> UTCTime
unitDivisor :: TimeUnit -> Int64
applyScale :: Int32 -> Int32 -> Double

Documentation

data ParquetReadOptions Source #

Options for reading Parquet data.

These options are applied in this order:

predicate filtering
column projection
row range

Column selection for selectedColumns uses leaf column names only.

Constructors

ParquetReadOptions

Fields

selectedColumns :: Maybe [Text]
Columns to keep in the final dataframe. If set, only these columns are returned. Predicate-referenced columns are read automatically when needed and projected out after filtering.
predicate :: Maybe (Expr Bool)
Optional row filter expression applied before projection.
rowRange :: Maybe (Int, Int)
Optional row slice (start, end) with start-inclusive/end-exclusive semantics.

Instances

Instances details

Show ParquetReadOptions Source #
Instance details Defined in DataFrame.IO.Parquet Methods showsPrec :: Int -> ParquetReadOptions -> ShowS # show :: ParquetReadOptions -> String # showList :: [ParquetReadOptions] -> ShowS #
Eq ParquetReadOptions Source #
Instance details Defined in DataFrame.IO.Parquet Methods (==) :: ParquetReadOptions -> ParquetReadOptions -> Bool # (/=) :: ParquetReadOptions -> ParquetReadOptions -> Bool #

defaultParquetReadOptions :: ParquetReadOptions Source #

Default Parquet read options.

Equivalent to:

ParquetReadOptions
    { selectedColumns = Nothing
    , predicate = Nothing
    , rowRange = Nothing
    }

readParquet :: FilePath -> IO DataFrame Source #

Read a parquet file from path and load it into a dataframe.

Example

Expand

ghci> D.readParquet "./data/mtcars.parquet"

cleanColPath :: [SNode] -> [String] -> [String] Source #

Read a Parquet file using explicit read options.

Example

Expand

ghci> D.readParquetWithOpts
ghci|   (D.defaultParquetReadOptions{D.selectedColumns = Just ["id"], D.rowRange = Just (0, 10)})
ghci|   ".testsdata/alltypes_plain.parquet"

When selectedColumns is set and predicate references other columns, those predicate columns are auto-included for decoding, then projected back to the requested output columns.

Strip Parquet encoding artifact names (REPEATED wrappers and their single list-element children) from a raw column path, leaving user-visible names.

readParquetWithOpts :: ParquetReadOptions -> FilePath -> IO DataFrame Source #

readParquetFiles :: FilePath -> IO DataFrame Source #

Read Parquet files from a directory or glob path.

This is equivalent to calling readParquetFilesWithOpts with defaultParquetReadOptions.

readParquetFilesWithOpts :: ParquetReadOptions -> FilePath -> IO DataFrame Source #

Read multiple Parquet files (directory or glob) using explicit options.

If path is a directory, all non-directory entries are read. If path is a glob, matching files are read.

For multi-file reads, rowRange is applied once after concatenation (global range semantics).

Example

Expand

ghci> D.readParquetFilesWithOpts
ghci|   (D.defaultParquetReadOptions{D.selectedColumns = Just ["id"], D.rowRange = Just (0, 5)})
ghci|   ".testsdata/alltypes_plain*.parquet"

applyRowRange :: ParquetReadOptions -> DataFrame -> DataFrame Source #

applySelectedColumns :: ParquetReadOptions -> DataFrame -> DataFrame Source #

applyPredicate :: ParquetReadOptions -> DataFrame -> DataFrame Source #

applyReadOptions :: ParquetReadOptions -> DataFrame -> DataFrame Source #

readMetadataFromPath :: FilePath -> IO (FileMetadata, ByteString) Source #

readMetadataSizeFromFooter :: ByteString -> (Int, ByteString) Source #

getColumnPaths :: [SchemaElement] -> [(Text, Int)] Source #

findLeafSchema :: [SchemaElement] -> [String] -> Maybe SchemaElement Source #

processColumnPages :: (Int, Int) -> [Page] -> ParquetType -> ParquetEncoding -> Maybe Int32 -> LogicalType -> IO Column Source #

decodePageData :: Maybe DictVals -> (Int, Int) -> ParquetType -> Maybe Int32 -> ParquetEncoding -> [Int] -> [Int] -> Int -> ByteString -> String -> IO Column Source #

applyLogicalType :: LogicalType -> Column -> Column Source #

microsecondsToUTCTime :: Int64 -> UTCTime Source #

unitDivisor :: TimeUnit -> Int64 Source #

applyScale :: Int32 -> Int32 -> Double Source #