| Safe Haskell | None |
|---|---|
| Language | Haskell2010 |
DataFrame.IO.Parquet
Synopsis
- data ParquetReadOptions = ParquetReadOptions {}
- defaultParquetReadOptions :: ParquetReadOptions
- readParquet :: FilePath -> IO DataFrame
- cleanColPath :: [SNode] -> [String] -> [String]
- readParquetWithOpts :: ParquetReadOptions -> FilePath -> IO DataFrame
- readParquetFiles :: FilePath -> IO DataFrame
- readParquetFilesWithOpts :: ParquetReadOptions -> FilePath -> IO DataFrame
- applyRowRange :: ParquetReadOptions -> DataFrame -> DataFrame
- applySelectedColumns :: ParquetReadOptions -> DataFrame -> DataFrame
- applyPredicate :: ParquetReadOptions -> DataFrame -> DataFrame
- applyReadOptions :: ParquetReadOptions -> DataFrame -> DataFrame
- readMetadataFromPath :: FilePath -> IO (FileMetadata, ByteString)
- readMetadataSizeFromFooter :: ByteString -> (Int, ByteString)
- getColumnPaths :: [SchemaElement] -> [(Text, Int)]
- findLeafSchema :: [SchemaElement] -> [String] -> Maybe SchemaElement
- processColumnPages :: (Int, Int) -> [Page] -> ParquetType -> ParquetEncoding -> Maybe Int32 -> LogicalType -> IO Column
- decodePageData :: Maybe DictVals -> (Int, Int) -> ParquetType -> Maybe Int32 -> ParquetEncoding -> [Int] -> [Int] -> Int -> ByteString -> String -> IO Column
- applyLogicalType :: LogicalType -> Column -> Column
- microsecondsToUTCTime :: Int64 -> UTCTime
- unitDivisor :: TimeUnit -> Int64
- applyScale :: Int32 -> Int32 -> Double
Documentation
data ParquetReadOptions Source #
Options for reading Parquet data.
These options are applied in this order:
- predicate filtering
- column projection
- row range
Column selection for selectedColumns uses leaf column names only.
Constructors
| ParquetReadOptions | |
Fields
| |
Instances
| Show ParquetReadOptions Source # | |
Defined in DataFrame.IO.Parquet Methods showsPrec :: Int -> ParquetReadOptions -> ShowS # show :: ParquetReadOptions -> String # showList :: [ParquetReadOptions] -> ShowS # | |
| Eq ParquetReadOptions Source # | |
Defined in DataFrame.IO.Parquet Methods (==) :: ParquetReadOptions -> ParquetReadOptions -> Bool # (/=) :: ParquetReadOptions -> ParquetReadOptions -> Bool # | |
defaultParquetReadOptions :: ParquetReadOptions Source #
Default Parquet read options.
Equivalent to:
ParquetReadOptions
{ selectedColumns = Nothing
, predicate = Nothing
, rowRange = Nothing
}
readParquet :: FilePath -> IO DataFrame Source #
Read a parquet file from path and load it into a dataframe.
Example
ghci> D.readParquet "./data/mtcars.parquet"
cleanColPath :: [SNode] -> [String] -> [String] Source #
Read a Parquet file using explicit read options.
Example
ghci> D.readParquetWithOpts
ghci| (D.defaultParquetReadOptions{D.selectedColumns = Just ["id"], D.rowRange = Just (0, 10)})
ghci| ".testsdata/alltypes_plain.parquet"
When selectedColumns is set and predicate references other columns, those predicate columns
are auto-included for decoding, then projected back to the requested output columns.
Strip Parquet encoding artifact names (REPEATED wrappers and their single list-element children) from a raw column path, leaving user-visible names.
readParquetFiles :: FilePath -> IO DataFrame Source #
Read Parquet files from a directory or glob path.
This is equivalent to calling readParquetFilesWithOpts with defaultParquetReadOptions.
readParquetFilesWithOpts :: ParquetReadOptions -> FilePath -> IO DataFrame Source #
Read multiple Parquet files (directory or glob) using explicit options.
If path is a directory, all non-directory entries are read.
If path is a glob, matching files are read.
For multi-file reads, rowRange is applied once after concatenation (global range semantics).
Example
ghci> D.readParquetFilesWithOpts
ghci| (D.defaultParquetReadOptions{D.selectedColumns = Just ["id"], D.rowRange = Just (0, 5)})
ghci| ".testsdata/alltypes_plain*.parquet"
applyRowRange :: ParquetReadOptions -> DataFrame -> DataFrame Source #
readMetadataFromPath :: FilePath -> IO (FileMetadata, ByteString) Source #
readMetadataSizeFromFooter :: ByteString -> (Int, ByteString) Source #
getColumnPaths :: [SchemaElement] -> [(Text, Int)] Source #
findLeafSchema :: [SchemaElement] -> [String] -> Maybe SchemaElement Source #
processColumnPages :: (Int, Int) -> [Page] -> ParquetType -> ParquetEncoding -> Maybe Int32 -> LogicalType -> IO Column Source #
decodePageData :: Maybe DictVals -> (Int, Int) -> ParquetType -> Maybe Int32 -> ParquetEncoding -> [Int] -> [Int] -> Int -> ByteString -> String -> IO Column Source #
applyLogicalType :: LogicalType -> Column -> Column Source #
microsecondsToUTCTime :: Int64 -> UTCTime Source #
unitDivisor :: TimeUnit -> Int64 Source #