Safe Haskell	None
Language	Haskell2010

DataFrame.IO.Parquet

Synopsis

data ParquetReadOptions = ParquetReadOptions {
- selectedColumns :: Maybe [Text]
- predicate :: Maybe (Expr Bool)
- rowRange :: Maybe (Int, Int)
}
defaultParquetReadOptions :: ParquetReadOptions
readParquet :: FilePath -> IO DataFrame
cleanColPath :: [SNode] -> [String] -> [String]
readParquetWithOpts :: ParquetReadOptions -> FilePath -> IO DataFrame
_readParquetWithOpts :: ForceNonSeekable -> ParquetReadOptions -> FilePath -> IO DataFrame
readParquetFiles :: FilePath -> IO DataFrame
readParquetFilesWithOpts :: ParquetReadOptions -> FilePath -> IO DataFrame
applyRowRange :: ParquetReadOptions -> DataFrame -> DataFrame
applySelectedColumns :: ParquetReadOptions -> DataFrame -> DataFrame
applyPredicate :: ParquetReadOptions -> DataFrame -> DataFrame
applyReadOptions :: ParquetReadOptions -> DataFrame -> DataFrame
readMetadataFromPath :: FilePath -> IO (FileMetadata, ByteString)
readMetadataFromHandle :: FileBufferedOrSeekable -> IO FileMetadata
readMetadataSizeFromFooterSlice :: ByteString -> (Int, ByteString)
readMetadataSizeFromFooter :: ByteString -> (Int, ByteString)
getColumnPaths :: [SchemaElement] -> [(Text, Int)]
findLeafSchema :: [SchemaElement] -> [String] -> Maybe SchemaElement
processColumnPages :: (Int, Int) -> [Page] -> ParquetType -> ParquetEncoding -> Maybe Int32 -> LogicalType -> IO Column
decodePageData :: Maybe DictVals -> (Int, Int) -> ParquetType -> Maybe Int32 -> ParquetEncoding -> [Int] -> [Int] -> Int -> ByteString -> String -> IO Column
applyLogicalType :: LogicalType -> Column -> Column
microsecondsToUTCTime :: Int64 -> UTCTime
unitDivisor :: TimeUnit -> Int64
applyScale :: Int32 -> Int32 -> Double
data HFRef = HFRef {
- hfOwner :: Text
- hfDataset :: Text
- hfGlob :: Text
}
data HFParquetFile = HFParquetFile {
- hfpUrl :: Text
- hfpConfig :: Text
- hfpSplit :: Text
- hfpFilename :: Text
}
newtype HFParquetResponse = HFParquetResponse {
- hfParquetFiles :: [HFParquetFile]
}
isHFUri :: FilePath -> Bool
parseHFUri :: FilePath -> Either String HFRef
getHFToken :: IO (Maybe ByteString)
hfUrlRepoPath :: HFParquetFile -> String
matchesGlob :: Text -> HFParquetFile -> Bool
resolveHFUrls :: Maybe ByteString -> HFRef -> IO [HFParquetFile]
downloadHFFiles :: Maybe ByteString -> [HFParquetFile] -> IO [FilePath]
hasGlob :: Text -> Bool
directHFUrl :: HFRef -> Text
fetchHFParquetFiles :: FilePath -> IO [FilePath]

Documentation

data ParquetReadOptions Source #

Options for reading Parquet data.

These options are applied in this order:

predicate filtering
column projection
row range

Column selection for selectedColumns uses leaf column names only.

Constructors

ParquetReadOptions

Fields

selectedColumns :: Maybe [Text]
Columns to keep in the final dataframe. If set, only these columns are returned. Predicate-referenced columns are read automatically when needed and projected out after filtering.
predicate :: Maybe (Expr Bool)
Optional row filter expression applied before projection.
rowRange :: Maybe (Int, Int)
Optional row slice (start, end) with start-inclusive/end-exclusive semantics.

Instances

Instances details

Show ParquetReadOptions Source #
Instance details Defined in DataFrame.IO.Parquet Methods showsPrec :: Int -> ParquetReadOptions -> ShowS # show :: ParquetReadOptions -> String # showList :: [ParquetReadOptions] -> ShowS #
Eq ParquetReadOptions Source #
Instance details Defined in DataFrame.IO.Parquet Methods (==) :: ParquetReadOptions -> ParquetReadOptions -> Bool # (/=) :: ParquetReadOptions -> ParquetReadOptions -> Bool #

defaultParquetReadOptions :: ParquetReadOptions Source #

Default Parquet read options.

Equivalent to:

ParquetReadOptions
    { selectedColumns = Nothing
    , predicate = Nothing
    , rowRange = Nothing
    }

readParquet :: FilePath -> IO DataFrame Source #

Read a parquet file from path and load it into a dataframe.

Example

Expand

ghci> D.readParquet "./data/mtcars.parquet"

cleanColPath :: [SNode] -> [String] -> [String] Source #

Read a Parquet file using explicit read options.

Example

Expand

ghci> D.readParquetWithOpts
ghci|   (D.defaultParquetReadOptions{D.selectedColumns = Just ["id"], D.rowRange = Just (0, 10)})
ghci|   ".testsdata/alltypes_plain.parquet"

When selectedColumns is set and predicate references other columns, those predicate columns are auto-included for decoding, then projected back to the requested output columns.

Strip Parquet encoding artifact names (REPEATED wrappers and their single list-element children) from a raw column path, leaving user-visible names.

readParquetWithOpts :: ParquetReadOptions -> FilePath -> IO DataFrame Source #

_readParquetWithOpts :: ForceNonSeekable -> ParquetReadOptions -> FilePath -> IO DataFrame Source #

Internal function to pass testing parameters

readParquetFiles :: FilePath -> IO DataFrame Source #

Read Parquet files from a directory or glob path.

This is equivalent to calling readParquetFilesWithOpts with defaultParquetReadOptions.

readParquetFilesWithOpts :: ParquetReadOptions -> FilePath -> IO DataFrame Source #

Read multiple Parquet files (directory or glob) using explicit options.

If path is a directory, all non-directory entries are read. If path is a glob, matching files are read.

For multi-file reads, rowRange is applied once after concatenation (global range semantics).

Example

Expand

ghci> D.readParquetFilesWithOpts
ghci|   (D.defaultParquetReadOptions{D.selectedColumns = Just ["id"], D.rowRange = Just (0, 5)})
ghci|   ".testsdata/alltypes_plain*.parquet"

applyRowRange :: ParquetReadOptions -> DataFrame -> DataFrame Source #

applySelectedColumns :: ParquetReadOptions -> DataFrame -> DataFrame Source #

applyPredicate :: ParquetReadOptions -> DataFrame -> DataFrame Source #

applyReadOptions :: ParquetReadOptions -> DataFrame -> DataFrame Source #

readMetadataFromPath :: FilePath -> IO (FileMetadata, ByteString) Source #

read the file in memory at once, parse magicString and return the entire file ByteString

readMetadataFromHandle :: FileBufferedOrSeekable -> IO FileMetadata Source #

read from the end of the file, parse magicString and return the entire file ByteString

readMetadataSizeFromFooterSlice :: ByteString -> (Int, ByteString) Source #

Takes the last 8 bit of the file to parse metadata size and magic string

readMetadataSizeFromFooter :: ByteString -> (Int, ByteString) Source #

getColumnPaths :: [SchemaElement] -> [(Text, Int)] Source #

findLeafSchema :: [SchemaElement] -> [String] -> Maybe SchemaElement Source #

processColumnPages :: (Int, Int) -> [Page] -> ParquetType -> ParquetEncoding -> Maybe Int32 -> LogicalType -> IO Column Source #

decodePageData :: Maybe DictVals -> (Int, Int) -> ParquetType -> Maybe Int32 -> ParquetEncoding -> [Int] -> [Int] -> Int -> ByteString -> String -> IO Column Source #

applyLogicalType :: LogicalType -> Column -> Column Source #

microsecondsToUTCTime :: Int64 -> UTCTime Source #

unitDivisor :: TimeUnit -> Int64 Source #

applyScale :: Int32 -> Int32 -> Double Source #

data HFRef Source #

Constructors

HFRef
Fields hfOwner :: Text hfDataset :: Text hfGlob :: Text

data HFParquetFile Source #

Constructors

HFParquetFile
Fields hfpUrl :: Text hfpConfig :: Text hfpSplit :: Text hfpFilename :: Text

Instances

Instances details

FromJSON HFParquetFile Source #
Instance details Defined in DataFrame.IO.Parquet Methods parseJSON :: Value -> Parser HFParquetFile # parseJSONList :: Value -> Parser [HFParquetFile] # omittedField :: Maybe HFParquetFile #
Show HFParquetFile Source #
Instance details Defined in DataFrame.IO.Parquet Methods showsPrec :: Int -> HFParquetFile -> ShowS # show :: HFParquetFile -> String # showList :: [HFParquetFile] -> ShowS #

newtype HFParquetResponse Source #

Constructors

HFParquetResponse
Fields hfParquetFiles :: [HFParquetFile]

Instances

Instances details

FromJSON HFParquetResponse Source #
Instance details Defined in DataFrame.IO.Parquet Methods parseJSON :: Value -> Parser HFParquetResponse # parseJSONList :: Value -> Parser [HFParquetResponse] # omittedField :: Maybe HFParquetResponse #

isHFUri :: FilePath -> Bool Source #

parseHFUri :: FilePath -> Either String HFRef Source #

getHFToken :: IO (Maybe ByteString) Source #

hfUrlRepoPath :: HFParquetFile -> String Source #

Extract the repo-relative path from a HuggingFace download URL. URL format: https://huggingface.co/datasets/{owner}/{dataset}/resolve/{ref}/{path} Returns the {path} portion (e.g. "data/train-00000-of-00001.parquet").

matchesGlob :: Text -> HFParquetFile -> Bool Source #

resolveHFUrls :: Maybe ByteString -> HFRef -> IO [HFParquetFile] Source #

downloadHFFiles :: Maybe ByteString -> [HFParquetFile] -> IO [FilePath] Source #

hasGlob :: Text -> Bool Source #

True when the path contains glob wildcard characters.

directHFUrl :: HFRef -> Text Source #

Build the direct HF repo download URL for a path with no wildcards. Format: https://huggingface.co/datasets/{owner}/{dataset}/resolve/main/{path}

fetchHFParquetFiles :: FilePath -> IO [FilePath] Source #