| Safe Haskell | None |
|---|---|
| Language | Haskell2010 |
DataFrame.IO.Parquet
Synopsis
- data ParquetReadOptions = ParquetReadOptions {}
- defaultParquetReadOptions :: ParquetReadOptions
- readParquet :: FilePath -> IO DataFrame
- cleanColPath :: [SNode] -> [String] -> [String]
- readParquetWithOpts :: ParquetReadOptions -> FilePath -> IO DataFrame
- _readParquetWithOpts :: ForceNonSeekable -> ParquetReadOptions -> FilePath -> IO DataFrame
- readParquetFiles :: FilePath -> IO DataFrame
- readParquetFilesWithOpts :: ParquetReadOptions -> FilePath -> IO DataFrame
- applyRowRange :: ParquetReadOptions -> DataFrame -> DataFrame
- applySelectedColumns :: ParquetReadOptions -> DataFrame -> DataFrame
- applyPredicate :: ParquetReadOptions -> DataFrame -> DataFrame
- applyReadOptions :: ParquetReadOptions -> DataFrame -> DataFrame
- readMetadataFromPath :: FilePath -> IO (FileMetadata, ByteString)
- readMetadataFromHandle :: FileBufferedOrSeekable -> IO FileMetadata
- readMetadataSizeFromFooterSlice :: ByteString -> (Int, ByteString)
- readMetadataSizeFromFooter :: ByteString -> (Int, ByteString)
- getColumnPaths :: [SchemaElement] -> [(Text, Int)]
- findLeafSchema :: [SchemaElement] -> [String] -> Maybe SchemaElement
- processColumnPages :: (Int, Int) -> [Page] -> ParquetType -> ParquetEncoding -> Maybe Int32 -> LogicalType -> IO Column
- decodePageData :: Maybe DictVals -> (Int, Int) -> ParquetType -> Maybe Int32 -> ParquetEncoding -> [Int] -> [Int] -> Int -> ByteString -> String -> IO Column
- applyLogicalType :: LogicalType -> Column -> Column
- microsecondsToUTCTime :: Int64 -> UTCTime
- unitDivisor :: TimeUnit -> Int64
- applyScale :: Int32 -> Int32 -> Double
- data HFRef = HFRef {}
- data HFParquetFile = HFParquetFile {}
- newtype HFParquetResponse = HFParquetResponse {}
- isHFUri :: FilePath -> Bool
- parseHFUri :: FilePath -> Either String HFRef
- getHFToken :: IO (Maybe ByteString)
- hfUrlRepoPath :: HFParquetFile -> String
- matchesGlob :: Text -> HFParquetFile -> Bool
- resolveHFUrls :: Maybe ByteString -> HFRef -> IO [HFParquetFile]
- downloadHFFiles :: Maybe ByteString -> [HFParquetFile] -> IO [FilePath]
- hasGlob :: Text -> Bool
- directHFUrl :: HFRef -> Text
- fetchHFParquetFiles :: FilePath -> IO [FilePath]
Documentation
data ParquetReadOptions Source #
Options for reading Parquet data.
These options are applied in this order:
- predicate filtering
- column projection
- row range
Column selection for selectedColumns uses leaf column names only.
Constructors
| ParquetReadOptions | |
Fields
| |
Instances
| Show ParquetReadOptions Source # | |
Defined in DataFrame.IO.Parquet Methods showsPrec :: Int -> ParquetReadOptions -> ShowS # show :: ParquetReadOptions -> String # showList :: [ParquetReadOptions] -> ShowS # | |
| Eq ParquetReadOptions Source # | |
Defined in DataFrame.IO.Parquet Methods (==) :: ParquetReadOptions -> ParquetReadOptions -> Bool # (/=) :: ParquetReadOptions -> ParquetReadOptions -> Bool # | |
defaultParquetReadOptions :: ParquetReadOptions Source #
Default Parquet read options.
Equivalent to:
ParquetReadOptions
{ selectedColumns = Nothing
, predicate = Nothing
, rowRange = Nothing
}
readParquet :: FilePath -> IO DataFrame Source #
Read a parquet file from path and load it into a dataframe.
Example
ghci> D.readParquet "./data/mtcars.parquet"
cleanColPath :: [SNode] -> [String] -> [String] Source #
Read a Parquet file using explicit read options.
Example
ghci> D.readParquetWithOpts
ghci| (D.defaultParquetReadOptions{D.selectedColumns = Just ["id"], D.rowRange = Just (0, 10)})
ghci| ".testsdata/alltypes_plain.parquet"
When selectedColumns is set and predicate references other columns, those predicate columns
are auto-included for decoding, then projected back to the requested output columns.
Strip Parquet encoding artifact names (REPEATED wrappers and their single list-element children) from a raw column path, leaving user-visible names.
_readParquetWithOpts :: ForceNonSeekable -> ParquetReadOptions -> FilePath -> IO DataFrame Source #
Internal function to pass testing parameters
readParquetFiles :: FilePath -> IO DataFrame Source #
Read Parquet files from a directory or glob path.
This is equivalent to calling readParquetFilesWithOpts with defaultParquetReadOptions.
readParquetFilesWithOpts :: ParquetReadOptions -> FilePath -> IO DataFrame Source #
Read multiple Parquet files (directory or glob) using explicit options.
If path is a directory, all non-directory entries are read.
If path is a glob, matching files are read.
For multi-file reads, rowRange is applied once after concatenation (global range semantics).
Example
ghci> D.readParquetFilesWithOpts
ghci| (D.defaultParquetReadOptions{D.selectedColumns = Just ["id"], D.rowRange = Just (0, 5)})
ghci| ".testsdata/alltypes_plain*.parquet"
applyRowRange :: ParquetReadOptions -> DataFrame -> DataFrame Source #
readMetadataFromPath :: FilePath -> IO (FileMetadata, ByteString) Source #
read the file in memory at once, parse magicString and return the entire file ByteString
readMetadataFromHandle :: FileBufferedOrSeekable -> IO FileMetadata Source #
read from the end of the file, parse magicString and return the entire file ByteString
readMetadataSizeFromFooterSlice :: ByteString -> (Int, ByteString) Source #
Takes the last 8 bit of the file to parse metadata size and magic string
readMetadataSizeFromFooter :: ByteString -> (Int, ByteString) Source #
getColumnPaths :: [SchemaElement] -> [(Text, Int)] Source #
findLeafSchema :: [SchemaElement] -> [String] -> Maybe SchemaElement Source #
processColumnPages :: (Int, Int) -> [Page] -> ParquetType -> ParquetEncoding -> Maybe Int32 -> LogicalType -> IO Column Source #
decodePageData :: Maybe DictVals -> (Int, Int) -> ParquetType -> Maybe Int32 -> ParquetEncoding -> [Int] -> [Int] -> Int -> ByteString -> String -> IO Column Source #
applyLogicalType :: LogicalType -> Column -> Column Source #
microsecondsToUTCTime :: Int64 -> UTCTime Source #
unitDivisor :: TimeUnit -> Int64 Source #
data HFParquetFile Source #
Constructors
| HFParquetFile | |
Instances
| FromJSON HFParquetFile Source # | |
Defined in DataFrame.IO.Parquet Methods parseJSON :: Value -> Parser HFParquetFile # parseJSONList :: Value -> Parser [HFParquetFile] # | |
| Show HFParquetFile Source # | |
Defined in DataFrame.IO.Parquet Methods showsPrec :: Int -> HFParquetFile -> ShowS # show :: HFParquetFile -> String # showList :: [HFParquetFile] -> ShowS # | |
newtype HFParquetResponse Source #
Constructors
| HFParquetResponse | |
Fields | |
Instances
| FromJSON HFParquetResponse Source # | |
Defined in DataFrame.IO.Parquet Methods parseJSON :: Value -> Parser HFParquetResponse # parseJSONList :: Value -> Parser [HFParquetResponse] # | |
getHFToken :: IO (Maybe ByteString) Source #
hfUrlRepoPath :: HFParquetFile -> String Source #
Extract the repo-relative path from a HuggingFace download URL. URL format: https://huggingface.co/datasets/{owner}/{dataset}/resolve/{ref}/{path} Returns the {path} portion (e.g. "data/train-00000-of-00001.parquet").
matchesGlob :: Text -> HFParquetFile -> Bool Source #
resolveHFUrls :: Maybe ByteString -> HFRef -> IO [HFParquetFile] Source #
downloadHFFiles :: Maybe ByteString -> [HFParquetFile] -> IO [FilePath] Source #
directHFUrl :: HFRef -> Text Source #
Build the direct HF repo download URL for a path with no wildcards. Format: https://huggingface.co/datasets/{owner}/{dataset}/resolve/main/{path}