{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE ExplicitNamespaces #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE GADTs #-}
{-# LANGUAGE NumericUnderscores #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE RankNTypes #-}
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE TypeApplications #-}
module DataFrame.IO.CSV where
import qualified Data.ByteString.Lazy as BL
import qualified Data.List as L
import qualified Data.Map.Strict as M
import qualified Data.Proxy as P
import qualified Data.Text as T
import qualified Data.Text.Encoding as TE
import qualified Data.Text.IO as TIO
import qualified Data.Vector as V
import qualified Data.Vector.Mutable as VM
import qualified Data.Vector.Unboxed as VU
import qualified Data.Vector.Unboxed.Mutable as VUM
import Data.Csv.Streaming (Records (..))
import qualified Data.Csv.Streaming as CsvStream
import Control.Monad
import Data.Char
import qualified Data.Csv as Csv
import Data.Either
import Data.Function (on)
import Data.Functor
import Data.IORef
import Data.Maybe
import Data.Type.Equality (TestEquality (testEquality))
import Data.Word (Word8)
import DataFrame.Internal.Column
import DataFrame.Internal.DataFrame (DataFrame (..))
import DataFrame.Internal.Parsing
import DataFrame.Internal.Schema
import DataFrame.Operations.Typing
import System.IO
import Type.Reflection
import Prelude hiding (concat, takeWhile)
chunkSize :: Int
chunkSize :: Int
chunkSize = Int
16_384
data PagedVector a = PagedVector
{ forall a. PagedVector a -> IORef [Vector a]
pvChunks :: !(IORef [V.Vector a])
, forall a. PagedVector a -> IORef (IOVector a)
pvActive :: !(IORef (VM.IOVector a))
, forall a. PagedVector a -> IORef Int
pvCount :: !(IORef Int)
}
data PagedUnboxedVector a = PagedUnboxedVector
{ forall a. PagedUnboxedVector a -> IORef [Vector a]
puvChunks :: !(IORef [VU.Vector a])
, forall a. PagedUnboxedVector a -> IORef (IOVector a)
puvActive :: !(IORef (VUM.IOVector a))
, forall a. PagedUnboxedVector a -> IORef Int
puvCount :: !(IORef Int)
}
data BuilderColumn
= BuilderInt !(PagedUnboxedVector Int) !(PagedUnboxedVector Word8)
| BuilderDouble !(PagedUnboxedVector Double) !(PagedUnboxedVector Word8)
| BuilderText !(PagedVector T.Text) !(PagedUnboxedVector Word8)
newPagedVector :: IO (PagedVector a)
newPagedVector :: forall a. IO (PagedVector a)
newPagedVector = do
IOVector a
active <- Int -> IO (MVector (PrimState IO) a)
forall (m :: * -> *) a.
PrimMonad m =>
Int -> m (MVector (PrimState m) a)
VM.unsafeNew Int
chunkSize
IORef [Vector a]
-> IORef (IOVector a) -> IORef Int -> PagedVector a
forall a.
IORef [Vector a]
-> IORef (IOVector a) -> IORef Int -> PagedVector a
PagedVector (IORef [Vector a]
-> IORef (IOVector a) -> IORef Int -> PagedVector a)
-> IO (IORef [Vector a])
-> IO (IORef (IOVector a) -> IORef Int -> PagedVector a)
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> [Vector a] -> IO (IORef [Vector a])
forall a. a -> IO (IORef a)
newIORef [] IO (IORef (IOVector a) -> IORef Int -> PagedVector a)
-> IO (IORef (IOVector a)) -> IO (IORef Int -> PagedVector a)
forall a b. IO (a -> b) -> IO a -> IO b
forall (f :: * -> *) a b. Applicative f => f (a -> b) -> f a -> f b
<*> IOVector a -> IO (IORef (IOVector a))
forall a. a -> IO (IORef a)
newIORef IOVector a
active IO (IORef Int -> PagedVector a)
-> IO (IORef Int) -> IO (PagedVector a)
forall a b. IO (a -> b) -> IO a -> IO b
forall (f :: * -> *) a b. Applicative f => f (a -> b) -> f a -> f b
<*> Int -> IO (IORef Int)
forall a. a -> IO (IORef a)
newIORef Int
0
newPagedUnboxedVector :: (VUM.Unbox a) => IO (PagedUnboxedVector a)
newPagedUnboxedVector :: forall a. Unbox a => IO (PagedUnboxedVector a)
newPagedUnboxedVector = do
IOVector a
active <- Int -> IO (MVector (PrimState IO) a)
forall (m :: * -> *) a.
(PrimMonad m, Unbox a) =>
Int -> m (MVector (PrimState m) a)
VUM.unsafeNew Int
chunkSize
IORef [Vector a]
-> IORef (IOVector a) -> IORef Int -> PagedUnboxedVector a
forall a.
IORef [Vector a]
-> IORef (IOVector a) -> IORef Int -> PagedUnboxedVector a
PagedUnboxedVector (IORef [Vector a]
-> IORef (IOVector a) -> IORef Int -> PagedUnboxedVector a)
-> IO (IORef [Vector a])
-> IO (IORef (IOVector a) -> IORef Int -> PagedUnboxedVector a)
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> [Vector a] -> IO (IORef [Vector a])
forall a. a -> IO (IORef a)
newIORef [] IO (IORef (IOVector a) -> IORef Int -> PagedUnboxedVector a)
-> IO (IORef (IOVector a))
-> IO (IORef Int -> PagedUnboxedVector a)
forall a b. IO (a -> b) -> IO a -> IO b
forall (f :: * -> *) a b. Applicative f => f (a -> b) -> f a -> f b
<*> IOVector a -> IO (IORef (IOVector a))
forall a. a -> IO (IORef a)
newIORef IOVector a
active IO (IORef Int -> PagedUnboxedVector a)
-> IO (IORef Int) -> IO (PagedUnboxedVector a)
forall a b. IO (a -> b) -> IO a -> IO b
forall (f :: * -> *) a b. Applicative f => f (a -> b) -> f a -> f b
<*> Int -> IO (IORef Int)
forall a. a -> IO (IORef a)
newIORef Int
0
appendPagedVector :: PagedVector a -> a -> IO ()
appendPagedVector :: forall a. PagedVector a -> a -> IO ()
appendPagedVector (PagedVector IORef [Vector a]
chunksRef IORef (IOVector a)
activeRef IORef Int
countRef) !a
val = do
Int
count <- IORef Int -> IO Int
forall a. IORef a -> IO a
readIORef IORef Int
countRef
IOVector a
active <- IORef (IOVector a) -> IO (IOVector a)
forall a. IORef a -> IO a
readIORef IORef (IOVector a)
activeRef
if Int
count Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
chunkSize
then do
MVector (PrimState IO) a -> Int -> a -> IO ()
forall (m :: * -> *) a.
PrimMonad m =>
MVector (PrimState m) a -> Int -> a -> m ()
VM.unsafeWrite IOVector a
MVector (PrimState IO) a
active Int
count a
val
IORef Int -> Int -> IO ()
forall a. IORef a -> a -> IO ()
writeIORef IORef Int
countRef (Int -> IO ()) -> Int -> IO ()
forall a b. (a -> b) -> a -> b
$! Int
count Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
1
else do
Vector a
frozen <- MVector (PrimState IO) a -> IO (Vector a)
forall (m :: * -> *) a.
PrimMonad m =>
MVector (PrimState m) a -> m (Vector a)
V.freeze IOVector a
MVector (PrimState IO) a
active
IORef [Vector a] -> ([Vector a] -> [Vector a]) -> IO ()
forall a. IORef a -> (a -> a) -> IO ()
modifyIORef' IORef [Vector a]
chunksRef (Vector a
frozen Vector a -> [Vector a] -> [Vector a]
forall a. a -> [a] -> [a]
:)
IOVector a
newActive <- Int -> IO (MVector (PrimState IO) a)
forall (m :: * -> *) a.
PrimMonad m =>
Int -> m (MVector (PrimState m) a)
VM.unsafeNew Int
chunkSize
MVector (PrimState IO) a -> Int -> a -> IO ()
forall (m :: * -> *) a.
PrimMonad m =>
MVector (PrimState m) a -> Int -> a -> m ()
VM.unsafeWrite IOVector a
MVector (PrimState IO) a
newActive Int
0 a
val
IORef (IOVector a) -> IOVector a -> IO ()
forall a. IORef a -> a -> IO ()
writeIORef IORef (IOVector a)
activeRef IOVector a
newActive
IORef Int -> Int -> IO ()
forall a. IORef a -> a -> IO ()
writeIORef IORef Int
countRef Int
1
{-# INLINE appendPagedVector #-}
appendPagedUnboxedVector :: (VUM.Unbox a) => PagedUnboxedVector a -> a -> IO ()
appendPagedUnboxedVector :: forall a. Unbox a => PagedUnboxedVector a -> a -> IO ()
appendPagedUnboxedVector (PagedUnboxedVector IORef [Vector a]
chunksRef IORef (IOVector a)
activeRef IORef Int
countRef) !a
val = do
Int
count <- IORef Int -> IO Int
forall a. IORef a -> IO a
readIORef IORef Int
countRef
IOVector a
active <- IORef (IOVector a) -> IO (IOVector a)
forall a. IORef a -> IO a
readIORef IORef (IOVector a)
activeRef
if Int
count Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
chunkSize
then do
MVector (PrimState IO) a -> Int -> a -> IO ()
forall (m :: * -> *) a.
(PrimMonad m, Unbox a) =>
MVector (PrimState m) a -> Int -> a -> m ()
VUM.unsafeWrite IOVector a
MVector (PrimState IO) a
active Int
count a
val
IORef Int -> Int -> IO ()
forall a. IORef a -> a -> IO ()
writeIORef IORef Int
countRef (Int -> IO ()) -> Int -> IO ()
forall a b. (a -> b) -> a -> b
$! Int
count Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
1
else do
Vector a
frozen <- MVector (PrimState IO) a -> IO (Vector a)
forall a (m :: * -> *).
(Unbox a, PrimMonad m) =>
MVector (PrimState m) a -> m (Vector a)
VU.freeze IOVector a
MVector (PrimState IO) a
active
IORef [Vector a] -> ([Vector a] -> [Vector a]) -> IO ()
forall a. IORef a -> (a -> a) -> IO ()
modifyIORef' IORef [Vector a]
chunksRef (Vector a
frozen Vector a -> [Vector a] -> [Vector a]
forall a. a -> [a] -> [a]
:)
IOVector a
newActive <- Int -> IO (MVector (PrimState IO) a)
forall (m :: * -> *) a.
(PrimMonad m, Unbox a) =>
Int -> m (MVector (PrimState m) a)
VUM.unsafeNew Int
chunkSize
MVector (PrimState IO) a -> Int -> a -> IO ()
forall (m :: * -> *) a.
(PrimMonad m, Unbox a) =>
MVector (PrimState m) a -> Int -> a -> m ()
VUM.unsafeWrite IOVector a
MVector (PrimState IO) a
newActive Int
0 a
val
IORef (IOVector a) -> IOVector a -> IO ()
forall a. IORef a -> a -> IO ()
writeIORef IORef (IOVector a)
activeRef IOVector a
newActive
IORef Int -> Int -> IO ()
forall a. IORef a -> a -> IO ()
writeIORef IORef Int
countRef Int
1
{-# INLINE appendPagedUnboxedVector #-}
freezePagedVector :: PagedVector a -> IO (V.Vector a)
freezePagedVector :: forall a. PagedVector a -> IO (Vector a)
freezePagedVector (PagedVector IORef [Vector a]
chunksRef IORef (IOVector a)
activeRef IORef Int
countRef) = do
Int
count <- IORef Int -> IO Int
forall a. IORef a -> IO a
readIORef IORef Int
countRef
IOVector a
active <- IORef (IOVector a) -> IO (IOVector a)
forall a. IORef a -> IO a
readIORef IORef (IOVector a)
activeRef
[Vector a]
chunks <- IORef [Vector a] -> IO [Vector a]
forall a. IORef a -> IO a
readIORef IORef [Vector a]
chunksRef
Vector a
lastChunk <- MVector (PrimState IO) a -> IO (Vector a)
forall (m :: * -> *) a.
PrimMonad m =>
MVector (PrimState m) a -> m (Vector a)
V.freeze (Int -> Int -> IOVector a -> IOVector a
forall s a. Int -> Int -> MVector s a -> MVector s a
VM.slice Int
0 Int
count IOVector a
active)
Vector a -> IO (Vector a)
forall a. a -> IO a
forall (m :: * -> *) a. Monad m => a -> m a
return (Vector a -> IO (Vector a)) -> Vector a -> IO (Vector a)
forall a b. (a -> b) -> a -> b
$! [Vector a] -> Vector a
forall a. [Vector a] -> Vector a
V.concat ([Vector a] -> [Vector a]
forall a. [a] -> [a]
reverse (Vector a
lastChunk Vector a -> [Vector a] -> [Vector a]
forall a. a -> [a] -> [a]
: [Vector a]
chunks))
freezePagedUnboxedVector ::
(VUM.Unbox a) => PagedUnboxedVector a -> IO (VU.Vector a)
freezePagedUnboxedVector :: forall a. Unbox a => PagedUnboxedVector a -> IO (Vector a)
freezePagedUnboxedVector (PagedUnboxedVector IORef [Vector a]
chunksRef IORef (IOVector a)
activeRef IORef Int
countRef) = do
Int
count <- IORef Int -> IO Int
forall a. IORef a -> IO a
readIORef IORef Int
countRef
IOVector a
active <- IORef (IOVector a) -> IO (IOVector a)
forall a. IORef a -> IO a
readIORef IORef (IOVector a)
activeRef
[Vector a]
chunks <- IORef [Vector a] -> IO [Vector a]
forall a. IORef a -> IO a
readIORef IORef [Vector a]
chunksRef
Vector a
lastChunk <- MVector (PrimState IO) a -> IO (Vector a)
forall a (m :: * -> *).
(Unbox a, PrimMonad m) =>
MVector (PrimState m) a -> m (Vector a)
VU.freeze (Int -> Int -> IOVector a -> IOVector a
forall a s. Unbox a => Int -> Int -> MVector s a -> MVector s a
VUM.slice Int
0 Int
count IOVector a
active)
Vector a -> IO (Vector a)
forall a. a -> IO a
forall (m :: * -> *) a. Monad m => a -> m a
return (Vector a -> IO (Vector a)) -> Vector a -> IO (Vector a)
forall a b. (a -> b) -> a -> b
$! [Vector a] -> Vector a
forall a. Unbox a => [Vector a] -> Vector a
VU.concat ([Vector a] -> [Vector a]
forall a. [a] -> [a]
reverse (Vector a
lastChunk Vector a -> [Vector a] -> [Vector a]
forall a. a -> [a] -> [a]
: [Vector a]
chunks))
data = | UseFirstRow | ProvideNames [T.Text]
deriving (HeaderSpec -> HeaderSpec -> Bool
(HeaderSpec -> HeaderSpec -> Bool)
-> (HeaderSpec -> HeaderSpec -> Bool) -> Eq HeaderSpec
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
$c== :: HeaderSpec -> HeaderSpec -> Bool
== :: HeaderSpec -> HeaderSpec -> Bool
$c/= :: HeaderSpec -> HeaderSpec -> Bool
/= :: HeaderSpec -> HeaderSpec -> Bool
Eq, Int -> HeaderSpec -> ShowS
[HeaderSpec] -> ShowS
HeaderSpec -> [Char]
(Int -> HeaderSpec -> ShowS)
-> (HeaderSpec -> [Char])
-> ([HeaderSpec] -> ShowS)
-> Show HeaderSpec
forall a.
(Int -> a -> ShowS) -> (a -> [Char]) -> ([a] -> ShowS) -> Show a
$cshowsPrec :: Int -> HeaderSpec -> ShowS
showsPrec :: Int -> HeaderSpec -> ShowS
$cshow :: HeaderSpec -> [Char]
show :: HeaderSpec -> [Char]
$cshowList :: [HeaderSpec] -> ShowS
showList :: [HeaderSpec] -> ShowS
Show)
data TypeSpec = InferFromSample Int | SpecifyTypes [SchemaType] | NoInference
data ReadOptions = ReadOptions
{ :: HeaderSpec
, ReadOptions -> TypeSpec
typeSpec :: TypeSpec
, ReadOptions -> Bool
safeRead :: Bool
, ReadOptions -> [Char]
dateFormat :: String
}
shouldInferFromSample :: TypeSpec -> Bool
shouldInferFromSample :: TypeSpec -> Bool
shouldInferFromSample (InferFromSample Int
_) = Bool
True
shouldInferFromSample TypeSpec
_ = Bool
False
schemaTypes :: TypeSpec -> [SchemaType]
schemaTypes :: TypeSpec -> [SchemaType]
schemaTypes (SpecifyTypes [SchemaType]
xs) = [SchemaType]
xs
schemaTypes TypeSpec
_ = []
typeInferenceSampleSize :: TypeSpec -> Int
typeInferenceSampleSize :: TypeSpec -> Int
typeInferenceSampleSize (InferFromSample Int
n) = Int
n
typeInferenceSampleSize TypeSpec
_ = Int
0
defaultReadOptions :: ReadOptions
defaultReadOptions :: ReadOptions
defaultReadOptions =
ReadOptions
{ headerSpec :: HeaderSpec
headerSpec = HeaderSpec
UseFirstRow
, typeSpec :: TypeSpec
typeSpec = Int -> TypeSpec
InferFromSample Int
100
, safeRead :: Bool
safeRead = Bool
True
, dateFormat :: [Char]
dateFormat = [Char]
"%Y-%m-%d"
}
readCsv :: FilePath -> IO DataFrame
readCsv :: [Char] -> IO DataFrame
readCsv = Char -> ReadOptions -> [Char] -> IO DataFrame
readSeparated Char
',' ReadOptions
defaultReadOptions
readCsvWithOpts :: ReadOptions -> FilePath -> IO DataFrame
readCsvWithOpts :: ReadOptions -> [Char] -> IO DataFrame
readCsvWithOpts = Char -> ReadOptions -> [Char] -> IO DataFrame
readSeparated Char
','
readTsv :: FilePath -> IO DataFrame
readTsv :: [Char] -> IO DataFrame
readTsv = Char -> ReadOptions -> [Char] -> IO DataFrame
readSeparated Char
'\t' ReadOptions
defaultReadOptions
readSeparated :: Char -> ReadOptions -> FilePath -> IO DataFrame
readSeparated :: Char -> ReadOptions -> [Char] -> IO DataFrame
readSeparated !Char
sep !ReadOptions
opts ![Char]
path = do
LazyByteString
csvData <- [Char] -> IO LazyByteString
BL.readFile [Char]
path
let decodeOpts :: DecodeOptions
decodeOpts = DecodeOptions
Csv.defaultDecodeOptions{Csv.decDelimiter = fromIntegral (ord sep)}
let stream :: Records (Vector LazyByteString)
stream = DecodeOptions
-> HasHeader -> LazyByteString -> Records (Vector LazyByteString)
forall a.
FromRecord a =>
DecodeOptions -> HasHeader -> LazyByteString -> Records a
CsvStream.decodeWith DecodeOptions
decodeOpts HasHeader
Csv.NoHeader LazyByteString
csvData
let peekStream :: Records a -> m (a, Records a)
peekStream (Cons (Right a
row) Records a
rest) = (a, Records a) -> m (a, Records a)
forall a. a -> m a
forall (m :: * -> *) a. Monad m => a -> m a
return (a
row, Records a
rest)
peekStream (Cons (Left [Char]
err) Records a
_) = [Char] -> m (a, Records a)
forall a. HasCallStack => [Char] -> a
error ([Char] -> m (a, Records a)) -> [Char] -> m (a, Records a)
forall a b. (a -> b) -> a -> b
$ [Char]
"Error parsing CSV header: " [Char] -> ShowS
forall a. [a] -> [a] -> [a]
++ [Char]
err
peekStream (Nil Maybe [Char]
Nothing LazyByteString
_) = [Char] -> m (a, Records a)
forall a. HasCallStack => [Char] -> a
error [Char]
"Empty CSV file"
peekStream (Nil (Just [Char]
err) LazyByteString
_) = [Char] -> m (a, Records a)
forall a. HasCallStack => [Char] -> a
error [Char]
err
(Vector LazyByteString
firstRowRaw, Records (Vector LazyByteString)
dataStream) <- Records (Vector LazyByteString)
-> IO (Vector LazyByteString, Records (Vector LazyByteString))
forall {m :: * -> *} {a}. Monad m => Records a -> m (a, Records a)
peekStream Records (Vector LazyByteString)
stream
let ([Text]
columnNames, Records (Vector LazyByteString)
rowsToProcess) = case ReadOptions -> HeaderSpec
headerSpec ReadOptions
opts of
HeaderSpec
NoHeader ->
( (Int -> Text) -> [Int] -> [Text]
forall a b. (a -> b) -> [a] -> [b]
map ([Char] -> Text
T.pack ([Char] -> Text) -> (Int -> [Char]) -> Int -> Text
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Int -> [Char]
forall a. Show a => a -> [Char]
show) [Int
0 .. Vector LazyByteString -> Int
forall a. Vector a -> Int
V.length Vector LazyByteString
firstRowRaw Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
1]
, Either [Char] (Vector LazyByteString)
-> Records (Vector LazyByteString)
-> Records (Vector LazyByteString)
forall a. Either [Char] a -> Records a -> Records a
Cons (Vector LazyByteString -> Either [Char] (Vector LazyByteString)
forall a b. b -> Either a b
Right Vector LazyByteString
firstRowRaw) Records (Vector LazyByteString)
dataStream
)
HeaderSpec
UseFirstRow ->
( (LazyByteString -> Text) -> [LazyByteString] -> [Text]
forall a b. (a -> b) -> [a] -> [b]
map (Text -> Text
T.strip (Text -> Text)
-> (LazyByteString -> Text) -> LazyByteString -> Text
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ByteString -> Text
TE.decodeUtf8Lenient (ByteString -> Text)
-> (LazyByteString -> ByteString) -> LazyByteString -> Text
forall b c a. (b -> c) -> (a -> b) -> a -> c
. LazyByteString -> ByteString
BL.toStrict) (Vector LazyByteString -> [LazyByteString]
forall a. Vector a -> [a]
V.toList Vector LazyByteString
firstRowRaw)
, Records (Vector LazyByteString)
dataStream
)
ProvideNames [Text]
ns ->
( [Text]
ns [Text] -> [Text] -> [Text]
forall a. [a] -> [a] -> [a]
++ Int -> [Text] -> [Text]
forall a. Int -> [a] -> [a]
drop ([Text] -> Int
forall a. [a] -> Int
forall (t :: * -> *) a. Foldable t => t a -> Int
length [Text]
ns) ((Int -> Text) -> [Int] -> [Text]
forall a b. (a -> b) -> [a] -> [b]
map ([Char] -> Text
T.pack ([Char] -> Text) -> (Int -> [Char]) -> Int -> Text
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Int -> [Char]
forall a. Show a => a -> [Char]
show) [Int
0 .. Vector LazyByteString -> Int
forall a. Vector a -> Int
V.length Vector LazyByteString
firstRowRaw Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
1])
, Either [Char] (Vector LazyByteString)
-> Records (Vector LazyByteString)
-> Records (Vector LazyByteString)
forall a. Either [Char] a -> Records a -> Records a
Cons (Vector LazyByteString -> Either [Char] (Vector LazyByteString)
forall a b. b -> Either a b
Right Vector LazyByteString
firstRowRaw) Records (Vector LazyByteString)
dataStream
)
(Vector LazyByteString
sampleRow, Records (Vector LazyByteString)
_) <- Records (Vector LazyByteString)
-> IO (Vector LazyByteString, Records (Vector LazyByteString))
forall {m :: * -> *} {a}. Monad m => Records a -> m (a, Records a)
peekStream Records (Vector LazyByteString)
rowsToProcess
[BuilderColumn]
builderCols <- [LazyByteString] -> ReadOptions -> IO [BuilderColumn]
initializeColumns (Vector LazyByteString -> [LazyByteString]
forall a. Vector a -> [a]
V.toList Vector LazyByteString
sampleRow) ReadOptions
opts
Records (Vector LazyByteString) -> [BuilderColumn] -> IO ()
processStream Records (Vector LazyByteString)
rowsToProcess [BuilderColumn]
builderCols
Vector Column
frozenCols <- [Column] -> Vector Column
forall a. [a] -> Vector a
V.fromList ([Column] -> Vector Column) -> IO [Column] -> IO (Vector Column)
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> (BuilderColumn -> IO Column) -> [BuilderColumn] -> IO [Column]
forall (t :: * -> *) (m :: * -> *) a b.
(Traversable t, Monad m) =>
(a -> m b) -> t a -> m (t b)
forall (m :: * -> *) a b. Monad m => (a -> m b) -> [a] -> m [b]
mapM BuilderColumn -> IO Column
freezeBuilderColumn [BuilderColumn]
builderCols
let numRows :: Int
numRows = Int -> (Column -> Int) -> Maybe Column -> Int
forall b a. b -> (a -> b) -> Maybe a -> b
maybe Int
0 Column -> Int
columnLength (Vector Column
frozenCols Vector Column -> Int -> Maybe Column
forall a. Vector a -> Int -> Maybe a
V.!? Int
0)
let df :: DataFrame
df =
Vector Column -> Map Text Int -> (Int, Int) -> DataFrame
DataFrame
Vector Column
frozenCols
([(Text, Int)] -> Map Text Int
forall k a. Ord k => [(k, a)] -> Map k a
M.fromList ([Text] -> [Int] -> [(Text, Int)]
forall a b. [a] -> [b] -> [(a, b)]
zip [Text]
columnNames [Int
0 ..]))
(Int
numRows, Vector Column -> Int
forall a. Vector a -> Int
V.length Vector Column
frozenCols)
DataFrame -> IO DataFrame
forall a. a -> IO a
forall (m :: * -> *) a. Monad m => a -> m a
return (DataFrame -> IO DataFrame) -> DataFrame -> IO DataFrame
forall a b. (a -> b) -> a -> b
$
if TypeSpec -> Bool
shouldInferFromSample (ReadOptions -> TypeSpec
typeSpec ReadOptions
opts)
then
Int -> Bool -> [Char] -> DataFrame -> DataFrame
parseDefaults
(TypeSpec -> Int
typeInferenceSampleSize (ReadOptions -> TypeSpec
typeSpec ReadOptions
opts))
(ReadOptions -> Bool
safeRead ReadOptions
opts)
(ReadOptions -> [Char]
dateFormat ReadOptions
opts)
DataFrame
df
else
if Bool -> Bool
not ([SchemaType] -> Bool
forall a. [a] -> Bool
forall (t :: * -> *) a. Foldable t => t a -> Bool
null (TypeSpec -> [SchemaType]
schemaTypes (ReadOptions -> TypeSpec
typeSpec ReadOptions
opts)))
then [SchemaType] -> DataFrame -> DataFrame
parseWithTypes (TypeSpec -> [SchemaType]
schemaTypes (ReadOptions -> TypeSpec
typeSpec ReadOptions
opts)) DataFrame
df
else DataFrame
df
initializeColumns :: [BL.ByteString] -> ReadOptions -> IO [BuilderColumn]
initializeColumns :: [LazyByteString] -> ReadOptions -> IO [BuilderColumn]
initializeColumns [LazyByteString]
row ReadOptions
opts = case ReadOptions -> TypeSpec
typeSpec ReadOptions
opts of
TypeSpec
NoInference -> (LazyByteString -> SchemaType -> IO BuilderColumn)
-> [LazyByteString] -> [SchemaType] -> IO [BuilderColumn]
forall (m :: * -> *) a b c.
Applicative m =>
(a -> b -> m c) -> [a] -> [b] -> m [c]
zipWithM LazyByteString -> SchemaType -> IO BuilderColumn
forall {p}. p -> SchemaType -> IO BuilderColumn
initColumn [LazyByteString]
row ([SchemaType] -> [SchemaType]
expandTypes [])
InferFromSample Int
_ -> (LazyByteString -> SchemaType -> IO BuilderColumn)
-> [LazyByteString] -> [SchemaType] -> IO [BuilderColumn]
forall (m :: * -> *) a b c.
Applicative m =>
(a -> b -> m c) -> [a] -> [b] -> m [c]
zipWithM LazyByteString -> SchemaType -> IO BuilderColumn
forall {p}. p -> SchemaType -> IO BuilderColumn
initColumn [LazyByteString]
row ([SchemaType] -> [SchemaType]
expandTypes [])
SpecifyTypes [SchemaType]
ts -> (LazyByteString -> SchemaType -> IO BuilderColumn)
-> [LazyByteString] -> [SchemaType] -> IO [BuilderColumn]
forall (m :: * -> *) a b c.
Applicative m =>
(a -> b -> m c) -> [a] -> [b] -> m [c]
zipWithM LazyByteString -> SchemaType -> IO BuilderColumn
forall {p}. p -> SchemaType -> IO BuilderColumn
initColumn [LazyByteString]
row ([SchemaType] -> [SchemaType]
expandTypes [SchemaType]
ts)
where
expandTypes :: [SchemaType] -> [SchemaType]
expandTypes [SchemaType]
xs = [SchemaType]
xs [SchemaType] -> [SchemaType] -> [SchemaType]
forall a. [a] -> [a] -> [a]
++ Int -> SchemaType -> [SchemaType]
forall a. Int -> a -> [a]
replicate ([LazyByteString] -> Int
forall a. [a] -> Int
forall (t :: * -> *) a. Foldable t => t a -> Int
length [LazyByteString]
row Int -> Int -> Int
forall a. Num a => a -> a -> a
- [SchemaType] -> Int
forall a. [a] -> Int
forall (t :: * -> *) a. Foldable t => t a -> Int
length [SchemaType]
xs) (forall a. Columnable a => SchemaType
schemaType @T.Text)
initColumn :: p -> SchemaType -> IO BuilderColumn
initColumn p
_ SchemaType
t = do
PagedUnboxedVector Word8
validityRef <- IO (PagedUnboxedVector Word8)
forall a. Unbox a => IO (PagedUnboxedVector a)
newPagedUnboxedVector
case SchemaType
t of
SType (Proxy a
_ :: P.Proxy a) -> case TypeRep a -> TypeRep Int -> Maybe (a :~: Int)
forall a b. TypeRep a -> TypeRep b -> Maybe (a :~: b)
forall {k} (f :: k -> *) (a :: k) (b :: k).
TestEquality f =>
f a -> f b -> Maybe (a :~: b)
testEquality (forall a. Typeable a => TypeRep a
forall {k} (a :: k). Typeable a => TypeRep a
typeRep @a) (forall a. Typeable a => TypeRep a
forall {k} (a :: k). Typeable a => TypeRep a
typeRep @Int) of
Just a :~: Int
Refl -> PagedUnboxedVector Int -> PagedUnboxedVector Word8 -> BuilderColumn
BuilderInt (PagedUnboxedVector Int
-> PagedUnboxedVector Word8 -> BuilderColumn)
-> IO (PagedUnboxedVector Int)
-> IO (PagedUnboxedVector Word8 -> BuilderColumn)
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> IO (PagedUnboxedVector Int)
forall a. Unbox a => IO (PagedUnboxedVector a)
newPagedUnboxedVector IO (PagedUnboxedVector Word8 -> BuilderColumn)
-> IO (PagedUnboxedVector Word8) -> IO BuilderColumn
forall a b. IO (a -> b) -> IO a -> IO b
forall (f :: * -> *) a b. Applicative f => f (a -> b) -> f a -> f b
<*> PagedUnboxedVector Word8 -> IO (PagedUnboxedVector Word8)
forall a. a -> IO a
forall (f :: * -> *) a. Applicative f => a -> f a
pure PagedUnboxedVector Word8
validityRef
Maybe (a :~: Int)
Nothing -> case TypeRep a -> TypeRep Double -> Maybe (a :~: Double)
forall a b. TypeRep a -> TypeRep b -> Maybe (a :~: b)
forall {k} (f :: k -> *) (a :: k) (b :: k).
TestEquality f =>
f a -> f b -> Maybe (a :~: b)
testEquality (forall a. Typeable a => TypeRep a
forall {k} (a :: k). Typeable a => TypeRep a
typeRep @a) (forall a. Typeable a => TypeRep a
forall {k} (a :: k). Typeable a => TypeRep a
typeRep @Double) of
Just a :~: Double
Refl -> PagedUnboxedVector Double
-> PagedUnboxedVector Word8 -> BuilderColumn
BuilderDouble (PagedUnboxedVector Double
-> PagedUnboxedVector Word8 -> BuilderColumn)
-> IO (PagedUnboxedVector Double)
-> IO (PagedUnboxedVector Word8 -> BuilderColumn)
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> IO (PagedUnboxedVector Double)
forall a. Unbox a => IO (PagedUnboxedVector a)
newPagedUnboxedVector IO (PagedUnboxedVector Word8 -> BuilderColumn)
-> IO (PagedUnboxedVector Word8) -> IO BuilderColumn
forall a b. IO (a -> b) -> IO a -> IO b
forall (f :: * -> *) a b. Applicative f => f (a -> b) -> f a -> f b
<*> PagedUnboxedVector Word8 -> IO (PagedUnboxedVector Word8)
forall a. a -> IO a
forall (f :: * -> *) a. Applicative f => a -> f a
pure PagedUnboxedVector Word8
validityRef
Maybe (a :~: Double)
Nothing -> PagedVector Text -> PagedUnboxedVector Word8 -> BuilderColumn
BuilderText (PagedVector Text -> PagedUnboxedVector Word8 -> BuilderColumn)
-> IO (PagedVector Text)
-> IO (PagedUnboxedVector Word8 -> BuilderColumn)
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> IO (PagedVector Text)
forall a. IO (PagedVector a)
newPagedVector IO (PagedUnboxedVector Word8 -> BuilderColumn)
-> IO (PagedUnboxedVector Word8) -> IO BuilderColumn
forall a b. IO (a -> b) -> IO a -> IO b
forall (f :: * -> *) a b. Applicative f => f (a -> b) -> f a -> f b
<*> PagedUnboxedVector Word8 -> IO (PagedUnboxedVector Word8)
forall a. a -> IO a
forall (f :: * -> *) a. Applicative f => a -> f a
pure PagedUnboxedVector Word8
validityRef
processStream ::
CsvStream.Records (V.Vector BL.ByteString) -> [BuilderColumn] -> IO ()
processStream :: Records (Vector LazyByteString) -> [BuilderColumn] -> IO ()
processStream (Cons (Right Vector LazyByteString
row) Records (Vector LazyByteString)
rest) [BuilderColumn]
cols = Vector LazyByteString -> [BuilderColumn] -> IO ()
processRow Vector LazyByteString
row [BuilderColumn]
cols IO () -> IO () -> IO ()
forall a b. IO a -> IO b -> IO b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Records (Vector LazyByteString) -> [BuilderColumn] -> IO ()
processStream Records (Vector LazyByteString)
rest [BuilderColumn]
cols
processStream (Cons (Left [Char]
err) Records (Vector LazyByteString)
_) [BuilderColumn]
_ = [Char] -> IO ()
forall a. HasCallStack => [Char] -> a
error ([Char]
"CSV Parse Error: " [Char] -> ShowS
forall a. [a] -> [a] -> [a]
++ [Char]
err)
processStream (Nil Maybe [Char]
_ LazyByteString
_) [BuilderColumn]
_ = () -> IO ()
forall a. a -> IO a
forall (m :: * -> *) a. Monad m => a -> m a
return ()
processRow :: V.Vector BL.ByteString -> [BuilderColumn] -> IO ()
processRow :: Vector LazyByteString -> [BuilderColumn] -> IO ()
processRow !Vector LazyByteString
vals ![BuilderColumn]
cols = (LazyByteString -> BuilderColumn -> IO ())
-> Vector LazyByteString -> Vector BuilderColumn -> IO ()
forall (m :: * -> *) a b c.
Monad m =>
(a -> b -> m c) -> Vector a -> Vector b -> m ()
V.zipWithM_ LazyByteString -> BuilderColumn -> IO ()
processValue Vector LazyByteString
vals ([BuilderColumn] -> Vector BuilderColumn
forall a. [a] -> Vector a
V.fromList [BuilderColumn]
cols)
where
processValue :: LazyByteString -> BuilderColumn -> IO ()
processValue !LazyByteString
bs !BuilderColumn
col = do
let bs' :: ByteString
bs' = LazyByteString -> ByteString
BL.toStrict LazyByteString
bs
case BuilderColumn
col of
BuilderInt PagedUnboxedVector Int
gv PagedUnboxedVector Word8
valid -> case HasCallStack => ByteString -> Maybe Int
ByteString -> Maybe Int
readByteStringInt ByteString
bs' of
Just !Int
i -> PagedUnboxedVector Int -> Int -> IO ()
forall a. Unbox a => PagedUnboxedVector a -> a -> IO ()
appendPagedUnboxedVector PagedUnboxedVector Int
gv Int
i IO () -> IO () -> IO ()
forall a b. IO a -> IO b -> IO b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> PagedUnboxedVector Word8 -> Word8 -> IO ()
forall a. Unbox a => PagedUnboxedVector a -> a -> IO ()
appendPagedUnboxedVector PagedUnboxedVector Word8
valid Word8
1
Maybe Int
Nothing -> PagedUnboxedVector Int -> Int -> IO ()
forall a. Unbox a => PagedUnboxedVector a -> a -> IO ()
appendPagedUnboxedVector PagedUnboxedVector Int
gv Int
0 IO () -> IO () -> IO ()
forall a b. IO a -> IO b -> IO b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> PagedUnboxedVector Word8 -> Word8 -> IO ()
forall a. Unbox a => PagedUnboxedVector a -> a -> IO ()
appendPagedUnboxedVector PagedUnboxedVector Word8
valid Word8
0
BuilderDouble PagedUnboxedVector Double
gv PagedUnboxedVector Word8
valid -> case HasCallStack => ByteString -> Maybe Double
ByteString -> Maybe Double
readByteStringDouble ByteString
bs' of
Just !Double
d -> PagedUnboxedVector Double -> Double -> IO ()
forall a. Unbox a => PagedUnboxedVector a -> a -> IO ()
appendPagedUnboxedVector PagedUnboxedVector Double
gv Double
d IO () -> IO () -> IO ()
forall a b. IO a -> IO b -> IO b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> PagedUnboxedVector Word8 -> Word8 -> IO ()
forall a. Unbox a => PagedUnboxedVector a -> a -> IO ()
appendPagedUnboxedVector PagedUnboxedVector Word8
valid Word8
1
Maybe Double
Nothing -> PagedUnboxedVector Double -> Double -> IO ()
forall a. Unbox a => PagedUnboxedVector a -> a -> IO ()
appendPagedUnboxedVector PagedUnboxedVector Double
gv Double
0.0 IO () -> IO () -> IO ()
forall a b. IO a -> IO b -> IO b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> PagedUnboxedVector Word8 -> Word8 -> IO ()
forall a. Unbox a => PagedUnboxedVector a -> a -> IO ()
appendPagedUnboxedVector PagedUnboxedVector Word8
valid Word8
0
BuilderText PagedVector Text
gv PagedUnboxedVector Word8
valid -> do
let !val :: Text
val = Text -> Text
T.strip (ByteString -> Text
TE.decodeUtf8Lenient ByteString
bs')
if Text -> Bool
isNull Text
val
then PagedVector Text -> Text -> IO ()
forall a. PagedVector a -> a -> IO ()
appendPagedVector PagedVector Text
gv Text
T.empty IO () -> IO () -> IO ()
forall a b. IO a -> IO b -> IO b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> PagedUnboxedVector Word8 -> Word8 -> IO ()
forall a. Unbox a => PagedUnboxedVector a -> a -> IO ()
appendPagedUnboxedVector PagedUnboxedVector Word8
valid Word8
0
else PagedVector Text -> Text -> IO ()
forall a. PagedVector a -> a -> IO ()
appendPagedVector PagedVector Text
gv Text
val IO () -> IO () -> IO ()
forall a b. IO a -> IO b -> IO b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> PagedUnboxedVector Word8 -> Word8 -> IO ()
forall a. Unbox a => PagedUnboxedVector a -> a -> IO ()
appendPagedUnboxedVector PagedUnboxedVector Word8
valid Word8
1
isNull :: T.Text -> Bool
isNull :: Text -> Bool
isNull Text
t = Text -> Bool
T.null Text
t Bool -> Bool -> Bool
|| Text
t Text -> Text -> Bool
forall a. Eq a => a -> a -> Bool
== Text
"NA" Bool -> Bool -> Bool
|| Text
t Text -> Text -> Bool
forall a. Eq a => a -> a -> Bool
== Text
"NULL" Bool -> Bool -> Bool
|| Text
t Text -> Text -> Bool
forall a. Eq a => a -> a -> Bool
== Text
"null"
freezeBuilderColumn :: BuilderColumn -> IO Column
freezeBuilderColumn :: BuilderColumn -> IO Column
freezeBuilderColumn (BuilderInt PagedUnboxedVector Int
gv PagedUnboxedVector Word8
validRef) = do
Vector Int
vec <- PagedUnboxedVector Int -> IO (Vector Int)
forall a. Unbox a => PagedUnboxedVector a -> IO (Vector a)
freezePagedUnboxedVector PagedUnboxedVector Int
gv
Vector Word8
valid <- PagedUnboxedVector Word8 -> IO (Vector Word8)
forall a. Unbox a => PagedUnboxedVector a -> IO (Vector a)
freezePagedUnboxedVector PagedUnboxedVector Word8
validRef
if (Word8 -> Bool) -> Vector Word8 -> Bool
forall a. Unbox a => (a -> Bool) -> Vector a -> Bool
VU.all (Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
== Word8
1) Vector Word8
valid
then Column -> IO Column
forall a. a -> IO a
forall (m :: * -> *) a. Monad m => a -> m a
return (Column -> IO Column) -> Column -> IO Column
forall a b. (a -> b) -> a -> b
$ Vector Int -> Column
forall a. (Columnable a, Unbox a) => Vector a -> Column
UnboxedColumn Vector Int
vec
else Vector Int -> Vector Word8 -> IO Column
forall a.
(Unbox a, Columnable a) =>
Vector a -> Vector Word8 -> IO Column
constructOptional Vector Int
vec Vector Word8
valid
freezeBuilderColumn (BuilderDouble PagedUnboxedVector Double
gv PagedUnboxedVector Word8
validRef) = do
Vector Double
vec <- PagedUnboxedVector Double -> IO (Vector Double)
forall a. Unbox a => PagedUnboxedVector a -> IO (Vector a)
freezePagedUnboxedVector PagedUnboxedVector Double
gv
Vector Word8
valid <- PagedUnboxedVector Word8 -> IO (Vector Word8)
forall a. Unbox a => PagedUnboxedVector a -> IO (Vector a)
freezePagedUnboxedVector PagedUnboxedVector Word8
validRef
if (Word8 -> Bool) -> Vector Word8 -> Bool
forall a. Unbox a => (a -> Bool) -> Vector a -> Bool
VU.all (Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
== Word8
1) Vector Word8
valid
then Column -> IO Column
forall a. a -> IO a
forall (m :: * -> *) a. Monad m => a -> m a
return (Column -> IO Column) -> Column -> IO Column
forall a b. (a -> b) -> a -> b
$ Vector Double -> Column
forall a. (Columnable a, Unbox a) => Vector a -> Column
UnboxedColumn Vector Double
vec
else Vector Double -> Vector Word8 -> IO Column
forall a.
(Unbox a, Columnable a) =>
Vector a -> Vector Word8 -> IO Column
constructOptional Vector Double
vec Vector Word8
valid
freezeBuilderColumn (BuilderText PagedVector Text
gv PagedUnboxedVector Word8
validRef) = do
Vector Text
vec <- PagedVector Text -> IO (Vector Text)
forall a. PagedVector a -> IO (Vector a)
freezePagedVector PagedVector Text
gv
Vector Word8
valid <- PagedUnboxedVector Word8 -> IO (Vector Word8)
forall a. Unbox a => PagedUnboxedVector a -> IO (Vector a)
freezePagedUnboxedVector PagedUnboxedVector Word8
validRef
if (Word8 -> Bool) -> Vector Word8 -> Bool
forall a. Unbox a => (a -> Bool) -> Vector a -> Bool
VU.all (Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
== Word8
1) Vector Word8
valid
then Column -> IO Column
forall a. a -> IO a
forall (m :: * -> *) a. Monad m => a -> m a
return (Column -> IO Column) -> Column -> IO Column
forall a b. (a -> b) -> a -> b
$ Vector Text -> Column
forall a. Columnable a => Vector a -> Column
BoxedColumn Vector Text
vec
else Vector Text -> Vector Word8 -> IO Column
constructOptionalBoxed Vector Text
vec Vector Word8
valid
constructOptional ::
(VU.Unbox a, Columnable a) => VU.Vector a -> VU.Vector Word8 -> IO Column
constructOptional :: forall a.
(Unbox a, Columnable a) =>
Vector a -> Vector Word8 -> IO Column
constructOptional Vector a
vec Vector Word8
valid = do
let size :: Int
size = Vector a -> Int
forall a. Unbox a => Vector a -> Int
VU.length Vector a
vec
MVector RealWorld (Maybe a)
mvec <- Int -> IO (MVector (PrimState IO) (Maybe a))
forall (m :: * -> *) a.
PrimMonad m =>
Int -> m (MVector (PrimState m) a)
VM.new Int
size
[Int] -> (Int -> IO ()) -> IO ()
forall (t :: * -> *) (m :: * -> *) a b.
(Foldable t, Monad m) =>
t a -> (a -> m b) -> m ()
forM_ [Int
0 .. Int
size Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
1] ((Int -> IO ()) -> IO ()) -> (Int -> IO ()) -> IO ()
forall a b. (a -> b) -> a -> b
$ \Int
i ->
if (Vector Word8
valid Vector Word8 -> Int -> Word8
forall a. Unbox a => Vector a -> Int -> a
VU.! Int
i) Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
== Word8
0
then MVector (PrimState IO) (Maybe a) -> Int -> Maybe a -> IO ()
forall (m :: * -> *) a.
PrimMonad m =>
MVector (PrimState m) a -> Int -> a -> m ()
VM.write MVector RealWorld (Maybe a)
MVector (PrimState IO) (Maybe a)
mvec Int
i Maybe a
forall a. Maybe a
Nothing
else MVector (PrimState IO) (Maybe a) -> Int -> Maybe a -> IO ()
forall (m :: * -> *) a.
PrimMonad m =>
MVector (PrimState m) a -> Int -> a -> m ()
VM.write MVector RealWorld (Maybe a)
MVector (PrimState IO) (Maybe a)
mvec Int
i (a -> Maybe a
forall a. a -> Maybe a
Just (Vector a
vec Vector a -> Int -> a
forall a. Unbox a => Vector a -> Int -> a
VU.! Int
i))
Vector (Maybe a) -> Column
forall a. Columnable a => Vector (Maybe a) -> Column
OptionalColumn (Vector (Maybe a) -> Column) -> IO (Vector (Maybe a)) -> IO Column
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> MVector (PrimState IO) (Maybe a) -> IO (Vector (Maybe a))
forall (m :: * -> *) a.
PrimMonad m =>
MVector (PrimState m) a -> m (Vector a)
V.freeze MVector RealWorld (Maybe a)
MVector (PrimState IO) (Maybe a)
mvec
constructOptionalBoxed :: V.Vector T.Text -> VU.Vector Word8 -> IO Column
constructOptionalBoxed :: Vector Text -> Vector Word8 -> IO Column
constructOptionalBoxed Vector Text
vec Vector Word8
valid = do
let size :: Int
size = Vector Text -> Int
forall a. Vector a -> Int
V.length Vector Text
vec
MVector RealWorld (Maybe Text)
mvec <- Int -> IO (MVector (PrimState IO) (Maybe Text))
forall (m :: * -> *) a.
PrimMonad m =>
Int -> m (MVector (PrimState m) a)
VM.new Int
size
[Int] -> (Int -> IO ()) -> IO ()
forall (t :: * -> *) (m :: * -> *) a b.
(Foldable t, Monad m) =>
t a -> (a -> m b) -> m ()
forM_ [Int
0 .. Int
size Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
1] ((Int -> IO ()) -> IO ()) -> (Int -> IO ()) -> IO ()
forall a b. (a -> b) -> a -> b
$ \Int
i ->
if (Vector Word8
valid Vector Word8 -> Int -> Word8
forall a. Unbox a => Vector a -> Int -> a
VU.! Int
i) Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
== Word8
0
then MVector (PrimState IO) (Maybe Text) -> Int -> Maybe Text -> IO ()
forall (m :: * -> *) a.
PrimMonad m =>
MVector (PrimState m) a -> Int -> a -> m ()
VM.write MVector RealWorld (Maybe Text)
MVector (PrimState IO) (Maybe Text)
mvec Int
i Maybe Text
forall a. Maybe a
Nothing
else MVector (PrimState IO) (Maybe Text) -> Int -> Maybe Text -> IO ()
forall (m :: * -> *) a.
PrimMonad m =>
MVector (PrimState m) a -> Int -> a -> m ()
VM.write MVector RealWorld (Maybe Text)
MVector (PrimState IO) (Maybe Text)
mvec Int
i (Text -> Maybe Text
forall a. a -> Maybe a
Just (Vector Text
vec Vector Text -> Int -> Text
forall a. Vector a -> Int -> a
V.! Int
i))
Vector (Maybe Text) -> Column
forall a. Columnable a => Vector (Maybe a) -> Column
OptionalColumn (Vector (Maybe Text) -> Column)
-> IO (Vector (Maybe Text)) -> IO Column
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> MVector (PrimState IO) (Maybe Text) -> IO (Vector (Maybe Text))
forall (m :: * -> *) a.
PrimMonad m =>
MVector (PrimState m) a -> m (Vector a)
V.freeze MVector RealWorld (Maybe Text)
MVector (PrimState IO) (Maybe Text)
mvec
writeCsv :: FilePath -> DataFrame -> IO ()
writeCsv :: [Char] -> DataFrame -> IO ()
writeCsv = Char -> [Char] -> DataFrame -> IO ()
writeSeparated Char
','
writeSeparated ::
Char ->
FilePath ->
DataFrame ->
IO ()
writeSeparated :: Char -> [Char] -> DataFrame -> IO ()
writeSeparated Char
c [Char]
filepath DataFrame
df = [Char] -> IOMode -> (Handle -> IO ()) -> IO ()
forall r. [Char] -> IOMode -> (Handle -> IO r) -> IO r
withFile [Char]
filepath IOMode
WriteMode ((Handle -> IO ()) -> IO ()) -> (Handle -> IO ()) -> IO ()
forall a b. (a -> b) -> a -> b
$ \Handle
handle -> do
let (Int
rows, Int
_) = DataFrame -> (Int, Int)
dataframeDimensions DataFrame
df
let headers :: [Text]
headers = ((Text, Int) -> Text) -> [(Text, Int)] -> [Text]
forall a b. (a -> b) -> [a] -> [b]
map (Text, Int) -> Text
forall a b. (a, b) -> a
fst (((Text, Int) -> (Text, Int) -> Ordering)
-> [(Text, Int)] -> [(Text, Int)]
forall a. (a -> a -> Ordering) -> [a] -> [a]
L.sortBy (Int -> Int -> Ordering
forall a. Ord a => a -> a -> Ordering
compare (Int -> Int -> Ordering)
-> ((Text, Int) -> Int) -> (Text, Int) -> (Text, Int) -> Ordering
forall b c a. (b -> b -> c) -> (a -> b) -> a -> a -> c
`on` (Text, Int) -> Int
forall a b. (a, b) -> b
snd) (Map Text Int -> [(Text, Int)]
forall k a. Map k a -> [(k, a)]
M.toList (DataFrame -> Map Text Int
columnIndices DataFrame
df)))
Handle -> Text -> IO ()
TIO.hPutStrLn Handle
handle (Text -> [Text] -> Text
T.intercalate Text
", " [Text]
headers)
[Int] -> (Int -> IO ()) -> IO ()
forall (t :: * -> *) (m :: * -> *) a b.
(Foldable t, Monad m) =>
t a -> (a -> m b) -> m ()
forM_ [Int
0 .. (Int
rows Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
1)] ((Int -> IO ()) -> IO ()) -> (Int -> IO ()) -> IO ()
forall a b. (a -> b) -> a -> b
$ \Int
i -> do
let row :: [Text]
row = DataFrame -> Int -> [Text]
getRowAsText DataFrame
df Int
i
Handle -> Text -> IO ()
TIO.hPutStrLn Handle
handle (Text -> [Text] -> Text
T.intercalate Text
"," [Text]
row)
getRowAsText :: DataFrame -> Int -> [T.Text]
getRowAsText :: DataFrame -> Int -> [Text]
getRowAsText DataFrame
df Int
i = (Int -> Column -> [Text] -> [Text])
-> [Text] -> Vector Column -> [Text]
forall a b. (Int -> a -> b -> b) -> b -> Vector a -> b
V.ifoldr Int -> Column -> [Text] -> [Text]
go [] (DataFrame -> Vector Column
columns DataFrame
df)
where
indexMap :: Map Int Text
indexMap = [(Int, Text)] -> Map Int Text
forall k a. Ord k => [(k, a)] -> Map k a
M.fromList (((Text, Int) -> (Int, Text)) -> [(Text, Int)] -> [(Int, Text)]
forall a b. (a -> b) -> [a] -> [b]
map (\(Text
a, Int
b) -> (Int
b, Text
a)) ([(Text, Int)] -> [(Int, Text)]) -> [(Text, Int)] -> [(Int, Text)]
forall a b. (a -> b) -> a -> b
$ Map Text Int -> [(Text, Int)]
forall k a. Map k a -> [(k, a)]
M.toList (DataFrame -> Map Text Int
columnIndices DataFrame
df))
go :: Int -> Column -> [Text] -> [Text]
go Int
k (BoxedColumn (Vector a
c :: V.Vector a)) [Text]
acc = case Vector a
c Vector a -> Int -> Maybe a
forall a. Vector a -> Int -> Maybe a
V.!? Int
i of
Just a
e -> Text
textRep Text -> [Text] -> [Text]
forall a. a -> [a] -> [a]
: [Text]
acc
where
textRep :: Text
textRep = case TypeRep a -> TypeRep Text -> Maybe (a :~: Text)
forall a b. TypeRep a -> TypeRep b -> Maybe (a :~: b)
forall {k} (f :: k -> *) (a :: k) (b :: k).
TestEquality f =>
f a -> f b -> Maybe (a :~: b)
testEquality (forall a. Typeable a => TypeRep a
forall {k} (a :: k). Typeable a => TypeRep a
typeRep @a) (forall a. Typeable a => TypeRep a
forall {k} (a :: k). Typeable a => TypeRep a
typeRep @T.Text) of
Just a :~: Text
Refl -> a
Text
e
Maybe (a :~: Text)
Nothing -> case forall a. Typeable a => TypeRep a
forall {k} (a :: k). Typeable a => TypeRep a
typeRep @a of
App TypeRep a
t1 TypeRep b
t2 -> case TypeRep a -> TypeRep Maybe -> Maybe (a :~~: Maybe)
forall k1 k2 (a :: k1) (b :: k2).
TypeRep a -> TypeRep b -> Maybe (a :~~: b)
eqTypeRep TypeRep a
t1 (forall {k} (a :: k). Typeable a => TypeRep a
forall (a :: * -> *). Typeable a => TypeRep a
typeRep @Maybe) of
Just a :~~: Maybe
HRefl -> case TypeRep b -> TypeRep Text -> Maybe (b :~: Text)
forall (a :: k1) (b :: k1).
TypeRep a -> TypeRep b -> Maybe (a :~: b)
forall {k} (f :: k -> *) (a :: k) (b :: k).
TestEquality f =>
f a -> f b -> Maybe (a :~: b)
testEquality TypeRep b
t2 (forall a. Typeable a => TypeRep a
forall {k} (a :: k). Typeable a => TypeRep a
typeRep @T.Text) of
Just b :~: Text
Refl -> Text -> Maybe Text -> Text
forall a. a -> Maybe a -> a
fromMaybe Text
"null" a
Maybe Text
e
Maybe (b :~: Text)
Nothing -> (Text -> Text
fromOptional (Text -> Text) -> (a -> Text) -> a -> Text
forall b c a. (b -> c) -> (a -> b) -> a -> c
. [Char] -> Text
T.pack ([Char] -> Text) -> (a -> [Char]) -> a -> Text
forall b c a. (b -> c) -> (a -> b) -> a -> c
. a -> [Char]
forall a. Show a => a -> [Char]
show) a
e
where
fromOptional :: Text -> Text
fromOptional Text
s
| Text -> Text -> Bool
T.isPrefixOf Text
"Just " Text
s = Int -> Text -> Text
T.drop (Text -> Int
T.length Text
"Just ") Text
s
| Bool
otherwise = Text
"null"
Maybe (a :~~: Maybe)
Nothing -> ([Char] -> Text
T.pack ([Char] -> Text) -> (a -> [Char]) -> a -> Text
forall b c a. (b -> c) -> (a -> b) -> a -> c
. a -> [Char]
forall a. Show a => a -> [Char]
show) a
e
TypeRep a
_ -> ([Char] -> Text
T.pack ([Char] -> Text) -> (a -> [Char]) -> a -> Text
forall b c a. (b -> c) -> (a -> b) -> a -> c
. a -> [Char]
forall a. Show a => a -> [Char]
show) a
e
Maybe a
Nothing ->
[Char] -> [Text]
forall a. HasCallStack => [Char] -> a
error ([Char] -> [Text]) -> [Char] -> [Text]
forall a b. (a -> b) -> a -> b
$
[Char]
"Column "
[Char] -> ShowS
forall a. [a] -> [a] -> [a]
++ Text -> [Char]
T.unpack (Map Int Text
indexMap Map Int Text -> Int -> Text
forall k a. Ord k => Map k a -> k -> a
M.! Int
k)
[Char] -> ShowS
forall a. [a] -> [a] -> [a]
++ [Char]
" has less items than "
[Char] -> ShowS
forall a. [a] -> [a] -> [a]
++ [Char]
"the other columns at index "
[Char] -> ShowS
forall a. [a] -> [a] -> [a]
++ Int -> [Char]
forall a. Show a => a -> [Char]
show Int
i
go Int
k (UnboxedColumn Vector a
c) [Text]
acc = case Vector a
c Vector a -> Int -> Maybe a
forall a. Unbox a => Vector a -> Int -> Maybe a
VU.!? Int
i of
Just a
e -> [Char] -> Text
T.pack (a -> [Char]
forall a. Show a => a -> [Char]
show a
e) Text -> [Text] -> [Text]
forall a. a -> [a] -> [a]
: [Text]
acc
Maybe a
Nothing ->
[Char] -> [Text]
forall a. HasCallStack => [Char] -> a
error ([Char] -> [Text]) -> [Char] -> [Text]
forall a b. (a -> b) -> a -> b
$
[Char]
"Column "
[Char] -> ShowS
forall a. [a] -> [a] -> [a]
++ Text -> [Char]
T.unpack (Map Int Text
indexMap Map Int Text -> Int -> Text
forall k a. Ord k => Map k a -> k -> a
M.! Int
k)
[Char] -> ShowS
forall a. [a] -> [a] -> [a]
++ [Char]
" has less items than "
[Char] -> ShowS
forall a. [a] -> [a] -> [a]
++ [Char]
"the other columns at index "
[Char] -> ShowS
forall a. [a] -> [a] -> [a]
++ Int -> [Char]
forall a. Show a => a -> [Char]
show Int
i
go Int
k (OptionalColumn (Vector (Maybe a)
c :: V.Vector (Maybe a))) [Text]
acc = case Vector (Maybe a)
c Vector (Maybe a) -> Int -> Maybe (Maybe a)
forall a. Vector a -> Int -> Maybe a
V.!? Int
i of
Just Maybe a
e -> Text
textRep Text -> [Text] -> [Text]
forall a. a -> [a] -> [a]
: [Text]
acc
where
textRep :: Text
textRep = case TypeRep a -> TypeRep Text -> Maybe (a :~: Text)
forall a b. TypeRep a -> TypeRep b -> Maybe (a :~: b)
forall {k} (f :: k -> *) (a :: k) (b :: k).
TestEquality f =>
f a -> f b -> Maybe (a :~: b)
testEquality (forall a. Typeable a => TypeRep a
forall {k} (a :: k). Typeable a => TypeRep a
typeRep @a) (forall a. Typeable a => TypeRep a
forall {k} (a :: k). Typeable a => TypeRep a
typeRep @T.Text) of
Just a :~: Text
Refl -> a -> Maybe a -> a
forall a. a -> Maybe a -> a
fromMaybe a
"Nothing" Maybe a
e
Maybe (a :~: Text)
Nothing -> ([Char] -> Text
T.pack ([Char] -> Text) -> (Maybe a -> [Char]) -> Maybe a -> Text
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Maybe a -> [Char]
forall a. Show a => a -> [Char]
show) Maybe a
e
Maybe (Maybe a)
Nothing ->
[Char] -> [Text]
forall a. HasCallStack => [Char] -> a
error ([Char] -> [Text]) -> [Char] -> [Text]
forall a b. (a -> b) -> a -> b
$
[Char]
"Column "
[Char] -> ShowS
forall a. [a] -> [a] -> [a]
++ Text -> [Char]
T.unpack (Map Int Text
indexMap Map Int Text -> Int -> Text
forall k a. Ord k => Map k a -> k -> a
M.! Int
k)
[Char] -> ShowS
forall a. [a] -> [a] -> [a]
++ [Char]
" has less items than "
[Char] -> ShowS
forall a. [a] -> [a] -> [a]
++ [Char]
"the other columns at index "
[Char] -> ShowS
forall a. [a] -> [a] -> [a]
++ Int -> [Char]
forall a. Show a => a -> [Char]
show Int
i
stripQuotes :: T.Text -> T.Text
stripQuotes :: Text -> Text
stripQuotes Text
txt =
case Text -> Maybe (Char, Text)
T.uncons Text
txt of
Just (Char
'"', Text
rest) ->
case Text -> Maybe (Text, Char)
T.unsnoc Text
rest of
Just (Text
middle, Char
'"') -> Text
middle
Maybe (Text, Char)
_ -> Text
txt
Maybe (Char, Text)
_ -> Text
txt