{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE ExplicitNamespaces #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE GADTs #-}
{-# LANGUAGE NumericUnderscores #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE RankNTypes #-}
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE TypeApplications #-}

module DataFrame.IO.CSV where

import qualified Data.ByteString.Lazy as BL
import qualified Data.List as L
import qualified Data.Map.Strict as M
import qualified Data.Proxy as P
import qualified Data.Text as T
import qualified Data.Text.Encoding as TE
import qualified Data.Text.IO as TIO
import qualified Data.Vector as V
import qualified Data.Vector.Mutable as VM
import qualified Data.Vector.Unboxed as VU
import qualified Data.Vector.Unboxed.Mutable as VUM

import Data.Csv.Streaming (Records (..))
import qualified Data.Csv.Streaming as CsvStream

import Control.Monad
import Data.Char
import qualified Data.Csv as Csv
import Data.Either
import Data.Function (on)
import Data.Functor
import Data.IORef
import Data.Maybe
import Data.Type.Equality (TestEquality (testEquality))
import Data.Word (Word8)
import DataFrame.Internal.Column
import DataFrame.Internal.DataFrame (DataFrame (..))
import DataFrame.Internal.Parsing
import DataFrame.Internal.Schema
import DataFrame.Operations.Typing
import System.IO
import Type.Reflection
import Prelude hiding (concat, takeWhile)

chunkSize :: Int
chunkSize :: Int
chunkSize = Int
16_384

data PagedVector a = PagedVector
    { forall a. PagedVector a -> IORef [Vector a]
pvChunks :: !(IORef [V.Vector a])
    -- ^ Finished chunks (reverse order)
    , forall a. PagedVector a -> IORef (IOVector a)
pvActive :: !(IORef (VM.IOVector a))
    -- ^ Current mutable chunk
    , forall a. PagedVector a -> IORef Int
pvCount :: !(IORef Int)
    -- ^ Items written in current chunk
    }

data PagedUnboxedVector a = PagedUnboxedVector
    { forall a. PagedUnboxedVector a -> IORef [Vector a]
puvChunks :: !(IORef [VU.Vector a])
    , forall a. PagedUnboxedVector a -> IORef (IOVector a)
puvActive :: !(IORef (VUM.IOVector a))
    , forall a. PagedUnboxedVector a -> IORef Int
puvCount :: !(IORef Int)
    }

data BuilderColumn
    = BuilderInt !(PagedUnboxedVector Int) !(PagedUnboxedVector Word8)
    | BuilderDouble !(PagedUnboxedVector Double) !(PagedUnboxedVector Word8)
    | BuilderText !(PagedVector T.Text) !(PagedUnboxedVector Word8)

newPagedVector :: IO (PagedVector a)
newPagedVector :: forall a. IO (PagedVector a)
newPagedVector = do
    IOVector a
active <- Int -> IO (MVector (PrimState IO) a)
forall (m :: * -> *) a.
PrimMonad m =>
Int -> m (MVector (PrimState m) a)
VM.unsafeNew Int
chunkSize
    IORef [Vector a]
-> IORef (IOVector a) -> IORef Int -> PagedVector a
forall a.
IORef [Vector a]
-> IORef (IOVector a) -> IORef Int -> PagedVector a
PagedVector (IORef [Vector a]
 -> IORef (IOVector a) -> IORef Int -> PagedVector a)
-> IO (IORef [Vector a])
-> IO (IORef (IOVector a) -> IORef Int -> PagedVector a)
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> [Vector a] -> IO (IORef [Vector a])
forall a. a -> IO (IORef a)
newIORef [] IO (IORef (IOVector a) -> IORef Int -> PagedVector a)
-> IO (IORef (IOVector a)) -> IO (IORef Int -> PagedVector a)
forall a b. IO (a -> b) -> IO a -> IO b
forall (f :: * -> *) a b. Applicative f => f (a -> b) -> f a -> f b
<*> IOVector a -> IO (IORef (IOVector a))
forall a. a -> IO (IORef a)
newIORef IOVector a
active IO (IORef Int -> PagedVector a)
-> IO (IORef Int) -> IO (PagedVector a)
forall a b. IO (a -> b) -> IO a -> IO b
forall (f :: * -> *) a b. Applicative f => f (a -> b) -> f a -> f b
<*> Int -> IO (IORef Int)
forall a. a -> IO (IORef a)
newIORef Int
0

newPagedUnboxedVector :: (VUM.Unbox a) => IO (PagedUnboxedVector a)
newPagedUnboxedVector :: forall a. Unbox a => IO (PagedUnboxedVector a)
newPagedUnboxedVector = do
    IOVector a
active <- Int -> IO (MVector (PrimState IO) a)
forall (m :: * -> *) a.
(PrimMonad m, Unbox a) =>
Int -> m (MVector (PrimState m) a)
VUM.unsafeNew Int
chunkSize
    IORef [Vector a]
-> IORef (IOVector a) -> IORef Int -> PagedUnboxedVector a
forall a.
IORef [Vector a]
-> IORef (IOVector a) -> IORef Int -> PagedUnboxedVector a
PagedUnboxedVector (IORef [Vector a]
 -> IORef (IOVector a) -> IORef Int -> PagedUnboxedVector a)
-> IO (IORef [Vector a])
-> IO (IORef (IOVector a) -> IORef Int -> PagedUnboxedVector a)
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> [Vector a] -> IO (IORef [Vector a])
forall a. a -> IO (IORef a)
newIORef [] IO (IORef (IOVector a) -> IORef Int -> PagedUnboxedVector a)
-> IO (IORef (IOVector a))
-> IO (IORef Int -> PagedUnboxedVector a)
forall a b. IO (a -> b) -> IO a -> IO b
forall (f :: * -> *) a b. Applicative f => f (a -> b) -> f a -> f b
<*> IOVector a -> IO (IORef (IOVector a))
forall a. a -> IO (IORef a)
newIORef IOVector a
active IO (IORef Int -> PagedUnboxedVector a)
-> IO (IORef Int) -> IO (PagedUnboxedVector a)
forall a b. IO (a -> b) -> IO a -> IO b
forall (f :: * -> *) a b. Applicative f => f (a -> b) -> f a -> f b
<*> Int -> IO (IORef Int)
forall a. a -> IO (IORef a)
newIORef Int
0

appendPagedVector :: PagedVector a -> a -> IO ()
appendPagedVector :: forall a. PagedVector a -> a -> IO ()
appendPagedVector (PagedVector IORef [Vector a]
chunksRef IORef (IOVector a)
activeRef IORef Int
countRef) !a
val = do
    Int
count <- IORef Int -> IO Int
forall a. IORef a -> IO a
readIORef IORef Int
countRef
    IOVector a
active <- IORef (IOVector a) -> IO (IOVector a)
forall a. IORef a -> IO a
readIORef IORef (IOVector a)
activeRef

    if Int
count Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
chunkSize
        then do
            MVector (PrimState IO) a -> Int -> a -> IO ()
forall (m :: * -> *) a.
PrimMonad m =>
MVector (PrimState m) a -> Int -> a -> m ()
VM.unsafeWrite IOVector a
MVector (PrimState IO) a
active Int
count a
val
            IORef Int -> Int -> IO ()
forall a. IORef a -> a -> IO ()
writeIORef IORef Int
countRef (Int -> IO ()) -> Int -> IO ()
forall a b. (a -> b) -> a -> b
$! Int
count Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
1
        else do
            Vector a
frozen <- MVector (PrimState IO) a -> IO (Vector a)
forall (m :: * -> *) a.
PrimMonad m =>
MVector (PrimState m) a -> m (Vector a)
V.freeze IOVector a
MVector (PrimState IO) a
active
            IORef [Vector a] -> ([Vector a] -> [Vector a]) -> IO ()
forall a. IORef a -> (a -> a) -> IO ()
modifyIORef' IORef [Vector a]
chunksRef (Vector a
frozen Vector a -> [Vector a] -> [Vector a]
forall a. a -> [a] -> [a]
:)

            IOVector a
newActive <- Int -> IO (MVector (PrimState IO) a)
forall (m :: * -> *) a.
PrimMonad m =>
Int -> m (MVector (PrimState m) a)
VM.unsafeNew Int
chunkSize
            MVector (PrimState IO) a -> Int -> a -> IO ()
forall (m :: * -> *) a.
PrimMonad m =>
MVector (PrimState m) a -> Int -> a -> m ()
VM.unsafeWrite IOVector a
MVector (PrimState IO) a
newActive Int
0 a
val

            IORef (IOVector a) -> IOVector a -> IO ()
forall a. IORef a -> a -> IO ()
writeIORef IORef (IOVector a)
activeRef IOVector a
newActive
            IORef Int -> Int -> IO ()
forall a. IORef a -> a -> IO ()
writeIORef IORef Int
countRef Int
1
{-# INLINE appendPagedVector #-}

appendPagedUnboxedVector :: (VUM.Unbox a) => PagedUnboxedVector a -> a -> IO ()
appendPagedUnboxedVector :: forall a. Unbox a => PagedUnboxedVector a -> a -> IO ()
appendPagedUnboxedVector (PagedUnboxedVector IORef [Vector a]
chunksRef IORef (IOVector a)
activeRef IORef Int
countRef) !a
val = do
    Int
count <- IORef Int -> IO Int
forall a. IORef a -> IO a
readIORef IORef Int
countRef
    IOVector a
active <- IORef (IOVector a) -> IO (IOVector a)
forall a. IORef a -> IO a
readIORef IORef (IOVector a)
activeRef

    if Int
count Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
chunkSize
        then do
            MVector (PrimState IO) a -> Int -> a -> IO ()
forall (m :: * -> *) a.
(PrimMonad m, Unbox a) =>
MVector (PrimState m) a -> Int -> a -> m ()
VUM.unsafeWrite IOVector a
MVector (PrimState IO) a
active Int
count a
val
            IORef Int -> Int -> IO ()
forall a. IORef a -> a -> IO ()
writeIORef IORef Int
countRef (Int -> IO ()) -> Int -> IO ()
forall a b. (a -> b) -> a -> b
$! Int
count Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
1
        else do
            Vector a
frozen <- MVector (PrimState IO) a -> IO (Vector a)
forall a (m :: * -> *).
(Unbox a, PrimMonad m) =>
MVector (PrimState m) a -> m (Vector a)
VU.freeze IOVector a
MVector (PrimState IO) a
active
            IORef [Vector a] -> ([Vector a] -> [Vector a]) -> IO ()
forall a. IORef a -> (a -> a) -> IO ()
modifyIORef' IORef [Vector a]
chunksRef (Vector a
frozen Vector a -> [Vector a] -> [Vector a]
forall a. a -> [a] -> [a]
:)

            IOVector a
newActive <- Int -> IO (MVector (PrimState IO) a)
forall (m :: * -> *) a.
(PrimMonad m, Unbox a) =>
Int -> m (MVector (PrimState m) a)
VUM.unsafeNew Int
chunkSize
            MVector (PrimState IO) a -> Int -> a -> IO ()
forall (m :: * -> *) a.
(PrimMonad m, Unbox a) =>
MVector (PrimState m) a -> Int -> a -> m ()
VUM.unsafeWrite IOVector a
MVector (PrimState IO) a
newActive Int
0 a
val

            IORef (IOVector a) -> IOVector a -> IO ()
forall a. IORef a -> a -> IO ()
writeIORef IORef (IOVector a)
activeRef IOVector a
newActive
            IORef Int -> Int -> IO ()
forall a. IORef a -> a -> IO ()
writeIORef IORef Int
countRef Int
1
{-# INLINE appendPagedUnboxedVector #-}

freezePagedVector :: PagedVector a -> IO (V.Vector a)
freezePagedVector :: forall a. PagedVector a -> IO (Vector a)
freezePagedVector (PagedVector IORef [Vector a]
chunksRef IORef (IOVector a)
activeRef IORef Int
countRef) = do
    Int
count <- IORef Int -> IO Int
forall a. IORef a -> IO a
readIORef IORef Int
countRef
    IOVector a
active <- IORef (IOVector a) -> IO (IOVector a)
forall a. IORef a -> IO a
readIORef IORef (IOVector a)
activeRef
    [Vector a]
chunks <- IORef [Vector a] -> IO [Vector a]
forall a. IORef a -> IO a
readIORef IORef [Vector a]
chunksRef

    Vector a
lastChunk <- MVector (PrimState IO) a -> IO (Vector a)
forall (m :: * -> *) a.
PrimMonad m =>
MVector (PrimState m) a -> m (Vector a)
V.freeze (Int -> Int -> IOVector a -> IOVector a
forall s a. Int -> Int -> MVector s a -> MVector s a
VM.slice Int
0 Int
count IOVector a
active)

    Vector a -> IO (Vector a)
forall a. a -> IO a
forall (m :: * -> *) a. Monad m => a -> m a
return (Vector a -> IO (Vector a)) -> Vector a -> IO (Vector a)
forall a b. (a -> b) -> a -> b
$! [Vector a] -> Vector a
forall a. [Vector a] -> Vector a
V.concat ([Vector a] -> [Vector a]
forall a. [a] -> [a]
reverse (Vector a
lastChunk Vector a -> [Vector a] -> [Vector a]
forall a. a -> [a] -> [a]
: [Vector a]
chunks))

freezePagedUnboxedVector ::
    (VUM.Unbox a) => PagedUnboxedVector a -> IO (VU.Vector a)
freezePagedUnboxedVector :: forall a. Unbox a => PagedUnboxedVector a -> IO (Vector a)
freezePagedUnboxedVector (PagedUnboxedVector IORef [Vector a]
chunksRef IORef (IOVector a)
activeRef IORef Int
countRef) = do
    Int
count <- IORef Int -> IO Int
forall a. IORef a -> IO a
readIORef IORef Int
countRef
    IOVector a
active <- IORef (IOVector a) -> IO (IOVector a)
forall a. IORef a -> IO a
readIORef IORef (IOVector a)
activeRef
    [Vector a]
chunks <- IORef [Vector a] -> IO [Vector a]
forall a. IORef a -> IO a
readIORef IORef [Vector a]
chunksRef

    Vector a
lastChunk <- MVector (PrimState IO) a -> IO (Vector a)
forall a (m :: * -> *).
(Unbox a, PrimMonad m) =>
MVector (PrimState m) a -> m (Vector a)
VU.freeze (Int -> Int -> IOVector a -> IOVector a
forall a s. Unbox a => Int -> Int -> MVector s a -> MVector s a
VUM.slice Int
0 Int
count IOVector a
active)
    Vector a -> IO (Vector a)
forall a. a -> IO a
forall (m :: * -> *) a. Monad m => a -> m a
return (Vector a -> IO (Vector a)) -> Vector a -> IO (Vector a)
forall a b. (a -> b) -> a -> b
$! [Vector a] -> Vector a
forall a. Unbox a => [Vector a] -> Vector a
VU.concat ([Vector a] -> [Vector a]
forall a. [a] -> [a]
reverse (Vector a
lastChunk Vector a -> [Vector a] -> [Vector a]
forall a. a -> [a] -> [a]
: [Vector a]
chunks))

-- | STANDARD CONFIG TYPES
data HeaderSpec = NoHeader | UseFirstRow | ProvideNames [T.Text]
    deriving (HeaderSpec -> HeaderSpec -> Bool
(HeaderSpec -> HeaderSpec -> Bool)
-> (HeaderSpec -> HeaderSpec -> Bool) -> Eq HeaderSpec
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
$c== :: HeaderSpec -> HeaderSpec -> Bool
== :: HeaderSpec -> HeaderSpec -> Bool
$c/= :: HeaderSpec -> HeaderSpec -> Bool
/= :: HeaderSpec -> HeaderSpec -> Bool
Eq, Int -> HeaderSpec -> ShowS
[HeaderSpec] -> ShowS
HeaderSpec -> [Char]
(Int -> HeaderSpec -> ShowS)
-> (HeaderSpec -> [Char])
-> ([HeaderSpec] -> ShowS)
-> Show HeaderSpec
forall a.
(Int -> a -> ShowS) -> (a -> [Char]) -> ([a] -> ShowS) -> Show a
$cshowsPrec :: Int -> HeaderSpec -> ShowS
showsPrec :: Int -> HeaderSpec -> ShowS
$cshow :: HeaderSpec -> [Char]
show :: HeaderSpec -> [Char]
$cshowList :: [HeaderSpec] -> ShowS
showList :: [HeaderSpec] -> ShowS
Show)

data TypeSpec = InferFromSample Int | SpecifyTypes [SchemaType] | NoInference

-- | CSV read parameters.
data ReadOptions = ReadOptions
    { ReadOptions -> HeaderSpec
headerSpec :: HeaderSpec
    -- ^ Where to get the headers from. (default: UseFirstRow)
    , ReadOptions -> TypeSpec
typeSpec :: TypeSpec
    -- ^ Whether/how to infer types. (default: InferFromSample 100)
    , ReadOptions -> Bool
safeRead :: Bool
    -- ^ Whether to partially parse values into `Maybe`/`Either`. (default: True)
    , ReadOptions -> [Char]
dateFormat :: String
    {- ^ Format of date fields as recognized by the Data.Time.Format module.

    __Examples:__

    @
    > parseTimeM True defaultTimeLocale "%Y/%-m/%-d" "2010/3/04" :: Maybe Day
    Just 2010-03-04
    > parseTimeM True defaultTimeLocale "%d/%-m/%-Y" "04/3/2010" :: Maybe Day
    Just 2010-03-04
    @
    -}
    }

shouldInferFromSample :: TypeSpec -> Bool
shouldInferFromSample :: TypeSpec -> Bool
shouldInferFromSample (InferFromSample Int
_) = Bool
True
shouldInferFromSample TypeSpec
_ = Bool
False

schemaTypes :: TypeSpec -> [SchemaType]
schemaTypes :: TypeSpec -> [SchemaType]
schemaTypes (SpecifyTypes [SchemaType]
xs) = [SchemaType]
xs
schemaTypes TypeSpec
_ = []

typeInferenceSampleSize :: TypeSpec -> Int
typeInferenceSampleSize :: TypeSpec -> Int
typeInferenceSampleSize (InferFromSample Int
n) = Int
n
typeInferenceSampleSize TypeSpec
_ = Int
0

defaultReadOptions :: ReadOptions
defaultReadOptions :: ReadOptions
defaultReadOptions =
    ReadOptions
        { headerSpec :: HeaderSpec
headerSpec = HeaderSpec
UseFirstRow
        , typeSpec :: TypeSpec
typeSpec = Int -> TypeSpec
InferFromSample Int
100
        , safeRead :: Bool
safeRead = Bool
True
        , dateFormat :: [Char]
dateFormat = [Char]
"%Y-%m-%d"
        }

{- | Read CSV file from path and load it into a dataframe.

==== __Example__
@
ghci> D.readCsv ".\/data\/taxi.csv"

@
-}
readCsv :: FilePath -> IO DataFrame
readCsv :: [Char] -> IO DataFrame
readCsv = Char -> ReadOptions -> [Char] -> IO DataFrame
readSeparated Char
',' ReadOptions
defaultReadOptions

{- | Read CSV file from path and load it into a dataframe.

==== __Example__
@
ghci> D.readCsvWithOpts ".\/data\/taxi.csv" (D.defaultReadOptions { dateFormat = "%d/%-m/%-Y" })

@
-}
readCsvWithOpts :: ReadOptions -> FilePath -> IO DataFrame
readCsvWithOpts :: ReadOptions -> [Char] -> IO DataFrame
readCsvWithOpts = Char -> ReadOptions -> [Char] -> IO DataFrame
readSeparated Char
','

{- | Read TSV (tab separated) file from path and load it into a dataframe.

==== __Example__
@
ghci> D.readTsv ".\/data\/taxi.tsv"

@
-}
readTsv :: FilePath -> IO DataFrame
readTsv :: [Char] -> IO DataFrame
readTsv = Char -> ReadOptions -> [Char] -> IO DataFrame
readSeparated Char
'\t' ReadOptions
defaultReadOptions

{- | Read text file with specified delimiter into a dataframe.

==== __Example__
@
ghci> D.readSeparated ';' D.defaultReadOptions ".\/data\/taxi.txt"

@
-}
readSeparated :: Char -> ReadOptions -> FilePath -> IO DataFrame
readSeparated :: Char -> ReadOptions -> [Char] -> IO DataFrame
readSeparated !Char
sep !ReadOptions
opts ![Char]
path = do
    LazyByteString
csvData <- [Char] -> IO LazyByteString
BL.readFile [Char]
path
    let decodeOpts :: DecodeOptions
decodeOpts = DecodeOptions
Csv.defaultDecodeOptions{Csv.decDelimiter = fromIntegral (ord sep)}
    let stream :: Records (Vector LazyByteString)
stream = DecodeOptions
-> HasHeader -> LazyByteString -> Records (Vector LazyByteString)
forall a.
FromRecord a =>
DecodeOptions -> HasHeader -> LazyByteString -> Records a
CsvStream.decodeWith DecodeOptions
decodeOpts HasHeader
Csv.NoHeader LazyByteString
csvData

    let peekStream :: Records a -> m (a, Records a)
peekStream (Cons (Right a
row) Records a
rest) = (a, Records a) -> m (a, Records a)
forall a. a -> m a
forall (m :: * -> *) a. Monad m => a -> m a
return (a
row, Records a
rest)
        peekStream (Cons (Left [Char]
err) Records a
_) = [Char] -> m (a, Records a)
forall a. HasCallStack => [Char] -> a
error ([Char] -> m (a, Records a)) -> [Char] -> m (a, Records a)
forall a b. (a -> b) -> a -> b
$ [Char]
"Error parsing CSV header: " [Char] -> ShowS
forall a. [a] -> [a] -> [a]
++ [Char]
err
        peekStream (Nil Maybe [Char]
Nothing LazyByteString
_) = [Char] -> m (a, Records a)
forall a. HasCallStack => [Char] -> a
error [Char]
"Empty CSV file"
        peekStream (Nil (Just [Char]
err) LazyByteString
_) = [Char] -> m (a, Records a)
forall a. HasCallStack => [Char] -> a
error [Char]
err

    (Vector LazyByteString
firstRowRaw, Records (Vector LazyByteString)
dataStream) <- Records (Vector LazyByteString)
-> IO (Vector LazyByteString, Records (Vector LazyByteString))
forall {m :: * -> *} {a}. Monad m => Records a -> m (a, Records a)
peekStream Records (Vector LazyByteString)
stream

    let ([Text]
columnNames, Records (Vector LazyByteString)
rowsToProcess) = case ReadOptions -> HeaderSpec
headerSpec ReadOptions
opts of
            HeaderSpec
NoHeader ->
                ( (Int -> Text) -> [Int] -> [Text]
forall a b. (a -> b) -> [a] -> [b]
map ([Char] -> Text
T.pack ([Char] -> Text) -> (Int -> [Char]) -> Int -> Text
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Int -> [Char]
forall a. Show a => a -> [Char]
show) [Int
0 .. Vector LazyByteString -> Int
forall a. Vector a -> Int
V.length Vector LazyByteString
firstRowRaw Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
1]
                , Either [Char] (Vector LazyByteString)
-> Records (Vector LazyByteString)
-> Records (Vector LazyByteString)
forall a. Either [Char] a -> Records a -> Records a
Cons (Vector LazyByteString -> Either [Char] (Vector LazyByteString)
forall a b. b -> Either a b
Right Vector LazyByteString
firstRowRaw) Records (Vector LazyByteString)
dataStream
                )
            HeaderSpec
UseFirstRow ->
                ( (LazyByteString -> Text) -> [LazyByteString] -> [Text]
forall a b. (a -> b) -> [a] -> [b]
map (Text -> Text
T.strip (Text -> Text)
-> (LazyByteString -> Text) -> LazyByteString -> Text
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ByteString -> Text
TE.decodeUtf8Lenient (ByteString -> Text)
-> (LazyByteString -> ByteString) -> LazyByteString -> Text
forall b c a. (b -> c) -> (a -> b) -> a -> c
. LazyByteString -> ByteString
BL.toStrict) (Vector LazyByteString -> [LazyByteString]
forall a. Vector a -> [a]
V.toList Vector LazyByteString
firstRowRaw)
                , Records (Vector LazyByteString)
dataStream
                )
            ProvideNames [Text]
ns ->
                ( [Text]
ns [Text] -> [Text] -> [Text]
forall a. [a] -> [a] -> [a]
++ Int -> [Text] -> [Text]
forall a. Int -> [a] -> [a]
drop ([Text] -> Int
forall a. [a] -> Int
forall (t :: * -> *) a. Foldable t => t a -> Int
length [Text]
ns) ((Int -> Text) -> [Int] -> [Text]
forall a b. (a -> b) -> [a] -> [b]
map ([Char] -> Text
T.pack ([Char] -> Text) -> (Int -> [Char]) -> Int -> Text
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Int -> [Char]
forall a. Show a => a -> [Char]
show) [Int
0 .. Vector LazyByteString -> Int
forall a. Vector a -> Int
V.length Vector LazyByteString
firstRowRaw Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
1])
                , Either [Char] (Vector LazyByteString)
-> Records (Vector LazyByteString)
-> Records (Vector LazyByteString)
forall a. Either [Char] a -> Records a -> Records a
Cons (Vector LazyByteString -> Either [Char] (Vector LazyByteString)
forall a b. b -> Either a b
Right Vector LazyByteString
firstRowRaw) Records (Vector LazyByteString)
dataStream
                )

    (Vector LazyByteString
sampleRow, Records (Vector LazyByteString)
_) <- Records (Vector LazyByteString)
-> IO (Vector LazyByteString, Records (Vector LazyByteString))
forall {m :: * -> *} {a}. Monad m => Records a -> m (a, Records a)
peekStream Records (Vector LazyByteString)
rowsToProcess
    [BuilderColumn]
builderCols <- [LazyByteString] -> ReadOptions -> IO [BuilderColumn]
initializeColumns (Vector LazyByteString -> [LazyByteString]
forall a. Vector a -> [a]
V.toList Vector LazyByteString
sampleRow) ReadOptions
opts
    Records (Vector LazyByteString) -> [BuilderColumn] -> IO ()
processStream Records (Vector LazyByteString)
rowsToProcess [BuilderColumn]
builderCols

    Vector Column
frozenCols <- [Column] -> Vector Column
forall a. [a] -> Vector a
V.fromList ([Column] -> Vector Column) -> IO [Column] -> IO (Vector Column)
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> (BuilderColumn -> IO Column) -> [BuilderColumn] -> IO [Column]
forall (t :: * -> *) (m :: * -> *) a b.
(Traversable t, Monad m) =>
(a -> m b) -> t a -> m (t b)
forall (m :: * -> *) a b. Monad m => (a -> m b) -> [a] -> m [b]
mapM BuilderColumn -> IO Column
freezeBuilderColumn [BuilderColumn]
builderCols
    let numRows :: Int
numRows = Int -> (Column -> Int) -> Maybe Column -> Int
forall b a. b -> (a -> b) -> Maybe a -> b
maybe Int
0 Column -> Int
columnLength (Vector Column
frozenCols Vector Column -> Int -> Maybe Column
forall a. Vector a -> Int -> Maybe a
V.!? Int
0)

    let df :: DataFrame
df =
            Vector Column -> Map Text Int -> (Int, Int) -> DataFrame
DataFrame
                Vector Column
frozenCols
                ([(Text, Int)] -> Map Text Int
forall k a. Ord k => [(k, a)] -> Map k a
M.fromList ([Text] -> [Int] -> [(Text, Int)]
forall a b. [a] -> [b] -> [(a, b)]
zip [Text]
columnNames [Int
0 ..]))
                (Int
numRows, Vector Column -> Int
forall a. Vector a -> Int
V.length Vector Column
frozenCols)

    DataFrame -> IO DataFrame
forall a. a -> IO a
forall (m :: * -> *) a. Monad m => a -> m a
return (DataFrame -> IO DataFrame) -> DataFrame -> IO DataFrame
forall a b. (a -> b) -> a -> b
$
        if TypeSpec -> Bool
shouldInferFromSample (ReadOptions -> TypeSpec
typeSpec ReadOptions
opts)
            then
                Int -> Bool -> [Char] -> DataFrame -> DataFrame
parseDefaults
                    (TypeSpec -> Int
typeInferenceSampleSize (ReadOptions -> TypeSpec
typeSpec ReadOptions
opts))
                    (ReadOptions -> Bool
safeRead ReadOptions
opts)
                    (ReadOptions -> [Char]
dateFormat ReadOptions
opts)
                    DataFrame
df
            else
                if Bool -> Bool
not ([SchemaType] -> Bool
forall a. [a] -> Bool
forall (t :: * -> *) a. Foldable t => t a -> Bool
null (TypeSpec -> [SchemaType]
schemaTypes (ReadOptions -> TypeSpec
typeSpec ReadOptions
opts)))
                    then [SchemaType] -> DataFrame -> DataFrame
parseWithTypes (TypeSpec -> [SchemaType]
schemaTypes (ReadOptions -> TypeSpec
typeSpec ReadOptions
opts)) DataFrame
df
                    else DataFrame
df

initializeColumns :: [BL.ByteString] -> ReadOptions -> IO [BuilderColumn]
initializeColumns :: [LazyByteString] -> ReadOptions -> IO [BuilderColumn]
initializeColumns [LazyByteString]
row ReadOptions
opts = case ReadOptions -> TypeSpec
typeSpec ReadOptions
opts of
    TypeSpec
NoInference -> (LazyByteString -> SchemaType -> IO BuilderColumn)
-> [LazyByteString] -> [SchemaType] -> IO [BuilderColumn]
forall (m :: * -> *) a b c.
Applicative m =>
(a -> b -> m c) -> [a] -> [b] -> m [c]
zipWithM LazyByteString -> SchemaType -> IO BuilderColumn
forall {p}. p -> SchemaType -> IO BuilderColumn
initColumn [LazyByteString]
row ([SchemaType] -> [SchemaType]
expandTypes [])
    InferFromSample Int
_ -> (LazyByteString -> SchemaType -> IO BuilderColumn)
-> [LazyByteString] -> [SchemaType] -> IO [BuilderColumn]
forall (m :: * -> *) a b c.
Applicative m =>
(a -> b -> m c) -> [a] -> [b] -> m [c]
zipWithM LazyByteString -> SchemaType -> IO BuilderColumn
forall {p}. p -> SchemaType -> IO BuilderColumn
initColumn [LazyByteString]
row ([SchemaType] -> [SchemaType]
expandTypes [])
    SpecifyTypes [SchemaType]
ts -> (LazyByteString -> SchemaType -> IO BuilderColumn)
-> [LazyByteString] -> [SchemaType] -> IO [BuilderColumn]
forall (m :: * -> *) a b c.
Applicative m =>
(a -> b -> m c) -> [a] -> [b] -> m [c]
zipWithM LazyByteString -> SchemaType -> IO BuilderColumn
forall {p}. p -> SchemaType -> IO BuilderColumn
initColumn [LazyByteString]
row ([SchemaType] -> [SchemaType]
expandTypes [SchemaType]
ts)
  where
    expandTypes :: [SchemaType] -> [SchemaType]
expandTypes [SchemaType]
xs = [SchemaType]
xs [SchemaType] -> [SchemaType] -> [SchemaType]
forall a. [a] -> [a] -> [a]
++ Int -> SchemaType -> [SchemaType]
forall a. Int -> a -> [a]
replicate ([LazyByteString] -> Int
forall a. [a] -> Int
forall (t :: * -> *) a. Foldable t => t a -> Int
length [LazyByteString]
row Int -> Int -> Int
forall a. Num a => a -> a -> a
- [SchemaType] -> Int
forall a. [a] -> Int
forall (t :: * -> *) a. Foldable t => t a -> Int
length [SchemaType]
xs) (forall a. Columnable a => SchemaType
schemaType @T.Text)
    initColumn :: p -> SchemaType -> IO BuilderColumn
initColumn p
_ SchemaType
t = do
        PagedUnboxedVector Word8
validityRef <- IO (PagedUnboxedVector Word8)
forall a. Unbox a => IO (PagedUnboxedVector a)
newPagedUnboxedVector
        case SchemaType
t of
            SType (Proxy a
_ :: P.Proxy a) -> case TypeRep a -> TypeRep Int -> Maybe (a :~: Int)
forall a b. TypeRep a -> TypeRep b -> Maybe (a :~: b)
forall {k} (f :: k -> *) (a :: k) (b :: k).
TestEquality f =>
f a -> f b -> Maybe (a :~: b)
testEquality (forall a. Typeable a => TypeRep a
forall {k} (a :: k). Typeable a => TypeRep a
typeRep @a) (forall a. Typeable a => TypeRep a
forall {k} (a :: k). Typeable a => TypeRep a
typeRep @Int) of
                Just a :~: Int
Refl -> PagedUnboxedVector Int -> PagedUnboxedVector Word8 -> BuilderColumn
BuilderInt (PagedUnboxedVector Int
 -> PagedUnboxedVector Word8 -> BuilderColumn)
-> IO (PagedUnboxedVector Int)
-> IO (PagedUnboxedVector Word8 -> BuilderColumn)
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> IO (PagedUnboxedVector Int)
forall a. Unbox a => IO (PagedUnboxedVector a)
newPagedUnboxedVector IO (PagedUnboxedVector Word8 -> BuilderColumn)
-> IO (PagedUnboxedVector Word8) -> IO BuilderColumn
forall a b. IO (a -> b) -> IO a -> IO b
forall (f :: * -> *) a b. Applicative f => f (a -> b) -> f a -> f b
<*> PagedUnboxedVector Word8 -> IO (PagedUnboxedVector Word8)
forall a. a -> IO a
forall (f :: * -> *) a. Applicative f => a -> f a
pure PagedUnboxedVector Word8
validityRef
                Maybe (a :~: Int)
Nothing -> case TypeRep a -> TypeRep Double -> Maybe (a :~: Double)
forall a b. TypeRep a -> TypeRep b -> Maybe (a :~: b)
forall {k} (f :: k -> *) (a :: k) (b :: k).
TestEquality f =>
f a -> f b -> Maybe (a :~: b)
testEquality (forall a. Typeable a => TypeRep a
forall {k} (a :: k). Typeable a => TypeRep a
typeRep @a) (forall a. Typeable a => TypeRep a
forall {k} (a :: k). Typeable a => TypeRep a
typeRep @Double) of
                    Just a :~: Double
Refl -> PagedUnboxedVector Double
-> PagedUnboxedVector Word8 -> BuilderColumn
BuilderDouble (PagedUnboxedVector Double
 -> PagedUnboxedVector Word8 -> BuilderColumn)
-> IO (PagedUnboxedVector Double)
-> IO (PagedUnboxedVector Word8 -> BuilderColumn)
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> IO (PagedUnboxedVector Double)
forall a. Unbox a => IO (PagedUnboxedVector a)
newPagedUnboxedVector IO (PagedUnboxedVector Word8 -> BuilderColumn)
-> IO (PagedUnboxedVector Word8) -> IO BuilderColumn
forall a b. IO (a -> b) -> IO a -> IO b
forall (f :: * -> *) a b. Applicative f => f (a -> b) -> f a -> f b
<*> PagedUnboxedVector Word8 -> IO (PagedUnboxedVector Word8)
forall a. a -> IO a
forall (f :: * -> *) a. Applicative f => a -> f a
pure PagedUnboxedVector Word8
validityRef
                    Maybe (a :~: Double)
Nothing -> PagedVector Text -> PagedUnboxedVector Word8 -> BuilderColumn
BuilderText (PagedVector Text -> PagedUnboxedVector Word8 -> BuilderColumn)
-> IO (PagedVector Text)
-> IO (PagedUnboxedVector Word8 -> BuilderColumn)
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> IO (PagedVector Text)
forall a. IO (PagedVector a)
newPagedVector IO (PagedUnboxedVector Word8 -> BuilderColumn)
-> IO (PagedUnboxedVector Word8) -> IO BuilderColumn
forall a b. IO (a -> b) -> IO a -> IO b
forall (f :: * -> *) a b. Applicative f => f (a -> b) -> f a -> f b
<*> PagedUnboxedVector Word8 -> IO (PagedUnboxedVector Word8)
forall a. a -> IO a
forall (f :: * -> *) a. Applicative f => a -> f a
pure PagedUnboxedVector Word8
validityRef

processStream ::
    CsvStream.Records (V.Vector BL.ByteString) -> [BuilderColumn] -> IO ()
processStream :: Records (Vector LazyByteString) -> [BuilderColumn] -> IO ()
processStream (Cons (Right Vector LazyByteString
row) Records (Vector LazyByteString)
rest) [BuilderColumn]
cols = Vector LazyByteString -> [BuilderColumn] -> IO ()
processRow Vector LazyByteString
row [BuilderColumn]
cols IO () -> IO () -> IO ()
forall a b. IO a -> IO b -> IO b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Records (Vector LazyByteString) -> [BuilderColumn] -> IO ()
processStream Records (Vector LazyByteString)
rest [BuilderColumn]
cols
processStream (Cons (Left [Char]
err) Records (Vector LazyByteString)
_) [BuilderColumn]
_ = [Char] -> IO ()
forall a. HasCallStack => [Char] -> a
error ([Char]
"CSV Parse Error: " [Char] -> ShowS
forall a. [a] -> [a] -> [a]
++ [Char]
err)
processStream (Nil Maybe [Char]
_ LazyByteString
_) [BuilderColumn]
_ = () -> IO ()
forall a. a -> IO a
forall (m :: * -> *) a. Monad m => a -> m a
return ()

processRow :: V.Vector BL.ByteString -> [BuilderColumn] -> IO ()
processRow :: Vector LazyByteString -> [BuilderColumn] -> IO ()
processRow !Vector LazyByteString
vals ![BuilderColumn]
cols = (LazyByteString -> BuilderColumn -> IO ())
-> Vector LazyByteString -> Vector BuilderColumn -> IO ()
forall (m :: * -> *) a b c.
Monad m =>
(a -> b -> m c) -> Vector a -> Vector b -> m ()
V.zipWithM_ LazyByteString -> BuilderColumn -> IO ()
processValue Vector LazyByteString
vals ([BuilderColumn] -> Vector BuilderColumn
forall a. [a] -> Vector a
V.fromList [BuilderColumn]
cols)
  where
    processValue :: LazyByteString -> BuilderColumn -> IO ()
processValue !LazyByteString
bs !BuilderColumn
col = do
        let bs' :: ByteString
bs' = LazyByteString -> ByteString
BL.toStrict LazyByteString
bs
        case BuilderColumn
col of
            BuilderInt PagedUnboxedVector Int
gv PagedUnboxedVector Word8
valid -> case HasCallStack => ByteString -> Maybe Int
ByteString -> Maybe Int
readByteStringInt ByteString
bs' of
                Just !Int
i -> PagedUnboxedVector Int -> Int -> IO ()
forall a. Unbox a => PagedUnboxedVector a -> a -> IO ()
appendPagedUnboxedVector PagedUnboxedVector Int
gv Int
i IO () -> IO () -> IO ()
forall a b. IO a -> IO b -> IO b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> PagedUnboxedVector Word8 -> Word8 -> IO ()
forall a. Unbox a => PagedUnboxedVector a -> a -> IO ()
appendPagedUnboxedVector PagedUnboxedVector Word8
valid Word8
1
                Maybe Int
Nothing -> PagedUnboxedVector Int -> Int -> IO ()
forall a. Unbox a => PagedUnboxedVector a -> a -> IO ()
appendPagedUnboxedVector PagedUnboxedVector Int
gv Int
0 IO () -> IO () -> IO ()
forall a b. IO a -> IO b -> IO b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> PagedUnboxedVector Word8 -> Word8 -> IO ()
forall a. Unbox a => PagedUnboxedVector a -> a -> IO ()
appendPagedUnboxedVector PagedUnboxedVector Word8
valid Word8
0
            BuilderDouble PagedUnboxedVector Double
gv PagedUnboxedVector Word8
valid -> case HasCallStack => ByteString -> Maybe Double
ByteString -> Maybe Double
readByteStringDouble ByteString
bs' of
                Just !Double
d -> PagedUnboxedVector Double -> Double -> IO ()
forall a. Unbox a => PagedUnboxedVector a -> a -> IO ()
appendPagedUnboxedVector PagedUnboxedVector Double
gv Double
d IO () -> IO () -> IO ()
forall a b. IO a -> IO b -> IO b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> PagedUnboxedVector Word8 -> Word8 -> IO ()
forall a. Unbox a => PagedUnboxedVector a -> a -> IO ()
appendPagedUnboxedVector PagedUnboxedVector Word8
valid Word8
1
                Maybe Double
Nothing -> PagedUnboxedVector Double -> Double -> IO ()
forall a. Unbox a => PagedUnboxedVector a -> a -> IO ()
appendPagedUnboxedVector PagedUnboxedVector Double
gv Double
0.0 IO () -> IO () -> IO ()
forall a b. IO a -> IO b -> IO b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> PagedUnboxedVector Word8 -> Word8 -> IO ()
forall a. Unbox a => PagedUnboxedVector a -> a -> IO ()
appendPagedUnboxedVector PagedUnboxedVector Word8
valid Word8
0
            BuilderText PagedVector Text
gv PagedUnboxedVector Word8
valid -> do
                let !val :: Text
val = Text -> Text
T.strip (ByteString -> Text
TE.decodeUtf8Lenient ByteString
bs')
                if Text -> Bool
isNull Text
val
                    then PagedVector Text -> Text -> IO ()
forall a. PagedVector a -> a -> IO ()
appendPagedVector PagedVector Text
gv Text
T.empty IO () -> IO () -> IO ()
forall a b. IO a -> IO b -> IO b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> PagedUnboxedVector Word8 -> Word8 -> IO ()
forall a. Unbox a => PagedUnboxedVector a -> a -> IO ()
appendPagedUnboxedVector PagedUnboxedVector Word8
valid Word8
0
                    else PagedVector Text -> Text -> IO ()
forall a. PagedVector a -> a -> IO ()
appendPagedVector PagedVector Text
gv Text
val IO () -> IO () -> IO ()
forall a b. IO a -> IO b -> IO b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> PagedUnboxedVector Word8 -> Word8 -> IO ()
forall a. Unbox a => PagedUnboxedVector a -> a -> IO ()
appendPagedUnboxedVector PagedUnboxedVector Word8
valid Word8
1

isNull :: T.Text -> Bool
isNull :: Text -> Bool
isNull Text
t = Text -> Bool
T.null Text
t Bool -> Bool -> Bool
|| Text
t Text -> Text -> Bool
forall a. Eq a => a -> a -> Bool
== Text
"NA" Bool -> Bool -> Bool
|| Text
t Text -> Text -> Bool
forall a. Eq a => a -> a -> Bool
== Text
"NULL" Bool -> Bool -> Bool
|| Text
t Text -> Text -> Bool
forall a. Eq a => a -> a -> Bool
== Text
"null"

freezeBuilderColumn :: BuilderColumn -> IO Column
freezeBuilderColumn :: BuilderColumn -> IO Column
freezeBuilderColumn (BuilderInt PagedUnboxedVector Int
gv PagedUnboxedVector Word8
validRef) = do
    Vector Int
vec <- PagedUnboxedVector Int -> IO (Vector Int)
forall a. Unbox a => PagedUnboxedVector a -> IO (Vector a)
freezePagedUnboxedVector PagedUnboxedVector Int
gv
    Vector Word8
valid <- PagedUnboxedVector Word8 -> IO (Vector Word8)
forall a. Unbox a => PagedUnboxedVector a -> IO (Vector a)
freezePagedUnboxedVector PagedUnboxedVector Word8
validRef
    if (Word8 -> Bool) -> Vector Word8 -> Bool
forall a. Unbox a => (a -> Bool) -> Vector a -> Bool
VU.all (Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
== Word8
1) Vector Word8
valid
        then Column -> IO Column
forall a. a -> IO a
forall (m :: * -> *) a. Monad m => a -> m a
return (Column -> IO Column) -> Column -> IO Column
forall a b. (a -> b) -> a -> b
$ Vector Int -> Column
forall a. (Columnable a, Unbox a) => Vector a -> Column
UnboxedColumn Vector Int
vec
        else Vector Int -> Vector Word8 -> IO Column
forall a.
(Unbox a, Columnable a) =>
Vector a -> Vector Word8 -> IO Column
constructOptional Vector Int
vec Vector Word8
valid
freezeBuilderColumn (BuilderDouble PagedUnboxedVector Double
gv PagedUnboxedVector Word8
validRef) = do
    Vector Double
vec <- PagedUnboxedVector Double -> IO (Vector Double)
forall a. Unbox a => PagedUnboxedVector a -> IO (Vector a)
freezePagedUnboxedVector PagedUnboxedVector Double
gv
    Vector Word8
valid <- PagedUnboxedVector Word8 -> IO (Vector Word8)
forall a. Unbox a => PagedUnboxedVector a -> IO (Vector a)
freezePagedUnboxedVector PagedUnboxedVector Word8
validRef
    if (Word8 -> Bool) -> Vector Word8 -> Bool
forall a. Unbox a => (a -> Bool) -> Vector a -> Bool
VU.all (Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
== Word8
1) Vector Word8
valid
        then Column -> IO Column
forall a. a -> IO a
forall (m :: * -> *) a. Monad m => a -> m a
return (Column -> IO Column) -> Column -> IO Column
forall a b. (a -> b) -> a -> b
$ Vector Double -> Column
forall a. (Columnable a, Unbox a) => Vector a -> Column
UnboxedColumn Vector Double
vec
        else Vector Double -> Vector Word8 -> IO Column
forall a.
(Unbox a, Columnable a) =>
Vector a -> Vector Word8 -> IO Column
constructOptional Vector Double
vec Vector Word8
valid
freezeBuilderColumn (BuilderText PagedVector Text
gv PagedUnboxedVector Word8
validRef) = do
    Vector Text
vec <- PagedVector Text -> IO (Vector Text)
forall a. PagedVector a -> IO (Vector a)
freezePagedVector PagedVector Text
gv
    Vector Word8
valid <- PagedUnboxedVector Word8 -> IO (Vector Word8)
forall a. Unbox a => PagedUnboxedVector a -> IO (Vector a)
freezePagedUnboxedVector PagedUnboxedVector Word8
validRef
    if (Word8 -> Bool) -> Vector Word8 -> Bool
forall a. Unbox a => (a -> Bool) -> Vector a -> Bool
VU.all (Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
== Word8
1) Vector Word8
valid
        then Column -> IO Column
forall a. a -> IO a
forall (m :: * -> *) a. Monad m => a -> m a
return (Column -> IO Column) -> Column -> IO Column
forall a b. (a -> b) -> a -> b
$ Vector Text -> Column
forall a. Columnable a => Vector a -> Column
BoxedColumn Vector Text
vec
        else Vector Text -> Vector Word8 -> IO Column
constructOptionalBoxed Vector Text
vec Vector Word8
valid

constructOptional ::
    (VU.Unbox a, Columnable a) => VU.Vector a -> VU.Vector Word8 -> IO Column
constructOptional :: forall a.
(Unbox a, Columnable a) =>
Vector a -> Vector Word8 -> IO Column
constructOptional Vector a
vec Vector Word8
valid = do
    let size :: Int
size = Vector a -> Int
forall a. Unbox a => Vector a -> Int
VU.length Vector a
vec
    MVector RealWorld (Maybe a)
mvec <- Int -> IO (MVector (PrimState IO) (Maybe a))
forall (m :: * -> *) a.
PrimMonad m =>
Int -> m (MVector (PrimState m) a)
VM.new Int
size
    [Int] -> (Int -> IO ()) -> IO ()
forall (t :: * -> *) (m :: * -> *) a b.
(Foldable t, Monad m) =>
t a -> (a -> m b) -> m ()
forM_ [Int
0 .. Int
size Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
1] ((Int -> IO ()) -> IO ()) -> (Int -> IO ()) -> IO ()
forall a b. (a -> b) -> a -> b
$ \Int
i ->
        if (Vector Word8
valid Vector Word8 -> Int -> Word8
forall a. Unbox a => Vector a -> Int -> a
VU.! Int
i) Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
== Word8
0
            then MVector (PrimState IO) (Maybe a) -> Int -> Maybe a -> IO ()
forall (m :: * -> *) a.
PrimMonad m =>
MVector (PrimState m) a -> Int -> a -> m ()
VM.write MVector RealWorld (Maybe a)
MVector (PrimState IO) (Maybe a)
mvec Int
i Maybe a
forall a. Maybe a
Nothing
            else MVector (PrimState IO) (Maybe a) -> Int -> Maybe a -> IO ()
forall (m :: * -> *) a.
PrimMonad m =>
MVector (PrimState m) a -> Int -> a -> m ()
VM.write MVector RealWorld (Maybe a)
MVector (PrimState IO) (Maybe a)
mvec Int
i (a -> Maybe a
forall a. a -> Maybe a
Just (Vector a
vec Vector a -> Int -> a
forall a. Unbox a => Vector a -> Int -> a
VU.! Int
i))
    Vector (Maybe a) -> Column
forall a. Columnable a => Vector (Maybe a) -> Column
OptionalColumn (Vector (Maybe a) -> Column) -> IO (Vector (Maybe a)) -> IO Column
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> MVector (PrimState IO) (Maybe a) -> IO (Vector (Maybe a))
forall (m :: * -> *) a.
PrimMonad m =>
MVector (PrimState m) a -> m (Vector a)
V.freeze MVector RealWorld (Maybe a)
MVector (PrimState IO) (Maybe a)
mvec

constructOptionalBoxed :: V.Vector T.Text -> VU.Vector Word8 -> IO Column
constructOptionalBoxed :: Vector Text -> Vector Word8 -> IO Column
constructOptionalBoxed Vector Text
vec Vector Word8
valid = do
    let size :: Int
size = Vector Text -> Int
forall a. Vector a -> Int
V.length Vector Text
vec
    MVector RealWorld (Maybe Text)
mvec <- Int -> IO (MVector (PrimState IO) (Maybe Text))
forall (m :: * -> *) a.
PrimMonad m =>
Int -> m (MVector (PrimState m) a)
VM.new Int
size
    [Int] -> (Int -> IO ()) -> IO ()
forall (t :: * -> *) (m :: * -> *) a b.
(Foldable t, Monad m) =>
t a -> (a -> m b) -> m ()
forM_ [Int
0 .. Int
size Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
1] ((Int -> IO ()) -> IO ()) -> (Int -> IO ()) -> IO ()
forall a b. (a -> b) -> a -> b
$ \Int
i ->
        if (Vector Word8
valid Vector Word8 -> Int -> Word8
forall a. Unbox a => Vector a -> Int -> a
VU.! Int
i) Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
== Word8
0
            then MVector (PrimState IO) (Maybe Text) -> Int -> Maybe Text -> IO ()
forall (m :: * -> *) a.
PrimMonad m =>
MVector (PrimState m) a -> Int -> a -> m ()
VM.write MVector RealWorld (Maybe Text)
MVector (PrimState IO) (Maybe Text)
mvec Int
i Maybe Text
forall a. Maybe a
Nothing
            else MVector (PrimState IO) (Maybe Text) -> Int -> Maybe Text -> IO ()
forall (m :: * -> *) a.
PrimMonad m =>
MVector (PrimState m) a -> Int -> a -> m ()
VM.write MVector RealWorld (Maybe Text)
MVector (PrimState IO) (Maybe Text)
mvec Int
i (Text -> Maybe Text
forall a. a -> Maybe a
Just (Vector Text
vec Vector Text -> Int -> Text
forall a. Vector a -> Int -> a
V.! Int
i))
    Vector (Maybe Text) -> Column
forall a. Columnable a => Vector (Maybe a) -> Column
OptionalColumn (Vector (Maybe Text) -> Column)
-> IO (Vector (Maybe Text)) -> IO Column
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> MVector (PrimState IO) (Maybe Text) -> IO (Vector (Maybe Text))
forall (m :: * -> *) a.
PrimMonad m =>
MVector (PrimState m) a -> m (Vector a)
V.freeze MVector RealWorld (Maybe Text)
MVector (PrimState IO) (Maybe Text)
mvec

writeCsv :: FilePath -> DataFrame -> IO ()
writeCsv :: [Char] -> DataFrame -> IO ()
writeCsv = Char -> [Char] -> DataFrame -> IO ()
writeSeparated Char
','

writeSeparated ::
    -- | Separator
    Char ->
    -- | Path to write to
    FilePath ->
    DataFrame ->
    IO ()
writeSeparated :: Char -> [Char] -> DataFrame -> IO ()
writeSeparated Char
c [Char]
filepath DataFrame
df = [Char] -> IOMode -> (Handle -> IO ()) -> IO ()
forall r. [Char] -> IOMode -> (Handle -> IO r) -> IO r
withFile [Char]
filepath IOMode
WriteMode ((Handle -> IO ()) -> IO ()) -> (Handle -> IO ()) -> IO ()
forall a b. (a -> b) -> a -> b
$ \Handle
handle -> do
    let (Int
rows, Int
_) = DataFrame -> (Int, Int)
dataframeDimensions DataFrame
df
    let headers :: [Text]
headers = ((Text, Int) -> Text) -> [(Text, Int)] -> [Text]
forall a b. (a -> b) -> [a] -> [b]
map (Text, Int) -> Text
forall a b. (a, b) -> a
fst (((Text, Int) -> (Text, Int) -> Ordering)
-> [(Text, Int)] -> [(Text, Int)]
forall a. (a -> a -> Ordering) -> [a] -> [a]
L.sortBy (Int -> Int -> Ordering
forall a. Ord a => a -> a -> Ordering
compare (Int -> Int -> Ordering)
-> ((Text, Int) -> Int) -> (Text, Int) -> (Text, Int) -> Ordering
forall b c a. (b -> b -> c) -> (a -> b) -> a -> a -> c
`on` (Text, Int) -> Int
forall a b. (a, b) -> b
snd) (Map Text Int -> [(Text, Int)]
forall k a. Map k a -> [(k, a)]
M.toList (DataFrame -> Map Text Int
columnIndices DataFrame
df)))
    Handle -> Text -> IO ()
TIO.hPutStrLn Handle
handle (Text -> [Text] -> Text
T.intercalate Text
", " [Text]
headers)
    [Int] -> (Int -> IO ()) -> IO ()
forall (t :: * -> *) (m :: * -> *) a b.
(Foldable t, Monad m) =>
t a -> (a -> m b) -> m ()
forM_ [Int
0 .. (Int
rows Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
1)] ((Int -> IO ()) -> IO ()) -> (Int -> IO ()) -> IO ()
forall a b. (a -> b) -> a -> b
$ \Int
i -> do
        let row :: [Text]
row = DataFrame -> Int -> [Text]
getRowAsText DataFrame
df Int
i
        Handle -> Text -> IO ()
TIO.hPutStrLn Handle
handle (Text -> [Text] -> Text
T.intercalate Text
"," [Text]
row)

getRowAsText :: DataFrame -> Int -> [T.Text]
getRowAsText :: DataFrame -> Int -> [Text]
getRowAsText DataFrame
df Int
i = (Int -> Column -> [Text] -> [Text])
-> [Text] -> Vector Column -> [Text]
forall a b. (Int -> a -> b -> b) -> b -> Vector a -> b
V.ifoldr Int -> Column -> [Text] -> [Text]
go [] (DataFrame -> Vector Column
columns DataFrame
df)
  where
    indexMap :: Map Int Text
indexMap = [(Int, Text)] -> Map Int Text
forall k a. Ord k => [(k, a)] -> Map k a
M.fromList (((Text, Int) -> (Int, Text)) -> [(Text, Int)] -> [(Int, Text)]
forall a b. (a -> b) -> [a] -> [b]
map (\(Text
a, Int
b) -> (Int
b, Text
a)) ([(Text, Int)] -> [(Int, Text)]) -> [(Text, Int)] -> [(Int, Text)]
forall a b. (a -> b) -> a -> b
$ Map Text Int -> [(Text, Int)]
forall k a. Map k a -> [(k, a)]
M.toList (DataFrame -> Map Text Int
columnIndices DataFrame
df))
    go :: Int -> Column -> [Text] -> [Text]
go Int
k (BoxedColumn (Vector a
c :: V.Vector a)) [Text]
acc = case Vector a
c Vector a -> Int -> Maybe a
forall a. Vector a -> Int -> Maybe a
V.!? Int
i of
        Just a
e -> Text
textRep Text -> [Text] -> [Text]
forall a. a -> [a] -> [a]
: [Text]
acc
          where
            textRep :: Text
textRep = case TypeRep a -> TypeRep Text -> Maybe (a :~: Text)
forall a b. TypeRep a -> TypeRep b -> Maybe (a :~: b)
forall {k} (f :: k -> *) (a :: k) (b :: k).
TestEquality f =>
f a -> f b -> Maybe (a :~: b)
testEquality (forall a. Typeable a => TypeRep a
forall {k} (a :: k). Typeable a => TypeRep a
typeRep @a) (forall a. Typeable a => TypeRep a
forall {k} (a :: k). Typeable a => TypeRep a
typeRep @T.Text) of
                Just a :~: Text
Refl -> a
Text
e
                Maybe (a :~: Text)
Nothing -> case forall a. Typeable a => TypeRep a
forall {k} (a :: k). Typeable a => TypeRep a
typeRep @a of
                    App TypeRep a
t1 TypeRep b
t2 -> case TypeRep a -> TypeRep Maybe -> Maybe (a :~~: Maybe)
forall k1 k2 (a :: k1) (b :: k2).
TypeRep a -> TypeRep b -> Maybe (a :~~: b)
eqTypeRep TypeRep a
t1 (forall {k} (a :: k). Typeable a => TypeRep a
forall (a :: * -> *). Typeable a => TypeRep a
typeRep @Maybe) of
                        Just a :~~: Maybe
HRefl -> case TypeRep b -> TypeRep Text -> Maybe (b :~: Text)
forall (a :: k1) (b :: k1).
TypeRep a -> TypeRep b -> Maybe (a :~: b)
forall {k} (f :: k -> *) (a :: k) (b :: k).
TestEquality f =>
f a -> f b -> Maybe (a :~: b)
testEquality TypeRep b
t2 (forall a. Typeable a => TypeRep a
forall {k} (a :: k). Typeable a => TypeRep a
typeRep @T.Text) of
                            Just b :~: Text
Refl -> Text -> Maybe Text -> Text
forall a. a -> Maybe a -> a
fromMaybe Text
"null" a
Maybe Text
e
                            Maybe (b :~: Text)
Nothing -> (Text -> Text
fromOptional (Text -> Text) -> (a -> Text) -> a -> Text
forall b c a. (b -> c) -> (a -> b) -> a -> c
. [Char] -> Text
T.pack ([Char] -> Text) -> (a -> [Char]) -> a -> Text
forall b c a. (b -> c) -> (a -> b) -> a -> c
. a -> [Char]
forall a. Show a => a -> [Char]
show) a
e
                              where
                                fromOptional :: Text -> Text
fromOptional Text
s
                                    | Text -> Text -> Bool
T.isPrefixOf Text
"Just " Text
s = Int -> Text -> Text
T.drop (Text -> Int
T.length Text
"Just ") Text
s
                                    | Bool
otherwise = Text
"null"
                        Maybe (a :~~: Maybe)
Nothing -> ([Char] -> Text
T.pack ([Char] -> Text) -> (a -> [Char]) -> a -> Text
forall b c a. (b -> c) -> (a -> b) -> a -> c
. a -> [Char]
forall a. Show a => a -> [Char]
show) a
e
                    TypeRep a
_ -> ([Char] -> Text
T.pack ([Char] -> Text) -> (a -> [Char]) -> a -> Text
forall b c a. (b -> c) -> (a -> b) -> a -> c
. a -> [Char]
forall a. Show a => a -> [Char]
show) a
e
        Maybe a
Nothing ->
            [Char] -> [Text]
forall a. HasCallStack => [Char] -> a
error ([Char] -> [Text]) -> [Char] -> [Text]
forall a b. (a -> b) -> a -> b
$
                [Char]
"Column "
                    [Char] -> ShowS
forall a. [a] -> [a] -> [a]
++ Text -> [Char]
T.unpack (Map Int Text
indexMap Map Int Text -> Int -> Text
forall k a. Ord k => Map k a -> k -> a
M.! Int
k)
                    [Char] -> ShowS
forall a. [a] -> [a] -> [a]
++ [Char]
" has less items than "
                    [Char] -> ShowS
forall a. [a] -> [a] -> [a]
++ [Char]
"the other columns at index "
                    [Char] -> ShowS
forall a. [a] -> [a] -> [a]
++ Int -> [Char]
forall a. Show a => a -> [Char]
show Int
i
    go Int
k (UnboxedColumn Vector a
c) [Text]
acc = case Vector a
c Vector a -> Int -> Maybe a
forall a. Unbox a => Vector a -> Int -> Maybe a
VU.!? Int
i of
        Just a
e -> [Char] -> Text
T.pack (a -> [Char]
forall a. Show a => a -> [Char]
show a
e) Text -> [Text] -> [Text]
forall a. a -> [a] -> [a]
: [Text]
acc
        Maybe a
Nothing ->
            [Char] -> [Text]
forall a. HasCallStack => [Char] -> a
error ([Char] -> [Text]) -> [Char] -> [Text]
forall a b. (a -> b) -> a -> b
$
                [Char]
"Column "
                    [Char] -> ShowS
forall a. [a] -> [a] -> [a]
++ Text -> [Char]
T.unpack (Map Int Text
indexMap Map Int Text -> Int -> Text
forall k a. Ord k => Map k a -> k -> a
M.! Int
k)
                    [Char] -> ShowS
forall a. [a] -> [a] -> [a]
++ [Char]
" has less items than "
                    [Char] -> ShowS
forall a. [a] -> [a] -> [a]
++ [Char]
"the other columns at index "
                    [Char] -> ShowS
forall a. [a] -> [a] -> [a]
++ Int -> [Char]
forall a. Show a => a -> [Char]
show Int
i
    go Int
k (OptionalColumn (Vector (Maybe a)
c :: V.Vector (Maybe a))) [Text]
acc = case Vector (Maybe a)
c Vector (Maybe a) -> Int -> Maybe (Maybe a)
forall a. Vector a -> Int -> Maybe a
V.!? Int
i of
        Just Maybe a
e -> Text
textRep Text -> [Text] -> [Text]
forall a. a -> [a] -> [a]
: [Text]
acc
          where
            textRep :: Text
textRep = case TypeRep a -> TypeRep Text -> Maybe (a :~: Text)
forall a b. TypeRep a -> TypeRep b -> Maybe (a :~: b)
forall {k} (f :: k -> *) (a :: k) (b :: k).
TestEquality f =>
f a -> f b -> Maybe (a :~: b)
testEquality (forall a. Typeable a => TypeRep a
forall {k} (a :: k). Typeable a => TypeRep a
typeRep @a) (forall a. Typeable a => TypeRep a
forall {k} (a :: k). Typeable a => TypeRep a
typeRep @T.Text) of
                Just a :~: Text
Refl -> a -> Maybe a -> a
forall a. a -> Maybe a -> a
fromMaybe a
"Nothing" Maybe a
e
                Maybe (a :~: Text)
Nothing -> ([Char] -> Text
T.pack ([Char] -> Text) -> (Maybe a -> [Char]) -> Maybe a -> Text
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Maybe a -> [Char]
forall a. Show a => a -> [Char]
show) Maybe a
e
        Maybe (Maybe a)
Nothing ->
            [Char] -> [Text]
forall a. HasCallStack => [Char] -> a
error ([Char] -> [Text]) -> [Char] -> [Text]
forall a b. (a -> b) -> a -> b
$
                [Char]
"Column "
                    [Char] -> ShowS
forall a. [a] -> [a] -> [a]
++ Text -> [Char]
T.unpack (Map Int Text
indexMap Map Int Text -> Int -> Text
forall k a. Ord k => Map k a -> k -> a
M.! Int
k)
                    [Char] -> ShowS
forall a. [a] -> [a] -> [a]
++ [Char]
" has less items than "
                    [Char] -> ShowS
forall a. [a] -> [a] -> [a]
++ [Char]
"the other columns at index "
                    [Char] -> ShowS
forall a. [a] -> [a] -> [a]
++ Int -> [Char]
forall a. Show a => a -> [Char]
show Int
i

stripQuotes :: T.Text -> T.Text
stripQuotes :: Text -> Text
stripQuotes Text
txt =
    case Text -> Maybe (Char, Text)
T.uncons Text
txt of
        Just (Char
'"', Text
rest) ->
            case Text -> Maybe (Text, Char)
T.unsnoc Text
rest of
                Just (Text
middle, Char
'"') -> Text
middle
                Maybe (Text, Char)
_ -> Text
txt
        Maybe (Char, Text)
_ -> Text
txt