{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE ExplicitNamespaces #-}
{-# LANGUAGE LambdaCase #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE TypeApplications #-}
{-# LANGUAGE GADTs #-}
{-# LANGUAGE RankNTypes #-}
{-# LANGUAGE Strict #-}
module DataFrame.IO.CSV where

import qualified Data.ByteString.Char8 as C
import qualified Data.List as L
import qualified Data.Map as M
import qualified Data.Set as S
import qualified Data.Text as T
import qualified Data.Text.Lazy as TL
import qualified Data.Text.Lazy.IO as TLIO
import qualified Data.Text.IO as TIO
import qualified Data.Vector as V
import qualified Data.Vector.Unboxed as VU
import qualified Data.Vector.Mutable as VM
import qualified Data.Vector.Unboxed.Mutable as VUM

import Control.Applicative ((<$>), (<|>), (<*>), (<*), (*>), many)
import Control.Monad (forM_, zipWithM_, unless, void)
import Data.Attoparsec.Text
import Data.Char
import DataFrame.Internal.Column (Column(..), freezeColumn', writeColumn, columnLength)
import DataFrame.Internal.DataFrame (DataFrame(..))
import DataFrame.Internal.Parsing
import DataFrame.Operations.Typing
import Data.Foldable (fold)
import Data.Function (on)
import Data.IORef
import Data.Maybe
import Data.Text.Encoding (decodeUtf8Lenient)
import Data.Type.Equality
  ( TestEquality (testEquality),
    type (:~:) (Refl)
  )
import GHC.IO.Handle (Handle)
import Prelude hiding (concat, takeWhile)
import System.IO
import Type.Reflection

-- | Record for CSV read options.
data ReadOptions = ReadOptions {
    ReadOptions -> Bool
hasHeader :: Bool,
    ReadOptions -> Bool
inferTypes :: Bool,
    ReadOptions -> Bool
safeRead :: Bool
}

-- | By default we assume the file has a header, we infer the types on read
-- and we convert any rows with nullish objects into Maybe (safeRead).
defaultOptions :: ReadOptions
defaultOptions :: ReadOptions
defaultOptions = ReadOptions { hasHeader :: Bool
hasHeader = Bool
True, inferTypes :: Bool
inferTypes = Bool
True, safeRead :: Bool
safeRead = Bool
True }

-- | Reads a CSV file from the given path.
-- Note this file stores intermediate temporary files
-- while converting the CSV from a row to a columnar format.
readCsv :: String -> IO DataFrame
readCsv :: String -> IO DataFrame
readCsv = Char -> ReadOptions -> String -> IO DataFrame
readSeparated Char
',' ReadOptions
defaultOptions

-- | Reads a tab separated file from the given path.
-- Note this file stores intermediate temporary files
-- while converting the CSV from a row to a columnar format.
readTsv :: String -> IO DataFrame
readTsv :: String -> IO DataFrame
readTsv = Char -> ReadOptions -> String -> IO DataFrame
readSeparated Char
'\t' ReadOptions
defaultOptions

-- | Reads a character separated file into a dataframe using mutable vectors.
readSeparated :: Char -> ReadOptions -> String -> IO DataFrame
readSeparated :: Char -> ReadOptions -> String -> IO DataFrame
readSeparated Char
c ReadOptions
opts String
path = do
    Int
totalRows <- Char -> String -> IO Int
countRows Char
c String
path
    String -> IOMode -> (Handle -> IO DataFrame) -> IO DataFrame
forall r. String -> IOMode -> (Handle -> IO r) -> IO r
withFile String
path IOMode
ReadMode ((Handle -> IO DataFrame) -> IO DataFrame)
-> (Handle -> IO DataFrame) -> IO DataFrame
forall a b. (a -> b) -> a -> b
$ \Handle
handle -> do
        [Text]
firstRow <- (Text -> Text) -> [Text] -> [Text]
forall a b. (a -> b) -> [a] -> [b]
map Text -> Text
T.strip ([Text] -> [Text]) -> (Text -> [Text]) -> Text -> [Text]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Char -> Text -> [Text]
parseSep Char
c (Text -> [Text]) -> IO Text -> IO [Text]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Handle -> IO Text
TIO.hGetLine Handle
handle
        let columnNames :: [Text]
columnNames = if ReadOptions -> Bool
hasHeader ReadOptions
opts
                        then (Text -> Text) -> [Text] -> [Text]
forall a b. (a -> b) -> [a] -> [b]
map ((Char -> Bool) -> Text -> Text
T.filter (Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
/= Char
'\"')) [Text]
firstRow
                        else (Int -> Text) -> [Int] -> [Text]
forall a b. (a -> b) -> [a] -> [b]
map (Char -> Text
T.singleton (Char -> Text) -> (Int -> Char) -> Int -> Text
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Int -> Char
intToDigit) [Int
0..([Text] -> Int
forall a. [a] -> Int
forall (t :: * -> *) a. Foldable t => t a -> Int
length [Text]
firstRow Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
1)]
        -- If there was no header rewind the file cursor.
        Bool -> IO () -> IO ()
forall (f :: * -> *). Applicative f => Bool -> f () -> f ()
unless (ReadOptions -> Bool
hasHeader ReadOptions
opts) (IO () -> IO ()) -> IO () -> IO ()
forall a b. (a -> b) -> a -> b
$ Handle -> SeekMode -> Integer -> IO ()
hSeek Handle
handle SeekMode
AbsoluteSeek Integer
0

        -- Initialize mutable vectors for each column
        let numColumns :: Int
numColumns = [Text] -> Int
forall a. [a] -> Int
forall (t :: * -> *) a. Foldable t => t a -> Int
length [Text]
columnNames
        let numRows :: Int
numRows = if ReadOptions -> Bool
hasHeader ReadOptions
opts then Int
totalRows Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
1 else Int
totalRows
        -- Use this row to infer the types of the rest of the column.
        -- TODO: this isn't robust but in so far as this is a guess anyway
        -- it's probably fine. But we should probably sample n rows and pick
        -- the most likely type from the sample.
        [Text]
dataRow <- (Text -> Text) -> [Text] -> [Text]
forall a b. (a -> b) -> [a] -> [b]
map Text -> Text
T.strip ([Text] -> [Text]) -> (Text -> [Text]) -> Text -> [Text]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Char -> Text -> [Text]
parseSep Char
c (Text -> [Text]) -> IO Text -> IO [Text]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Handle -> IO Text
TIO.hGetLine Handle
handle

        -- This array will track the indices of all null values for each column.
        -- If any exist then the column will be an optional type.
        IOVector [(Int, Text)]
nullIndices <- Int -> IO (MVector (PrimState IO) [(Int, Text)])
forall (m :: * -> *) a.
PrimMonad m =>
Int -> m (MVector (PrimState m) a)
VM.unsafeNew Int
numColumns
        MVector (PrimState IO) [(Int, Text)] -> [(Int, Text)] -> IO ()
forall (m :: * -> *) a.
PrimMonad m =>
MVector (PrimState m) a -> a -> m ()
VM.set IOVector [(Int, Text)]
MVector (PrimState IO) [(Int, Text)]
nullIndices []
        IOVector Column
mutableCols <- Int -> IO (MVector (PrimState IO) Column)
forall (m :: * -> *) a.
PrimMonad m =>
Int -> m (MVector (PrimState m) a)
VM.unsafeNew Int
numColumns
        Int -> IOVector Column -> [Text] -> IO ()
getInitialDataVectors Int
numRows IOVector Column
mutableCols [Text]
dataRow

        -- Read rows into the mutable vectors
        Int
-> Char
-> IOVector Column
-> IOVector [(Int, Text)]
-> Handle
-> IO ()
fillColumns Int
numRows Char
c IOVector Column
mutableCols IOVector [(Int, Text)]
nullIndices Handle
handle

        -- Freeze the mutable vectors into immutable ones
        Vector [(Int, Text)]
nulls' <- MVector (PrimState IO) [(Int, Text)] -> IO (Vector [(Int, Text)])
forall (m :: * -> *) a.
PrimMonad m =>
MVector (PrimState m) a -> m (Vector a)
V.unsafeFreeze IOVector [(Int, Text)]
MVector (PrimState IO) [(Int, Text)]
nullIndices
        Vector (Maybe Column)
cols <- (Int -> IO (Maybe Column))
-> Vector Int -> IO (Vector (Maybe Column))
forall (m :: * -> *) a b.
Monad m =>
(a -> m b) -> Vector a -> m (Vector b)
V.mapM (IOVector Column
-> Vector [(Int, Text)] -> ReadOptions -> Int -> IO (Maybe Column)
freezeColumn IOVector Column
mutableCols Vector [(Int, Text)]
nulls' ReadOptions
opts) (Int -> (Int -> Int) -> Vector Int
forall a. Int -> (Int -> a) -> Vector a
V.generate Int
numColumns Int -> Int
forall a. a -> a
id)
        DataFrame -> IO DataFrame
forall a. a -> IO a
forall (m :: * -> *) a. Monad m => a -> m a
return (DataFrame -> IO DataFrame) -> DataFrame -> IO DataFrame
forall a b. (a -> b) -> a -> b
$ DataFrame {
                columns :: Vector (Maybe Column)
columns = Vector (Maybe Column)
cols,
                freeIndices :: [Int]
freeIndices = [],
                columnIndices :: Map Text Int
columnIndices = [(Text, Int)] -> Map Text Int
forall k a. Ord k => [(k, a)] -> Map k a
M.fromList ([Text] -> [Int] -> [(Text, Int)]
forall a b. [a] -> [b] -> [(a, b)]
zip [Text]
columnNames [Int
0..]),
                dataframeDimensions :: (Int, Int)
dataframeDimensions = (Int -> (Column -> Int) -> Maybe Column -> Int
forall b a. b -> (a -> b) -> Maybe a -> b
maybe Int
0 Column -> Int
columnLength (Vector (Maybe Column)
cols Vector (Maybe Column) -> Int -> Maybe Column
forall a. Vector a -> Int -> a
V.! Int
0), Vector (Maybe Column) -> Int
forall a. Vector a -> Int
V.length Vector (Maybe Column)
cols)
            }
{-# INLINE readSeparated #-}

getInitialDataVectors :: Int -> VM.IOVector Column -> [T.Text] -> IO ()
getInitialDataVectors :: Int -> IOVector Column -> [Text] -> IO ()
getInitialDataVectors Int
n IOVector Column
mCol [Text]
xs = do
    [(Int, Text)] -> ((Int, Text) -> IO ()) -> IO ()
forall (t :: * -> *) (m :: * -> *) a b.
(Foldable t, Monad m) =>
t a -> (a -> m b) -> m ()
forM_ ([Int] -> [Text] -> [(Int, Text)]
forall a b. [a] -> [b] -> [(a, b)]
zip [Int
0..] [Text]
xs) (((Int, Text) -> IO ()) -> IO ())
-> ((Int, Text) -> IO ()) -> IO ()
forall a b. (a -> b) -> a -> b
$ \(Int
i, Text
x) -> do
        Column
col <- case Text -> Text
inferValueType Text
x of
                Text
"Int" -> IOVector Int -> Column
forall a. (Columnable a, Unbox a) => IOVector a -> Column
MutableUnboxedColumn (IOVector Int -> Column) -> IO (IOVector Int) -> IO Column
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$>  ((Int -> IO (MVector (PrimState IO) Int)
forall (m :: * -> *) a.
(PrimMonad m, Unbox a) =>
Int -> m (MVector (PrimState m) a)
VUM.unsafeNew Int
n :: IO (VUM.IOVector Int)) IO (IOVector Int)
-> (IOVector Int -> IO (IOVector Int)) -> IO (IOVector Int)
forall a b. IO a -> (a -> IO b) -> IO b
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= \IOVector Int
c -> MVector (PrimState IO) Int -> Int -> Int -> IO ()
forall (m :: * -> *) a.
(PrimMonad m, Unbox a) =>
MVector (PrimState m) a -> Int -> a -> m ()
VUM.unsafeWrite IOVector Int
MVector (PrimState IO) Int
c Int
0 (Int -> Maybe Int -> Int
forall a. a -> Maybe a -> a
fromMaybe Int
0 (Maybe Int -> Int) -> Maybe Int -> Int
forall a b. (a -> b) -> a -> b
$ HasCallStack => Text -> Maybe Int
Text -> Maybe Int
readInt Text
x) IO () -> IO (IOVector Int) -> IO (IOVector Int)
forall a b. IO a -> IO b -> IO b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> IOVector Int -> IO (IOVector Int)
forall a. a -> IO a
forall (m :: * -> *) a. Monad m => a -> m a
return IOVector Int
c)
                Text
"Double" -> IOVector Double -> Column
forall a. (Columnable a, Unbox a) => IOVector a -> Column
MutableUnboxedColumn (IOVector Double -> Column) -> IO (IOVector Double) -> IO Column
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> ((Int -> IO (MVector (PrimState IO) Double)
forall (m :: * -> *) a.
(PrimMonad m, Unbox a) =>
Int -> m (MVector (PrimState m) a)
VUM.unsafeNew Int
n :: IO (VUM.IOVector Double)) IO (IOVector Double)
-> (IOVector Double -> IO (IOVector Double))
-> IO (IOVector Double)
forall a b. IO a -> (a -> IO b) -> IO b
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= \IOVector Double
c -> MVector (PrimState IO) Double -> Int -> Double -> IO ()
forall (m :: * -> *) a.
(PrimMonad m, Unbox a) =>
MVector (PrimState m) a -> Int -> a -> m ()
VUM.unsafeWrite IOVector Double
MVector (PrimState IO) Double
c Int
0 (Double -> Maybe Double -> Double
forall a. a -> Maybe a -> a
fromMaybe Double
0 (Maybe Double -> Double) -> Maybe Double -> Double
forall a b. (a -> b) -> a -> b
$ HasCallStack => Text -> Maybe Double
Text -> Maybe Double
readDouble Text
x) IO () -> IO (IOVector Double) -> IO (IOVector Double)
forall a b. IO a -> IO b -> IO b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> IOVector Double -> IO (IOVector Double)
forall a. a -> IO a
forall (m :: * -> *) a. Monad m => a -> m a
return IOVector Double
c)
                Text
_ -> IOVector Text -> Column
forall a. Columnable a => IOVector a -> Column
MutableBoxedColumn (IOVector Text -> Column) -> IO (IOVector Text) -> IO Column
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> ((Int -> IO (MVector (PrimState IO) Text)
forall (m :: * -> *) a.
PrimMonad m =>
Int -> m (MVector (PrimState m) a)
VM.unsafeNew Int
n :: IO (VM.IOVector T.Text)) IO (IOVector Text)
-> (IOVector Text -> IO (IOVector Text)) -> IO (IOVector Text)
forall a b. IO a -> (a -> IO b) -> IO b
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= \IOVector Text
c -> MVector (PrimState IO) Text -> Int -> Text -> IO ()
forall (m :: * -> *) a.
PrimMonad m =>
MVector (PrimState m) a -> Int -> a -> m ()
VM.unsafeWrite IOVector Text
MVector (PrimState IO) Text
c Int
0 Text
x IO () -> IO (IOVector Text) -> IO (IOVector Text)
forall a b. IO a -> IO b -> IO b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> IOVector Text -> IO (IOVector Text)
forall a. a -> IO a
forall (m :: * -> *) a. Monad m => a -> m a
return IOVector Text
c)
        MVector (PrimState IO) Column -> Int -> Column -> IO ()
forall (m :: * -> *) a.
PrimMonad m =>
MVector (PrimState m) a -> Int -> a -> m ()
VM.unsafeWrite IOVector Column
MVector (PrimState IO) Column
mCol Int
i Column
col
{-# INLINE getInitialDataVectors #-}

inferValueType :: T.Text -> T.Text
inferValueType :: Text -> Text
inferValueType Text
s = let
        example :: Text
example = Text
s
    in case HasCallStack => Text -> Maybe Int
Text -> Maybe Int
readInt Text
example of
        Just Int
_ -> Text
"Int"
        Maybe Int
Nothing -> case HasCallStack => Text -> Maybe Double
Text -> Maybe Double
readDouble Text
example of
            Just Double
_ -> Text
"Double"
            Maybe Double
Nothing -> Text
"Other"
{-# INLINE inferValueType #-}

-- | Reads rows from the handle and stores values in mutable vectors.
fillColumns :: Int -> Char -> VM.IOVector Column -> VM.IOVector [(Int, T.Text)] -> Handle -> IO ()
fillColumns :: Int
-> Char
-> IOVector Column
-> IOVector [(Int, Text)]
-> Handle
-> IO ()
fillColumns Int
n Char
c IOVector Column
mutableCols IOVector [(Int, Text)]
nullIndices Handle
handle = do
    IORef Text
input <- Text -> IO (IORef Text)
forall a. a -> IO (IORef a)
newIORef (Text
forall a. Monoid a => a
mempty :: T.Text)
    [Int] -> (Int -> IO ()) -> IO ()
forall (t :: * -> *) (m :: * -> *) a b.
(Foldable t, Monad m) =>
t a -> (a -> m b) -> m ()
forM_ [Int
1..Int
n] ((Int -> IO ()) -> IO ()) -> (Int -> IO ()) -> IO ()
forall a b. (a -> b) -> a -> b
$ \Int
i -> do
        Bool
isEOF <- Handle -> IO Bool
hIsEOF Handle
handle
        Text
input' <- IORef Text -> IO Text
forall a. IORef a -> IO a
readIORef IORef Text
input
        Bool -> IO () -> IO ()
forall (f :: * -> *). Applicative f => Bool -> f () -> f ()
unless (Bool
isEOF Bool -> Bool -> Bool
&& Text
input' Text -> Text -> Bool
forall a. Eq a => a -> a -> Bool
== Text
forall a. Monoid a => a
mempty) (IO () -> IO ()) -> IO () -> IO ()
forall a b. (a -> b) -> a -> b
$ do
              IO Text -> Parser [Text] -> Text -> IO (Result [Text])
forall (m :: * -> *) a.
Monad m =>
m Text -> Parser a -> Text -> m (Result a)
parseWith (Handle -> IO Text
TIO.hGetChunk Handle
handle) (Char -> Parser [Text]
parseRow Char
c) Text
input' IO (Result [Text]) -> (Result [Text] -> IO ()) -> IO ()
forall a b. IO a -> (a -> IO b) -> IO b
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= \case
                Fail Text
unconsumed [String]
ctx String
er -> do
                  Integer
erpos <- Handle -> IO Integer
hTell Handle
handle
                  String -> IO ()
forall a. String -> IO a
forall (m :: * -> *) a. MonadFail m => String -> m a
fail (String -> IO ()) -> String -> IO ()
forall a b. (a -> b) -> a -> b
$ String
"Failed to parse CSV file around " String -> String -> String
forall a. Semigroup a => a -> a -> a
<> Integer -> String
forall a. Show a => a -> String
show Integer
erpos String -> String -> String
forall a. Semigroup a => a -> a -> a
<> String
" byte; due: "
                    String -> String -> String
forall a. Semigroup a => a -> a -> a
<> String -> String
forall a. Show a => a -> String
show String
er String -> String -> String
forall a. Semigroup a => a -> a -> a
<> String
"; context: " String -> String -> String
forall a. Semigroup a => a -> a -> a
<> [String] -> String
forall a. Show a => a -> String
show [String]
ctx
                Partial Text -> Result [Text]
c -> do
                  String -> IO ()
forall a. String -> IO a
forall (m :: * -> *) a. MonadFail m => String -> m a
fail String
"Partial handler is called"
                Done (Text
unconsumed :: T.Text) ([Text]
row :: [T.Text]) -> do
                  IORef Text -> Text -> IO ()
forall a. IORef a -> a -> IO ()
writeIORef IORef Text
input Text
unconsumed
                  (Int -> Text -> IO ()) -> [Int] -> [Text] -> IO ()
forall (m :: * -> *) a b c.
Applicative m =>
(a -> b -> m c) -> [a] -> [b] -> m ()
zipWithM_ (IOVector Column
-> IOVector [(Int, Text)] -> Int -> Int -> Text -> IO ()
writeValue IOVector Column
mutableCols IOVector [(Int, Text)]
nullIndices Int
i) [Int
0..] [Text]
row
{-# INLINE fillColumns #-}

-- | Writes a value into the appropriate column, resizing the vector if necessary.
writeValue :: VM.IOVector Column -> VM.IOVector [(Int, T.Text)] -> Int -> Int -> T.Text -> IO ()
writeValue :: IOVector Column
-> IOVector [(Int, Text)] -> Int -> Int -> Text -> IO ()
writeValue IOVector Column
mutableCols IOVector [(Int, Text)]
nullIndices Int
count Int
colIndex Text
value = do
    Column
col <- MVector (PrimState IO) Column -> Int -> IO Column
forall (m :: * -> *) a.
PrimMonad m =>
MVector (PrimState m) a -> Int -> m a
VM.unsafeRead IOVector Column
MVector (PrimState IO) Column
mutableCols Int
colIndex
    Either Text Bool
res <- Int -> Text -> Column -> IO (Either Text Bool)
writeColumn Int
count Text
value Column
col
    let modify :: Text -> IO ()
modify Text
value = MVector (PrimState IO) [(Int, Text)]
-> ([(Int, Text)] -> [(Int, Text)]) -> Int -> IO ()
forall (m :: * -> *) a.
PrimMonad m =>
MVector (PrimState m) a -> (a -> a) -> Int -> m ()
VM.unsafeModify IOVector [(Int, Text)]
MVector (PrimState IO) [(Int, Text)]
nullIndices ((Int
count, Text
value) (Int, Text) -> [(Int, Text)] -> [(Int, Text)]
forall a. a -> [a] -> [a]
:) Int
colIndex
    (Text -> IO ()) -> (Bool -> IO ()) -> Either Text Bool -> IO ()
forall a c b. (a -> c) -> (b -> c) -> Either a b -> c
either Text -> IO ()
modify (IO () -> Bool -> IO ()
forall a b. a -> b -> a
const (() -> IO ()
forall a. a -> IO a
forall (m :: * -> *) a. Monad m => a -> m a
return ())) Either Text Bool
res
{-# INLINE writeValue #-}

-- | Freezes a mutable vector into an immutable one, trimming it to the actual row count.
freezeColumn :: VM.IOVector Column -> V.Vector [(Int, T.Text)] -> ReadOptions -> Int -> IO (Maybe Column)
freezeColumn :: IOVector Column
-> Vector [(Int, Text)] -> ReadOptions -> Int -> IO (Maybe Column)
freezeColumn IOVector Column
mutableCols Vector [(Int, Text)]
nulls ReadOptions
opts Int
colIndex = do
    Column
col <- MVector (PrimState IO) Column -> Int -> IO Column
forall (m :: * -> *) a.
PrimMonad m =>
MVector (PrimState m) a -> Int -> m a
VM.unsafeRead IOVector Column
MVector (PrimState IO) Column
mutableCols Int
colIndex
    Column -> Maybe Column
forall a. a -> Maybe a
Just (Column -> Maybe Column) -> IO Column -> IO (Maybe Column)
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> [(Int, Text)] -> Column -> IO Column
freezeColumn' (Vector [(Int, Text)]
nulls Vector [(Int, Text)] -> Int -> [(Int, Text)]
forall a. Vector a -> Int -> a
V.! Int
colIndex) Column
col
{-# INLINE freezeColumn #-}

parseSep :: Char -> T.Text -> [T.Text]
parseSep :: Char -> Text -> [Text]
parseSep Char
c Text
s = (String -> [Text])
-> ([Text] -> [Text]) -> Either String [Text] -> [Text]
forall a c b. (a -> c) -> (b -> c) -> Either a b -> c
either String -> [Text]
forall a. HasCallStack => String -> a
error [Text] -> [Text]
forall a. a -> a
id (Parser [Text] -> Text -> Either String [Text]
forall a. Parser a -> Text -> Either String a
parseOnly (Char -> Parser [Text]
record Char
c) Text
s)
{-# INLINE parseSep #-}

record :: Char -> Parser [T.Text]
record :: Char -> Parser [Text]
record Char
c =
   Char -> Parser Text
field Char
c Parser Text -> Parser Text Char -> Parser [Text]
forall (f :: * -> *) a s. Alternative f => f a -> f s -> f [a]
`sepBy1` Char -> Parser Text Char
char Char
c
   Parser [Text] -> String -> Parser [Text]
forall i a. Parser i a -> String -> Parser i a
<?> String
"record"
{-# INLINE record #-}

parseRow :: Char -> Parser [T.Text]
parseRow :: Char -> Parser [Text]
parseRow Char
c = (Char -> Parser [Text]
record Char
c Parser [Text] -> Parser Text () -> Parser [Text]
forall a b. Parser Text a -> Parser Text b -> Parser Text a
forall (f :: * -> *) a b. Applicative f => f a -> f b -> f a
<* Parser Text ()
lineEnd)  Parser [Text] -> String -> Parser [Text]
forall i a. Parser i a -> String -> Parser i a
<?> String
"record-new-line"

field :: Char -> Parser T.Text
field :: Char -> Parser Text
field Char
c =
   Parser Text
quotedField Parser Text -> Parser Text -> Parser Text
forall a. Parser Text a -> Parser Text a -> Parser Text a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> Char -> Parser Text
unquotedField Char
c
   Parser Text -> String -> Parser Text
forall i a. Parser i a -> String -> Parser i a
<?> String
"field"
{-# INLINE field #-}

unquotedTerminators :: Char -> S.Set Char
unquotedTerminators :: Char -> Set Char
unquotedTerminators Char
sep = String -> Set Char
forall a. Ord a => [a] -> Set a
S.fromList [Char
sep, Char
'\n', Char
'\r', Char
'"']

unquotedField :: Char -> Parser T.Text
unquotedField :: Char -> Parser Text
unquotedField Char
sep =
   (Char -> Bool) -> Parser Text
takeWhile (Bool -> Bool
not (Bool -> Bool) -> (Char -> Bool) -> Char -> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Char -> Set Char -> Bool
forall a. Ord a => a -> Set a -> Bool
`S.member` Set Char
terminators)) Parser Text -> String -> Parser Text
forall i a. Parser i a -> String -> Parser i a
<?> String
"unquoted field"
   where terminators :: Set Char
terminators = Char -> Set Char
unquotedTerminators Char
sep
{-# INLINE unquotedField #-}

quotedField :: Parser T.Text
quotedField :: Parser Text
quotedField = Char -> Parser Text Char
char Char
'"' Parser Text Char -> Parser Text -> Parser Text
forall a b. Parser Text a -> Parser Text b -> Parser Text b
forall (f :: * -> *) a b. Applicative f => f a -> f b -> f b
*> Parser Text
contents Parser Text -> Parser Text Char -> Parser Text
forall a b. Parser Text a -> Parser Text b -> Parser Text a
forall (f :: * -> *) a b. Applicative f => f a -> f b -> f a
<* Char -> Parser Text Char
char Char
'"' Parser Text -> String -> Parser Text
forall i a. Parser i a -> String -> Parser i a
<?> String
"quoted field"
    where
        contents :: Parser Text
contents = [Text] -> Text
forall m. Monoid m => [m] -> m
forall (t :: * -> *) m. (Foldable t, Monoid m) => t m -> m
fold ([Text] -> Text) -> Parser [Text] -> Parser Text
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Parser Text -> Parser [Text]
forall a. Parser Text a -> Parser Text [a]
forall (f :: * -> *) a. Alternative f => f a -> f [a]
many (Parser Text
unquote Parser Text -> Parser Text -> Parser Text
forall a. Parser Text a -> Parser Text a -> Parser Text a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> Parser Text
unescape)
            where
                unquote :: Parser Text
unquote = (Char -> Bool) -> Parser Text
takeWhile1 (String -> Char -> Bool
notInClass String
"\"\\")
                unescape :: Parser Text
unescape = Char -> Parser Text Char
char Char
'\\' Parser Text Char -> Parser Text -> Parser Text
forall a b. Parser Text a -> Parser Text b -> Parser Text b
forall (f :: * -> *) a b. Applicative f => f a -> f b -> f b
*> do
                    Char -> Text
T.singleton (Char -> Text) -> Parser Text Char -> Parser Text
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> do
                        Char -> Parser Text Char
char Char
'\\' Parser Text Char -> Parser Text Char -> Parser Text Char
forall a. Parser Text a -> Parser Text a -> Parser Text a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> Char -> Parser Text Char
char Char
'"'
{-# INLINE quotedField #-}

lineEnd :: Parser ()
lineEnd :: Parser Text ()
lineEnd =
   (Parser Text ()
endOfLine Parser Text () -> Parser Text () -> Parser Text ()
forall a. Parser Text a -> Parser Text a -> Parser Text a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> Parser Text ()
forall t. Chunk t => Parser t ()
endOfInput)
   Parser Text () -> String -> Parser Text ()
forall i a. Parser i a -> String -> Parser i a
<?> String
"end of line"
{-# INLINE lineEnd #-}

-- | First pass to count rows for exact allocation
countRows :: Char -> FilePath -> IO Int
countRows :: Char -> String -> IO Int
countRows Char
c String
path = String -> IOMode -> (Handle -> IO Int) -> IO Int
forall r. String -> IOMode -> (Handle -> IO r) -> IO r
withFile String
path IOMode
ReadMode ((Handle -> IO Int) -> IO Int) -> (Handle -> IO Int) -> IO Int
forall a b. (a -> b) -> a -> b
$! Int -> Text -> Handle -> IO Int
go Int
0 Text
""
   where
      go :: Int -> Text -> Handle -> IO Int
go !Int
n !Text
input Handle
h = do
         Bool
isEOF <- Handle -> IO Bool
hIsEOF Handle
h
         if Bool
isEOF Bool -> Bool -> Bool
&& Text
input Text -> Text -> Bool
forall a. Eq a => a -> a -> Bool
== Text
forall a. Monoid a => a
mempty
            then Int -> IO Int
forall a. a -> IO a
forall (f :: * -> *) a. Applicative f => a -> f a
pure Int
n
            else
               IO Text -> Parser [Text] -> Text -> IO (Result [Text])
forall (m :: * -> *) a.
Monad m =>
m Text -> Parser a -> Text -> m (Result a)
parseWith (Handle -> IO Text
TIO.hGetChunk Handle
h) (Char -> Parser [Text]
parseRow Char
c) Text
input IO (Result [Text]) -> (Result [Text] -> IO Int) -> IO Int
forall a b. IO a -> (a -> IO b) -> IO b
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= \case
                  Fail Text
unconsumed [String]
ctx String
er -> do
                    Integer
erpos <- Handle -> IO Integer
hTell Handle
h
                    String -> IO Int
forall a. String -> IO a
forall (m :: * -> *) a. MonadFail m => String -> m a
fail (String -> IO Int) -> String -> IO Int
forall a b. (a -> b) -> a -> b
$ String
"Failed to parse CSV file around " String -> String -> String
forall a. Semigroup a => a -> a -> a
<> Integer -> String
forall a. Show a => a -> String
show Integer
erpos String -> String -> String
forall a. Semigroup a => a -> a -> a
<> String
" byte; due: "
                      String -> String -> String
forall a. Semigroup a => a -> a -> a
<> String -> String
forall a. Show a => a -> String
show String
er String -> String -> String
forall a. Semigroup a => a -> a -> a
<> String
"; context: " String -> String -> String
forall a. Semigroup a => a -> a -> a
<> [String] -> String
forall a. Show a => a -> String
show [String]
ctx String -> String -> String
forall a. Semigroup a => a -> a -> a
<> String
" " String -> String -> String
forall a. Semigroup a => a -> a -> a
<> Text -> String
forall a. Show a => a -> String
show Text
unconsumed
                  Partial Text -> Result [Text]
c -> do
                    String -> IO Int
forall a. String -> IO a
forall (m :: * -> *) a. MonadFail m => String -> m a
fail (String -> IO Int) -> String -> IO Int
forall a b. (a -> b) -> a -> b
$ String
"Partial handler is called; n = " String -> String -> String
forall a. Semigroup a => a -> a -> a
<> Int -> String
forall a. Show a => a -> String
show Int
n
                  Done (Text
unconsumed :: T.Text) [Text]
_ ->
                    Int -> Text -> Handle -> IO Int
go (Int
n Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
1) Text
unconsumed Handle
h
{-# INLINE countRows #-}

writeCsv :: String -> DataFrame -> IO ()
writeCsv :: String -> DataFrame -> IO ()
writeCsv = Char -> String -> DataFrame -> IO ()
writeSeparated Char
','

writeSeparated :: Char      -- ^ Separator
               -> String    -- ^ Path to write to
               -> DataFrame
               -> IO ()
writeSeparated :: Char -> String -> DataFrame -> IO ()
writeSeparated Char
c String
filepath DataFrame
df = String -> IOMode -> (Handle -> IO ()) -> IO ()
forall r. String -> IOMode -> (Handle -> IO r) -> IO r
withFile String
filepath IOMode
WriteMode ((Handle -> IO ()) -> IO ()) -> (Handle -> IO ()) -> IO ()
forall a b. (a -> b) -> a -> b
$ \Handle
handle ->do
    let (Int
rows, Int
columns) = DataFrame -> (Int, Int)
dataframeDimensions DataFrame
df
    let headers :: [Text]
headers = ((Text, Int) -> Text) -> [(Text, Int)] -> [Text]
forall a b. (a -> b) -> [a] -> [b]
map (Text, Int) -> Text
forall a b. (a, b) -> a
fst (((Text, Int) -> (Text, Int) -> Ordering)
-> [(Text, Int)] -> [(Text, Int)]
forall a. (a -> a -> Ordering) -> [a] -> [a]
L.sortBy (Int -> Int -> Ordering
forall a. Ord a => a -> a -> Ordering
compare (Int -> Int -> Ordering)
-> ((Text, Int) -> Int) -> (Text, Int) -> (Text, Int) -> Ordering
forall b c a. (b -> b -> c) -> (a -> b) -> a -> a -> c
`on` (Text, Int) -> Int
forall a b. (a, b) -> b
snd) (Map Text Int -> [(Text, Int)]
forall k a. Map k a -> [(k, a)]
M.toList (DataFrame -> Map Text Int
columnIndices DataFrame
df)))
    Handle -> Text -> IO ()
TIO.hPutStrLn Handle
handle (Text -> [Text] -> Text
T.intercalate Text
", " [Text]
headers)
    [Int] -> (Int -> IO ()) -> IO ()
forall (t :: * -> *) (m :: * -> *) a b.
(Foldable t, Monad m) =>
t a -> (a -> m b) -> m ()
forM_ [Int
0..(Int
rows Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
1)] ((Int -> IO ()) -> IO ()) -> (Int -> IO ()) -> IO ()
forall a b. (a -> b) -> a -> b
$ \Int
i -> do
        let row :: [Text]
row = DataFrame -> Int -> [Text]
getRowAsText DataFrame
df Int
i
        Handle -> Text -> IO ()
TIO.hPutStrLn Handle
handle (Text -> [Text] -> Text
T.intercalate Text
", " [Text]
row)

getRowAsText :: DataFrame -> Int -> [T.Text]
getRowAsText :: DataFrame -> Int -> [Text]
getRowAsText DataFrame
df Int
i = (Int -> Maybe Column -> [Text] -> [Text])
-> [Text] -> Vector (Maybe Column) -> [Text]
forall a b. (Int -> a -> b -> b) -> b -> Vector a -> b
V.ifoldr Int -> Maybe Column -> [Text] -> [Text]
go [] (DataFrame -> Vector (Maybe Column)
columns DataFrame
df)
  where
    indexMap :: Map Int Text
indexMap = [(Int, Text)] -> Map Int Text
forall k a. Ord k => [(k, a)] -> Map k a
M.fromList (((Text, Int) -> (Int, Text)) -> [(Text, Int)] -> [(Int, Text)]
forall a b. (a -> b) -> [a] -> [b]
map (\(Text
a, Int
b) -> (Int
b, Text
a)) ([(Text, Int)] -> [(Int, Text)]) -> [(Text, Int)] -> [(Int, Text)]
forall a b. (a -> b) -> a -> b
$ Map Text Int -> [(Text, Int)]
forall k a. Map k a -> [(k, a)]
M.toList (DataFrame -> Map Text Int
columnIndices DataFrame
df))
    go :: Int -> Maybe Column -> [Text] -> [Text]
go Int
k Maybe Column
Nothing [Text]
acc = [Text]
acc
    go Int
k (Just (BoxedColumn (Vector a
c :: V.Vector a))) [Text]
acc = case Vector a
c Vector a -> Int -> Maybe a
forall a. Vector a -> Int -> Maybe a
V.!? Int
i of
        Just a
e -> Text
textRep Text -> [Text] -> [Text]
forall a. a -> [a] -> [a]
: [Text]
acc
            where textRep :: Text
textRep = case TypeRep a -> TypeRep Text -> Maybe (a :~: Text)
forall a b. TypeRep a -> TypeRep b -> Maybe (a :~: b)
forall {k} (f :: k -> *) (a :: k) (b :: k).
TestEquality f =>
f a -> f b -> Maybe (a :~: b)
testEquality (forall a. Typeable a => TypeRep a
forall {k} (a :: k). Typeable a => TypeRep a
typeRep @a) (forall a. Typeable a => TypeRep a
forall {k} (a :: k). Typeable a => TypeRep a
typeRep @T.Text) of
                    Just a :~: Text
Refl -> a
Text
e
                    Maybe (a :~: Text)
Nothing   -> case forall a. Typeable a => TypeRep a
forall {k} (a :: k). Typeable a => TypeRep a
typeRep @a of
                        App TypeRep a
t1 TypeRep b
t2 -> case TypeRep a -> TypeRep Maybe -> Maybe (a :~~: Maybe)
forall k1 k2 (a :: k1) (b :: k2).
TypeRep a -> TypeRep b -> Maybe (a :~~: b)
eqTypeRep TypeRep a
t1 (forall {k} (a :: k). Typeable a => TypeRep a
forall (a :: * -> *). Typeable a => TypeRep a
typeRep @Maybe) of
                            Just a :~~: Maybe
HRefl -> case TypeRep b -> TypeRep Text -> Maybe (b :~: Text)
forall (a :: k1) (b :: k1).
TypeRep a -> TypeRep b -> Maybe (a :~: b)
forall {k} (f :: k -> *) (a :: k) (b :: k).
TestEquality f =>
f a -> f b -> Maybe (a :~: b)
testEquality TypeRep b
t2 (forall a. Typeable a => TypeRep a
forall {k} (a :: k). Typeable a => TypeRep a
typeRep @T.Text) of
                                Just b :~: Text
Refl -> Text -> Maybe Text -> Text
forall a. a -> Maybe a -> a
fromMaybe Text
"null" a
Maybe Text
e
                                Maybe (b :~: Text)
Nothing -> (Text -> Text
fromOptional (Text -> Text) -> (a -> Text) -> a -> Text
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (String -> Text
T.pack (String -> Text) -> (a -> String) -> a -> Text
forall b c a. (b -> c) -> (a -> b) -> a -> c
. a -> String
forall a. Show a => a -> String
show)) a
e
                                            where fromOptional :: Text -> Text
fromOptional Text
s
                                                    | Text -> Text -> Bool
T.isPrefixOf Text
"Just " Text
s = Int -> Text -> Text
T.drop (Text -> Int
T.length Text
"Just ") Text
s
                                                    | Bool
otherwise = Text
"null"
                            Maybe (a :~~: Maybe)
Nothing -> (String -> Text
T.pack (String -> Text) -> (a -> String) -> a -> Text
forall b c a. (b -> c) -> (a -> b) -> a -> c
. a -> String
forall a. Show a => a -> String
show) a
e
                        TypeRep a
_ -> (String -> Text
T.pack (String -> Text) -> (a -> String) -> a -> Text
forall b c a. (b -> c) -> (a -> b) -> a -> c
. a -> String
forall a. Show a => a -> String
show) a
e
        Maybe a
Nothing ->
            String -> [Text]
forall a. HasCallStack => String -> a
error (String -> [Text]) -> String -> [Text]
forall a b. (a -> b) -> a -> b
$
                String
"Column "
                String -> String -> String
forall a. [a] -> [a] -> [a]
++ Text -> String
T.unpack (Map Int Text
indexMap Map Int Text -> Int -> Text
forall k a. Ord k => Map k a -> k -> a
M.! Int
k)
                String -> String -> String
forall a. [a] -> [a] -> [a]
++ String
" has less items than "
                String -> String -> String
forall a. [a] -> [a] -> [a]
++ String
"the other columns at index "
                String -> String -> String
forall a. [a] -> [a] -> [a]
++ Int -> String
forall a. Show a => a -> String
show Int
i
    go Int
k (Just (UnboxedColumn Vector a
c)) [Text]
acc = case Vector a
c Vector a -> Int -> Maybe a
forall a. Unbox a => Vector a -> Int -> Maybe a
VU.!? Int
i of
        Just a
e -> String -> Text
T.pack (a -> String
forall a. Show a => a -> String
show a
e) Text -> [Text] -> [Text]
forall a. a -> [a] -> [a]
: [Text]
acc
        Maybe a
Nothing ->
            String -> [Text]
forall a. HasCallStack => String -> a
error (String -> [Text]) -> String -> [Text]
forall a b. (a -> b) -> a -> b
$
                String
"Column "
                String -> String -> String
forall a. [a] -> [a] -> [a]
++ Text -> String
T.unpack (Map Int Text
indexMap Map Int Text -> Int -> Text
forall k a. Ord k => Map k a -> k -> a
M.! Int
k)
                String -> String -> String
forall a. [a] -> [a] -> [a]
++ String
" has less items than "
                String -> String -> String
forall a. [a] -> [a] -> [a]
++ String
"the other columns at index "
                String -> String -> String
forall a. [a] -> [a] -> [a]
++ Int -> String
forall a. Show a => a -> String
show Int
i
    go Int
k (Just (OptionalColumn (Vector (Maybe a)
c :: V.Vector (Maybe a)))) [Text]
acc = case Vector (Maybe a)
c Vector (Maybe a) -> Int -> Maybe (Maybe a)
forall a. Vector a -> Int -> Maybe a
V.!? Int
i of
        Just Maybe a
e -> Text
textRep Text -> [Text] -> [Text]
forall a. a -> [a] -> [a]
: [Text]
acc
            where textRep :: Text
textRep = case TypeRep a -> TypeRep Text -> Maybe (a :~: Text)
forall a b. TypeRep a -> TypeRep b -> Maybe (a :~: b)
forall {k} (f :: k -> *) (a :: k) (b :: k).
TestEquality f =>
f a -> f b -> Maybe (a :~: b)
testEquality (forall a. Typeable a => TypeRep a
forall {k} (a :: k). Typeable a => TypeRep a
typeRep @a) (forall a. Typeable a => TypeRep a
forall {k} (a :: k). Typeable a => TypeRep a
typeRep @T.Text) of
                    Just a :~: Text
Refl -> a -> Maybe a -> a
forall a. a -> Maybe a -> a
fromMaybe a
"Nothing" Maybe a
e
                    Maybe (a :~: Text)
Nothing   -> (String -> Text
T.pack (String -> Text) -> (Maybe a -> String) -> Maybe a -> Text
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Maybe a -> String
forall a. Show a => a -> String
show) Maybe a
e
        Maybe (Maybe a)
Nothing ->
            String -> [Text]
forall a. HasCallStack => String -> a
error (String -> [Text]) -> String -> [Text]
forall a b. (a -> b) -> a -> b
$
                String
"Column "
                String -> String -> String
forall a. [a] -> [a] -> [a]
++ Text -> String
T.unpack (Map Int Text
indexMap Map Int Text -> Int -> Text
forall k a. Ord k => Map k a -> k -> a
M.! Int
k)
                String -> String -> String
forall a. [a] -> [a] -> [a]
++ String
" has less items than "
                String -> String -> String
forall a. [a] -> [a] -> [a]
++ String
"the other columns at index "
                String -> String -> String
forall a. [a] -> [a] -> [a]
++ Int -> String
forall a. Show a => a -> String
show Int
i