{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE ExplicitNamespaces #-}
{-# LANGUAGE LambdaCase #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE TypeApplications #-}
{-# LANGUAGE GADTs #-}
{-# LANGUAGE RankNTypes #-}
{-# LANGUAGE Strict #-}
module Data.DataFrame.IO.CSV where

import qualified Data.ByteString.Char8 as C
import qualified Data.List as L
import qualified Data.Map as M
import qualified Data.Set as S
import qualified Data.Text as T
import qualified Data.Text.Lazy as TL
import qualified Data.Text.Lazy.IO as TLIO
import qualified Data.Text.IO as TIO
import qualified Data.Vector as V
import qualified Data.Vector.Unboxed as VU
import qualified Data.Vector.Mutable as VM
import qualified Data.Vector.Unboxed.Mutable as VUM

import Control.Applicative ((<$>), (<|>), (<*>), (<*), (*>), many)
import Control.Monad (forM_, zipWithM_, unless, void)
import Data.Attoparsec.Text
import Data.Char
import Data.DataFrame.Internal.Column (Column(..), freezeColumn', writeColumn, columnLength)
import Data.DataFrame.Internal.DataFrame (DataFrame(..))
import Data.DataFrame.Internal.Parsing
import Data.DataFrame.Operations.Typing
import Data.Function (on)
import Data.IORef
import Data.Maybe
import Data.Text.Encoding (decodeUtf8Lenient)
import Data.Type.Equality
  ( TestEquality (testEquality),
    type (:~:) (Refl)
  )
import GHC.IO.Handle (Handle)
import Prelude hiding (concat, takeWhile)
import System.IO
import Type.Reflection

-- | Record for CSV read options.
data ReadOptions = ReadOptions {
    ReadOptions -> Bool
hasHeader :: Bool,
    ReadOptions -> Bool
inferTypes :: Bool,
    ReadOptions -> Bool
safeRead :: Bool
}

-- | By default we assume the file has a header, we infer the types on read
-- and we convert any rows with nullish objects into Maybe (safeRead).
defaultOptions :: ReadOptions
defaultOptions :: ReadOptions
defaultOptions = ReadOptions { hasHeader :: Bool
hasHeader = Bool
True, inferTypes :: Bool
inferTypes = Bool
True, safeRead :: Bool
safeRead = Bool
True }

-- | Reads a CSV file from the given path.
-- Note this file stores intermediate temporary files
-- while converting the CSV from a row to a columnar format.
readCsv :: String -> IO DataFrame
readCsv :: String -> IO DataFrame
readCsv = Char -> ReadOptions -> String -> IO DataFrame
readSeparated Char
',' ReadOptions
defaultOptions

-- | Reads a tab separated file from the given path.
-- Note this file stores intermediate temporary files
-- while converting the CSV from a row to a columnar format.
readTsv :: String -> IO DataFrame
readTsv :: String -> IO DataFrame
readTsv = Char -> ReadOptions -> String -> IO DataFrame
readSeparated Char
'\t' ReadOptions
defaultOptions

-- | Reads a character separated file into a dataframe using mutable vectors.
readSeparated :: Char -> ReadOptions -> String -> IO DataFrame
readSeparated :: Char -> ReadOptions -> String -> IO DataFrame
readSeparated Char
c ReadOptions
opts String
path = do
    Int
totalRows <- Char -> String -> IO Int
countRows Char
c String
path
    String -> IOMode -> (Handle -> IO DataFrame) -> IO DataFrame
forall r. String -> IOMode -> (Handle -> IO r) -> IO r
withFile String
path IOMode
ReadMode ((Handle -> IO DataFrame) -> IO DataFrame)
-> (Handle -> IO DataFrame) -> IO DataFrame
forall a b. (a -> b) -> a -> b
$ \Handle
handle -> do
        [Text]
firstRow <- (Text -> Text) -> [Text] -> [Text]
forall a b. (a -> b) -> [a] -> [b]
map Text -> Text
T.strip ([Text] -> [Text]) -> (Text -> [Text]) -> Text -> [Text]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Char -> Text -> [Text]
parseSep Char
c (Text -> [Text]) -> IO Text -> IO [Text]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Handle -> IO Text
TIO.hGetLine Handle
handle
        let columnNames :: [Text]
columnNames = if ReadOptions -> Bool
hasHeader ReadOptions
opts
                        then (Text -> Text) -> [Text] -> [Text]
forall a b. (a -> b) -> [a] -> [b]
map ((Char -> Bool) -> Text -> Text
T.filter (Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
/= Char
'\"')) [Text]
firstRow
                        else (Int -> Text) -> [Int] -> [Text]
forall a b. (a -> b) -> [a] -> [b]
map (Char -> Text
T.singleton (Char -> Text) -> (Int -> Char) -> Int -> Text
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Int -> Char
intToDigit) [Int
0..([Text] -> Int
forall a. [a] -> Int
forall (t :: * -> *) a. Foldable t => t a -> Int
length [Text]
firstRow Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
1)]
        -- If there was no header rewind the file cursor.
        Bool -> IO () -> IO ()
forall (f :: * -> *). Applicative f => Bool -> f () -> f ()
unless (ReadOptions -> Bool
hasHeader ReadOptions
opts) (IO () -> IO ()) -> IO () -> IO ()
forall a b. (a -> b) -> a -> b
$ Handle -> SeekMode -> Integer -> IO ()
hSeek Handle
handle SeekMode
AbsoluteSeek Integer
0

        -- Initialize mutable vectors for each column
        let numColumns :: Int
numColumns = [Text] -> Int
forall a. [a] -> Int
forall (t :: * -> *) a. Foldable t => t a -> Int
length [Text]
columnNames
        let numRows :: Int
numRows = if ReadOptions -> Bool
hasHeader ReadOptions
opts then Int
totalRows Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
1 else Int
totalRows
        -- Use this row to infer the types of the rest of the column.
        -- TODO: this isn't robust but in so far as this is a guess anyway
        -- it's probably fine. But we should probably sample n rows and pick
        -- the most likely type from the sample.
        [Text]
dataRow <- (Text -> Text) -> [Text] -> [Text]
forall a b. (a -> b) -> [a] -> [b]
map Text -> Text
T.strip ([Text] -> [Text]) -> (Text -> [Text]) -> Text -> [Text]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Char -> Text -> [Text]
parseSep Char
c (Text -> [Text]) -> IO Text -> IO [Text]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Handle -> IO Text
TIO.hGetLine Handle
handle

        -- This array will track the indices of all null values for each column.
        -- If any exist then the column will be an optional type.
        IOVector [(Int, Text)]
nullIndices <- Int -> IO (MVector (PrimState IO) [(Int, Text)])
forall (m :: * -> *) a.
PrimMonad m =>
Int -> m (MVector (PrimState m) a)
VM.unsafeNew Int
numColumns
        MVector (PrimState IO) [(Int, Text)] -> [(Int, Text)] -> IO ()
forall (m :: * -> *) a.
PrimMonad m =>
MVector (PrimState m) a -> a -> m ()
VM.set IOVector [(Int, Text)]
MVector (PrimState IO) [(Int, Text)]
nullIndices []
        IOVector Column
mutableCols <- Int -> IO (MVector (PrimState IO) Column)
forall (m :: * -> *) a.
PrimMonad m =>
Int -> m (MVector (PrimState m) a)
VM.unsafeNew Int
numColumns
        Int -> IOVector Column -> [Text] -> IO ()
getInitialDataVectors Int
numRows IOVector Column
mutableCols [Text]
dataRow

        -- Read rows into the mutable vectors
        Int
-> Char
-> IOVector Column
-> IOVector [(Int, Text)]
-> Handle
-> IO ()
fillColumns Int
numRows Char
c IOVector Column
mutableCols IOVector [(Int, Text)]
nullIndices Handle
handle

        -- Freeze the mutable vectors into immutable ones
        Vector [(Int, Text)]
nulls' <- MVector (PrimState IO) [(Int, Text)] -> IO (Vector [(Int, Text)])
forall (m :: * -> *) a.
PrimMonad m =>
MVector (PrimState m) a -> m (Vector a)
V.unsafeFreeze IOVector [(Int, Text)]
MVector (PrimState IO) [(Int, Text)]
nullIndices
        Vector (Maybe Column)
cols <- (Int -> IO (Maybe Column))
-> Vector Int -> IO (Vector (Maybe Column))
forall (m :: * -> *) a b.
Monad m =>
(a -> m b) -> Vector a -> m (Vector b)
V.mapM (IOVector Column
-> Vector [(Int, Text)] -> ReadOptions -> Int -> IO (Maybe Column)
freezeColumn IOVector Column
mutableCols Vector [(Int, Text)]
nulls' ReadOptions
opts) (Int -> (Int -> Int) -> Vector Int
forall a. Int -> (Int -> a) -> Vector a
V.generate Int
numColumns Int -> Int
forall a. a -> a
id)
        DataFrame -> IO DataFrame
forall a. a -> IO a
forall (m :: * -> *) a. Monad m => a -> m a
return (DataFrame -> IO DataFrame) -> DataFrame -> IO DataFrame
forall a b. (a -> b) -> a -> b
$ DataFrame {
                columns :: Vector (Maybe Column)
columns = Vector (Maybe Column)
cols,
                freeIndices :: [Int]
freeIndices = [],
                columnIndices :: Map Text Int
columnIndices = [(Text, Int)] -> Map Text Int
forall k a. Ord k => [(k, a)] -> Map k a
M.fromList ([Text] -> [Int] -> [(Text, Int)]
forall a b. [a] -> [b] -> [(a, b)]
zip [Text]
columnNames [Int
0..]),
                dataframeDimensions :: (Int, Int)
dataframeDimensions = (Int -> (Column -> Int) -> Maybe Column -> Int
forall b a. b -> (a -> b) -> Maybe a -> b
maybe Int
0 Column -> Int
columnLength (Vector (Maybe Column)
cols Vector (Maybe Column) -> Int -> Maybe Column
forall a. Vector a -> Int -> a
V.! Int
0), Vector (Maybe Column) -> Int
forall a. Vector a -> Int
V.length Vector (Maybe Column)
cols)
            }
{-# INLINE readSeparated #-}

getInitialDataVectors :: Int -> VM.IOVector Column -> [T.Text] -> IO ()
getInitialDataVectors :: Int -> IOVector Column -> [Text] -> IO ()
getInitialDataVectors Int
n IOVector Column
mCol [Text]
xs = do
    [(Int, Text)] -> ((Int, Text) -> IO ()) -> IO ()
forall (t :: * -> *) (m :: * -> *) a b.
(Foldable t, Monad m) =>
t a -> (a -> m b) -> m ()
forM_ ([Int] -> [Text] -> [(Int, Text)]
forall a b. [a] -> [b] -> [(a, b)]
zip [Int
0..] [Text]
xs) (((Int, Text) -> IO ()) -> IO ())
-> ((Int, Text) -> IO ()) -> IO ()
forall a b. (a -> b) -> a -> b
$ \(Int
i, Text
x) -> do
        Column
col <- case Text -> Text
inferValueType Text
x of
                Text
"Int" -> IOVector Int -> Column
forall a. (Columnable a, Unbox a) => IOVector a -> Column
MutableUnboxedColumn (IOVector Int -> Column) -> IO (IOVector Int) -> IO Column
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$>  ((Int -> IO (MVector (PrimState IO) Int)
forall (m :: * -> *) a.
(PrimMonad m, Unbox a) =>
Int -> m (MVector (PrimState m) a)
VUM.unsafeNew Int
n :: IO (VUM.IOVector Int)) IO (IOVector Int)
-> (IOVector Int -> IO (IOVector Int)) -> IO (IOVector Int)
forall a b. IO a -> (a -> IO b) -> IO b
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= \IOVector Int
c -> MVector (PrimState IO) Int -> Int -> Int -> IO ()
forall (m :: * -> *) a.
(PrimMonad m, Unbox a) =>
MVector (PrimState m) a -> Int -> a -> m ()
VUM.unsafeWrite IOVector Int
MVector (PrimState IO) Int
c Int
0 (Int -> Maybe Int -> Int
forall a. a -> Maybe a -> a
fromMaybe Int
0 (Maybe Int -> Int) -> Maybe Int -> Int
forall a b. (a -> b) -> a -> b
$ HasCallStack => Text -> Maybe Int
Text -> Maybe Int
readInt Text
x) IO () -> IO (IOVector Int) -> IO (IOVector Int)
forall a b. IO a -> IO b -> IO b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> IOVector Int -> IO (IOVector Int)
forall a. a -> IO a
forall (m :: * -> *) a. Monad m => a -> m a
return IOVector Int
c)
                Text
"Double" -> IOVector Double -> Column
forall a. (Columnable a, Unbox a) => IOVector a -> Column
MutableUnboxedColumn (IOVector Double -> Column) -> IO (IOVector Double) -> IO Column
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> ((Int -> IO (MVector (PrimState IO) Double)
forall (m :: * -> *) a.
(PrimMonad m, Unbox a) =>
Int -> m (MVector (PrimState m) a)
VUM.unsafeNew Int
n :: IO (VUM.IOVector Double)) IO (IOVector Double)
-> (IOVector Double -> IO (IOVector Double))
-> IO (IOVector Double)
forall a b. IO a -> (a -> IO b) -> IO b
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= \IOVector Double
c -> MVector (PrimState IO) Double -> Int -> Double -> IO ()
forall (m :: * -> *) a.
(PrimMonad m, Unbox a) =>
MVector (PrimState m) a -> Int -> a -> m ()
VUM.unsafeWrite IOVector Double
MVector (PrimState IO) Double
c Int
0 (Double -> Maybe Double -> Double
forall a. a -> Maybe a -> a
fromMaybe Double
0 (Maybe Double -> Double) -> Maybe Double -> Double
forall a b. (a -> b) -> a -> b
$ HasCallStack => Text -> Maybe Double
Text -> Maybe Double
readDouble Text
x) IO () -> IO (IOVector Double) -> IO (IOVector Double)
forall a b. IO a -> IO b -> IO b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> IOVector Double -> IO (IOVector Double)
forall a. a -> IO a
forall (m :: * -> *) a. Monad m => a -> m a
return IOVector Double
c)
                Text
_ -> IOVector Text -> Column
forall a. Columnable a => IOVector a -> Column
MutableBoxedColumn (IOVector Text -> Column) -> IO (IOVector Text) -> IO Column
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> ((Int -> IO (MVector (PrimState IO) Text)
forall (m :: * -> *) a.
PrimMonad m =>
Int -> m (MVector (PrimState m) a)
VM.unsafeNew Int
n :: IO (VM.IOVector T.Text)) IO (IOVector Text)
-> (IOVector Text -> IO (IOVector Text)) -> IO (IOVector Text)
forall a b. IO a -> (a -> IO b) -> IO b
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= \IOVector Text
c -> MVector (PrimState IO) Text -> Int -> Text -> IO ()
forall (m :: * -> *) a.
PrimMonad m =>
MVector (PrimState m) a -> Int -> a -> m ()
VM.unsafeWrite IOVector Text
MVector (PrimState IO) Text
c Int
0 Text
x IO () -> IO (IOVector Text) -> IO (IOVector Text)
forall a b. IO a -> IO b -> IO b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> IOVector Text -> IO (IOVector Text)
forall a. a -> IO a
forall (m :: * -> *) a. Monad m => a -> m a
return IOVector Text
c)
        MVector (PrimState IO) Column -> Int -> Column -> IO ()
forall (m :: * -> *) a.
PrimMonad m =>
MVector (PrimState m) a -> Int -> a -> m ()
VM.unsafeWrite IOVector Column
MVector (PrimState IO) Column
mCol Int
i Column
col
{-# INLINE getInitialDataVectors #-}

inferValueType :: T.Text -> T.Text
inferValueType :: Text -> Text
inferValueType Text
s = let
        example :: Text
example = Text
s
    in case HasCallStack => Text -> Maybe Int
Text -> Maybe Int
readInt Text
example of
        Just Int
_ -> Text
"Int"
        Maybe Int
Nothing -> case HasCallStack => Text -> Maybe Double
Text -> Maybe Double
readDouble Text
example of
            Just Double
_ -> Text
"Double"
            Maybe Double
Nothing -> Text
"Other"
{-# INLINE inferValueType #-}

-- | Reads rows from the handle and stores values in mutable vectors.
fillColumns :: Int -> Char -> VM.IOVector Column -> VM.IOVector [(Int, T.Text)] -> Handle -> IO ()
fillColumns :: Int
-> Char
-> IOVector Column
-> IOVector [(Int, Text)]
-> Handle
-> IO ()
fillColumns Int
n Char
c IOVector Column
mutableCols IOVector [(Int, Text)]
nullIndices Handle
handle = do
    IORef Text
input <- Text -> IO (IORef Text)
forall a. a -> IO (IORef a)
newIORef (Text
forall a. Monoid a => a
mempty :: T.Text)
    [Int] -> (Int -> IO ()) -> IO ()
forall (t :: * -> *) (m :: * -> *) a b.
(Foldable t, Monad m) =>
t a -> (a -> m b) -> m ()
forM_ [Int
1..Int
n] ((Int -> IO ()) -> IO ()) -> (Int -> IO ()) -> IO ()
forall a b. (a -> b) -> a -> b
$ \Int
i -> do
        Bool
isEOF <- Handle -> IO Bool
hIsEOF Handle
handle
        Text
input' <- IORef Text -> IO Text
forall a. IORef a -> IO a
readIORef IORef Text
input
        Bool -> IO () -> IO ()
forall (f :: * -> *). Applicative f => Bool -> f () -> f ()
unless (Bool
isEOF Bool -> Bool -> Bool
&& Text
input' Text -> Text -> Bool
forall a. Eq a => a -> a -> Bool
== Text
forall a. Monoid a => a
mempty) (IO () -> IO ()) -> IO () -> IO ()
forall a b. (a -> b) -> a -> b
$ do
              IO Text -> Parser [Text] -> Text -> IO (Result [Text])
forall (m :: * -> *) a.
Monad m =>
m Text -> Parser a -> Text -> m (Result a)
parseWith (Handle -> IO Text
TIO.hGetChunk Handle
handle) (Char -> Parser [Text]
parseRow Char
c) Text
input' IO (Result [Text]) -> (Result [Text] -> IO ()) -> IO ()
forall a b. IO a -> (a -> IO b) -> IO b
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= \case
                Fail Text
unconsumed [String]
ctx String
er -> do
                  Integer
erpos <- Handle -> IO Integer
hTell Handle
handle
                  String -> IO ()
forall a. String -> IO a
forall (m :: * -> *) a. MonadFail m => String -> m a
fail (String -> IO ()) -> String -> IO ()
forall a b. (a -> b) -> a -> b
$ String
"Failed to parse CSV file around " String -> String -> String
forall a. Semigroup a => a -> a -> a
<> Integer -> String
forall a. Show a => a -> String
show Integer
erpos String -> String -> String
forall a. Semigroup a => a -> a -> a
<> String
" byte; due: "
                    String -> String -> String
forall a. Semigroup a => a -> a -> a
<> String -> String
forall a. Show a => a -> String
show String
er String -> String -> String
forall a. Semigroup a => a -> a -> a
<> String
"; context: " String -> String -> String
forall a. Semigroup a => a -> a -> a
<> [String] -> String
forall a. Show a => a -> String
show [String]
ctx
                Partial Text -> Result [Text]
c -> do
                  String -> IO ()
forall a. String -> IO a
forall (m :: * -> *) a. MonadFail m => String -> m a
fail String
"Partial handler is called"
                Done (Text
unconsumed :: T.Text) ([Text]
row :: [T.Text]) -> do
                  IORef Text -> Text -> IO ()
forall a. IORef a -> a -> IO ()
writeIORef IORef Text
input Text
unconsumed
                  (Int -> Text -> IO ()) -> [Int] -> [Text] -> IO ()
forall (m :: * -> *) a b c.
Applicative m =>
(a -> b -> m c) -> [a] -> [b] -> m ()
zipWithM_ (IOVector Column
-> IOVector [(Int, Text)] -> Int -> Int -> Text -> IO ()
writeValue IOVector Column
mutableCols IOVector [(Int, Text)]
nullIndices Int
i) [Int
0..] [Text]
row
{-# INLINE fillColumns #-}

-- | Writes a value into the appropriate column, resizing the vector if necessary.
writeValue :: VM.IOVector Column -> VM.IOVector [(Int, T.Text)] -> Int -> Int -> T.Text -> IO ()
writeValue :: IOVector Column
-> IOVector [(Int, Text)] -> Int -> Int -> Text -> IO ()
writeValue IOVector Column
mutableCols IOVector [(Int, Text)]
nullIndices Int
count Int
colIndex Text
value = do
    Column
col <- MVector (PrimState IO) Column -> Int -> IO Column
forall (m :: * -> *) a.
PrimMonad m =>
MVector (PrimState m) a -> Int -> m a
VM.unsafeRead IOVector Column
MVector (PrimState IO) Column
mutableCols Int
colIndex
    Either Text Bool
res <- Int -> Text -> Column -> IO (Either Text Bool)
writeColumn Int
count Text
value Column
col
    let modify :: Text -> IO ()
modify Text
value = MVector (PrimState IO) [(Int, Text)]
-> ([(Int, Text)] -> [(Int, Text)]) -> Int -> IO ()
forall (m :: * -> *) a.
PrimMonad m =>
MVector (PrimState m) a -> (a -> a) -> Int -> m ()
VM.unsafeModify IOVector [(Int, Text)]
MVector (PrimState IO) [(Int, Text)]
nullIndices ((Int
count, Text
value) (Int, Text) -> [(Int, Text)] -> [(Int, Text)]
forall a. a -> [a] -> [a]
:) Int
colIndex
    (Text -> IO ()) -> (Bool -> IO ()) -> Either Text Bool -> IO ()
forall a c b. (a -> c) -> (b -> c) -> Either a b -> c
either Text -> IO ()
modify (IO () -> Bool -> IO ()
forall a b. a -> b -> a
const (() -> IO ()
forall a. a -> IO a
forall (m :: * -> *) a. Monad m => a -> m a
return ())) Either Text Bool
res
{-# INLINE writeValue #-}

-- | Freezes a mutable vector into an immutable one, trimming it to the actual row count.
freezeColumn :: VM.IOVector Column -> V.Vector [(Int, T.Text)] -> ReadOptions -> Int -> IO (Maybe Column)
freezeColumn :: IOVector Column
-> Vector [(Int, Text)] -> ReadOptions -> Int -> IO (Maybe Column)
freezeColumn IOVector Column
mutableCols Vector [(Int, Text)]
nulls ReadOptions
opts Int
colIndex = do
    Column
col <- MVector (PrimState IO) Column -> Int -> IO Column
forall (m :: * -> *) a.
PrimMonad m =>
MVector (PrimState m) a -> Int -> m a
VM.unsafeRead IOVector Column
MVector (PrimState IO) Column
mutableCols Int
colIndex
    Column -> Maybe Column
forall a. a -> Maybe a
Just (Column -> Maybe Column) -> IO Column -> IO (Maybe Column)
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> [(Int, Text)] -> Column -> IO Column
freezeColumn' (Vector [(Int, Text)]
nulls Vector [(Int, Text)] -> Int -> [(Int, Text)]
forall a. Vector a -> Int -> a
V.! Int
colIndex) Column
col
{-# INLINE freezeColumn #-}

parseSep :: Char -> T.Text -> [T.Text]
parseSep :: Char -> Text -> [Text]
parseSep Char
c Text
s = (String -> [Text])
-> ([Text] -> [Text]) -> Either String [Text] -> [Text]
forall a c b. (a -> c) -> (b -> c) -> Either a b -> c
either String -> [Text]
forall a. HasCallStack => String -> a
error [Text] -> [Text]
forall a. a -> a
id (Parser [Text] -> Text -> Either String [Text]
forall a. Parser a -> Text -> Either String a
parseOnly (Char -> Parser [Text]
record Char
c) Text
s)
{-# INLINE parseSep #-}

record :: Char -> Parser [T.Text]
record :: Char -> Parser [Text]
record Char
c =
   Char -> Parser Text
field Char
c Parser Text -> Parser Text Char -> Parser [Text]
forall (f :: * -> *) a s. Alternative f => f a -> f s -> f [a]
`sepBy1` Char -> Parser Text Char
char Char
c
   Parser [Text] -> String -> Parser [Text]
forall i a. Parser i a -> String -> Parser i a
<?> String
"record"
{-# INLINE record #-}

parseRow :: Char -> Parser [T.Text]
parseRow :: Char -> Parser [Text]
parseRow Char
c = (Char -> Parser [Text]
record Char
c Parser [Text] -> Parser Text () -> Parser [Text]
forall a b. Parser Text a -> Parser Text b -> Parser Text a
forall (f :: * -> *) a b. Applicative f => f a -> f b -> f a
<* Parser Text ()
lineEnd)  Parser [Text] -> String -> Parser [Text]
forall i a. Parser i a -> String -> Parser i a
<?> String
"record-new-line"

field :: Char -> Parser T.Text
field :: Char -> Parser Text
field Char
c =
   Parser Text
quotedField Parser Text -> Parser Text -> Parser Text
forall a. Parser Text a -> Parser Text a -> Parser Text a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> Char -> Parser Text
unquotedField Char
c
   Parser Text -> String -> Parser Text
forall i a. Parser i a -> String -> Parser i a
<?> String
"field"
{-# INLINE field #-}

unquotedField :: Char -> Parser T.Text
unquotedField :: Char -> Parser Text
unquotedField Char
sep =
   (Char -> Bool) -> Parser Text
takeWhile Char -> Bool
nonTerminal Parser Text -> String -> Parser Text
forall i a. Parser i a -> String -> Parser i a
<?> String
"unquoted field"
   where nonTerminal :: Char -> Bool
nonTerminal = (Char -> Set Char -> Bool
forall a. Ord a => a -> Set a -> Bool
`S.notMember` String -> Set Char
forall a. Ord a => [a] -> Set a
S.fromList [Char
sep, Char
'\n', Char
'\r', Char
'"'])
{-# INLINE unquotedField #-}

insideQuotes :: Parser T.Text
insideQuotes :: Parser Text
insideQuotes =
   Text -> Text -> Text
T.append (Text -> Text -> Text) -> Parser Text -> Parser Text (Text -> Text)
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> (Char -> Bool) -> Parser Text
takeWhile (Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
/= Char
'"')
            Parser Text (Text -> Text) -> Parser Text -> Parser Text
forall a b. Parser Text (a -> b) -> Parser Text a -> Parser Text b
forall (f :: * -> *) a b. Applicative f => f (a -> b) -> f a -> f b
<*> ([Text] -> Text
T.concat ([Text] -> Text) -> Parser [Text] -> Parser Text
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Parser Text -> Parser [Text]
forall a. Parser Text a -> Parser Text [a]
forall (f :: * -> *) a. Alternative f => f a -> f [a]
many (Char -> Text -> Text
T.cons (Char -> Text -> Text)
-> Parser Text Char -> Parser Text (Text -> Text)
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Parser Text Char
dquotes Parser Text (Text -> Text) -> Parser Text -> Parser Text
forall a b. Parser Text (a -> b) -> Parser Text a -> Parser Text b
forall (f :: * -> *) a b. Applicative f => f (a -> b) -> f a -> f b
<*> Parser Text
insideQuotes))
   Parser Text -> String -> Parser Text
forall i a. Parser i a -> String -> Parser i a
<?> String
"inside of double quotes"
   where
      dquotes :: Parser Text Char
dquotes =
         Text -> Parser Text
string Text
"\"\"" Parser Text -> Parser Text Char -> Parser Text Char
forall a b. Parser Text a -> Parser Text b -> Parser Text b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Char -> Parser Text Char
forall a. a -> Parser Text a
forall (m :: * -> *) a. Monad m => a -> m a
return Char
'"'
         Parser Text Char -> String -> Parser Text Char
forall i a. Parser i a -> String -> Parser i a
<?> String
"paired double quotes"
{-# INLINE insideQuotes #-}

quotedField :: Parser T.Text
quotedField :: Parser Text
quotedField =
   Char -> Parser Text Char
char Char
'"' Parser Text Char -> Parser Text -> Parser Text
forall a b. Parser Text a -> Parser Text b -> Parser Text b
forall (f :: * -> *) a b. Applicative f => f a -> f b -> f b
*> Parser Text
insideQuotes Parser Text -> Parser Text Char -> Parser Text
forall a b. Parser Text a -> Parser Text b -> Parser Text a
forall (f :: * -> *) a b. Applicative f => f a -> f b -> f a
<* Char -> Parser Text Char
char Char
'"'
   Parser Text -> String -> Parser Text
forall i a. Parser i a -> String -> Parser i a
<?> String
"quoted field"
{-# INLINE quotedField #-}

lineEnd :: Parser ()
lineEnd :: Parser Text ()
lineEnd =
   (Parser Text ()
endOfLine Parser Text () -> Parser Text () -> Parser Text ()
forall a. Parser Text a -> Parser Text a -> Parser Text a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> Parser Text ()
forall t. Chunk t => Parser t ()
endOfInput)
   Parser Text () -> String -> Parser Text ()
forall i a. Parser i a -> String -> Parser i a
<?> String
"end of line"
{-# INLINE lineEnd #-}

-- | First pass to count rows for exact allocation
countRows :: Char -> FilePath -> IO Int
countRows :: Char -> String -> IO Int
countRows Char
c String
path = String -> IOMode -> (Handle -> IO Int) -> IO Int
forall r. String -> IOMode -> (Handle -> IO r) -> IO r
withFile String
path IOMode
ReadMode ((Handle -> IO Int) -> IO Int) -> (Handle -> IO Int) -> IO Int
forall a b. (a -> b) -> a -> b
$! Int -> Text -> Handle -> IO Int
go Int
0 Text
""
   where
      go :: Int -> Text -> Handle -> IO Int
go !Int
n !Text
input Handle
h = do
         Bool
isEOF <- Handle -> IO Bool
hIsEOF Handle
h
         if Bool
isEOF Bool -> Bool -> Bool
&& Text
input Text -> Text -> Bool
forall a. Eq a => a -> a -> Bool
== Text
forall a. Monoid a => a
mempty
            then Int -> IO Int
forall a. a -> IO a
forall (f :: * -> *) a. Applicative f => a -> f a
pure Int
n
            else
               IO Text -> Parser [Text] -> Text -> IO (Result [Text])
forall (m :: * -> *) a.
Monad m =>
m Text -> Parser a -> Text -> m (Result a)
parseWith (Handle -> IO Text
TIO.hGetChunk Handle
h) (Char -> Parser [Text]
parseRow Char
c) Text
input IO (Result [Text]) -> (Result [Text] -> IO Int) -> IO Int
forall a b. IO a -> (a -> IO b) -> IO b
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= \case
                  Fail Text
unconsumed [String]
ctx String
er -> do
                    Integer
erpos <- Handle -> IO Integer
hTell Handle
h
                    String -> IO Int
forall a. String -> IO a
forall (m :: * -> *) a. MonadFail m => String -> m a
fail (String -> IO Int) -> String -> IO Int
forall a b. (a -> b) -> a -> b
$ String
"Failed to parse CSV file around " String -> String -> String
forall a. Semigroup a => a -> a -> a
<> Integer -> String
forall a. Show a => a -> String
show Integer
erpos String -> String -> String
forall a. Semigroup a => a -> a -> a
<> String
" byte; due: "
                      String -> String -> String
forall a. Semigroup a => a -> a -> a
<> String -> String
forall a. Show a => a -> String
show String
er String -> String -> String
forall a. Semigroup a => a -> a -> a
<> String
"; context: " String -> String -> String
forall a. Semigroup a => a -> a -> a
<> [String] -> String
forall a. Show a => a -> String
show [String]
ctx
                  Partial Text -> Result [Text]
c -> do
                    String -> IO Int
forall a. String -> IO a
forall (m :: * -> *) a. MonadFail m => String -> m a
fail (String -> IO Int) -> String -> IO Int
forall a b. (a -> b) -> a -> b
$ String
"Partial handler is called; n = " String -> String -> String
forall a. Semigroup a => a -> a -> a
<> Int -> String
forall a. Show a => a -> String
show Int
n
                  Done (Text
unconsumed :: T.Text) [Text]
_ ->
                    Int -> Text -> Handle -> IO Int
go (Int
n Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
1) Text
unconsumed Handle
h
{-# INLINE countRows #-}

writeCsv :: String -> DataFrame -> IO ()
writeCsv :: String -> DataFrame -> IO ()
writeCsv = Char -> String -> DataFrame -> IO ()
writeSeparated Char
','

writeSeparated :: Char      -- ^ Separator
               -> String    -- ^ Path to write to
               -> DataFrame
               -> IO ()
writeSeparated :: Char -> String -> DataFrame -> IO ()
writeSeparated Char
c String
filepath DataFrame
df = String -> IOMode -> (Handle -> IO ()) -> IO ()
forall r. String -> IOMode -> (Handle -> IO r) -> IO r
withFile String
filepath IOMode
WriteMode ((Handle -> IO ()) -> IO ()) -> (Handle -> IO ()) -> IO ()
forall a b. (a -> b) -> a -> b
$ \Handle
handle ->do
    let (Int
rows, Int
columns) = DataFrame -> (Int, Int)
dataframeDimensions DataFrame
df
    let headers :: [Text]
headers = ((Text, Int) -> Text) -> [(Text, Int)] -> [Text]
forall a b. (a -> b) -> [a] -> [b]
map (Text, Int) -> Text
forall a b. (a, b) -> a
fst (((Text, Int) -> (Text, Int) -> Ordering)
-> [(Text, Int)] -> [(Text, Int)]
forall a. (a -> a -> Ordering) -> [a] -> [a]
L.sortBy (Int -> Int -> Ordering
forall a. Ord a => a -> a -> Ordering
compare (Int -> Int -> Ordering)
-> ((Text, Int) -> Int) -> (Text, Int) -> (Text, Int) -> Ordering
forall b c a. (b -> b -> c) -> (a -> b) -> a -> a -> c
`on` (Text, Int) -> Int
forall a b. (a, b) -> b
snd) (Map Text Int -> [(Text, Int)]
forall k a. Map k a -> [(k, a)]
M.toList (DataFrame -> Map Text Int
columnIndices DataFrame
df)))
    Handle -> Text -> IO ()
TIO.hPutStrLn Handle
handle (Text -> [Text] -> Text
T.intercalate Text
", " [Text]
headers)
    [Int] -> (Int -> IO ()) -> IO ()
forall (t :: * -> *) (m :: * -> *) a b.
(Foldable t, Monad m) =>
t a -> (a -> m b) -> m ()
forM_ [Int
0..(Int
rows Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
1)] ((Int -> IO ()) -> IO ()) -> (Int -> IO ()) -> IO ()
forall a b. (a -> b) -> a -> b
$ \Int
i -> do
        let row :: [Text]
row = DataFrame -> Int -> [Text]
getRowAsText DataFrame
df Int
i
        Handle -> Text -> IO ()
TIO.hPutStrLn Handle
handle (Text -> [Text] -> Text
T.intercalate Text
", " [Text]
row)

getRowAsText :: DataFrame -> Int -> [T.Text]
getRowAsText :: DataFrame -> Int -> [Text]
getRowAsText DataFrame
df Int
i = (Int -> Maybe Column -> [Text] -> [Text])
-> [Text] -> Vector (Maybe Column) -> [Text]
forall a b. (Int -> a -> b -> b) -> b -> Vector a -> b
V.ifoldr Int -> Maybe Column -> [Text] -> [Text]
go [] (DataFrame -> Vector (Maybe Column)
columns DataFrame
df)
  where
    indexMap :: Map Int Text
indexMap = [(Int, Text)] -> Map Int Text
forall k a. Ord k => [(k, a)] -> Map k a
M.fromList (((Text, Int) -> (Int, Text)) -> [(Text, Int)] -> [(Int, Text)]
forall a b. (a -> b) -> [a] -> [b]
map (\(Text
a, Int
b) -> (Int
b, Text
a)) ([(Text, Int)] -> [(Int, Text)]) -> [(Text, Int)] -> [(Int, Text)]
forall a b. (a -> b) -> a -> b
$ Map Text Int -> [(Text, Int)]
forall k a. Map k a -> [(k, a)]
M.toList (DataFrame -> Map Text Int
columnIndices DataFrame
df))
    go :: Int -> Maybe Column -> [Text] -> [Text]
go Int
k Maybe Column
Nothing [Text]
acc = [Text]
acc
    go Int
k (Just (BoxedColumn (Vector a
c :: V.Vector a))) [Text]
acc = case Vector a
c Vector a -> Int -> Maybe a
forall a. Vector a -> Int -> Maybe a
V.!? Int
i of
        Just a
e -> Text
textRep Text -> [Text] -> [Text]
forall a. a -> [a] -> [a]
: [Text]
acc
            where textRep :: Text
textRep = case TypeRep a -> TypeRep Text -> Maybe (a :~: Text)
forall a b. TypeRep a -> TypeRep b -> Maybe (a :~: b)
forall {k} (f :: k -> *) (a :: k) (b :: k).
TestEquality f =>
f a -> f b -> Maybe (a :~: b)
testEquality (forall a. Typeable a => TypeRep a
forall {k} (a :: k). Typeable a => TypeRep a
typeRep @a) (forall a. Typeable a => TypeRep a
forall {k} (a :: k). Typeable a => TypeRep a
typeRep @T.Text) of
                    Just a :~: Text
Refl -> a
Text
e
                    Maybe (a :~: Text)
Nothing   -> case forall a. Typeable a => TypeRep a
forall {k} (a :: k). Typeable a => TypeRep a
typeRep @a of
                        App TypeRep a
t1 TypeRep b
t2 -> case TypeRep a -> TypeRep Maybe -> Maybe (a :~~: Maybe)
forall k1 k2 (a :: k1) (b :: k2).
TypeRep a -> TypeRep b -> Maybe (a :~~: b)
eqTypeRep TypeRep a
t1 (forall {k} (a :: k). Typeable a => TypeRep a
forall (a :: * -> *). Typeable a => TypeRep a
typeRep @Maybe) of
                            Just a :~~: Maybe
HRefl -> case TypeRep b -> TypeRep Text -> Maybe (b :~: Text)
forall (a :: k1) (b :: k1).
TypeRep a -> TypeRep b -> Maybe (a :~: b)
forall {k} (f :: k -> *) (a :: k) (b :: k).
TestEquality f =>
f a -> f b -> Maybe (a :~: b)
testEquality TypeRep b
t2 (forall a. Typeable a => TypeRep a
forall {k} (a :: k). Typeable a => TypeRep a
typeRep @T.Text) of
                                Just b :~: Text
Refl -> Text -> Maybe Text -> Text
forall a. a -> Maybe a -> a
fromMaybe Text
"null" a
Maybe Text
e
                                Maybe (b :~: Text)
Nothing -> (Text -> Text
fromOptional (Text -> Text) -> (a -> Text) -> a -> Text
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (String -> Text
T.pack (String -> Text) -> (a -> String) -> a -> Text
forall b c a. (b -> c) -> (a -> b) -> a -> c
. a -> String
forall a. Show a => a -> String
show)) a
e
                                            where fromOptional :: Text -> Text
fromOptional Text
s
                                                    | Text -> Text -> Bool
T.isPrefixOf Text
"Just " Text
s = Int -> Text -> Text
T.drop (Text -> Int
T.length Text
"Just ") Text
s
                                                    | Bool
otherwise = Text
"null"
                            Maybe (a :~~: Maybe)
Nothing -> (String -> Text
T.pack (String -> Text) -> (a -> String) -> a -> Text
forall b c a. (b -> c) -> (a -> b) -> a -> c
. a -> String
forall a. Show a => a -> String
show) a
e
                        TypeRep a
_ -> (String -> Text
T.pack (String -> Text) -> (a -> String) -> a -> Text
forall b c a. (b -> c) -> (a -> b) -> a -> c
. a -> String
forall a. Show a => a -> String
show) a
e
        Maybe a
Nothing ->
            String -> [Text]
forall a. HasCallStack => String -> a
error (String -> [Text]) -> String -> [Text]
forall a b. (a -> b) -> a -> b
$
                String
"Column "
                String -> String -> String
forall a. [a] -> [a] -> [a]
++ Text -> String
T.unpack (Map Int Text
indexMap Map Int Text -> Int -> Text
forall k a. Ord k => Map k a -> k -> a
M.! Int
k)
                String -> String -> String
forall a. [a] -> [a] -> [a]
++ String
" has less items than "
                String -> String -> String
forall a. [a] -> [a] -> [a]
++ String
"the other columns at index "
                String -> String -> String
forall a. [a] -> [a] -> [a]
++ Int -> String
forall a. Show a => a -> String
show Int
i
    go Int
k (Just (UnboxedColumn Vector a
c)) [Text]
acc = case Vector a
c Vector a -> Int -> Maybe a
forall a. Unbox a => Vector a -> Int -> Maybe a
VU.!? Int
i of
        Just a
e -> String -> Text
T.pack (a -> String
forall a. Show a => a -> String
show a
e) Text -> [Text] -> [Text]
forall a. a -> [a] -> [a]
: [Text]
acc
        Maybe a
Nothing ->
            String -> [Text]
forall a. HasCallStack => String -> a
error (String -> [Text]) -> String -> [Text]
forall a b. (a -> b) -> a -> b
$
                String
"Column "
                String -> String -> String
forall a. [a] -> [a] -> [a]
++ Text -> String
T.unpack (Map Int Text
indexMap Map Int Text -> Int -> Text
forall k a. Ord k => Map k a -> k -> a
M.! Int
k)
                String -> String -> String
forall a. [a] -> [a] -> [a]
++ String
" has less items than "
                String -> String -> String
forall a. [a] -> [a] -> [a]
++ String
"the other columns at index "
                String -> String -> String
forall a. [a] -> [a] -> [a]
++ Int -> String
forall a. Show a => a -> String
show Int
i
    go Int
k (Just (OptionalColumn (Vector (Maybe a)
c :: V.Vector (Maybe a)))) [Text]
acc = case Vector (Maybe a)
c Vector (Maybe a) -> Int -> Maybe (Maybe a)
forall a. Vector a -> Int -> Maybe a
V.!? Int
i of
        Just Maybe a
e -> Text
textRep Text -> [Text] -> [Text]
forall a. a -> [a] -> [a]
: [Text]
acc
            where textRep :: Text
textRep = case TypeRep a -> TypeRep Text -> Maybe (a :~: Text)
forall a b. TypeRep a -> TypeRep b -> Maybe (a :~: b)
forall {k} (f :: k -> *) (a :: k) (b :: k).
TestEquality f =>
f a -> f b -> Maybe (a :~: b)
testEquality (forall a. Typeable a => TypeRep a
forall {k} (a :: k). Typeable a => TypeRep a
typeRep @a) (forall a. Typeable a => TypeRep a
forall {k} (a :: k). Typeable a => TypeRep a
typeRep @T.Text) of
                    Just a :~: Text
Refl -> a -> Maybe a -> a
forall a. a -> Maybe a -> a
fromMaybe a
"Nothing" Maybe a
e
                    Maybe (a :~: Text)
Nothing   -> (String -> Text
T.pack (String -> Text) -> (Maybe a -> String) -> Maybe a -> Text
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Maybe a -> String
forall a. Show a => a -> String
show) Maybe a
e
        Maybe (Maybe a)
Nothing ->
            String -> [Text]
forall a. HasCallStack => String -> a
error (String -> [Text]) -> String -> [Text]
forall a b. (a -> b) -> a -> b
$
                String
"Column "
                String -> String -> String
forall a. [a] -> [a] -> [a]
++ Text -> String
T.unpack (Map Int Text
indexMap Map Int Text -> Int -> Text
forall k a. Ord k => Map k a -> k -> a
M.! Int
k)
                String -> String -> String
forall a. [a] -> [a] -> [a]
++ String
" has less items than "
                String -> String -> String
forall a. [a] -> [a] -> [a]
++ String
"the other columns at index "
                String -> String -> String
forall a. [a] -> [a] -> [a]
++ Int -> String
forall a. Show a => a -> String
show Int
i