{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE CApiFFI #-}
{-# LANGUAGE ForeignFunctionInterface #-}

module DataFrame.IO.Unstable.CSV (
    fastReadCsvUnstable,
    readCsvUnstable,
) where

import qualified Data.Vector as Vector
import qualified Data.Vector.Storable as VS
import Data.Vector.Storable.Mutable (
    grow,
    unsafeFromForeignPtr,
 )
import qualified Data.Vector.Storable.Mutable as VSM
import System.IO.MMap (
    Mode (WriteCopy),
    mmapFileForeignPtr,
 )

import Foreign (
    Ptr,
    castForeignPtr,
    castPtr,
    mallocArray,
    newForeignPtr_,
 )
import Foreign.C.Types

import qualified Data.ByteString as BS
import Data.ByteString.Internal (ByteString (PS))
import qualified Data.Map as M
import Data.Text (Text)
import qualified Data.Text as Text
import qualified Data.Text.Encoding as TextEncoding
import Data.Word (Word8)

import Control.Parallel.Strategies (parList, rpar, using)
import Data.Array.IArray (array, (!))
import Data.Array.Unboxed (UArray)
import Data.Ix (range)

import DataFrame.IO.CSV (
    HeaderSpec (..),
    ReadOptions (..),
    defaultReadOptions,
    shouldInferFromSample,
    typeInferenceSampleSize,
 )
import DataFrame.Internal.DataFrame (DataFrame (..))
import DataFrame.Operations.Typing (parseFromExamples)

fastReadCsvUnstable :: FilePath -> IO DataFrame
fastReadCsvUnstable :: FilePath -> IO DataFrame
fastReadCsvUnstable =
    ReadOptions
-> (Int -> Vector Word8 -> IO (Vector CSize))
-> FilePath
-> IO DataFrame
readCsvUnstable'
        ReadOptions
defaultReadOptions
        Int -> Vector Word8 -> IO (Vector CSize)
getDelimiterIndices

readCsvUnstable :: FilePath -> IO DataFrame
readCsvUnstable :: FilePath -> IO DataFrame
readCsvUnstable =
    ReadOptions
-> (Int -> Vector Word8 -> IO (Vector CSize))
-> FilePath
-> IO DataFrame
readCsvUnstable'
        ReadOptions
defaultReadOptions
        ( \Int
originalLen Vector Word8
v -> do
            Ptr CSize
indices <- Int -> IO (Ptr CSize)
forall a. Storable a => Int -> IO (Ptr a)
mallocArray Int
originalLen
            Int -> Vector Word8 -> Ptr CSize -> IO (Vector CSize)
getDelimiterIndices_ Int
originalLen Vector Word8
v Ptr CSize
indices
        )

readCsvUnstable' ::
    ReadOptions ->
    (Int -> VS.Vector Word8 -> IO (VS.Vector CSize)) ->
    FilePath ->
    IO DataFrame
readCsvUnstable' :: ReadOptions
-> (Int -> Vector Word8 -> IO (Vector CSize))
-> FilePath
-> IO DataFrame
readCsvUnstable' ReadOptions
opts Int -> Vector Word8 -> IO (Vector CSize)
delimiterIndices FilePath
filePath = do
    -- We use write copy mode so that we can append
    -- padding to the end of the memory space
    (ForeignPtr Word8
bufferPtr, Int
offset, Int
len) <-
        FilePath
-> Mode -> Maybe (Int64, Int) -> IO (ForeignPtr Word8, Int, Int)
forall a.
FilePath
-> Mode -> Maybe (Int64, Int) -> IO (ForeignPtr a, Int, Int)
mmapFileForeignPtr
            FilePath
filePath
            Mode
WriteCopy
            Maybe (Int64, Int)
forall a. Maybe a
Nothing
    let mutableFile :: MVector s Word8
mutableFile = ForeignPtr Word8 -> Int -> Int -> MVector s Word8
forall a s. Storable a => ForeignPtr a -> Int -> Int -> MVector s a
unsafeFromForeignPtr ForeignPtr Word8
bufferPtr Int
offset Int
len
    MVector RealWorld Word8
paddedMutableFile <- MVector (PrimState IO) Word8
-> Int -> IO (MVector (PrimState IO) Word8)
forall (m :: * -> *) a.
(PrimMonad m, Storable a) =>
MVector (PrimState m) a -> Int -> m (MVector (PrimState m) a)
grow MVector RealWorld Word8
MVector (PrimState IO) Word8
forall {s}. MVector s Word8
mutableFile Int
64
    Vector Word8
paddedCSVFile <- MVector (PrimState IO) Word8 -> IO (Vector Word8)
forall a (m :: * -> *).
(Storable a, PrimMonad m) =>
MVector (PrimState m) a -> m (Vector a)
VS.unsafeFreeze MVector RealWorld Word8
MVector (PrimState IO) Word8
paddedMutableFile
    Vector CSize
indices <- Int -> Vector Word8 -> IO (Vector CSize)
delimiterIndices Int
len Vector Word8
paddedCSVFile
    let numCol :: Int
numCol = Vector Word8 -> Vector CSize -> Int
countColumnsInFirstRow Vector Word8
paddedCSVFile Vector CSize
indices
        totalRows :: Int
totalRows = Vector CSize -> Int
forall a. Storable a => Vector a -> Int
VS.length Vector CSize
indices Int -> Int -> Int
forall a. Integral a => a -> a -> a
`div` Int
numCol
        extractField' :: Int -> Text
extractField' = Vector Word8 -> Vector CSize -> Int -> Text
extractField Vector Word8
paddedCSVFile Vector CSize
indices
        (Vector Text
columnNames, Int
dataStartRow) = case ReadOptions -> HeaderSpec
headerSpec ReadOptions
opts of
            HeaderSpec
NoHeader ->
                ( [Text] -> Vector Text
forall a. [a] -> Vector a
Vector.fromList ([Text] -> Vector Text) -> [Text] -> Vector Text
forall a b. (a -> b) -> a -> b
$
                    (Int -> Text) -> [Int] -> [Text]
forall a b. (a -> b) -> [a] -> [b]
map (FilePath -> Text
Text.pack (FilePath -> Text) -> (Int -> FilePath) -> Int -> Text
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Int -> FilePath
forall a. Show a => a -> FilePath
show) [Int
0 .. Int
numCol Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
1]
                , Int
0
                )
            HeaderSpec
UseFirstRow ->
                ( [Text] -> Vector Text
forall a. [a] -> Vector a
Vector.fromList ([Text] -> Vector Text) -> [Text] -> Vector Text
forall a b. (a -> b) -> a -> b
$
                    (Int -> Text) -> [Int] -> [Text]
forall a b. (a -> b) -> [a] -> [b]
map Int -> Text
extractField' [Int
0 .. Int
numCol Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
1]
                , Int
1
                )
            ProvideNames [Text]
ns ->
                ([Text] -> Vector Text
forall a. [a] -> Vector a
Vector.fromList [Text]
ns, Int
0)
        numRow :: Int
numRow = Int
totalRows Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
dataStartRow
        parseTypes :: Vector Text -> Column
parseTypes Vector Text
col =
            let n :: Int
n =
                    if TypeSpec -> Bool
shouldInferFromSample (ReadOptions -> TypeSpec
typeSpec ReadOptions
opts)
                        then TypeSpec -> Int
typeInferenceSampleSize (ReadOptions -> TypeSpec
typeSpec ReadOptions
opts)
                        else Int
0
             in Int -> FilePath -> Bool -> Vector Text -> Column
parseFromExamples
                    Int
n
                    (ReadOptions -> FilePath
dateFormat ReadOptions
opts)
                    (ReadOptions -> Bool
safeRead ReadOptions
opts)
                    Vector Text
col
        generateColumn :: Int -> Column
generateColumn Int
col =
            Vector Text -> Column
parseTypes (Vector Text -> Column) -> Vector Text -> Column
forall a b. (a -> b) -> a -> b
$
                Int -> [Text] -> Vector Text
forall a. Int -> [a] -> Vector a
Vector.fromListN
                    Int
numRow
                    ( (Int -> Text) -> [Int] -> [Text]
forall a b. (a -> b) -> [a] -> [b]
map
                        ( \Int
row ->
                            Int -> Text
extractField'
                                (Int
row Int -> Int -> Int
forall a. Num a => a -> a -> a
* Int
numCol Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
col)
                        )
                        [Int
dataStartRow .. Int
totalRows Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
1]
                    )
        columns :: Vector Column
columns =
            Int -> [Column] -> Vector Column
forall a. Int -> [a] -> Vector a
Vector.fromListN
                Int
numCol
                ( (Int -> Column) -> [Int] -> [Column]
forall a b. (a -> b) -> [a] -> [b]
map Int -> Column
generateColumn [Int
0 .. Int
numCol Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
1]
                    [Column] -> Strategy [Column] -> [Column]
forall a. a -> Strategy a -> a
`using` Strategy Column -> Strategy [Column]
forall a. Strategy a -> Strategy [a]
parList Strategy Column
forall a. Strategy a
rpar
                )
        columnIndices :: Map Text Int
columnIndices =
            [(Text, Int)] -> Map Text Int
forall k a. Ord k => [(k, a)] -> Map k a
M.fromList ([(Text, Int)] -> Map Text Int) -> [(Text, Int)] -> Map Text Int
forall a b. (a -> b) -> a -> b
$
                [Text] -> [Int] -> [(Text, Int)]
forall a b. [a] -> [b] -> [(a, b)]
zip (Vector Text -> [Text]
forall a. Vector a -> [a]
Vector.toList Vector Text
columnNames) [Int
0 ..]
        dataframeDimensions :: (Int, Int)
dataframeDimensions = (Int
numRow, Int
numCol)
    DataFrame -> IO DataFrame
forall a. a -> IO a
forall (m :: * -> *) a. Monad m => a -> m a
return (DataFrame -> IO DataFrame) -> DataFrame -> IO DataFrame
forall a b. (a -> b) -> a -> b
$
        Vector Column -> Map Text Int -> (Int, Int) -> DataFrame
DataFrame Vector Column
columns Map Text Int
columnIndices (Int, Int)
dataframeDimensions

{-# INLINE extractField #-}
extractField ::
    VS.Vector Word8 ->
    VS.Vector CSize ->
    Int ->
    Text
extractField :: Vector Word8 -> Vector CSize -> Int -> Text
extractField Vector Word8
file Vector CSize
indices Int
position =
    ByteString -> Text
TextEncoding.decodeUtf8Lenient
        (ByteString -> Text)
-> (Vector Word8 -> ByteString) -> Vector Word8 -> Text
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Vector Word8 -> ByteString
unsafeToByteString
        (Vector Word8 -> Text) -> Vector Word8 -> Text
forall a b. (a -> b) -> a -> b
$ Int -> Int -> Vector Word8 -> Vector Word8
forall a. Storable a => Int -> Int -> Vector a -> Vector a
VS.slice
            Int
previous
            (Int
next Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
previous)
            Vector Word8
file
  where
    previous :: Int
previous =
        if Int
position Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== Int
0
            then Int
0
            else Int
1 Int -> Int -> Int
forall a. Num a => a -> a -> a
+ CSize -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Vector CSize
indices Vector CSize -> Int -> CSize
forall a. Storable a => Vector a -> Int -> a
VS.! (Int
position Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
1))
    next :: Int
next = CSize -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral (CSize -> Int) -> CSize -> Int
forall a b. (a -> b) -> a -> b
$ Vector CSize
indices Vector CSize -> Int -> CSize
forall a. Storable a => Vector a -> Int -> a
VS.! Int
position
    unsafeToByteString :: VS.Vector Word8 -> BS.ByteString
    unsafeToByteString :: Vector Word8 -> ByteString
unsafeToByteString Vector Word8
v = ForeignPtr Word8 -> Int -> Int -> ByteString
PS (ForeignPtr Word8 -> ForeignPtr Word8
forall a b. ForeignPtr a -> ForeignPtr b
castForeignPtr ForeignPtr Word8
ptr) Int
0 Int
len
      where
        (ForeignPtr Word8
ptr, Int
len) = Vector Word8 -> (ForeignPtr Word8, Int)
forall a. Vector a -> (ForeignPtr a, Int)
VS.unsafeToForeignPtr0 Vector Word8
v

foreign import capi "process_csv.h get_delimiter_indices"
    get_delimiter_indices ::
        Ptr CUChar -> -- input
        CSize -> -- input size
        Ptr CSize -> -- result array
        IO CSize -- occupancy of result array

{-# INLINE getDelimiterIndices #-}
getDelimiterIndices ::
    Int ->
    VS.Vector Word8 ->
    IO (VS.Vector CSize)
getDelimiterIndices :: Int -> Vector Word8 -> IO (Vector CSize)
getDelimiterIndices Int
originalLen Vector Word8
csvFile =
    Vector Word8
-> (Ptr Word8 -> IO (Vector CSize)) -> IO (Vector CSize)
forall a b. Storable a => Vector a -> (Ptr a -> IO b) -> IO b
VS.unsafeWith Vector Word8
csvFile ((Ptr Word8 -> IO (Vector CSize)) -> IO (Vector CSize))
-> (Ptr Word8 -> IO (Vector CSize)) -> IO (Vector CSize)
forall a b. (a -> b) -> a -> b
$ \Ptr Word8
buffer -> do
        let paddedLen :: Int
paddedLen = Vector Word8 -> Int
forall a. Storable a => Vector a -> Int
VS.length Vector Word8
csvFile
        -- then number of delimiters cannot exceed the size
        -- of the input array (which would be a series of
        -- empty fields)
        Ptr CSize
indices <- Int -> IO (Ptr CSize)
forall a. Storable a => Int -> IO (Ptr a)
mallocArray Int
paddedLen
        CSize
num_fields <-
            Ptr CUChar -> CSize -> Ptr CSize -> IO CSize
get_delimiter_indices
                (Ptr Word8 -> Ptr CUChar
forall a b. Ptr a -> Ptr b
castPtr Ptr Word8
buffer)
                (Int -> CSize
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
paddedLen)
                (Ptr CSize -> Ptr CSize
forall a b. Ptr a -> Ptr b
castPtr Ptr CSize
indices)
        if CSize
num_fields CSize -> CSize -> Bool
forall a. Eq a => a -> a -> Bool
== -CSize
1
            then Int -> Vector Word8 -> Ptr CSize -> IO (Vector CSize)
getDelimiterIndices_ Int
originalLen Vector Word8
csvFile Ptr CSize
indices
            else do
                ForeignPtr CSize
indices' <- Ptr CSize -> IO (ForeignPtr CSize)
forall a. Ptr a -> IO (ForeignPtr a)
newForeignPtr_ Ptr CSize
indices
                let resultVector :: MVector s CSize
resultVector = ForeignPtr CSize -> Int -> MVector s CSize
forall a s. ForeignPtr a -> Int -> MVector s a
VSM.unsafeFromForeignPtr0 ForeignPtr CSize
indices' Int
paddedLen
                -- Handle the case where the file doesn't end with a newline
                -- We need to add a final delimiter for the last field
                Int
finalResultLen <-
                    if Int
originalLen Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
> Int
0 Bool -> Bool -> Bool
&& Vector Word8
csvFile Vector Word8 -> Int -> Word8
forall a. Storable a => Vector a -> Int -> a
VS.! (Int
originalLen Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
1) Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
/= Word8
lf
                        then do
                            MVector (PrimState IO) CSize -> Int -> CSize -> IO ()
forall (m :: * -> *) a.
(PrimMonad m, Storable a) =>
MVector (PrimState m) a -> Int -> a -> m ()
VSM.write MVector RealWorld CSize
MVector (PrimState IO) CSize
forall {s}. MVector s CSize
resultVector (CSize -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral CSize
num_fields) (Int -> CSize
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
originalLen)
                            Int -> IO Int
forall a. a -> IO a
forall (m :: * -> *) a. Monad m => a -> m a
return (CSize -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral CSize
num_fields Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
1)
                        else Int -> IO Int
forall a. a -> IO a
forall (m :: * -> *) a. Monad m => a -> m a
return (CSize -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral CSize
num_fields)
                MVector (PrimState IO) CSize -> IO (Vector CSize)
forall a (m :: * -> *).
(Storable a, PrimMonad m) =>
MVector (PrimState m) a -> m (Vector a)
VS.unsafeFreeze (MVector (PrimState IO) CSize -> IO (Vector CSize))
-> MVector (PrimState IO) CSize -> IO (Vector CSize)
forall a b. (a -> b) -> a -> b
$ Int -> Int -> MVector RealWorld CSize -> MVector RealWorld CSize
forall a s. Storable a => Int -> Int -> MVector s a -> MVector s a
VSM.slice Int
0 Int
finalResultLen MVector RealWorld CSize
forall {s}. MVector s CSize
resultVector

-- We have a Native version in case the C version
-- cannot be used. For example if neither ARM_NEON
-- nor AVX2 are available

lf, cr, comma, quote :: Word8
lf :: Word8
lf = Word8
0x0A
cr :: Word8
cr = Word8
0x0D
comma :: Word8
comma = Word8
0x2C
quote :: Word8
quote = Word8
0x22

-- We parse using a state machine
data State
    = UnEscaped -- non quoted
    | Escaped -- quoted
    deriving (Int -> State
State -> Int
State -> [State]
State -> State
State -> State -> [State]
State -> State -> State -> [State]
(State -> State)
-> (State -> State)
-> (Int -> State)
-> (State -> Int)
-> (State -> [State])
-> (State -> State -> [State])
-> (State -> State -> [State])
-> (State -> State -> State -> [State])
-> Enum State
forall a.
(a -> a)
-> (a -> a)
-> (Int -> a)
-> (a -> Int)
-> (a -> [a])
-> (a -> a -> [a])
-> (a -> a -> [a])
-> (a -> a -> a -> [a])
-> Enum a
$csucc :: State -> State
succ :: State -> State
$cpred :: State -> State
pred :: State -> State
$ctoEnum :: Int -> State
toEnum :: Int -> State
$cfromEnum :: State -> Int
fromEnum :: State -> Int
$cenumFrom :: State -> [State]
enumFrom :: State -> [State]
$cenumFromThen :: State -> State -> [State]
enumFromThen :: State -> State -> [State]
$cenumFromTo :: State -> State -> [State]
enumFromTo :: State -> State -> [State]
$cenumFromThenTo :: State -> State -> State -> [State]
enumFromThenTo :: State -> State -> State -> [State]
Enum)

{-# INLINE stateTransitionTable #-}
stateTransitionTable :: UArray (Int, Word8) Int
stateTransitionTable :: UArray (Int, Word8) Int
stateTransitionTable = ((Int, Word8), (Int, Word8))
-> [((Int, Word8), Int)] -> UArray (Int, Word8) Int
forall (a :: * -> * -> *) e i.
(IArray a e, Ix i) =>
(i, i) -> [(i, e)] -> a i e
array ((Int
0, Word8
0), (Int
1, Word8
255)) [((Int, Word8)
i, (Int, Word8) -> Int
forall {a}. (Eq a, Num a) => (Int, a) -> Int
f (Int, Word8)
i) | (Int, Word8)
i <- ((Int, Word8), (Int, Word8)) -> [(Int, Word8)]
forall a. Ix a => (a, a) -> [a]
range ((Int
0, Word8
0), (Int
1, Word8
255))]
  where
    -- Unescaped newline
    f :: (Int, a) -> Int
f (Int
0, a
0x0A) = State -> Int
forall a. Enum a => a -> Int
fromEnum State
UnEscaped
    -- Unescaped comma
    f (Int
0, a
0x2C) = State -> Int
forall a. Enum a => a -> Int
fromEnum State
UnEscaped
    -- Unescaped quote
    f (Int
0, a
0x22) = State -> Int
forall a. Enum a => a -> Int
fromEnum State
Escaped
    -- Escaped quote
    -- escaped quote in fields are dealt as
    -- consecutive quoted sections of a field
    -- example: If we have
    -- field1, "abc""def""ghi, field3
    -- we end up processing abc, def, and ghi
    -- as consecutive quoted strings.
    f (Int
1, a
0x22) = State -> Int
forall a. Enum a => a -> Int
fromEnum State
UnEscaped
    -- Everything else
    f (Int
state, a
_) = Int
state

{-# INLINE getDelimiterIndices_ #-}
getDelimiterIndices_ ::
    Int ->
    VS.Vector Word8 ->
    Ptr CSize ->
    IO (VS.Vector CSize)
getDelimiterIndices_ :: Int -> Vector Word8 -> Ptr CSize -> IO (Vector CSize)
getDelimiterIndices_ Int
originalLen Vector Word8
csvFile Ptr CSize
resultPtr = do
    MVector RealWorld CSize
resultVector <- IO (MVector RealWorld CSize)
forall {s}. IO (MVector s CSize)
resultVectorM
    (State
_, Int
resultLen) <-
        ((State, Int) -> Int -> Word8 -> IO (State, Int))
-> (State, Int) -> Vector Word8 -> IO (State, Int)
forall (m :: * -> *) b a.
(Monad m, Storable b) =>
(a -> Int -> b -> m a) -> a -> Vector b -> m a
VS.ifoldM'
            (MVector RealWorld CSize
-> (State, Int) -> Int -> Word8 -> IO (State, Int)
processCharacter MVector RealWorld CSize
resultVector)
            (State
UnEscaped, Int
0)
            Vector Word8
csvFile
    -- Handle the case where the file doesn't end with a newline
    -- We need to add a final delimiter for the last field
    Int
finalResultLen <-
        if Int
originalLen Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
> Int
0 Bool -> Bool -> Bool
&& Vector Word8
csvFile Vector Word8 -> Int -> Word8
forall a. Storable a => Vector a -> Int -> a
VS.! (Int
originalLen Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
1) Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
/= Word8
lf
            then do
                MVector (PrimState IO) CSize -> Int -> CSize -> IO ()
forall (m :: * -> *) a.
(PrimMonad m, Storable a) =>
MVector (PrimState m) a -> Int -> a -> m ()
VSM.write MVector RealWorld CSize
MVector (PrimState IO) CSize
resultVector Int
resultLen (Int -> CSize
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
originalLen)
                Int -> IO Int
forall a. a -> IO a
forall (m :: * -> *) a. Monad m => a -> m a
return (Int
resultLen Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
1)
            else Int -> IO Int
forall a. a -> IO a
forall (m :: * -> *) a. Monad m => a -> m a
return Int
resultLen
    MVector (PrimState IO) CSize -> IO (Vector CSize)
forall a (m :: * -> *).
(Storable a, PrimMonad m) =>
MVector (PrimState m) a -> m (Vector a)
VS.unsafeFreeze (MVector (PrimState IO) CSize -> IO (Vector CSize))
-> MVector (PrimState IO) CSize -> IO (Vector CSize)
forall a b. (a -> b) -> a -> b
$ Int -> Int -> MVector RealWorld CSize -> MVector RealWorld CSize
forall a s. Storable a => Int -> Int -> MVector s a -> MVector s a
VSM.slice Int
0 Int
finalResultLen MVector RealWorld CSize
resultVector
  where
    paddedLen :: Int
paddedLen = Vector Word8 -> Int
forall a. Storable a => Vector a -> Int
VS.length Vector Word8
csvFile
    resultVectorM :: IO (MVector s CSize)
resultVectorM = do
        ForeignPtr CSize
resultForeignPtr <- Ptr CSize -> IO (ForeignPtr CSize)
forall a. Ptr a -> IO (ForeignPtr a)
newForeignPtr_ Ptr CSize
resultPtr
        MVector s CSize -> IO (MVector s CSize)
forall a. a -> IO a
forall (m :: * -> *) a. Monad m => a -> m a
return (MVector s CSize -> IO (MVector s CSize))
-> MVector s CSize -> IO (MVector s CSize)
forall a b. (a -> b) -> a -> b
$ ForeignPtr CSize -> Int -> MVector s CSize
forall a s. ForeignPtr a -> Int -> MVector s a
VSM.unsafeFromForeignPtr0 ForeignPtr CSize
resultForeignPtr Int
paddedLen
    processCharacter ::
        VSM.IOVector CSize ->
        (State, Int) ->
        Int ->
        Word8 ->
        IO (State, Int)
    processCharacter :: MVector RealWorld CSize
-> (State, Int) -> Int -> Word8 -> IO (State, Int)
processCharacter
        MVector RealWorld CSize
resultVector
        (!State
state, !Int
resultIndex)
        Int
index
        Word8
character =
            case State
state of
                State
UnEscaped ->
                    if Word8
character Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
== Word8
lf Bool -> Bool -> Bool
|| Word8
character Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
== Word8
comma
                        then do
                            MVector (PrimState IO) CSize -> Int -> CSize -> IO ()
forall (m :: * -> *) a.
(PrimMonad m, Storable a) =>
MVector (PrimState m) a -> Int -> a -> m ()
VSM.write
                                MVector RealWorld CSize
MVector (PrimState IO) CSize
resultVector
                                Int
resultIndex
                                (Int -> CSize
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
index)
                            (State, Int) -> IO (State, Int)
forall a. a -> IO a
forall (m :: * -> *) a. Monad m => a -> m a
return (State
newState, Int
resultIndex Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
1)
                        else (State, Int) -> IO (State, Int)
forall a. a -> IO a
forall (m :: * -> *) a. Monad m => a -> m a
return (State
newState, Int
resultIndex)
                State
Escaped -> (State, Int) -> IO (State, Int)
forall a. a -> IO a
forall (m :: * -> *) a. Monad m => a -> m a
return (State
newState, Int
resultIndex)
          where
            newState :: State
newState =
                Int -> State
forall a. Enum a => Int -> a
toEnum (Int -> State) -> Int -> State
forall a b. (a -> b) -> a -> b
$
                    UArray (Int, Word8) Int
stateTransitionTable
                        UArray (Int, Word8) Int -> (Int, Word8) -> Int
forall (a :: * -> * -> *) e i.
(IArray a e, Ix i) =>
a i e -> i -> e
! (State -> Int
forall a. Enum a => a -> Int
fromEnum State
state, Word8
character)

{-# INLINE countColumnsInFirstRow #-}
countColumnsInFirstRow ::
    VS.Vector Word8 ->
    VS.Vector CSize ->
    Int
countColumnsInFirstRow :: Vector Word8 -> Vector CSize -> Int
countColumnsInFirstRow Vector Word8
file Vector CSize
indices
    | Vector CSize -> Int
forall a. Storable a => Vector a -> Int
VS.length Vector CSize
indices Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== Int
0 = Int
0
    | Bool
otherwise =
        Int
1
            Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Vector CSize -> Int
forall a. Storable a => Vector a -> Int
VS.length
                ( (CSize -> Bool) -> Vector CSize -> Vector CSize
forall a. Storable a => (a -> Bool) -> Vector a -> Vector a
VS.takeWhile
                    (\CSize
i -> Vector Word8
file Vector Word8 -> Int -> Word8
forall a. Storable a => Vector a -> Int -> a
VS.! CSize -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral CSize
i Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
/= Word8
lf)
                    Vector CSize
indices
                )