{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE CApiFFI #-}
{-# LANGUAGE ForeignFunctionInterface #-}

module DataFrame.IO.Unstable.CSV (
    fastReadCsvUnstable,
    readCsvUnstable,
    fastReadTsvUnstable,
    readTsvUnstable,
    getDelimiterIndices,
) where

import qualified Data.Vector as Vector
import qualified Data.Vector.Storable as VS
import Data.Vector.Storable.Mutable (
    grow,
    unsafeFromForeignPtr,
 )
import qualified Data.Vector.Storable.Mutable as VSM
import System.IO.MMap (
    Mode (WriteCopy),
    mmapFileForeignPtr,
 )

import Foreign (
    Ptr,
    castForeignPtr,
    castPtr,
    mallocArray,
    newForeignPtr_,
 )
import Foreign.C.Types

import qualified Data.ByteString as BS
import Data.ByteString.Internal (ByteString (PS))
import qualified Data.Map as M
import Data.Text (Text)
import qualified Data.Text as Text
import qualified Data.Text.Encoding as TextEncoding
import Data.Word (Word8)

import Control.Parallel.Strategies (parList, rpar, using)
import Data.Array.IArray (array, (!))
import Data.Array.Unboxed (UArray)
import Data.Ix (range)

import DataFrame.IO.CSV (
    HeaderSpec (..),
    ReadOptions (..),
    defaultReadOptions,
    shouldInferFromSample,
    stripQuotes,
    typeInferenceSampleSize,
 )
import DataFrame.Internal.DataFrame (DataFrame (..))
import DataFrame.Operations.Typing (parseFromExamples)

readSeparatedDefaultFast :: Word8 -> FilePath -> IO DataFrame
readSeparatedDefaultFast :: Word8 -> FilePath -> IO DataFrame
readSeparatedDefaultFast Word8
separator =
    Word8
-> ReadOptions
-> (Word8 -> Int -> Vector Word8 -> IO (Vector CSize))
-> FilePath
-> IO DataFrame
readSeparated
        Word8
separator
        ReadOptions
defaultReadOptions
        Word8 -> Int -> Vector Word8 -> IO (Vector CSize)
getDelimiterIndices

readSeparatedDefault :: Word8 -> FilePath -> IO DataFrame
readSeparatedDefault :: Word8 -> FilePath -> IO DataFrame
readSeparatedDefault Word8
separator =
    Word8
-> ReadOptions
-> (Word8 -> Int -> Vector Word8 -> IO (Vector CSize))
-> FilePath
-> IO DataFrame
readSeparated
        Word8
separator
        ReadOptions
defaultReadOptions
        ( \Word8
separator Int
originalLen Vector Word8
v -> do
            Ptr CSize
indices <- Int -> IO (Ptr CSize)
forall a. Storable a => Int -> IO (Ptr a)
mallocArray Int
originalLen
            Word8 -> Int -> Vector Word8 -> Ptr CSize -> IO (Vector CSize)
getDelimiterIndices_ Word8
separator Int
originalLen Vector Word8
v Ptr CSize
indices
        )

fastReadCsvUnstable :: FilePath -> IO DataFrame
fastReadCsvUnstable :: FilePath -> IO DataFrame
fastReadCsvUnstable = Word8 -> FilePath -> IO DataFrame
readSeparatedDefaultFast Word8
comma

readCsvUnstable :: FilePath -> IO DataFrame
readCsvUnstable :: FilePath -> IO DataFrame
readCsvUnstable = Word8 -> FilePath -> IO DataFrame
readSeparatedDefault Word8
comma

fastReadTsvUnstable :: FilePath -> IO DataFrame
fastReadTsvUnstable :: FilePath -> IO DataFrame
fastReadTsvUnstable = Word8 -> FilePath -> IO DataFrame
readSeparatedDefaultFast Word8
tab

readTsvUnstable :: FilePath -> IO DataFrame
readTsvUnstable :: FilePath -> IO DataFrame
readTsvUnstable = Word8 -> FilePath -> IO DataFrame
readSeparatedDefault Word8
tab

readSeparated ::
    Word8 ->
    ReadOptions ->
    (Word8 -> Int -> VS.Vector Word8 -> IO (VS.Vector CSize)) ->
    FilePath ->
    IO DataFrame
readSeparated :: Word8
-> ReadOptions
-> (Word8 -> Int -> Vector Word8 -> IO (Vector CSize))
-> FilePath
-> IO DataFrame
readSeparated Word8
separator ReadOptions
opts Word8 -> Int -> Vector Word8 -> IO (Vector CSize)
delimiterIndices FilePath
filePath = do
    -- We use write copy mode so that we can append
    -- padding to the end of the memory space
    (ForeignPtr Word8
bufferPtr, Int
offset, Int
len) <-
        FilePath
-> Mode -> Maybe (Int64, Int) -> IO (ForeignPtr Word8, Int, Int)
forall a.
FilePath
-> Mode -> Maybe (Int64, Int) -> IO (ForeignPtr a, Int, Int)
mmapFileForeignPtr
            FilePath
filePath
            Mode
WriteCopy
            Maybe (Int64, Int)
forall a. Maybe a
Nothing
    let mutableFile :: MVector s Word8
mutableFile = ForeignPtr Word8 -> Int -> Int -> MVector s Word8
forall a s. Storable a => ForeignPtr a -> Int -> Int -> MVector s a
unsafeFromForeignPtr ForeignPtr Word8
bufferPtr Int
offset Int
len
    MVector RealWorld Word8
paddedMutableFile <- MVector (PrimState IO) Word8
-> Int -> IO (MVector (PrimState IO) Word8)
forall (m :: * -> *) a.
(PrimMonad m, Storable a) =>
MVector (PrimState m) a -> Int -> m (MVector (PrimState m) a)
grow MVector RealWorld Word8
MVector (PrimState IO) Word8
forall {s}. MVector s Word8
mutableFile Int
64
    Vector Word8
paddedCSVFile <- MVector (PrimState IO) Word8 -> IO (Vector Word8)
forall a (m :: * -> *).
(Storable a, PrimMonad m) =>
MVector (PrimState m) a -> m (Vector a)
VS.unsafeFreeze MVector RealWorld Word8
MVector (PrimState IO) Word8
paddedMutableFile
    Vector CSize
indices <- Word8 -> Int -> Vector Word8 -> IO (Vector CSize)
delimiterIndices Word8
separator Int
len Vector Word8
paddedCSVFile
    let numCol :: Int
numCol = Vector Word8 -> Vector CSize -> Int
countColumnsInFirstRow Vector Word8
paddedCSVFile Vector CSize
indices
        totalRows :: Int
totalRows = Vector CSize -> Int
forall a. Storable a => Vector a -> Int
VS.length Vector CSize
indices Int -> Int -> Int
forall a. Integral a => a -> a -> a
`div` Int
numCol
        extractField' :: Int -> Text
extractField' = Vector Word8 -> Vector CSize -> Int -> Text
extractField Vector Word8
paddedCSVFile Vector CSize
indices
        (Vector Text
columnNames, Int
dataStartRow) = case ReadOptions -> HeaderSpec
headerSpec ReadOptions
opts of
            HeaderSpec
NoHeader ->
                ( [Text] -> Vector Text
forall a. [a] -> Vector a
Vector.fromList ([Text] -> Vector Text) -> [Text] -> Vector Text
forall a b. (a -> b) -> a -> b
$
                    (Int -> Text) -> [Int] -> [Text]
forall a b. (a -> b) -> [a] -> [b]
map (FilePath -> Text
Text.pack (FilePath -> Text) -> (Int -> FilePath) -> Int -> Text
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Int -> FilePath
forall a. Show a => a -> FilePath
show) [Int
0 .. Int
numCol Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
1]
                , Int
0
                )
            HeaderSpec
UseFirstRow ->
                ( [Text] -> Vector Text
forall a. [a] -> Vector a
Vector.fromList ([Text] -> Vector Text) -> [Text] -> Vector Text
forall a b. (a -> b) -> a -> b
$
                    (Int -> Text) -> [Int] -> [Text]
forall a b. (a -> b) -> [a] -> [b]
map (Text -> Text
stripQuotes (Text -> Text) -> (Int -> Text) -> Int -> Text
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Int -> Text
extractField') [Int
0 .. Int
numCol Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
1]
                , Int
1
                )
            ProvideNames [Text]
ns ->
                ([Text] -> Vector Text
forall a. [a] -> Vector a
Vector.fromList [Text]
ns, Int
0)
        numRow :: Int
numRow = Int
totalRows Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
dataStartRow
        parseTypes :: Vector Text -> Column
parseTypes Vector Text
col =
            let n :: Int
n =
                    if TypeSpec -> Bool
shouldInferFromSample (ReadOptions -> TypeSpec
typeSpec ReadOptions
opts)
                        then TypeSpec -> Int
typeInferenceSampleSize (ReadOptions -> TypeSpec
typeSpec ReadOptions
opts)
                        else Int
0
             in [Text] -> Int -> Bool -> FilePath -> Vector Text -> Column
parseFromExamples
                    (ReadOptions -> [Text]
missingIndicators ReadOptions
opts)
                    Int
n
                    (ReadOptions -> Bool
safeRead ReadOptions
opts)
                    (ReadOptions -> FilePath
dateFormat ReadOptions
opts)
                    Vector Text
col
        generateColumn :: Int -> Column
generateColumn Int
col =
            Vector Text -> Column
parseTypes (Vector Text -> Column) -> Vector Text -> Column
forall a b. (a -> b) -> a -> b
$
                Int -> [Text] -> Vector Text
forall a. Int -> [a] -> Vector a
Vector.fromListN
                    Int
numRow
                    ( (Int -> Text) -> [Int] -> [Text]
forall a b. (a -> b) -> [a] -> [b]
map
                        ( \Int
row ->
                            (Text -> Text
stripQuotes (Text -> Text) -> (Int -> Text) -> Int -> Text
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Int -> Text
extractField')
                                (Int
row Int -> Int -> Int
forall a. Num a => a -> a -> a
* Int
numCol Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
col)
                        )
                        [Int
dataStartRow .. Int
totalRows Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
1]
                    )
        columns :: Vector Column
columns =
            Int -> [Column] -> Vector Column
forall a. Int -> [a] -> Vector a
Vector.fromListN
                Int
numCol
                ( (Int -> Column) -> [Int] -> [Column]
forall a b. (a -> b) -> [a] -> [b]
map Int -> Column
generateColumn [Int
0 .. Int
numCol Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
1]
                    [Column] -> Strategy [Column] -> [Column]
forall a. a -> Strategy a -> a
`using` Strategy Column -> Strategy [Column]
forall a. Strategy a -> Strategy [a]
parList Strategy Column
forall a. Strategy a
rpar
                )
        columnIndices :: Map Text Int
columnIndices =
            [(Text, Int)] -> Map Text Int
forall k a. Ord k => [(k, a)] -> Map k a
M.fromList ([(Text, Int)] -> Map Text Int) -> [(Text, Int)] -> Map Text Int
forall a b. (a -> b) -> a -> b
$
                [Text] -> [Int] -> [(Text, Int)]
forall a b. [a] -> [b] -> [(a, b)]
zip (Vector Text -> [Text]
forall a. Vector a -> [a]
Vector.toList Vector Text
columnNames) [Int
0 ..]
        dataframeDimensions :: (Int, Int)
dataframeDimensions = (Int
numRow, Int
numCol)
    DataFrame -> IO DataFrame
forall a. a -> IO a
forall (m :: * -> *) a. Monad m => a -> m a
return (DataFrame -> IO DataFrame) -> DataFrame -> IO DataFrame
forall a b. (a -> b) -> a -> b
$
        Vector Column
-> Map Text Int -> (Int, Int) -> Map Text UExpr -> DataFrame
DataFrame Vector Column
columns Map Text Int
columnIndices (Int, Int)
dataframeDimensions Map Text UExpr
forall k a. Map k a
M.empty

{-# INLINE extractField #-}
extractField ::
    VS.Vector Word8 ->
    VS.Vector CSize ->
    Int ->
    Text
extractField :: Vector Word8 -> Vector CSize -> Int -> Text
extractField Vector Word8
file Vector CSize
indices Int
position =
    Text -> Text
Text.strip
        (Text -> Text) -> (Vector Word8 -> Text) -> Vector Word8 -> Text
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ByteString -> Text
TextEncoding.decodeUtf8Lenient
        (ByteString -> Text)
-> (Vector Word8 -> ByteString) -> Vector Word8 -> Text
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Vector Word8 -> ByteString
unsafeToByteString
        (Vector Word8 -> Text) -> Vector Word8 -> Text
forall a b. (a -> b) -> a -> b
$ Int -> Int -> Vector Word8 -> Vector Word8
forall a. Storable a => Int -> Int -> Vector a -> Vector a
VS.slice
            Int
previous
            (Int
next Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
previous)
            Vector Word8
file
  where
    previous :: Int
previous =
        if Int
position Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== Int
0
            then Int
0
            else Int
1 Int -> Int -> Int
forall a. Num a => a -> a -> a
+ CSize -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Vector CSize
indices Vector CSize -> Int -> CSize
forall a. Storable a => Vector a -> Int -> a
VS.! (Int
position Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
1))
    next :: Int
next = CSize -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral (CSize -> Int) -> CSize -> Int
forall a b. (a -> b) -> a -> b
$ Vector CSize
indices Vector CSize -> Int -> CSize
forall a. Storable a => Vector a -> Int -> a
VS.! Int
position
    unsafeToByteString :: VS.Vector Word8 -> BS.ByteString
    unsafeToByteString :: Vector Word8 -> ByteString
unsafeToByteString Vector Word8
v = ForeignPtr Word8 -> Int -> Int -> ByteString
PS (ForeignPtr Word8 -> ForeignPtr Word8
forall a b. ForeignPtr a -> ForeignPtr b
castForeignPtr ForeignPtr Word8
ptr) Int
0 Int
len
      where
        (ForeignPtr Word8
ptr, Int
len) = Vector Word8 -> (ForeignPtr Word8, Int)
forall a. Vector a -> (ForeignPtr a, Int)
VS.unsafeToForeignPtr0 Vector Word8
v

foreign import capi "process_csv.h get_delimiter_indices"
    get_delimiter_indices ::
        Ptr CUChar -> -- input
        CSize -> -- input size
        CUChar -> -- separator character
        Ptr CSize -> -- result array
        IO CSize -- occupancy of result array

{-# INLINE getDelimiterIndices #-}
getDelimiterIndices ::
    Word8 ->
    Int ->
    VS.Vector Word8 ->
    IO (VS.Vector CSize)
getDelimiterIndices :: Word8 -> Int -> Vector Word8 -> IO (Vector CSize)
getDelimiterIndices Word8
separator Int
originalLen Vector Word8
csvFile =
    Vector Word8
-> (Ptr Word8 -> IO (Vector CSize)) -> IO (Vector CSize)
forall a b. Storable a => Vector a -> (Ptr a -> IO b) -> IO b
VS.unsafeWith Vector Word8
csvFile ((Ptr Word8 -> IO (Vector CSize)) -> IO (Vector CSize))
-> (Ptr Word8 -> IO (Vector CSize)) -> IO (Vector CSize)
forall a b. (a -> b) -> a -> b
$ \Ptr Word8
buffer -> do
        let paddedLen :: Int
paddedLen = Vector Word8 -> Int
forall a. Storable a => Vector a -> Int
VS.length Vector Word8
csvFile
        -- GC-managed pinned memory: freed automatically, no leak in streaming use.
        IOVector CSize
resultMV <- Int -> IO (MVector (PrimState IO) CSize)
forall (m :: * -> *) a.
(PrimMonad m, Storable a) =>
Int -> m (MVector (PrimState m) a)
VSM.unsafeNew Int
paddedLen
        CSize
num_fields <-
            IOVector CSize -> (Ptr CSize -> IO CSize) -> IO CSize
forall a b. Storable a => IOVector a -> (Ptr a -> IO b) -> IO b
VSM.unsafeWith IOVector CSize
resultMV ((Ptr CSize -> IO CSize) -> IO CSize)
-> (Ptr CSize -> IO CSize) -> IO CSize
forall a b. (a -> b) -> a -> b
$ \Ptr CSize
indicesPtr ->
                Ptr CUChar -> CSize -> CUChar -> Ptr CSize -> IO CSize
get_delimiter_indices
                    (Ptr Word8 -> Ptr CUChar
forall a b. Ptr a -> Ptr b
castPtr Ptr Word8
buffer)
                    (Int -> CSize
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
paddedLen)
                    (Word8 -> CUChar
forall a b. (Integral a, Num b) => a -> b
fromIntegral Word8
separator)
                    (Ptr CSize -> Ptr CSize
forall a b. Ptr a -> Ptr b
castPtr Ptr CSize
indicesPtr)
        if CSize
num_fields CSize -> CSize -> Bool
forall a. Eq a => a -> a -> Bool
== -CSize
1
            then do
                -- Haskell state-machine fallback, writing directly into resultMV.
                let trans :: UArray (Int, Word8) Int
trans = Word8 -> UArray (Int, Word8) Int
stateTransitionTable Word8
separator
                    processChar :: (State, Int) -> a -> Word8 -> m (a, Int)
processChar (!State
state, !Int
idx) a
i Word8
byte =
                        case State
state of
                            State
UnEscaped ->
                                if Word8
byte Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
== Word8
lf Bool -> Bool -> Bool
|| Word8
byte Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
== Word8
separator
                                    then do
                                        MVector (PrimState m) CSize -> Int -> CSize -> m ()
forall (m :: * -> *) a.
(PrimMonad m, Storable a) =>
MVector (PrimState m) a -> Int -> a -> m ()
VSM.unsafeWrite IOVector CSize
MVector (PrimState m) CSize
resultMV Int
idx (a -> CSize
forall a b. (Integral a, Num b) => a -> b
fromIntegral a
i)
                                        (a, Int) -> m (a, Int)
forall a. a -> m a
forall (m :: * -> *) a. Monad m => a -> m a
return (Int -> a
forall a. Enum a => Int -> a
toEnum (UArray (Int, Word8) Int
trans UArray (Int, Word8) Int -> (Int, Word8) -> Int
forall (a :: * -> * -> *) e i.
(IArray a e, Ix i) =>
a i e -> i -> e
! (State -> Int
forall a. Enum a => a -> Int
fromEnum State
state, Word8
byte)), Int
idx Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
1)
                                    else (a, Int) -> m (a, Int)
forall a. a -> m a
forall (m :: * -> *) a. Monad m => a -> m a
return (Int -> a
forall a. Enum a => Int -> a
toEnum (UArray (Int, Word8) Int
trans UArray (Int, Word8) Int -> (Int, Word8) -> Int
forall (a :: * -> * -> *) e i.
(IArray a e, Ix i) =>
a i e -> i -> e
! (State -> Int
forall a. Enum a => a -> Int
fromEnum State
state, Word8
byte)), Int
idx)
                            State
Escaped ->
                                (a, Int) -> m (a, Int)
forall a. a -> m a
forall (m :: * -> *) a. Monad m => a -> m a
return (Int -> a
forall a. Enum a => Int -> a
toEnum (UArray (Int, Word8) Int
trans UArray (Int, Word8) Int -> (Int, Word8) -> Int
forall (a :: * -> * -> *) e i.
(IArray a e, Ix i) =>
a i e -> i -> e
! (State -> Int
forall a. Enum a => a -> Int
fromEnum State
state, Word8
byte)), Int
idx)
                (State
_, Int
finalIdx) <- ((State, Int) -> Int -> Word8 -> IO (State, Int))
-> (State, Int) -> Vector Word8 -> IO (State, Int)
forall (m :: * -> *) b a.
(Monad m, Storable b) =>
(a -> Int -> b -> m a) -> a -> Vector b -> m a
VS.ifoldM' (State, Int) -> Int -> Word8 -> IO (State, Int)
forall {m :: * -> *} {a} {a}.
(PrimState m ~ RealWorld, PrimMonad m, Integral a, Enum a) =>
(State, Int) -> a -> Word8 -> m (a, Int)
processChar (State
UnEscaped, Int
0 :: Int) Vector Word8
csvFile
                Int
finalLen <-
                    if Int
originalLen Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
> Int
0 Bool -> Bool -> Bool
&& Vector Word8
csvFile Vector Word8 -> Int -> Word8
forall a. Storable a => Vector a -> Int -> a
VS.! (Int
originalLen Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
1) Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
/= Word8
lf
                        then do
                            MVector (PrimState IO) CSize -> Int -> CSize -> IO ()
forall (m :: * -> *) a.
(PrimMonad m, Storable a) =>
MVector (PrimState m) a -> Int -> a -> m ()
VSM.unsafeWrite IOVector CSize
MVector (PrimState IO) CSize
resultMV Int
finalIdx (Int -> CSize
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
originalLen)
                            Int -> IO Int
forall a. a -> IO a
forall (m :: * -> *) a. Monad m => a -> m a
return (Int
finalIdx Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
1)
                        else Int -> IO Int
forall a. a -> IO a
forall (m :: * -> *) a. Monad m => a -> m a
return Int
finalIdx
                MVector (PrimState IO) CSize -> IO (Vector CSize)
forall a (m :: * -> *).
(Storable a, PrimMonad m) =>
MVector (PrimState m) a -> m (Vector a)
VS.unsafeFreeze (Int -> Int -> IOVector CSize -> IOVector CSize
forall a s. Storable a => Int -> Int -> MVector s a -> MVector s a
VSM.slice Int
0 Int
finalLen IOVector CSize
resultMV)
            else do
                let n :: Int
n = CSize -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral CSize
num_fields
                Int
finalLen <-
                    if Int
originalLen Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
> Int
0 Bool -> Bool -> Bool
&& Vector Word8
csvFile Vector Word8 -> Int -> Word8
forall a. Storable a => Vector a -> Int -> a
VS.! (Int
originalLen Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
1) Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
/= Word8
lf
                        then do
                            MVector (PrimState IO) CSize -> Int -> CSize -> IO ()
forall (m :: * -> *) a.
(PrimMonad m, Storable a) =>
MVector (PrimState m) a -> Int -> a -> m ()
VSM.write IOVector CSize
MVector (PrimState IO) CSize
resultMV Int
n (Int -> CSize
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
originalLen)
                            Int -> IO Int
forall a. a -> IO a
forall (m :: * -> *) a. Monad m => a -> m a
return (Int
n Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
1)
                        else Int -> IO Int
forall a. a -> IO a
forall (m :: * -> *) a. Monad m => a -> m a
return Int
n
                MVector (PrimState IO) CSize -> IO (Vector CSize)
forall a (m :: * -> *).
(Storable a, PrimMonad m) =>
MVector (PrimState m) a -> m (Vector a)
VS.unsafeFreeze (Int -> Int -> IOVector CSize -> IOVector CSize
forall a s. Storable a => Int -> Int -> MVector s a -> MVector s a
VSM.slice Int
0 Int
finalLen IOVector CSize
resultMV)

-- We have a Native version in case the C version
-- cannot be used. For example if neither ARM_NEON
-- nor AVX2 are available

lf, cr, comma, tab, quote :: Word8
lf :: Word8
lf = Word8
0x0A
cr :: Word8
cr = Word8
0x0D
comma :: Word8
comma = Word8
0x2C
tab :: Word8
tab = Word8
0x09
quote :: Word8
quote = Word8
0x22

-- We parse using a state machine
data State
    = UnEscaped -- non quoted
    | Escaped -- quoted
    deriving (Int -> State
State -> Int
State -> [State]
State -> State
State -> State -> [State]
State -> State -> State -> [State]
(State -> State)
-> (State -> State)
-> (Int -> State)
-> (State -> Int)
-> (State -> [State])
-> (State -> State -> [State])
-> (State -> State -> [State])
-> (State -> State -> State -> [State])
-> Enum State
forall a.
(a -> a)
-> (a -> a)
-> (Int -> a)
-> (a -> Int)
-> (a -> [a])
-> (a -> a -> [a])
-> (a -> a -> [a])
-> (a -> a -> a -> [a])
-> Enum a
$csucc :: State -> State
succ :: State -> State
$cpred :: State -> State
pred :: State -> State
$ctoEnum :: Int -> State
toEnum :: Int -> State
$cfromEnum :: State -> Int
fromEnum :: State -> Int
$cenumFrom :: State -> [State]
enumFrom :: State -> [State]
$cenumFromThen :: State -> State -> [State]
enumFromThen :: State -> State -> [State]
$cenumFromTo :: State -> State -> [State]
enumFromTo :: State -> State -> [State]
$cenumFromThenTo :: State -> State -> State -> [State]
enumFromThenTo :: State -> State -> State -> [State]
Enum)

{-# INLINE stateTransitionTable #-}
stateTransitionTable :: Word8 -> UArray (Int, Word8) Int
stateTransitionTable :: Word8 -> UArray (Int, Word8) Int
stateTransitionTable Word8
separator = ((Int, Word8), (Int, Word8))
-> [((Int, Word8), Int)] -> UArray (Int, Word8) Int
forall (a :: * -> * -> *) e i.
(IArray a e, Ix i) =>
(i, i) -> [(i, e)] -> a i e
array ((Int
0, Word8
0), (Int
1, Word8
255)) [((Int, Word8)
i, (Int, Word8) -> Int
f (Int, Word8)
i) | (Int, Word8)
i <- ((Int, Word8), (Int, Word8)) -> [(Int, Word8)]
forall a. Ix a => (a, a) -> [a]
range ((Int
0, Word8
0), (Int
1, Word8
255))]
  where
    f :: (Int, Word8) -> Int
f (Int
0, Word8
character)
        -- Unescaped newline
        | Word8
character Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
== Word8
0x0A = State -> Int
forall a. Enum a => a -> Int
fromEnum State
UnEscaped
        -- Unescaped separator
        | Word8
character Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
== Word8
separator = State -> Int
forall a. Enum a => a -> Int
fromEnum State
UnEscaped
        -- Unescaped quote
        | Word8
character Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
== Word8
0x22 = State -> Int
forall a. Enum a => a -> Int
fromEnum State
Escaped
        | Bool
otherwise = State -> Int
forall a. Enum a => a -> Int
fromEnum State
UnEscaped
    -- Escaped quote
    -- escaped quote in fields are dealt as
    -- consecutive quoted sections of a field
    -- example: If we have
    -- field1, "abc""def""ghi, field3
    -- we end up processing abc, def, and ghi
    -- as consecutive quoted strings.
    f (Int
1, Word8
0x22) = State -> Int
forall a. Enum a => a -> Int
fromEnum State
UnEscaped
    -- Everything else
    f (Int
state, Word8
_) = Int
state

{-# INLINE getDelimiterIndices_ #-}
getDelimiterIndices_ ::
    Word8 ->
    Int ->
    VS.Vector Word8 ->
    Ptr CSize ->
    IO (VS.Vector CSize)
getDelimiterIndices_ :: Word8 -> Int -> Vector Word8 -> Ptr CSize -> IO (Vector CSize)
getDelimiterIndices_ Word8
separator Int
originalLen Vector Word8
csvFile Ptr CSize
resultPtr = do
    IOVector CSize
resultVector <- IO (IOVector CSize)
forall {s}. IO (MVector s CSize)
resultVectorM
    (State
_, Int
resultLen) <-
        ((State, Int) -> Int -> Word8 -> IO (State, Int))
-> (State, Int) -> Vector Word8 -> IO (State, Int)
forall (m :: * -> *) b a.
(Monad m, Storable b) =>
(a -> Int -> b -> m a) -> a -> Vector b -> m a
VS.ifoldM'
            (IOVector CSize -> (State, Int) -> Int -> Word8 -> IO (State, Int)
processCharacter IOVector CSize
resultVector)
            (State
UnEscaped, Int
0)
            Vector Word8
csvFile
    -- Handle the case where the file doesn't end with a newline
    -- We need to add a final delimiter for the last field
    Int
finalResultLen <-
        if Int
originalLen Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
> Int
0 Bool -> Bool -> Bool
&& Vector Word8
csvFile Vector Word8 -> Int -> Word8
forall a. Storable a => Vector a -> Int -> a
VS.! (Int
originalLen Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
1) Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
/= Word8
lf
            then do
                MVector (PrimState IO) CSize -> Int -> CSize -> IO ()
forall (m :: * -> *) a.
(PrimMonad m, Storable a) =>
MVector (PrimState m) a -> Int -> a -> m ()
VSM.write IOVector CSize
MVector (PrimState IO) CSize
resultVector Int
resultLen (Int -> CSize
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
originalLen)
                Int -> IO Int
forall a. a -> IO a
forall (m :: * -> *) a. Monad m => a -> m a
return (Int
resultLen Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
1)
            else Int -> IO Int
forall a. a -> IO a
forall (m :: * -> *) a. Monad m => a -> m a
return Int
resultLen
    MVector (PrimState IO) CSize -> IO (Vector CSize)
forall a (m :: * -> *).
(Storable a, PrimMonad m) =>
MVector (PrimState m) a -> m (Vector a)
VS.unsafeFreeze (MVector (PrimState IO) CSize -> IO (Vector CSize))
-> MVector (PrimState IO) CSize -> IO (Vector CSize)
forall a b. (a -> b) -> a -> b
$ Int -> Int -> IOVector CSize -> IOVector CSize
forall a s. Storable a => Int -> Int -> MVector s a -> MVector s a
VSM.slice Int
0 Int
finalResultLen IOVector CSize
resultVector
  where
    paddedLen :: Int
paddedLen = Vector Word8 -> Int
forall a. Storable a => Vector a -> Int
VS.length Vector Word8
csvFile
    resultVectorM :: IO (MVector s CSize)
resultVectorM = do
        ForeignPtr CSize
resultForeignPtr <- Ptr CSize -> IO (ForeignPtr CSize)
forall a. Ptr a -> IO (ForeignPtr a)
newForeignPtr_ Ptr CSize
resultPtr
        MVector s CSize -> IO (MVector s CSize)
forall a. a -> IO a
forall (m :: * -> *) a. Monad m => a -> m a
return (MVector s CSize -> IO (MVector s CSize))
-> MVector s CSize -> IO (MVector s CSize)
forall a b. (a -> b) -> a -> b
$ ForeignPtr CSize -> Int -> MVector s CSize
forall a s. ForeignPtr a -> Int -> MVector s a
VSM.unsafeFromForeignPtr0 ForeignPtr CSize
resultForeignPtr Int
paddedLen
    transitionTable :: UArray (Int, Word8) Int
transitionTable = Word8 -> UArray (Int, Word8) Int
stateTransitionTable Word8
separator
    processCharacter ::
        VSM.IOVector CSize ->
        (State, Int) ->
        Int ->
        Word8 ->
        IO (State, Int)
    processCharacter :: IOVector CSize -> (State, Int) -> Int -> Word8 -> IO (State, Int)
processCharacter
        IOVector CSize
resultVector
        (!State
state, !Int
resultIndex)
        Int
index
        Word8
character =
            case State
state of
                State
UnEscaped ->
                    if Word8
character Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
== Word8
lf Bool -> Bool -> Bool
|| Word8
character Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
== Word8
separator
                        then do
                            MVector (PrimState IO) CSize -> Int -> CSize -> IO ()
forall (m :: * -> *) a.
(PrimMonad m, Storable a) =>
MVector (PrimState m) a -> Int -> a -> m ()
VSM.write
                                IOVector CSize
MVector (PrimState IO) CSize
resultVector
                                Int
resultIndex
                                (Int -> CSize
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
index)
                            (State, Int) -> IO (State, Int)
forall a. a -> IO a
forall (m :: * -> *) a. Monad m => a -> m a
return (State
newState, Int
resultIndex Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
1)
                        else (State, Int) -> IO (State, Int)
forall a. a -> IO a
forall (m :: * -> *) a. Monad m => a -> m a
return (State
newState, Int
resultIndex)
                State
Escaped -> (State, Int) -> IO (State, Int)
forall a. a -> IO a
forall (m :: * -> *) a. Monad m => a -> m a
return (State
newState, Int
resultIndex)
          where
            newState :: State
newState =
                Int -> State
forall a. Enum a => Int -> a
toEnum (Int -> State) -> Int -> State
forall a b. (a -> b) -> a -> b
$
                    UArray (Int, Word8) Int
transitionTable
                        UArray (Int, Word8) Int -> (Int, Word8) -> Int
forall (a :: * -> * -> *) e i.
(IArray a e, Ix i) =>
a i e -> i -> e
! (State -> Int
forall a. Enum a => a -> Int
fromEnum State
state, Word8
character)

{-# INLINE countColumnsInFirstRow #-}
countColumnsInFirstRow ::
    VS.Vector Word8 ->
    VS.Vector CSize ->
    Int
countColumnsInFirstRow :: Vector Word8 -> Vector CSize -> Int
countColumnsInFirstRow Vector Word8
file Vector CSize
indices
    | Vector CSize -> Int
forall a. Storable a => Vector a -> Int
VS.length Vector CSize
indices Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== Int
0 = Int
0
    | Bool
otherwise =
        Int
1
            Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Vector CSize -> Int
forall a. Storable a => Vector a -> Int
VS.length
                ( (CSize -> Bool) -> Vector CSize -> Vector CSize
forall a. Storable a => (a -> Bool) -> Vector a -> Vector a
VS.takeWhile
                    (\CSize
i -> Vector Word8
file Vector Word8 -> Int -> Word8
forall a. Storable a => Vector a -> Int -> a
VS.! CSize -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral CSize
i Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
/= Word8
lf)
                    Vector CSize
indices
                )