{-# LANGUAGE LambdaCase #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE ScopedTypeVariables #-}
module DataFrame.Internal.Parsing where
import qualified Data.ByteString.Char8 as C
import qualified Data.Set as S
import qualified Data.Text as T
import qualified Data.Text.IO as TIO
import Control.Applicative (many, (<|>))
import Data.Attoparsec.Text hiding (decimal, double, signed)
import Data.ByteString.Lex.Fractional
import Data.Foldable (fold)
import Data.Maybe (fromMaybe)
import Data.Text.Read (decimal, double, signed)
import Data.Time (Day, defaultTimeLocale, parseTimeM)
import GHC.Stack (HasCallStack)
import System.IO (Handle, IOMode (..), hIsEOF, hTell, withFile)
import Text.Read (readMaybe)
import Prelude hiding (takeWhile)
isNullish :: T.Text -> Bool
isNullish :: Text -> Bool
isNullish =
( Text -> Set Text -> Bool
forall a. Ord a => a -> Set a -> Bool
`S.member`
[Text] -> Set Text
forall a. Ord a => [a] -> Set a
S.fromList
[Text
"Nothing", Text
"NULL", Text
"", Text
" ", Text
"nan", Text
"null", Text
"N/A", Text
"NaN", Text
"NAN", Text
"NA"]
)
isNullishBS :: C.ByteString -> Bool
isNullishBS :: ByteString -> Bool
isNullishBS =
( ByteString -> Set ByteString -> Bool
forall a. Ord a => a -> Set a -> Bool
`S.member`
[ByteString] -> Set ByteString
forall a. Ord a => [a] -> Set a
S.fromList
[ByteString
"Nothing", ByteString
"NULL", ByteString
"", ByteString
" ", ByteString
"nan", ByteString
"null", ByteString
"N/A", ByteString
"NaN", ByteString
"NAN", ByteString
"NA"]
)
isTrueish :: T.Text -> Bool
isTrueish :: Text -> Bool
isTrueish Text
t = Text
t Text -> [Text] -> Bool
forall a. Eq a => a -> [a] -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` [Text
"True", Text
"true", Text
"TRUE"]
isFalseish :: T.Text -> Bool
isFalseish :: Text -> Bool
isFalseish Text
t = Text
t Text -> [Text] -> Bool
forall a. Eq a => a -> [a] -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` [Text
"False", Text
"false", Text
"FALSE"]
readValue :: (HasCallStack, Read a) => T.Text -> a
readValue :: forall a. (HasCallStack, Read a) => Text -> a
readValue Text
s = case String -> Maybe a
forall a. Read a => String -> Maybe a
readMaybe (Text -> String
T.unpack Text
s) of
Maybe a
Nothing -> String -> a
forall a. HasCallStack => String -> a
error (String
"Could not read value: " String -> String -> String
forall a. Semigroup a => a -> a -> a
<> Text -> String
T.unpack Text
s)
Just a
value -> a
value
readBool :: (HasCallStack) => T.Text -> Maybe Bool
readBool :: HasCallStack => Text -> Maybe Bool
readBool Text
s
| Text -> Bool
isTrueish Text
s = Bool -> Maybe Bool
forall a. a -> Maybe a
Just Bool
True
| Text -> Bool
isFalseish Text
s = Bool -> Maybe Bool
forall a. a -> Maybe a
Just Bool
False
| Bool
otherwise = Maybe Bool
forall a. Maybe a
Nothing
readByteStringBool :: C.ByteString -> Maybe Bool
readByteStringBool :: ByteString -> Maybe Bool
readByteStringBool ByteString
s
| ByteString
s ByteString -> [ByteString] -> Bool
forall a. Eq a => a -> [a] -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` [ByteString
"True", ByteString
"true", ByteString
"TRUE"] = Bool -> Maybe Bool
forall a. a -> Maybe a
Just Bool
True
| ByteString
s ByteString -> [ByteString] -> Bool
forall a. Eq a => a -> [a] -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` [ByteString
"False", ByteString
"false", ByteString
"FALSE"] = Bool -> Maybe Bool
forall a. a -> Maybe a
Just Bool
False
| Bool
otherwise = Maybe Bool
forall a. Maybe a
Nothing
readByteStringDate :: String -> C.ByteString -> Maybe Day
readByteStringDate :: String -> ByteString -> Maybe Day
readByteStringDate String
fmt = Bool -> TimeLocale -> String -> String -> Maybe Day
forall (m :: * -> *) t.
(MonadFail m, ParseTime t) =>
Bool -> TimeLocale -> String -> String -> m t
parseTimeM Bool
True TimeLocale
defaultTimeLocale String
fmt (String -> Maybe Day)
-> (ByteString -> String) -> ByteString -> Maybe Day
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ByteString -> String
C.unpack
readInteger :: (HasCallStack) => T.Text -> Maybe Integer
readInteger :: HasCallStack => Text -> Maybe Integer
readInteger Text
s = case Reader Integer -> Reader Integer
forall a. Num a => Reader a -> Reader a
signed Reader Integer
forall a. Integral a => Reader a
decimal (Text -> Text
T.strip Text
s) of
Left String
_ -> Maybe Integer
forall a. Maybe a
Nothing
Right (Integer
value, Text
"") -> Integer -> Maybe Integer
forall a. a -> Maybe a
Just Integer
value
Right (Integer
value, Text
_) -> Maybe Integer
forall a. Maybe a
Nothing
readInt :: (HasCallStack) => T.Text -> Maybe Int
readInt :: HasCallStack => Text -> Maybe Int
readInt Text
s = case Reader Int -> Reader Int
forall a. Num a => Reader a -> Reader a
signed Reader Int
forall a. Integral a => Reader a
decimal (Text -> Text
T.strip Text
s) of
Left String
_ -> Maybe Int
forall a. Maybe a
Nothing
Right (Int
value, Text
"") -> Int -> Maybe Int
forall a. a -> Maybe a
Just Int
value
Right (Int
value, Text
_) -> Maybe Int
forall a. Maybe a
Nothing
{-# INLINE readInt #-}
readByteStringInt :: (HasCallStack) => C.ByteString -> Maybe Int
readByteStringInt :: HasCallStack => ByteString -> Maybe Int
readByteStringInt ByteString
s = case ByteString -> Maybe (Int, ByteString)
C.readInt (ByteString -> ByteString
C.strip ByteString
s) of
Maybe (Int, ByteString)
Nothing -> Maybe Int
forall a. Maybe a
Nothing
Just (Int
value, ByteString
"") -> Int -> Maybe Int
forall a. a -> Maybe a
Just Int
value
Just (Int
value, ByteString
_) -> Maybe Int
forall a. Maybe a
Nothing
{-# INLINE readByteStringInt #-}
readByteStringDouble :: (HasCallStack) => C.ByteString -> Maybe Double
readByteStringDouble :: HasCallStack => ByteString -> Maybe Double
readByteStringDouble ByteString
s =
let
readFunc :: ByteString -> Maybe (Double, ByteString)
readFunc = if (Char -> Bool) -> ByteString -> Bool
C.any (\Char
c -> Char
c Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== Char
'e' Bool -> Bool -> Bool
|| Char
c Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== Char
'E') ByteString
s then ByteString -> Maybe (Double, ByteString)
forall a. Fractional a => ByteString -> Maybe (a, ByteString)
readExponential else ByteString -> Maybe (Double, ByteString)
forall a. Fractional a => ByteString -> Maybe (a, ByteString)
readDecimal
in
case (ByteString -> Maybe (Double, ByteString))
-> ByteString -> Maybe (Double, ByteString)
forall a.
Num a =>
(ByteString -> Maybe (a, ByteString))
-> ByteString -> Maybe (a, ByteString)
readSigned ByteString -> Maybe (Double, ByteString)
readFunc (ByteString -> ByteString
C.strip ByteString
s) of
Maybe (Double, ByteString)
Nothing -> Maybe Double
forall a. Maybe a
Nothing
Just (Double
value, ByteString
"") -> Double -> Maybe Double
forall a. a -> Maybe a
Just Double
value
Just (Double
value, ByteString
_) -> Maybe Double
forall a. Maybe a
Nothing
{-# INLINE readByteStringDouble #-}
readDouble :: (HasCallStack) => T.Text -> Maybe Double
readDouble :: HasCallStack => Text -> Maybe Double
readDouble Text
s =
case Reader Double -> Reader Double
forall a. Num a => Reader a -> Reader a
signed Reader Double
double Text
s of
Left String
_ -> Maybe Double
forall a. Maybe a
Nothing
Right (Double
value, Text
"") -> Double -> Maybe Double
forall a. a -> Maybe a
Just Double
value
Right (Double
value, Text
_) -> Maybe Double
forall a. Maybe a
Nothing
{-# INLINE readDouble #-}
readIntegerEither :: (HasCallStack) => T.Text -> Either T.Text Integer
readIntegerEither :: HasCallStack => Text -> Either Text Integer
readIntegerEither Text
s = case Reader Integer -> Reader Integer
forall a. Num a => Reader a -> Reader a
signed Reader Integer
forall a. Integral a => Reader a
decimal (Text -> Text
T.strip Text
s) of
Left String
_ -> Text -> Either Text Integer
forall a b. a -> Either a b
Left Text
s
Right (Integer
value, Text
"") -> Integer -> Either Text Integer
forall a b. b -> Either a b
Right Integer
value
Right (Integer
value, Text
_) -> Text -> Either Text Integer
forall a b. a -> Either a b
Left Text
s
{-# INLINE readIntegerEither #-}
readIntEither :: (HasCallStack) => T.Text -> Either T.Text Int
readIntEither :: HasCallStack => Text -> Either Text Int
readIntEither Text
s = case Reader Int -> Reader Int
forall a. Num a => Reader a -> Reader a
signed Reader Int
forall a. Integral a => Reader a
decimal (Text -> Text
T.strip Text
s) of
Left String
_ -> Text -> Either Text Int
forall a b. a -> Either a b
Left Text
s
Right (Int
value, Text
"") -> Int -> Either Text Int
forall a b. b -> Either a b
Right Int
value
Right (Int
value, Text
_) -> Text -> Either Text Int
forall a b. a -> Either a b
Left Text
s
{-# INLINE readIntEither #-}
readDoubleEither :: (HasCallStack) => T.Text -> Either T.Text Double
readDoubleEither :: HasCallStack => Text -> Either Text Double
readDoubleEither Text
s =
case Reader Double -> Reader Double
forall a. Num a => Reader a -> Reader a
signed Reader Double
double Text
s of
Left String
_ -> Text -> Either Text Double
forall a b. a -> Either a b
Left Text
s
Right (Double
value, Text
"") -> Double -> Either Text Double
forall a b. b -> Either a b
Right Double
value
Right (Double
value, Text
_) -> Text -> Either Text Double
forall a b. a -> Either a b
Left Text
s
{-# INLINE readDoubleEither #-}
safeReadValue :: (Read a) => T.Text -> Maybe a
safeReadValue :: forall a. Read a => Text -> Maybe a
safeReadValue Text
s = String -> Maybe a
forall a. Read a => String -> Maybe a
readMaybe (Text -> String
T.unpack Text
s)
readWithDefault :: (HasCallStack, Read a) => a -> T.Text -> a
readWithDefault :: forall a. (HasCallStack, Read a) => a -> Text -> a
readWithDefault a
v Text
s = a -> Maybe a -> a
forall a. a -> Maybe a -> a
fromMaybe a
v (String -> Maybe a
forall a. Read a => String -> Maybe a
readMaybe (Text -> String
T.unpack Text
s))
parseSep :: Char -> T.Text -> [T.Text]
parseSep :: Char -> Text -> [Text]
parseSep Char
c Text
s = (String -> [Text])
-> ([Text] -> [Text]) -> Either String [Text] -> [Text]
forall a c b. (a -> c) -> (b -> c) -> Either a b -> c
either String -> [Text]
forall a. HasCallStack => String -> a
error [Text] -> [Text]
forall a. a -> a
id (Parser [Text] -> Text -> Either String [Text]
forall a. Parser a -> Text -> Either String a
parseOnly (Char -> Parser [Text]
record Char
c) Text
s)
{-# INLINE parseSep #-}
record :: Char -> Parser [T.Text]
record :: Char -> Parser [Text]
record Char
c =
Char -> Parser Text
field Char
c Parser Text -> Parser Text Char -> Parser [Text]
forall (f :: * -> *) a s. Alternative f => f a -> f s -> f [a]
`sepBy1` Char -> Parser Text Char
char Char
c
Parser [Text] -> String -> Parser [Text]
forall i a. Parser i a -> String -> Parser i a
<?> String
"record"
{-# INLINE record #-}
parseRow :: Char -> Parser [T.Text]
parseRow :: Char -> Parser [Text]
parseRow Char
c = (Char -> Parser [Text]
record Char
c Parser [Text] -> Parser Text () -> Parser [Text]
forall a b. Parser Text a -> Parser Text b -> Parser Text a
forall (f :: * -> *) a b. Applicative f => f a -> f b -> f a
<* Parser Text ()
lineEnd) Parser [Text] -> String -> Parser [Text]
forall i a. Parser i a -> String -> Parser i a
<?> String
"record-new-line"
field :: Char -> Parser T.Text
field :: Char -> Parser Text
field Char
c =
Parser Text
quotedField Parser Text -> Parser Text -> Parser Text
forall a. Parser Text a -> Parser Text a -> Parser Text a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> Char -> Parser Text
unquotedField Char
c
Parser Text -> String -> Parser Text
forall i a. Parser i a -> String -> Parser i a
<?> String
"field"
{-# INLINE field #-}
unquotedTerminators :: Char -> S.Set Char
unquotedTerminators :: Char -> Set Char
unquotedTerminators Char
sep = String -> Set Char
forall a. Ord a => [a] -> Set a
S.fromList [Char
sep, Char
'\n', Char
'\r', Char
'"']
unquotedField :: Char -> Parser T.Text
unquotedField :: Char -> Parser Text
unquotedField Char
sep =
(Char -> Bool) -> Parser Text
takeWhile (Bool -> Bool
not (Bool -> Bool) -> (Char -> Bool) -> Char -> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Char -> Set Char -> Bool
forall a. Ord a => a -> Set a -> Bool
`S.member` Set Char
terminators)) Parser Text -> String -> Parser Text
forall i a. Parser i a -> String -> Parser i a
<?> String
"unquoted field"
where
terminators :: Set Char
terminators = Char -> Set Char
unquotedTerminators Char
sep
{-# INLINE unquotedField #-}
quotedField :: Parser T.Text
quotedField :: Parser Text
quotedField = Char -> Parser Text Char
char Char
'"' Parser Text Char -> Parser Text -> Parser Text
forall a b. Parser Text a -> Parser Text b -> Parser Text b
forall (f :: * -> *) a b. Applicative f => f a -> f b -> f b
*> Parser Text
contents Parser Text -> Parser Text Char -> Parser Text
forall a b. Parser Text a -> Parser Text b -> Parser Text a
forall (f :: * -> *) a b. Applicative f => f a -> f b -> f a
<* Char -> Parser Text Char
char Char
'"' Parser Text -> String -> Parser Text
forall i a. Parser i a -> String -> Parser i a
<?> String
"quoted field"
where
contents :: Parser Text
contents = [Text] -> Text
forall m. Monoid m => [m] -> m
forall (t :: * -> *) m. (Foldable t, Monoid m) => t m -> m
fold ([Text] -> Text) -> Parser [Text] -> Parser Text
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Parser Text -> Parser [Text]
forall a. Parser Text a -> Parser Text [a]
forall (f :: * -> *) a. Alternative f => f a -> f [a]
many (Parser Text
unquote Parser Text -> Parser Text -> Parser Text
forall a. Parser Text a -> Parser Text a -> Parser Text a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> Parser Text
unescape)
where
unquote :: Parser Text
unquote = (Char -> Bool) -> Parser Text
takeWhile1 (String -> Char -> Bool
notInClass String
"\"\\")
unescape :: Parser Text
unescape =
Char -> Parser Text Char
char Char
'\\' Parser Text Char -> Parser Text -> Parser Text
forall a b. Parser Text a -> Parser Text b -> Parser Text b
forall (f :: * -> *) a b. Applicative f => f a -> f b -> f b
*> do
Char -> Text
T.singleton (Char -> Text) -> Parser Text Char -> Parser Text
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> do
Char -> Parser Text Char
char Char
'\\' Parser Text Char -> Parser Text Char -> Parser Text Char
forall a. Parser Text a -> Parser Text a -> Parser Text a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> Char -> Parser Text Char
char Char
'"'
{-# INLINE quotedField #-}
lineEnd :: Parser ()
lineEnd :: Parser Text ()
lineEnd =
(Parser Text ()
endOfLine Parser Text () -> Parser Text () -> Parser Text ()
forall a. Parser Text a -> Parser Text a -> Parser Text a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> Parser Text ()
forall t. Chunk t => Parser t ()
endOfInput)
Parser Text () -> String -> Parser Text ()
forall i a. Parser i a -> String -> Parser i a
<?> String
"end of line"
{-# INLINE lineEnd #-}
countRows :: Char -> FilePath -> IO Int
countRows :: Char -> String -> IO Int
countRows Char
c String
path = String -> IOMode -> (Handle -> IO Int) -> IO Int
forall r. String -> IOMode -> (Handle -> IO r) -> IO r
withFile String
path IOMode
ReadMode ((Handle -> IO Int) -> IO Int) -> (Handle -> IO Int) -> IO Int
forall a b. (a -> b) -> a -> b
$! Int -> Text -> Handle -> IO Int
forall {t}. (Show t, Num t) => t -> Text -> Handle -> IO t
go Int
0 Text
""
where
go :: t -> Text -> Handle -> IO t
go t
n Text
input Handle
h = do
Bool
isEOF <- Handle -> IO Bool
hIsEOF Handle
h
if Bool
isEOF Bool -> Bool -> Bool
&& Text
input Text -> Text -> Bool
forall a. Eq a => a -> a -> Bool
== Text
forall a. Monoid a => a
mempty
then t -> IO t
forall a. a -> IO a
forall (f :: * -> *) a. Applicative f => a -> f a
pure t
n
else
IO Text -> Parser [Text] -> Text -> IO (Result [Text])
forall (m :: * -> *) a.
Monad m =>
m Text -> Parser a -> Text -> m (Result a)
parseWith (Handle -> IO Text
TIO.hGetChunk Handle
h) (Char -> Parser [Text]
parseRow Char
c) Text
input IO (Result [Text]) -> (Result [Text] -> IO t) -> IO t
forall a b. IO a -> (a -> IO b) -> IO b
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= \case
Fail Text
unconsumed [String]
ctx String
er -> do
Integer
erpos <- Handle -> IO Integer
hTell Handle
h
String -> IO t
forall a. String -> IO a
forall (m :: * -> *) a. MonadFail m => String -> m a
fail (String -> IO t) -> String -> IO t
forall a b. (a -> b) -> a -> b
$
String
"Failed to parse CSV file around "
String -> String -> String
forall a. Semigroup a => a -> a -> a
<> Integer -> String
forall a. Show a => a -> String
show Integer
erpos
String -> String -> String
forall a. Semigroup a => a -> a -> a
<> String
" byte; due: "
String -> String -> String
forall a. Semigroup a => a -> a -> a
<> String -> String
forall a. Show a => a -> String
show String
er
String -> String -> String
forall a. Semigroup a => a -> a -> a
<> String
"; context: "
String -> String -> String
forall a. Semigroup a => a -> a -> a
<> [String] -> String
forall a. Show a => a -> String
show [String]
ctx
String -> String -> String
forall a. Semigroup a => a -> a -> a
<> String
" "
String -> String -> String
forall a. Semigroup a => a -> a -> a
<> Text -> String
forall a. Show a => a -> String
show Text
unconsumed
Partial Text -> Result [Text]
_ -> String -> IO t
forall a. String -> IO a
forall (m :: * -> *) a. MonadFail m => String -> m a
fail (String -> IO t) -> String -> IO t
forall a b. (a -> b) -> a -> b
$ String
"Partial handler is called; n = " String -> String -> String
forall a. Semigroup a => a -> a -> a
<> t -> String
forall a. Show a => a -> String
show t
n
Done (Text
unconsumed :: T.Text) [Text]
_ ->
t -> Text -> Handle -> IO t
go (t
n t -> t -> t
forall a. Num a => a -> a -> a
+ t
1) Text
unconsumed Handle
h
{-# INLINE countRows #-}
inferValueType :: T.Text -> T.Text
inferValueType :: Text -> Text
inferValueType Text
s = case HasCallStack => Text -> Maybe Int
Text -> Maybe Int
readInt Text
s of
Just Int
_ -> Text
"Int"
Maybe Int
Nothing -> case HasCallStack => Text -> Maybe Double
Text -> Maybe Double
readDouble Text
s of
Just Double
_ -> Text
"Double"
Maybe Double
Nothing -> Text
"Other"
{-# INLINE inferValueType #-}
readSingleLine :: Char -> T.Text -> Handle -> IO ([T.Text], T.Text)
readSingleLine :: Char -> Text -> Handle -> IO ([Text], Text)
readSingleLine Char
c Text
unused Handle
handle =
IO Text -> Parser [Text] -> Text -> IO (Result [Text])
forall (m :: * -> *) a.
Monad m =>
m Text -> Parser a -> Text -> m (Result a)
parseWith (Handle -> IO Text
TIO.hGetChunk Handle
handle) (Char -> Parser [Text]
parseRow Char
c) Text
unused IO (Result [Text])
-> (Result [Text] -> IO ([Text], Text)) -> IO ([Text], Text)
forall a b. IO a -> (a -> IO b) -> IO b
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= \case
Fail Text
unconsumed [String]
ctx String
er -> do
Integer
erpos <- Handle -> IO Integer
hTell Handle
handle
String -> IO ([Text], Text)
forall a. String -> IO a
forall (m :: * -> *) a. MonadFail m => String -> m a
fail (String -> IO ([Text], Text)) -> String -> IO ([Text], Text)
forall a b. (a -> b) -> a -> b
$
String
"Failed to parse CSV file around "
String -> String -> String
forall a. Semigroup a => a -> a -> a
<> Integer -> String
forall a. Show a => a -> String
show Integer
erpos
String -> String -> String
forall a. Semigroup a => a -> a -> a
<> String
" byte; due: "
String -> String -> String
forall a. Semigroup a => a -> a -> a
<> String -> String
forall a. Show a => a -> String
show String
er
String -> String -> String
forall a. Semigroup a => a -> a -> a
<> String
"; context: "
String -> String -> String
forall a. Semigroup a => a -> a -> a
<> [String] -> String
forall a. Show a => a -> String
show [String]
ctx
Partial Text -> Result [Text]
_ -> String -> IO ([Text], Text)
forall a. String -> IO a
forall (m :: * -> *) a. MonadFail m => String -> m a
fail String
"Partial handler is called"
Done (Text
unconsumed :: T.Text) ([Text]
row :: [T.Text]) ->
([Text], Text) -> IO ([Text], Text)
forall a. a -> IO a
forall (m :: * -> *) a. Monad m => a -> m a
return ([Text]
row, Text
unconsumed)