{- Copyright (c) Meta Platforms, Inc. and affiliates. All rights reserved. This source code is licensed under the BSD-style license found in the LICENSE file in the root directory of this source tree. -} {-# OPTIONS_GHC -fprof-auto #-} {-# LANGUAGE ApplicativeDo #-} {-# LANGUAGE LambdaCase #-} {-# LANGUAGE NondecreasingIndentation #-} module Thrift.Protocol.JSON.String ( parseJSONString ) where import Control.Exception import Control.Monad import Thrift.Binary.Parser import Data.Bits import qualified Data.ByteString as ByteString import Data.ByteString.Internal (ByteString(..)) import Data.ByteString.Unsafe import Data.Char import Data.Text (Text) import Data.Text.Encoding import Data.Word import Data.Word8 import Foreign.ForeignPtr import Foreign.Ptr import Foreign.Storable import System.IO.Unsafe import Util.FFI parseJSONString :: Parser Text parseJSONString = do skipSpaces *> word8 _quotedbl rawBytes <- peekBS case unescape rawBytes of Left (StringError err) -> fail err Right (UnescapeResult bytes inLen) -> do skipN (inLen + 1) -- Skip over the string and closing " case decodeUtf8' bytes of Left err -> fail $ show err Right txt -> return txt -- NOTE: The cpp2 implementation only accepts escaped "unicode" characters of -- the form "\u00XX" where the X's are hex digits, so we will do the same here. -- This means that all \u characters fit into a single byte which simplifies the -- unescaping logic signifigantly -- Datatype where all fields are strict so that we can avoid calling `evaluate` data UnescapeResult = UnescapeResult !ByteString {-# UNPACK #-} !Int -- Low-level unescaping logic. Works on the underlying byte array unescape :: ByteString -> Either StringError UnescapeResult unescape rawBytes = unsafeDupablePerformIO $ try $ unsafeUseAsCStringLen rawBytes $ \(charBuf, fullLen) -> do let input = castPtr charBuf :: Ptr Word8 -- Find the lengths of the input and output (inLen, outLen) <- unescapedLength input fullLen -- If the input has the same length as the output, then there are no escape -- characters, so we can just return the ByteString as is if inLen == outLen then return $! UnescapeResult (ByteString.take outLen rawBytes) inLen else do -- Allocate a buffer for the output outBuf <- mallocForeignPtrArray outLen unsafeWithForeignPtr outBuf $ \output -> do let -- Unescape the input character by character. -- i is the input index and j is the output index go !i !j | j >= outLen = return () | otherwise = do byte <- peekByteOff input i if -- Case 1: we have reached an escape character -- NOTE: no bounds checks are needed because we already checked -- bounds in unescapeLength | byte == _backslash -> do escChr <- peekByteOff input $ i + 1 if -- Case 1.1: We have reached a 4-hex-digit unicode point | escChr == _u -> do c <- parseUnicodeChar =<< peekByteOff (castPtr input) (i + 2) pokeByteOff output j c go (i + 6) (j + 1) -- Case 1.2: Byte is a valid escape char | Just unesc <- unescapeChar escChr -> do pokeByteOff output j unesc go (i + 2) (j + 1) -- Case 1.3: Invalid escape character | otherwise -> throwIO $ StringError $ "parseJSONString: Not a valid escape character: " ++ [chr $ fromIntegral byte] -- Case 2: Not an escape character | otherwise -> do pokeByteOff output j byte go (i + 1) (j + 1) go 0 0 return $! UnescapeResult (PS outBuf 0 outLen) inLen {-# INLINE unescapedLength #-} -- Get the escaped and unescaped lengths of the input unescapedLength :: Ptr Word8 -> Int -> IO (Int, Int) unescapedLength buf len = go 0 0 where -- In this function i is the index in the input and j is the output length go !i !j | i >= len = stringError | otherwise = do byte <- peekByteOff buf i if | byte == _quotedbl -> return (i, j) | byte == _backslash -> if i + 1 < len then do nextByte <- peekByteOff buf (i + 1) if | nextByte == _u -> go (i + 6) (j + 1) -- skip over \uXXXX | otherwise -> go (i + 2) (j + 1) -- skip over \X else stringError | otherwise -> go (i + 1) (j + 1) stringError = throwIO $ StringError "parseJSONString: string does not terminate" {-# INLINE unescapeChar #-} unescapeChar :: Word8 -> Maybe Word8 unescapeChar = \case 0x22 {- " -} -> Just _quotedbl 0x27 {- ' -} -> Just _quotesingle 0x2F {- / -} -> Just _slash 0x5C {- \ -} -> Just _backslash 0x30 {- 0 -} -> Just _0 0x61 {- a -} -> Just $ fromIntegral $ ord '\a' 0x62 {- b -} -> Just $ fromIntegral $ ord '\b' 0x66 {- f -} -> Just $ fromIntegral $ ord '\f' 0x6E {- n -} -> Just _lf -- linefeed 0x72 {- r -} -> Just _cr -- carriage return 0x74 {- t -} -> Just _tab 0x76 {- v -} -> Just _vt -- vertical tab _ -> Nothing {-# INLINE parseUnicodeChar #-} parseUnicodeChar :: Word32 -> IO Word8 parseUnicodeChar !word = do -- Must have the form '\u00XX' (see note above). -- 0x30 is the acsii character '0' unless (word .&. 0x0000FFFF == 0x3030) unicodeError d0 <- hexdigit $ fromIntegral $ (word `shiftR` 16) .&. 0x000000FF d1 <- hexdigit $ fromIntegral $ (word `shiftR` 24) .&. 0x000000FF return $ d0 `shiftL` 4 .|. d1 where unicodeError = throwIO $ StringError "Improperly formatted unicode string" hexdigit :: Word8 -> IO Word8 hexdigit n | n `inRange` (_0, _9) = pure $ n - _0 | n `inRange` (_a, _f) = pure $ n - _a + 10 | n `inRange` (_A, _F) = pure $ n - _A + 10 | otherwise = unicodeError inRange :: Word8 -> (Word8, Word8) -> Bool inRange n (x, y) = n >= x && n <= y newtype StringError = StringError String deriving Show instance Exception StringError