{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE LambdaCase #-}
{-# LANGUAGE MagicHash #-}
{-# LANGUAGE NamedFieldPuns #-}

module Xml
  ( Node (..)
  , Content (..)
  , Attribute (..)
  , decode
  ) where

import Data.Builder.ST (Builder)
import Data.Bytes (Bytes)
import Data.Bytes.Parser (Parser)
import Data.Chunks (Chunks)
import Data.Primitive (SmallArray)
import Data.Text.Short (ShortText)
import Data.Word (Word8)
import GHC.Exts (Char (C#), Char#)

import qualified Data.Builder.ST as Builder
import qualified Data.Bytes as Bytes
import qualified Data.Bytes.Parser as Parser
import qualified Data.Bytes.Parser.Latin as Latin
import qualified Data.Bytes.Parser.Rebindable as R
import qualified Data.Bytes.Parser.Unsafe as Unsafe
import qualified Data.Bytes.Parser.Utf8 as Utf8
import qualified Data.Chunks as Chunks
import qualified Data.Text.Short as TS
import qualified Data.Text.Short.Unsafe as TS

data Node
  = Text !ShortText
  | Element {-# UNPACK #-} !Content
  deriving (Int -> Node -> ShowS
[Node] -> ShowS
Node -> String
(Int -> Node -> ShowS)
-> (Node -> String) -> ([Node] -> ShowS) -> Show Node
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
$cshowsPrec :: Int -> Node -> ShowS
showsPrec :: Int -> Node -> ShowS
$cshow :: Node -> String
show :: Node -> String
$cshowList :: [Node] -> ShowS
showList :: [Node] -> ShowS
Show, Node -> Node -> Bool
(Node -> Node -> Bool) -> (Node -> Node -> Bool) -> Eq Node
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
$c== :: Node -> Node -> Bool
== :: Node -> Node -> Bool
$c/= :: Node -> Node -> Bool
/= :: Node -> Node -> Bool
Eq)

data Content = Content
  { Content -> ShortText
tag :: !ShortText
  , Content -> SmallArray Attribute
attributes :: !(SmallArray Attribute)
  , Content -> SmallArray Node
children :: !(SmallArray Node)
  }
  deriving (Int -> Content -> ShowS
[Content] -> ShowS
Content -> String
(Int -> Content -> ShowS)
-> (Content -> String) -> ([Content] -> ShowS) -> Show Content
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
$cshowsPrec :: Int -> Content -> ShowS
showsPrec :: Int -> Content -> ShowS
$cshow :: Content -> String
show :: Content -> String
$cshowList :: [Content] -> ShowS
showList :: [Content] -> ShowS
Show, Content -> Content -> Bool
(Content -> Content -> Bool)
-> (Content -> Content -> Bool) -> Eq Content
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
$c== :: Content -> Content -> Bool
== :: Content -> Content -> Bool
$c/= :: Content -> Content -> Bool
/= :: Content -> Content -> Bool
Eq)

data Attribute = Attribute
  { Attribute -> ShortText
name :: !ShortText
  , Attribute -> ShortText
value :: !ShortText
  }
  deriving (Int -> Attribute -> ShowS
[Attribute] -> ShowS
Attribute -> String
(Int -> Attribute -> ShowS)
-> (Attribute -> String)
-> ([Attribute] -> ShowS)
-> Show Attribute
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
$cshowsPrec :: Int -> Attribute -> ShowS
showsPrec :: Int -> Attribute -> ShowS
$cshow :: Attribute -> String
show :: Attribute -> String
$cshowList :: [Attribute] -> ShowS
showList :: [Attribute] -> ShowS
Show, Attribute -> Attribute -> Bool
(Attribute -> Attribute -> Bool)
-> (Attribute -> Attribute -> Bool) -> Eq Attribute
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
$c== :: Attribute -> Attribute -> Bool
== :: Attribute -> Attribute -> Bool
$c/= :: Attribute -> Attribute -> Bool
/= :: Attribute -> Attribute -> Bool
Eq)

decode :: Bytes -> Maybe Node
decode :: Bytes -> Maybe Node
decode !Bytes
b = (forall s. Parser () s Node) -> Bytes -> Maybe Node
forall e a. (forall s. Parser e s a) -> Bytes -> Maybe a
Parser.parseBytesMaybe Parser () s Node
forall s. Parser () s Node
elementNodeParser Bytes
b

elementNodeParser :: Parser () s Node
elementNodeParser :: forall s. Parser () s Node
elementNodeParser = do
  () -> Char -> Parser () s ()
forall e s. e -> Char -> Parser e s ()
Latin.char () Char
'<'
  Bytes
btag <- (Word8 -> Bool) -> Parser () s Bytes
forall e s. (Word8 -> Bool) -> Parser e s Bytes
Parser.takeWhile (\Word8
w -> Bool -> Bool
not (Word8 -> Bool
isXmlSpace Word8
w) Bool -> Bool -> Bool
&& Word8
w Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
/= Word8
0x3E Bool -> Bool -> Bool
&& Word8
w Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
/= Word8
0x2F)
  case Bytes -> Int
Bytes.length Bytes
btag of
    Int
0 -> () -> Parser () s ()
forall e s a. e -> Parser e s a
Parser.fail ()
    Int
_ -> () -> Parser () s ()
forall a. a -> Parser () s a
forall (f :: * -> *) a. Applicative f => a -> f a
pure ()
  ShortText
tag <- case ShortByteString -> Maybe ShortText
TS.fromShortByteString (Bytes -> ShortByteString
Bytes.toShortByteStringClone Bytes
btag) of
    Maybe ShortText
Nothing -> () -> Parser () s ShortText
forall e s a. e -> Parser e s a
Parser.fail ()
    Just ShortText
ttag -> ShortText -> Parser () s ShortText
forall a. a -> Parser () s a
forall (f :: * -> *) a. Applicative f => a -> f a
pure ShortText
ttag
  -- Note that parserAttributes consumes leading and trailing whitespace.
  Chunks Attribute
attrs <- Builder s Attribute -> Parser () s (Chunks Attribute)
forall s. Builder s Attribute -> Parser () s (Chunks Attribute)
parserAttributes (Builder s Attribute -> Parser () s (Chunks Attribute))
-> Parser () s (Builder s Attribute)
-> Parser () s (Chunks Attribute)
forall (m :: * -> *) a b. Monad m => (a -> m b) -> m a -> m b
=<< ST s (Builder s Attribute) -> Parser () s (Builder s Attribute)
forall s a e. ST s a -> Parser e s a
Parser.effect ST s (Builder s Attribute)
forall s a. ST s (Builder s a)
Builder.new
  let !attributes :: SmallArray Attribute
attributes = Chunks Attribute -> SmallArray Attribute
forall a. Chunks a -> SmallArray a
Chunks.concat Chunks Attribute
attrs
  () -> Parser () s Char
forall e s. e -> Parser e s Char
Latin.any () Parser () s Char -> (Char -> Parser () s Node) -> Parser () s Node
forall a b. Parser () s a -> (a -> Parser () s b) -> Parser () s b
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= \case
    Char
'>' -> do
      Chunks Node
nodes <- ShortText -> Parser () s (Chunks Node)
forall s. ShortText -> Parser () s (Chunks Node)
childrenParser ShortText
tag
      Node -> Parser () s Node
forall a. a -> Parser () s a
forall (f :: * -> *) a. Applicative f => a -> f a
pure (Content -> Node
Element Content {ShortText
tag :: ShortText
tag :: ShortText
tag, SmallArray Attribute
attributes :: SmallArray Attribute
attributes :: SmallArray Attribute
attributes, children :: SmallArray Node
children = Chunks Node -> SmallArray Node
forall a. Chunks a -> SmallArray a
Chunks.concat Chunks Node
nodes})
    Char
'/' -> do
      () -> Char -> Parser () s ()
forall e s. e -> Char -> Parser e s ()
Latin.char () Char
'>'
      Node -> Parser () s Node
forall a. a -> Parser () s a
forall (f :: * -> *) a. Applicative f => a -> f a
pure (Content -> Node
Element Content {ShortText
tag :: ShortText
tag :: ShortText
tag, SmallArray Attribute
attributes :: SmallArray Attribute
attributes :: SmallArray Attribute
attributes, children :: SmallArray Node
children = SmallArray Node
forall a. Monoid a => a
mempty})
    Char
_ -> () -> Parser () s Node
forall e s a. e -> Parser e s a
Parser.fail ()

textNodeParser :: Parser () s Node
textNodeParser :: forall s. Parser () s Node
textNodeParser = do
  Bytes
raw <- (Word8 -> Bool) -> Parser () s Bytes
forall e s. (Word8 -> Bool) -> Parser e s Bytes
Parser.takeWhile (\Word8
w -> Word8
w Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
/= Word8
0x3C)
  case (Word8 -> Bool) -> Bytes -> Bool
Bytes.any (\Word8
w -> Word8
w Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
> Word8
0x7F Bool -> Bool -> Bool
|| Word8
w Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
== Word8
0x26) Bytes
raw of
    Bool
True -> () -> Parser () s Node
forall e s a. e -> Parser e s a
Parser.fail () -- TODO: escape or check UTF-8 encoding here instead
    Bool
False -> Node -> Parser () s Node
forall a. a -> Parser () s a
forall (f :: * -> *) a. Applicative f => a -> f a
pure (ShortText -> Node
Text (ShortByteString -> ShortText
TS.fromShortByteStringUnsafe (Bytes -> ShortByteString
Bytes.toShortByteStringClone Bytes
raw)))

-- This eats the closing tag as well.
childrenParser ::
  ShortText -> -- opening tag name, looking for a closing tag that matches
  Parser () s (Chunks Node)
childrenParser :: forall s. ShortText -> Parser () s (Chunks Node)
childrenParser !ShortText
tag = do
  Builder s Node
b0 <- ST s (Builder s Node) -> Parser () s (Builder s Node)
forall s a e. ST s a -> Parser e s a
Parser.effect ST s (Builder s Node)
forall s a. ST s (Builder s a)
Builder.new
  ShortText -> Builder s Node -> Parser () s (Chunks Node)
forall s. ShortText -> Builder s Node -> Parser () s (Chunks Node)
childrenParserLoop ShortText
tag Builder s Node
b0

childrenParserLoop ::
  ShortText -> -- opening tag name, looking for a closing tag that matches
  Builder s Node ->
  Parser () s (Chunks Node)
childrenParserLoop :: forall s. ShortText -> Builder s Node -> Parser () s (Chunks Node)
childrenParserLoop !ShortText
tag !Builder s Node
b0 =
  () -> Parser () s Char
forall e s. e -> Parser e s Char
Latin.any () Parser () s Char
-> (Char -> Parser () s (Chunks Node)) -> Parser () s (Chunks Node)
forall a b. Parser () s a -> (a -> Parser () s b) -> Parser () s b
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= \case
    Char
'<' ->
      () -> Parser () s Char
forall e s. e -> Parser e s Char
Latin.any () Parser () s Char
-> (Char -> Parser () s (Chunks Node)) -> Parser () s (Chunks Node)
forall a b. Parser () s a -> (a -> Parser () s b) -> Parser () s b
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= \case
        Char
'/' -> do
          () -> ShortText -> Parser () s ()
forall e s. e -> ShortText -> Parser e s ()
Utf8.shortText () ShortText
tag
          (Word8 -> Bool) -> Parser () s ()
forall e s. (Word8 -> Bool) -> Parser e s ()
Parser.skipWhile Word8 -> Bool
isXmlSpace
          () -> Char -> Parser () s ()
forall e s. e -> Char -> Parser e s ()
Latin.char () Char
'>'
          ST s (Chunks Node) -> Parser () s (Chunks Node)
forall s a e. ST s a -> Parser e s a
Parser.effect (Builder s Node -> ST s (Chunks Node)
forall s a. Builder s a -> ST s (Chunks a)
Builder.freeze Builder s Node
b0)
        Char
_ -> do
          Int -> Parser () s ()
forall e s. Int -> Parser e s ()
Unsafe.unconsume Int
2
          Node
node <- Parser () s Node
forall s. Parser () s Node
elementNodeParser
          Builder s Node
b1 <- ST s (Builder s Node) -> Parser () s (Builder s Node)
forall s a e. ST s a -> Parser e s a
Parser.effect (Node -> Builder s Node -> ST s (Builder s Node)
forall a s. a -> Builder s a -> ST s (Builder s a)
Builder.push Node
node Builder s Node
b0)
          ShortText -> Builder s Node -> Parser () s (Chunks Node)
forall s. ShortText -> Builder s Node -> Parser () s (Chunks Node)
childrenParserLoop ShortText
tag Builder s Node
b1
    Char
_ -> do
      Int -> Parser () s ()
forall e s. Int -> Parser e s ()
Unsafe.unconsume Int
1
      Node
node <- Parser () s Node
forall s. Parser () s Node
textNodeParser
      Builder s Node
b1 <- ST s (Builder s Node) -> Parser () s (Builder s Node)
forall s a e. ST s a -> Parser e s a
Parser.effect (Node -> Builder s Node -> ST s (Builder s Node)
forall a s. a -> Builder s a -> ST s (Builder s a)
Builder.push Node
node Builder s Node
b0)
      ShortText -> Builder s Node -> Parser () s (Chunks Node)
forall s. ShortText -> Builder s Node -> Parser () s (Chunks Node)
childrenParserLoop ShortText
tag Builder s Node
b1

isXmlSpace :: Word8 -> Bool
isXmlSpace :: Word8 -> Bool
isXmlSpace = \case
  Word8
0x20 -> Bool
True
  Word8
0x09 -> Bool
True
  Word8
0x0D -> Bool
True
  Word8
0x0A -> Bool
True
  Word8
_ -> Bool
False

parserAttributes :: Builder s Attribute -> Parser () s (Chunks Attribute)
parserAttributes :: forall s. Builder s Attribute -> Parser () s (Chunks Attribute)
parserAttributes !Builder s Attribute
b0 = do
  (Word8 -> Bool) -> Parser () s ()
forall e s. (Word8 -> Bool) -> Parser e s ()
Parser.skipWhile Word8 -> Bool
isXmlSpace
  Parser () s Bool
forall s. Parser () s Bool
peekIsNameStartChar Parser () s Bool
-> (Bool -> Parser () s (Chunks Attribute))
-> Parser () s (Chunks Attribute)
forall a b. Parser () s a -> (a -> Parser () s b) -> Parser () s b
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= \case
    Bool
True -> do
      Attribute
attr <- Parser () s Attribute
forall s. Parser () s Attribute
parserAttribute
      Builder s Attribute
b1 <- ST s (Builder s Attribute) -> Parser () s (Builder s Attribute)
forall s a e. ST s a -> Parser e s a
Parser.effect (Attribute -> Builder s Attribute -> ST s (Builder s Attribute)
forall a s. a -> Builder s a -> ST s (Builder s a)
Builder.push Attribute
attr Builder s Attribute
b0)
      Builder s Attribute -> Parser () s (Chunks Attribute)
forall s. Builder s Attribute -> Parser () s (Chunks Attribute)
parserAttributes Builder s Attribute
b1
    Bool
False -> do
      (Word8 -> Bool) -> Parser () s ()
forall e s. (Word8 -> Bool) -> Parser e s ()
Parser.skipWhile Word8 -> Bool
isXmlSpace
      ST s (Chunks Attribute) -> Parser () s (Chunks Attribute)
forall s a e. ST s a -> Parser e s a
Parser.effect (Builder s Attribute -> ST s (Chunks Attribute)
forall s a. Builder s a -> ST s (Chunks a)
Builder.freeze Builder s Attribute
b0)

-- From the spec, we have:
--   Attribute ::= Name Eq AttValue
--   Eq        ::= S? '=' S?
--   Name      ::= NameStartChar (NameChar)*
--
-- Precondition A: The first character is a NameStartChar. This parser
-- does not check this.
parserAttribute :: Parser () s Attribute
parserAttribute :: forall s. Parser () s Attribute
parserAttribute = do
  Bytes
bname <- (Word8 -> Bool) -> Parser () s Bytes
forall e s. (Word8 -> Bool) -> Parser e s Bytes
Parser.takeWhile (\Word8
w -> Bool -> Bool
not (Word8 -> Bool
isXmlSpace Word8
w) Bool -> Bool -> Bool
&& Word8
w Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
/= Word8
0x3D)
  -- We may assume that length of bname is at least one because of
  -- precondition A.
  !ShortText
name <- case ShortByteString -> Maybe ShortText
TS.fromShortByteString (Bytes -> ShortByteString
Bytes.toShortByteStringClone Bytes
bname) of
    Maybe ShortText
Nothing -> () -> Parser () s ShortText
forall e s a. e -> Parser e s a
Parser.fail ()
    Just ShortText
tname -> ShortText -> Parser () s ShortText
forall a. a -> Parser () s a
forall (f :: * -> *) a. Applicative f => a -> f a
pure ShortText
tname
  (Word8 -> Bool) -> Parser () s ()
forall e s. (Word8 -> Bool) -> Parser e s ()
Parser.skipWhile Word8 -> Bool
isXmlSpace
  () -> Char -> Parser () s ()
forall e s. e -> Char -> Parser e s ()
Latin.char () Char
'='
  (Word8 -> Bool) -> Parser () s ()
forall e s. (Word8 -> Bool) -> Parser e s ()
Parser.skipWhile Word8 -> Bool
isXmlSpace
  !ShortText
value <- Parser () s ShortText
forall s. Parser () s ShortText
parserAttributeValue
  Attribute -> Parser () s Attribute
forall a. a -> Parser () s a
forall (f :: * -> *) a. Applicative f => a -> f a
pure Attribute {ShortText
name :: ShortText
name :: ShortText
name, ShortText
value :: ShortText
value :: ShortText
value}

-- TODO: This is woefully incomplete
parserAttributeValue :: Parser () s ShortText
parserAttributeValue :: forall s. Parser () s ShortText
parserAttributeValue = do
  () -> Parser () s Char
forall e s. e -> Parser e s Char
Latin.any () Parser () s Char
-> (Char -> Parser () s ShortText) -> Parser () s ShortText
forall a b. Parser () s a -> (a -> Parser () s b) -> Parser () s b
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= \case
    Char
'"' -> do
      Bytes
bval <- (Word8 -> Bool) -> Parser () s Bytes
forall e s. (Word8 -> Bool) -> Parser e s Bytes
Parser.takeWhile (\Word8
w -> Word8
w Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
/= Word8
0x22)
      () -> Char -> Parser () s ()
forall e s. e -> Char -> Parser e s ()
Latin.char () Char
'"'
      case ShortByteString -> Maybe ShortText
TS.fromShortByteString (Bytes -> ShortByteString
Bytes.toShortByteStringClone Bytes
bval) of
        Maybe ShortText
Nothing -> () -> Parser () s ShortText
forall e s a. e -> Parser e s a
Parser.fail ()
        Just ShortText
tval -> ShortText -> Parser () s ShortText
forall a. a -> Parser () s a
forall (f :: * -> *) a. Applicative f => a -> f a
pure ShortText
tval
    Char
'\'' -> do
      Bytes
bval <- (Word8 -> Bool) -> Parser () s Bytes
forall e s. (Word8 -> Bool) -> Parser e s Bytes
Parser.takeWhile (\Word8
w -> Word8
w Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
/= Word8
0x27)
      () -> Char -> Parser () s ()
forall e s. e -> Char -> Parser e s ()
Latin.char () Char
'\''
      case ShortByteString -> Maybe ShortText
TS.fromShortByteString (Bytes -> ShortByteString
Bytes.toShortByteStringClone Bytes
bval) of
        Maybe ShortText
Nothing -> () -> Parser () s ShortText
forall e s a. e -> Parser e s a
Parser.fail ()
        Just ShortText
tval -> ShortText -> Parser () s ShortText
forall a. a -> Parser () s a
forall (f :: * -> *) a. Applicative f => a -> f a
pure ShortText
tval
    Char
_ -> () -> Parser () s ShortText
forall e s a. e -> Parser e s a
Parser.fail ()

peekIsNameStartChar :: Parser () s Bool
peekIsNameStartChar :: forall s. Parser () s Bool
peekIsNameStartChar =
  Parser () s Int
forall e s. Parser e s Int
Unsafe.cursor Parser () s Int -> (Int -> Parser () s Bool) -> Parser () s Bool
forall e s a b. Parser e s a -> (a -> Parser e s b) -> Parser e s b
forall e s a b.
Bind LiftedRep LiftedRep =>
Parser e s a -> (a -> Parser e s b) -> Parser e s b
R.>>= \Int
pos ->
    () -> Parser () s Char#
forall e s. e -> Parser e s Char#
Utf8.any# () Parser () s Char#
-> (Char# -> Parser () s Bool) -> Parser () s Bool
forall e s (a :: TYPE 'WordRep) b.
Parser e s a -> (a -> Parser e s b) -> Parser e s b
forall e s a b.
Bind LiftedRep LiftedRep =>
Parser e s a -> (a -> Parser e s b) -> Parser e s b
R.>>= \Char#
c ->
      Int -> Parser () s ()
forall e s. Int -> Parser e s ()
Unsafe.jump Int
pos Parser () s () -> (() -> Parser () s Bool) -> Parser () s Bool
forall e s a b. Parser e s a -> (a -> Parser e s b) -> Parser e s b
forall e s a b.
Bind LiftedRep LiftedRep =>
Parser e s a -> (a -> Parser e s b) -> Parser e s b
R.>>= \()
_ ->
        Bool -> Parser () s Bool
forall e s a. a -> Parser e s a
forall e s a. Pure LiftedRep => a -> Parser e s a
R.pure (Char# -> Bool
isNameStartChar Char#
c)

isNameStartChar :: Char# -> Bool
isNameStartChar :: Char# -> Bool
isNameStartChar Char#
c = case Char# -> Char
C# Char#
c of
  Char
':' -> Bool
True
  Char
'_' -> Bool
True
  Char
_ | Char# -> Char
C# Char#
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
>= Char
'A' Bool -> Bool -> Bool
&& Char# -> Char
C# Char#
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= Char
'Z' -> Bool
True
  Char
_ | Char# -> Char
C# Char#
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
>= Char
'a' Bool -> Bool -> Bool
&& Char# -> Char
C# Char#
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= Char
'z' -> Bool
True
  Char
_ -> Bool
False