{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE DeriveGeneric #-}
{-# LANGUAGE LambdaCase #-}

{-# OPTIONS_GHC -O2 #-}
{-# OPTIONS_GHC -Wno-unrecognised-pragmas #-}
{-# HLINT ignore "Redundant id" #-}
{-# HLINT ignore "Redundant bracket" #-}

-- | This is a performance-oriented HTML tokenizer aim at web-crawling
-- applications. It follows the HTML5 parsing specification quite closely,
-- so it behaves reasonable well on ill-formed documents from the open Web.
--
module Text.HTML.Parser
    ( -- * Parsing
      parseTokens
    , parseTokensLazy
    , token
      -- * Types
    , Token(..)
    , TagName, AttrName, AttrValue
    , Attr(..)
      -- * Rendering, text canonicalization
    , renderTokens
    , renderToken
    , renderAttrs
    , renderAttr
    , canonicalizeTokens
    ) where

import Data.Char hiding (isSpace)
import Data.List (unfoldr)
import GHC.Generics
import Control.Applicative
import Data.Monoid
import Control.Monad (guard)
import Control.DeepSeq

import Data.Attoparsec.Text
import qualified Data.Attoparsec.Text.Lazy as AL
import Data.Text (Text)
import qualified Data.Text as T
import qualified Data.Text.Lazy as TL
import Data.Text.Lazy.Builder (Builder)
import qualified Data.Text.Lazy.Builder as B
import Prelude hiding (take, takeWhile)

import Text.HTML.Parser.Entities (entities)
import qualified Data.Trie as Trie

-- Section numbers refer to W3C HTML 5.2 specification.

-- | A tag name (e.g. @body@)
type TagName   = Text

-- | An attribute name (e.g. @href@)
type AttrName  = Text

-- | The value of an attribute
type AttrValue = Text

-- | An HTML token
data Token
  -- | An opening tag. Attribute ordering is arbitrary. Void elements have a 'TagOpen' but no corresponding 'TagClose'. See 'Text.HTML.Tree.nonClosing'.
  = TagOpen !TagName [Attr]
  -- | A self-closing tag.
  | TagSelfClose !TagName [Attr]
  -- | A closing tag.
  | TagClose !TagName
  -- | The content between tags.
  | ContentText !Text
  -- | A single character of content
  | ContentChar !Char
  -- | Contents of a comment.
  | Comment !Builder
  -- | Doctype
  | Doctype !Text
  deriving (Int -> Token -> ShowS
[Token] -> ShowS
Token -> String
(Int -> Token -> ShowS)
-> (Token -> String) -> ([Token] -> ShowS) -> Show Token
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
$cshowsPrec :: Int -> Token -> ShowS
showsPrec :: Int -> Token -> ShowS
$cshow :: Token -> String
show :: Token -> String
$cshowList :: [Token] -> ShowS
showList :: [Token] -> ShowS
Show, Eq Token
Eq Token =>
(Token -> Token -> Ordering)
-> (Token -> Token -> Bool)
-> (Token -> Token -> Bool)
-> (Token -> Token -> Bool)
-> (Token -> Token -> Bool)
-> (Token -> Token -> Token)
-> (Token -> Token -> Token)
-> Ord Token
Token -> Token -> Bool
Token -> Token -> Ordering
Token -> Token -> Token
forall a.
Eq a =>
(a -> a -> Ordering)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> a)
-> (a -> a -> a)
-> Ord a
$ccompare :: Token -> Token -> Ordering
compare :: Token -> Token -> Ordering
$c< :: Token -> Token -> Bool
< :: Token -> Token -> Bool
$c<= :: Token -> Token -> Bool
<= :: Token -> Token -> Bool
$c> :: Token -> Token -> Bool
> :: Token -> Token -> Bool
$c>= :: Token -> Token -> Bool
>= :: Token -> Token -> Bool
$cmax :: Token -> Token -> Token
max :: Token -> Token -> Token
$cmin :: Token -> Token -> Token
min :: Token -> Token -> Token
Ord, Token -> Token -> Bool
(Token -> Token -> Bool) -> (Token -> Token -> Bool) -> Eq Token
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
$c== :: Token -> Token -> Bool
== :: Token -> Token -> Bool
$c/= :: Token -> Token -> Bool
/= :: Token -> Token -> Bool
Eq, (forall x. Token -> Rep Token x)
-> (forall x. Rep Token x -> Token) -> Generic Token
forall x. Rep Token x -> Token
forall x. Token -> Rep Token x
forall a.
(forall x. a -> Rep a x) -> (forall x. Rep a x -> a) -> Generic a
$cfrom :: forall x. Token -> Rep Token x
from :: forall x. Token -> Rep Token x
$cto :: forall x. Rep Token x -> Token
to :: forall x. Rep Token x -> Token
Generic)

-- | This is a bit of a hack
endOfFileToken :: Token
endOfFileToken :: Token
endOfFileToken = Text -> Token
ContentText Text
""

-- | An attribute of a tag
data Attr = Attr !AttrName !AttrValue
          deriving (Int -> Attr -> ShowS
[Attr] -> ShowS
Attr -> String
(Int -> Attr -> ShowS)
-> (Attr -> String) -> ([Attr] -> ShowS) -> Show Attr
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
$cshowsPrec :: Int -> Attr -> ShowS
showsPrec :: Int -> Attr -> ShowS
$cshow :: Attr -> String
show :: Attr -> String
$cshowList :: [Attr] -> ShowS
showList :: [Attr] -> ShowS
Show, Attr -> Attr -> Bool
(Attr -> Attr -> Bool) -> (Attr -> Attr -> Bool) -> Eq Attr
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
$c== :: Attr -> Attr -> Bool
== :: Attr -> Attr -> Bool
$c/= :: Attr -> Attr -> Bool
/= :: Attr -> Attr -> Bool
Eq, Eq Attr
Eq Attr =>
(Attr -> Attr -> Ordering)
-> (Attr -> Attr -> Bool)
-> (Attr -> Attr -> Bool)
-> (Attr -> Attr -> Bool)
-> (Attr -> Attr -> Bool)
-> (Attr -> Attr -> Attr)
-> (Attr -> Attr -> Attr)
-> Ord Attr
Attr -> Attr -> Bool
Attr -> Attr -> Ordering
Attr -> Attr -> Attr
forall a.
Eq a =>
(a -> a -> Ordering)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> a)
-> (a -> a -> a)
-> Ord a
$ccompare :: Attr -> Attr -> Ordering
compare :: Attr -> Attr -> Ordering
$c< :: Attr -> Attr -> Bool
< :: Attr -> Attr -> Bool
$c<= :: Attr -> Attr -> Bool
<= :: Attr -> Attr -> Bool
$c> :: Attr -> Attr -> Bool
> :: Attr -> Attr -> Bool
$c>= :: Attr -> Attr -> Bool
>= :: Attr -> Attr -> Bool
$cmax :: Attr -> Attr -> Attr
max :: Attr -> Attr -> Attr
$cmin :: Attr -> Attr -> Attr
min :: Attr -> Attr -> Attr
Ord)

instance NFData Token where
    rnf :: Token -> ()
rnf (Comment Builder
b) = Text -> ()
forall a. NFData a => a -> ()
rnf (Text -> ()) -> Text -> ()
forall a b. (a -> b) -> a -> b
$ Builder -> Text
B.toLazyText Builder
b
    rnf Token
_           = ()

-- | Parse a single 'Token'.
token :: Parser Token
token :: Parser Token
token = Parser Token
dataState -- Start in the data state.

-- | /§8.2.4.1/: Data state
dataState :: Parser Token
dataState :: Parser Token
dataState = do
    Text
content <- (Char -> Bool) -> Parser Text
takeWhile (\Char
c -> Char
c Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
/= Char
'<' Bool -> Bool -> Bool
&& Char
c Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
/= Char
'&')
    if Bool -> Bool
not (Bool -> Bool) -> Bool -> Bool
forall a b. (a -> b) -> a -> b
$ Text -> Bool
T.null Text
content
      then Token -> Parser Token
forall a. a -> Parser Text a
forall (m :: * -> *) a. Monad m => a -> m a
return (Token -> Parser Token) -> Token -> Parser Token
forall a b. (a -> b) -> a -> b
$ Text -> Token
ContentText Text
content
      else [Parser Token] -> Parser Token
forall (f :: * -> *) a. Alternative f => [f a] -> f a
choice
        [ Char -> Parser Char
char Char
'<' Parser Char -> Parser Token -> Parser Token
forall a b. Parser Text a -> Parser Text b -> Parser Text b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Parser Token
tagOpen
        , Parser Token -> Parser Token
forall i a. Parser i a -> Parser i a
try (Parser Token -> Parser Token) -> Parser Token -> Parser Token
forall a b. (a -> b) -> a -> b
$ Char -> Parser Char
char Char
'&' Parser Char -> Parser Token -> Parser Token
forall a b. Parser Text a -> Parser Text b -> Parser Text b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Parser Token
charRef
        , Char -> Token
ContentChar Char
'&' Token -> Parser Char -> Parser Token
forall a b. a -> Parser Text b -> Parser Text a
forall (f :: * -> *) a b. Functor f => a -> f b -> f a
<$ Char -> Parser Char
char Char
'&'
        ]

charRef :: Parser Token
charRef :: Parser Token
charRef = Trie Char Token -> Parser Token
go Trie Char Token
entityTrie
  where
    go :: Trie.Trie Char Token -> Parser Token
    go :: Trie Char Token -> Parser Token
go Trie Char Token
trie = do
      Char
c <- Parser Char
anyChar
      case Char
c of
        Char
';' -> Parser Token
-> (Token -> Parser Token) -> Maybe Token -> Parser Token
forall b a. b -> (a -> b) -> Maybe a -> b
maybe Parser Token
forall a. Parser Text a
forall (f :: * -> *) a. Alternative f => f a
empty Token -> Parser Token
forall a. a -> Parser Text a
forall (m :: * -> *) a. Monad m => a -> m a
return (Trie Char Token -> Maybe Token
forall k v. Trie k v -> Maybe v
Trie.terminal Trie Char Token
trie)
        Char
_ -> Trie Char Token -> Parser Token
go (Char -> Trie Char Token -> Trie Char Token
forall k v. Ord k => k -> Trie k v -> Trie k v
Trie.step Char
c Trie Char Token
trie)

entityTrie :: Trie.Trie Char Token
entityTrie :: Trie Char Token
entityTrie = [(String, Token)] -> Trie Char Token
forall k v. Ord k => [([k], v)] -> Trie k v
Trie.fromList
    [ (Text -> String
T.unpack Text
name, Text -> Token
ContentText Text
expansion)
    | (Text
name, Text
expansion) <- [(Text, Text)]
entities
    ]

-- | /§8.2.4.6/: Tag open state
tagOpen :: Parser Token
tagOpen :: Parser Token
tagOpen =
        (Char -> Parser Char
char Char
'!' Parser Char -> Parser Token -> Parser Token
forall a b. Parser Text a -> Parser Text b -> Parser Text b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Parser Token
markupDeclOpen)
    Parser Token -> Parser Token -> Parser Token
forall a. Parser Text a -> Parser Text a -> Parser Text a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> (Char -> Parser Char
char Char
'/' Parser Char -> Parser Token -> Parser Token
forall a b. Parser Text a -> Parser Text b -> Parser Text b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Parser Token
endTagOpen)
    Parser Token -> Parser Token -> Parser Token
forall a. Parser Text a -> Parser Text a -> Parser Text a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> (Char -> Parser Char
char Char
'?' Parser Char -> Parser Token -> Parser Token
forall a b. Parser Text a -> Parser Text b -> Parser Text b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Builder -> Parser Token
bogusComment Builder
forall a. Monoid a => a
mempty)
    Parser Token -> Parser Token -> Parser Token
forall a. Parser Text a -> Parser Text a -> Parser Text a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> Parser Token
tagNameOpen
    Parser Token -> Parser Token -> Parser Token
forall a. Parser Text a -> Parser Text a -> Parser Text a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> Parser Token
other
  where
    other :: Parser Token
other = do
        Token -> Parser Token
forall a. a -> Parser Text a
forall (m :: * -> *) a. Monad m => a -> m a
return (Token -> Parser Token) -> Token -> Parser Token
forall a b. (a -> b) -> a -> b
$ Char -> Token
ContentChar Char
'<'

-- | /§8.2.4.7/: End tag open state
endTagOpen :: Parser Token
endTagOpen :: Parser Token
endTagOpen = Parser Token
tagNameClose

-- | Equivalent to @inClass "\x09\x0a\x0c "@
isWhitespace :: Char -> Bool
isWhitespace :: Char -> Bool
isWhitespace Char
'\x09' = Bool
True
isWhitespace Char
'\x0a' = Bool
True
isWhitespace Char
'\x0c' = Bool
True
isWhitespace Char
'\x0d' = Bool
True
isWhitespace Char
' '    = Bool
True
isWhitespace Char
_      = Bool
False

orC :: (Char -> Bool) -> (Char -> Bool) -> Char -> Bool
orC :: (Char -> Bool) -> (Char -> Bool) -> Char -> Bool
orC Char -> Bool
f Char -> Bool
g Char
c = Char -> Bool
f Char
c Bool -> Bool -> Bool
|| Char -> Bool
g Char
c
{-# INLINE orC #-}

isC :: Char -> Char -> Bool
isC :: Char -> Char -> Bool
isC = Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
(==)
{-# INLINE isC #-}

-- | /§8.2.4.8/: Tag name state: the open case
--
-- deviation: no lower-casing, don't handle NULL characters
tagNameOpen :: Parser Token
tagNameOpen :: Parser Token
tagNameOpen = do
    Text
tag <- Parser Text
tagName'
    Parser Token -> Parser Token
forall a. a -> a
id (Parser Token -> Parser Token) -> Parser Token -> Parser Token
forall a b. (a -> b) -> a -> b
$  ((Char -> Bool) -> Parser Char
satisfy Char -> Bool
isWhitespace Parser Char -> Parser Token -> Parser Token
forall a b. Parser Text a -> Parser Text b -> Parser Text b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Text -> [Attr] -> Parser Token
beforeAttrName Text
tag [])
      Parser Token -> Parser Token -> Parser Token
forall a. Parser Text a -> Parser Text a -> Parser Text a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> (Char -> Parser Char
char Char
'/' Parser Char -> Parser Token -> Parser Token
forall a b. Parser Text a -> Parser Text b -> Parser Text b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Text -> [Attr] -> Parser Token
selfClosingStartTag Text
tag [])
      Parser Token -> Parser Token -> Parser Token
forall a. Parser Text a -> Parser Text a -> Parser Text a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> (Char -> Parser Char
char Char
'>' Parser Char -> Parser Token -> Parser Token
forall a b. Parser Text a -> Parser Text b -> Parser Text b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Token -> Parser Token
forall a. a -> Parser Text a
forall (m :: * -> *) a. Monad m => a -> m a
return (Text -> [Attr] -> Token
TagOpen Text
tag []))

-- | /§8.2.4.10/: Tag name state: close case
tagNameClose :: Parser Token
tagNameClose :: Parser Token
tagNameClose = do
    Text
tag <- Parser Text
tagName'
    Char -> Parser Char
char Char
'>' Parser Char -> Parser Token -> Parser Token
forall a b. Parser Text a -> Parser Text b -> Parser Text b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Token -> Parser Token
forall a. a -> Parser Text a
forall (m :: * -> *) a. Monad m => a -> m a
return (Text -> Token
TagClose Text
tag)

-- | /§8.2.4.10/: Tag name state: common code
--
-- deviation: no lower-casing, don't handle NULL characters
tagName' :: Parser Text
tagName' :: Parser Text
tagName' = do
    Char
c <- Parser Char
peekChar'
    Bool -> Parser Text ()
forall (f :: * -> *). Alternative f => Bool -> f ()
guard (Bool -> Parser Text ()) -> Bool -> Parser Text ()
forall a b. (a -> b) -> a -> b
$ Char -> Bool
isAsciiUpper Char
c Bool -> Bool -> Bool
|| Char -> Bool
isAsciiLower Char
c
    (Char -> Bool) -> Parser Text
takeWhile ((Char -> Bool) -> Parser Text) -> (Char -> Bool) -> Parser Text
forall a b. (a -> b) -> a -> b
$ Bool -> Bool
not (Bool -> Bool) -> (Char -> Bool) -> Char -> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Char -> Bool
isWhitespace (Char -> Bool) -> (Char -> Bool) -> Char -> Bool
`orC` Char -> Char -> Bool
isC Char
'/' (Char -> Bool) -> (Char -> Bool) -> Char -> Bool
`orC` Char -> Char -> Bool
isC Char
'<' (Char -> Bool) -> (Char -> Bool) -> Char -> Bool
`orC` Char -> Char -> Bool
isC Char
'>')

-- | /§8.2.4.40/: Self-closing start tag state
selfClosingStartTag :: TagName -> [Attr] -> Parser Token
selfClosingStartTag :: Text -> [Attr] -> Parser Token
selfClosingStartTag Text
tag [Attr]
attrs = do
        (Char -> Parser Char
char Char
'>' Parser Char -> Parser Token -> Parser Token
forall a b. Parser Text a -> Parser Text b -> Parser Text b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Token -> Parser Token
forall a. a -> Parser Text a
forall (m :: * -> *) a. Monad m => a -> m a
return (Text -> [Attr] -> Token
TagSelfClose Text
tag [Attr]
attrs))
    Parser Token -> Parser Token -> Parser Token
forall a. Parser Text a -> Parser Text a -> Parser Text a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> (Parser Text ()
forall t. Chunk t => Parser t ()
endOfInput Parser Text () -> Parser Token -> Parser Token
forall a b. Parser Text a -> Parser Text b -> Parser Text b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Token -> Parser Token
forall a. a -> Parser Text a
forall (m :: * -> *) a. Monad m => a -> m a
return Token
endOfFileToken)
    Parser Token -> Parser Token -> Parser Token
forall a. Parser Text a -> Parser Text a -> Parser Text a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> Text -> [Attr] -> Parser Token
beforeAttrName Text
tag [Attr]
attrs

-- | /§8.2.4.32/: Before attribute name state
--
-- deviation: no lower-casing
beforeAttrName :: TagName -> [Attr] -> Parser Token
beforeAttrName :: Text -> [Attr] -> Parser Token
beforeAttrName Text
tag [Attr]
attrs = do
    (Char -> Bool) -> Parser Text ()
skipWhile Char -> Bool
isWhitespace
    Parser Token -> Parser Token
forall a. a -> a
id (Parser Token -> Parser Token) -> Parser Token -> Parser Token
forall a b. (a -> b) -> a -> b
$  (Char -> Parser Char
char Char
'/' Parser Char -> Parser Token -> Parser Token
forall a b. Parser Text a -> Parser Text b -> Parser Text b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Text -> [Attr] -> Parser Token
selfClosingStartTag Text
tag [Attr]
attrs)
      Parser Token -> Parser Token -> Parser Token
forall a. Parser Text a -> Parser Text a -> Parser Text a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> (Char -> Parser Char
char Char
'>' Parser Char -> Parser Token -> Parser Token
forall a b. Parser Text a -> Parser Text b -> Parser Text b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Token -> Parser Token
forall a. a -> Parser Text a
forall (m :: * -> *) a. Monad m => a -> m a
return (Text -> [Attr] -> Token
TagOpen Text
tag [Attr]
attrs))
      -- <|> (char '\x00' >> attrName tag attrs) -- TODO: NULL
      Parser Token -> Parser Token -> Parser Token
forall a. Parser Text a -> Parser Text a -> Parser Text a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> Text -> [Attr] -> Parser Token
attrName Text
tag [Attr]
attrs

-- | /§8.2.4.33/: Attribute name state
attrName :: TagName -> [Attr] -> Parser Token
attrName :: Text -> [Attr] -> Parser Token
attrName Text
tag [Attr]
attrs = do
    Text
name <- (Char -> Bool) -> Parser Text
takeWhile ((Char -> Bool) -> Parser Text) -> (Char -> Bool) -> Parser Text
forall a b. (a -> b) -> a -> b
$ Bool -> Bool
not (Bool -> Bool) -> (Char -> Bool) -> Char -> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Char -> Bool
isWhitespace (Char -> Bool) -> (Char -> Bool) -> Char -> Bool
`orC` Char -> Char -> Bool
isC Char
'/' (Char -> Bool) -> (Char -> Bool) -> Char -> Bool
`orC` Char -> Char -> Bool
isC Char
'=' (Char -> Bool) -> (Char -> Bool) -> Char -> Bool
`orC` Char -> Char -> Bool
isC Char
'>')
    Parser Token -> Parser Token
forall a. a -> a
id (Parser Token -> Parser Token) -> Parser Token -> Parser Token
forall a b. (a -> b) -> a -> b
$  (Parser Text ()
forall t. Chunk t => Parser t ()
endOfInput Parser Text () -> Parser Token -> Parser Token
forall a b. Parser Text a -> Parser Text b -> Parser Text b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Text -> [Attr] -> Text -> Parser Token
afterAttrName Text
tag [Attr]
attrs Text
name)
      Parser Token -> Parser Token -> Parser Token
forall a. Parser Text a -> Parser Text a -> Parser Text a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> (Char -> Parser Char
char Char
'=' Parser Char -> Parser Token -> Parser Token
forall a b. Parser Text a -> Parser Text b -> Parser Text b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Text -> [Attr] -> Text -> Parser Token
beforeAttrValue Text
tag [Attr]
attrs Text
name)
      Parser Token -> Parser Token -> Parser Token
forall a. Parser Text a -> Parser Text a -> Parser Text a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> ((Char -> Bool) -> Parser Char
satisfy Char -> Bool
isWhitespace Parser Char -> Parser Token -> Parser Token
forall a b. Parser Text a -> Parser Text b -> Parser Text b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Text -> [Attr] -> Text -> Parser Token
afterAttrName Text
tag [Attr]
attrs Text
name)
      -- N.B. '/' is handled by afterAttrName
      Parser Token -> Parser Token -> Parser Token
forall a. Parser Text a -> Parser Text a -> Parser Text a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> Parser Token -> Parser Token
forall i a. Parser i a -> Parser i a
try (do Maybe Char
mc <- Parser (Maybe Char)
peekChar
                  case Maybe Char
mc of
                    Just Char
c | Char -> Bool
notNameChar Char
c ->  Text -> [Attr] -> Text -> Parser Token
afterAttrName Text
tag [Attr]
attrs Text
name
                    Maybe Char
_ -> Parser Token
forall a. Parser Text a
forall (f :: * -> *) a. Alternative f => f a
empty)
      -- <|> -- TODO: NULL
  where notNameChar :: Char -> Bool
notNameChar = Char -> Bool
isWhitespace (Char -> Bool) -> (Char -> Bool) -> Char -> Bool
`orC` Char -> Char -> Bool
isC Char
'/' (Char -> Bool) -> (Char -> Bool) -> Char -> Bool
`orC` Char -> Char -> Bool
isC Char
'>'

-- | /§8.2.4.34/: After attribute name state
afterAttrName :: TagName -> [Attr] -> AttrName -> Parser Token
afterAttrName :: Text -> [Attr] -> Text -> Parser Token
afterAttrName Text
tag [Attr]
attrs Text
name = do
    (Char -> Bool) -> Parser Text ()
skipWhile Char -> Bool
isWhitespace
    Parser Token -> Parser Token
forall a. a -> a
id (Parser Token -> Parser Token) -> Parser Token -> Parser Token
forall a b. (a -> b) -> a -> b
$  (Char -> Parser Char
char Char
'/' Parser Char -> Parser Token -> Parser Token
forall a b. Parser Text a -> Parser Text b -> Parser Text b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Text -> [Attr] -> Parser Token
selfClosingStartTag Text
tag (Text -> Text -> Attr
Attr Text
name Text
T.empty Attr -> [Attr] -> [Attr]
forall a. a -> [a] -> [a]
: [Attr]
attrs))
      Parser Token -> Parser Token -> Parser Token
forall a. Parser Text a -> Parser Text a -> Parser Text a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> (Char -> Parser Char
char Char
'=' Parser Char -> Parser Token -> Parser Token
forall a b. Parser Text a -> Parser Text b -> Parser Text b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Text -> [Attr] -> Text -> Parser Token
beforeAttrValue Text
tag [Attr]
attrs Text
name)
      Parser Token -> Parser Token -> Parser Token
forall a. Parser Text a -> Parser Text a -> Parser Text a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> (Char -> Parser Char
char Char
'>' Parser Char -> Parser Token -> Parser Token
forall a b. Parser Text a -> Parser Text b -> Parser Text b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Token -> Parser Token
forall a. a -> Parser Text a
forall (m :: * -> *) a. Monad m => a -> m a
return (Text -> [Attr] -> Token
TagOpen Text
tag (Text -> Text -> Attr
Attr Text
name Text
T.empty Attr -> [Attr] -> [Attr]
forall a. a -> [a] -> [a]
: [Attr]
attrs)))
      Parser Token -> Parser Token -> Parser Token
forall a. Parser Text a -> Parser Text a -> Parser Text a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> (Parser Text ()
forall t. Chunk t => Parser t ()
endOfInput Parser Text () -> Parser Token -> Parser Token
forall a b. Parser Text a -> Parser Text b -> Parser Text b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Token -> Parser Token
forall a. a -> Parser Text a
forall (m :: * -> *) a. Monad m => a -> m a
return Token
endOfFileToken)
      Parser Token -> Parser Token -> Parser Token
forall a. Parser Text a -> Parser Text a -> Parser Text a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> Text -> [Attr] -> Parser Token
attrName Text
tag (Text -> Text -> Attr
Attr Text
name Text
T.empty Attr -> [Attr] -> [Attr]
forall a. a -> [a] -> [a]
: [Attr]
attrs)  -- not exactly sure this is right

-- | /§8.2.4.35/: Before attribute value state
beforeAttrValue :: TagName -> [Attr] -> AttrName -> Parser Token
beforeAttrValue :: Text -> [Attr] -> Text -> Parser Token
beforeAttrValue Text
tag [Attr]
attrs Text
name = do
    (Char -> Bool) -> Parser Text ()
skipWhile Char -> Bool
isWhitespace
    Parser Token -> Parser Token
forall a. a -> a
id (Parser Token -> Parser Token) -> Parser Token -> Parser Token
forall a b. (a -> b) -> a -> b
$  (Char -> Parser Char
char Char
'"' Parser Char -> Parser Token -> Parser Token
forall a b. Parser Text a -> Parser Text b -> Parser Text b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Text -> [Attr] -> Text -> Parser Token
attrValueDQuoted Text
tag [Attr]
attrs Text
name)
      Parser Token -> Parser Token -> Parser Token
forall a. Parser Text a -> Parser Text a -> Parser Text a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> (Char -> Parser Char
char Char
'\'' Parser Char -> Parser Token -> Parser Token
forall a b. Parser Text a -> Parser Text b -> Parser Text b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Text -> [Attr] -> Text -> Parser Token
attrValueSQuoted Text
tag [Attr]
attrs Text
name)
      Parser Token -> Parser Token -> Parser Token
forall a. Parser Text a -> Parser Text a -> Parser Text a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> (Char -> Parser Char
char Char
'>' Parser Char -> Parser Token -> Parser Token
forall a b. Parser Text a -> Parser Text b -> Parser Text b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Token -> Parser Token
forall a. a -> Parser Text a
forall (m :: * -> *) a. Monad m => a -> m a
return (Text -> [Attr] -> Token
TagOpen Text
tag (Text -> Text -> Attr
Attr Text
name Text
T.empty Attr -> [Attr] -> [Attr]
forall a. a -> [a] -> [a]
: [Attr]
attrs)))
      Parser Token -> Parser Token -> Parser Token
forall a. Parser Text a -> Parser Text a -> Parser Text a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> Text -> [Attr] -> Text -> Parser Token
attrValueUnquoted Text
tag [Attr]
attrs Text
name

-- | /§8.2.4.36/: Attribute value (double-quoted) state
attrValueDQuoted :: TagName -> [Attr] -> AttrName -> Parser Token
attrValueDQuoted :: Text -> [Attr] -> Text -> Parser Token
attrValueDQuoted Text
tag [Attr]
attrs Text
name = do
    Text
value <- (Char -> Bool) -> Parser Text
takeWhile (Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
/= Char
'"')
    Char
_ <- Char -> Parser Char
char Char
'"'
    Text -> [Attr] -> Text -> Text -> Parser Token
afterAttrValueQuoted Text
tag [Attr]
attrs Text
name Text
value

-- | /§8.2.4.37/: Attribute value (single-quoted) state
attrValueSQuoted :: TagName -> [Attr] -> AttrName -> Parser Token
attrValueSQuoted :: Text -> [Attr] -> Text -> Parser Token
attrValueSQuoted Text
tag [Attr]
attrs Text
name = do
    Text
value <- (Char -> Bool) -> Parser Text
takeWhile (Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
/= Char
'\'')
    Char
_ <- Char -> Parser Char
char Char
'\''
    Text -> [Attr] -> Text -> Text -> Parser Token
afterAttrValueQuoted Text
tag [Attr]
attrs Text
name Text
value

-- | /§8.2.4.38/: Attribute value (unquoted) state
attrValueUnquoted :: TagName -> [Attr] -> AttrName -> Parser Token
attrValueUnquoted :: Text -> [Attr] -> Text -> Parser Token
attrValueUnquoted Text
tag [Attr]
attrs Text
name = do
    Text
value <- (Char -> Bool) -> Parser Text
takeTill ((Char -> Bool) -> Parser Text) -> (Char -> Bool) -> Parser Text
forall a b. (a -> b) -> a -> b
$ Char -> Bool
isWhitespace (Char -> Bool) -> (Char -> Bool) -> Char -> Bool
`orC` Char -> Char -> Bool
isC Char
'>'
    Parser Token -> Parser Token
forall a. a -> a
id (Parser Token -> Parser Token) -> Parser Token -> Parser Token
forall a b. (a -> b) -> a -> b
$  ((Char -> Bool) -> Parser Char
satisfy Char -> Bool
isWhitespace Parser Char -> Parser Token -> Parser Token
forall a b. Parser Text a -> Parser Text b -> Parser Text b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Text -> [Attr] -> Parser Token
beforeAttrName Text
tag (Text -> Text -> Attr
Attr Text
name Text
value Attr -> [Attr] -> [Attr]
forall a. a -> [a] -> [a]
: [Attr]
attrs))
      Parser Token -> Parser Token -> Parser Token
forall a. Parser Text a -> Parser Text a -> Parser Text a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> (Char -> Parser Char
char Char
'>' Parser Char -> Parser Token -> Parser Token
forall a b. Parser Text a -> Parser Text b -> Parser Text b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Token -> Parser Token
forall a. a -> Parser Text a
forall (m :: * -> *) a. Monad m => a -> m a
return (Text -> [Attr] -> Token
TagOpen Text
tag (Text -> Text -> Attr
Attr Text
name Text
value Attr -> [Attr] -> [Attr]
forall a. a -> [a] -> [a]
: [Attr]
attrs)))
      Parser Token -> Parser Token -> Parser Token
forall a. Parser Text a -> Parser Text a -> Parser Text a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> (Parser Text ()
forall t. Chunk t => Parser t ()
endOfInput Parser Text () -> Parser Token -> Parser Token
forall a b. Parser Text a -> Parser Text b -> Parser Text b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Token -> Parser Token
forall a. a -> Parser Text a
forall (m :: * -> *) a. Monad m => a -> m a
return Token
endOfFileToken)

-- | /§8.2.4.39/: After attribute value (quoted) state
afterAttrValueQuoted :: TagName -> [Attr] -> AttrName -> AttrValue -> Parser Token
afterAttrValueQuoted :: Text -> [Attr] -> Text -> Text -> Parser Token
afterAttrValueQuoted Text
tag [Attr]
attrs Text
name Text
value =
          ((Char -> Bool) -> Parser Char
satisfy Char -> Bool
isWhitespace Parser Char -> Parser Token -> Parser Token
forall a b. Parser Text a -> Parser Text b -> Parser Text b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Text -> [Attr] -> Parser Token
beforeAttrName Text
tag [Attr]
attrs')
      Parser Token -> Parser Token -> Parser Token
forall a. Parser Text a -> Parser Text a -> Parser Text a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> (Char -> Parser Char
char Char
'/' Parser Char -> Parser Token -> Parser Token
forall a b. Parser Text a -> Parser Text b -> Parser Text b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Text -> [Attr] -> Parser Token
selfClosingStartTag Text
tag [Attr]
attrs')
      Parser Token -> Parser Token -> Parser Token
forall a. Parser Text a -> Parser Text a -> Parser Text a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> (Char -> Parser Char
char Char
'>' Parser Char -> Parser Token -> Parser Token
forall a b. Parser Text a -> Parser Text b -> Parser Text b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Token -> Parser Token
forall a. a -> Parser Text a
forall (m :: * -> *) a. Monad m => a -> m a
return (Text -> [Attr] -> Token
TagOpen Text
tag [Attr]
attrs'))
      Parser Token -> Parser Token -> Parser Token
forall a. Parser Text a -> Parser Text a -> Parser Text a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> (Parser Text ()
forall t. Chunk t => Parser t ()
endOfInput Parser Text () -> Parser Token -> Parser Token
forall a b. Parser Text a -> Parser Text b -> Parser Text b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Token -> Parser Token
forall a. a -> Parser Text a
forall (m :: * -> *) a. Monad m => a -> m a
return Token
endOfFileToken)
  where attrs' :: [Attr]
attrs' = Text -> Text -> Attr
Attr Text
name Text
value Attr -> [Attr] -> [Attr]
forall a. a -> [a] -> [a]
: [Attr]
attrs

-- | /§8.2.4.41/: Bogus comment state
bogusComment :: Builder -> Parser Token
bogusComment :: Builder -> Parser Token
bogusComment Builder
content = do
        (Char -> Parser Char
char Char
'>' Parser Char -> Parser Token -> Parser Token
forall a b. Parser Text a -> Parser Text b -> Parser Text b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Token -> Parser Token
forall a. a -> Parser Text a
forall (m :: * -> *) a. Monad m => a -> m a
return (Builder -> Token
Comment Builder
content))
    Parser Token -> Parser Token -> Parser Token
forall a. Parser Text a -> Parser Text a -> Parser Text a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> (Parser Text ()
forall t. Chunk t => Parser t ()
endOfInput Parser Text () -> Parser Token -> Parser Token
forall a b. Parser Text a -> Parser Text b -> Parser Text b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Token -> Parser Token
forall a. a -> Parser Text a
forall (m :: * -> *) a. Monad m => a -> m a
return (Builder -> Token
Comment Builder
content))
    Parser Token -> Parser Token -> Parser Token
forall a. Parser Text a -> Parser Text a -> Parser Text a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> (Char -> Parser Char
char Char
'\x00' Parser Char -> Parser Token -> Parser Token
forall a b. Parser Text a -> Parser Text b -> Parser Text b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Builder -> Parser Token
bogusComment (Builder
content Builder -> Builder -> Builder
forall a. Semigroup a => a -> a -> a
<> Builder
"\xfffd"))
    Parser Token -> Parser Token -> Parser Token
forall a. Parser Text a -> Parser Text a -> Parser Text a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> (Parser Char
anyChar Parser Char -> (Char -> Parser Token) -> Parser Token
forall a b. Parser Text a -> (a -> Parser Text b) -> Parser Text b
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= \Char
c -> Builder -> Parser Token
bogusComment (Builder
content Builder -> Builder -> Builder
forall a. Semigroup a => a -> a -> a
<> Char -> Builder
B.singleton Char
c))

-- | /§8.2.4.42/: Markup declaration open state
markupDeclOpen :: Parser Token
markupDeclOpen :: Parser Token
markupDeclOpen =
        Parser Token -> Parser Token
forall i a. Parser i a -> Parser i a
try Parser Token
comment_
    Parser Token -> Parser Token -> Parser Token
forall a. Parser Text a -> Parser Text a -> Parser Text a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> Parser Token -> Parser Token
forall i a. Parser i a -> Parser i a
try Parser Token
docType
    Parser Token -> Parser Token -> Parser Token
forall a. Parser Text a -> Parser Text a -> Parser Text a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> Builder -> Parser Token
bogusComment Builder
forall a. Monoid a => a
mempty
  where
    comment_ :: Parser Token
comment_ = Char -> Parser Char
char Char
'-' Parser Char -> Parser Char -> Parser Char
forall a b. Parser Text a -> Parser Text b -> Parser Text b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Char -> Parser Char
char Char
'-' Parser Char -> Parser Token -> Parser Token
forall a b. Parser Text a -> Parser Text b -> Parser Text b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Parser Token
commentStart
    docType :: Parser Token
docType = do
        -- switching this to asciiCI slowed things down by a factor of two
        Text
s <- Int -> Parser Text
take Int
7
        Bool -> Parser Text ()
forall (f :: * -> *). Alternative f => Bool -> f ()
guard (Bool -> Parser Text ()) -> Bool -> Parser Text ()
forall a b. (a -> b) -> a -> b
$ Text -> Text
T.toLower Text
s Text -> Text -> Bool
forall a. Eq a => a -> a -> Bool
== Text
"doctype"
        Parser Token
doctype

-- | /§8.2.4.43/: Comment start state
commentStart :: Parser Token
commentStart :: Parser Token
commentStart = do
          (Char -> Parser Char
char Char
'-' Parser Char -> Parser Token -> Parser Token
forall a b. Parser Text a -> Parser Text b -> Parser Text b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Parser Token
commentStartDash)
      Parser Token -> Parser Token -> Parser Token
forall a. Parser Text a -> Parser Text a -> Parser Text a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> (Char -> Parser Char
char Char
'>' Parser Char -> Parser Token -> Parser Token
forall a b. Parser Text a -> Parser Text b -> Parser Text b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Token -> Parser Token
forall a. a -> Parser Text a
forall (m :: * -> *) a. Monad m => a -> m a
return (Builder -> Token
Comment Builder
forall a. Monoid a => a
mempty))
      Parser Token -> Parser Token -> Parser Token
forall a. Parser Text a -> Parser Text a -> Parser Text a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> Builder -> Parser Token
comment Builder
forall a. Monoid a => a
mempty

-- | /§8.2.4.44/: Comment start dash state
commentStartDash :: Parser Token
commentStartDash :: Parser Token
commentStartDash =
          (Char -> Parser Char
char Char
'-' Parser Char -> Parser Token -> Parser Token
forall a b. Parser Text a -> Parser Text b -> Parser Text b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Builder -> Parser Token
commentEnd Builder
forall a. Monoid a => a
mempty)
      Parser Token -> Parser Token -> Parser Token
forall a. Parser Text a -> Parser Text a -> Parser Text a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> (Char -> Parser Char
char Char
'>' Parser Char -> Parser Token -> Parser Token
forall a b. Parser Text a -> Parser Text b -> Parser Text b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Token -> Parser Token
forall a. a -> Parser Text a
forall (m :: * -> *) a. Monad m => a -> m a
return (Builder -> Token
Comment Builder
forall a. Monoid a => a
mempty))
      Parser Token -> Parser Token -> Parser Token
forall a. Parser Text a -> Parser Text a -> Parser Text a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> (Parser Text ()
forall t. Chunk t => Parser t ()
endOfInput Parser Text () -> Parser Token -> Parser Token
forall a b. Parser Text a -> Parser Text b -> Parser Text b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Token -> Parser Token
forall a. a -> Parser Text a
forall (m :: * -> *) a. Monad m => a -> m a
return (Builder -> Token
Comment Builder
forall a. Monoid a => a
mempty))
      Parser Token -> Parser Token -> Parser Token
forall a. Parser Text a -> Parser Text a -> Parser Text a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> (Builder -> Parser Token
comment (Char -> Builder
B.singleton Char
'-'))

-- | /§8.2.4.45/: Comment state
comment :: Builder -> Parser Token
comment :: Builder -> Parser Token
comment Builder
content0 = do
    Builder
content <- Text -> Builder
B.fromText (Text -> Builder) -> Parser Text -> Parser Text Builder
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> (Char -> Bool) -> Parser Text
takeWhile (Bool -> Bool
not (Bool -> Bool) -> (Char -> Bool) -> Char -> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Char -> Char -> Bool
isC Char
'-' (Char -> Bool) -> (Char -> Bool) -> Char -> Bool
`orC` Char -> Char -> Bool
isC Char
'\x00' (Char -> Bool) -> (Char -> Bool) -> Char -> Bool
`orC` Char -> Char -> Bool
isC Char
'<'))
    Parser Token -> Parser Token
forall a. a -> a
id (Parser Token -> Parser Token) -> Parser Token -> Parser Token
forall a b. (a -> b) -> a -> b
$  (Char -> Parser Char
char Char
'<' Parser Char -> Parser Token -> Parser Token
forall a b. Parser Text a -> Parser Text b -> Parser Text b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Builder -> Parser Token
commentLessThan (Builder
content0 Builder -> Builder -> Builder
forall a. Semigroup a => a -> a -> a
<> Builder
content Builder -> Builder -> Builder
forall a. Semigroup a => a -> a -> a
<> Builder
"<"))
      Parser Token -> Parser Token -> Parser Token
forall a. Parser Text a -> Parser Text a -> Parser Text a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> (Char -> Parser Char
char Char
'-' Parser Char -> Parser Token -> Parser Token
forall a b. Parser Text a -> Parser Text b -> Parser Text b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Builder -> Parser Token
commentEndDash (Builder
content0 Builder -> Builder -> Builder
forall a. Semigroup a => a -> a -> a
<> Builder
content))
      Parser Token -> Parser Token -> Parser Token
forall a. Parser Text a -> Parser Text a -> Parser Text a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> (Char -> Parser Char
char Char
'\x00' Parser Char -> Parser Token -> Parser Token
forall a b. Parser Text a -> Parser Text b -> Parser Text b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Builder -> Parser Token
comment (Builder
content0 Builder -> Builder -> Builder
forall a. Semigroup a => a -> a -> a
<> Builder
content Builder -> Builder -> Builder
forall a. Semigroup a => a -> a -> a
<> Char -> Builder
B.singleton Char
'\xfffd'))
      Parser Token -> Parser Token -> Parser Token
forall a. Parser Text a -> Parser Text a -> Parser Text a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> (Parser Text ()
forall t. Chunk t => Parser t ()
endOfInput Parser Text () -> Parser Token -> Parser Token
forall a b. Parser Text a -> Parser Text b -> Parser Text b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Token -> Parser Token
forall a. a -> Parser Text a
forall (m :: * -> *) a. Monad m => a -> m a
return (Builder -> Token
Comment (Builder -> Token) -> Builder -> Token
forall a b. (a -> b) -> a -> b
$ Builder
content0 Builder -> Builder -> Builder
forall a. Semigroup a => a -> a -> a
<> Builder
content))

-- | /§8.2.46/: Comment less-than sign state
commentLessThan :: Builder -> Parser Token
commentLessThan :: Builder -> Parser Token
commentLessThan Builder
content =
        (Char -> Parser Char
char Char
'!' Parser Char -> Parser Token -> Parser Token
forall a b. Parser Text a -> Parser Text b -> Parser Text b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Builder -> Parser Token
commentLessThanBang (Builder
content Builder -> Builder -> Builder
forall a. Semigroup a => a -> a -> a
<> Builder
"!"))
    Parser Token -> Parser Token -> Parser Token
forall a. Parser Text a -> Parser Text a -> Parser Text a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> (Char -> Parser Char
char Char
'<' Parser Char -> Parser Token -> Parser Token
forall a b. Parser Text a -> Parser Text b -> Parser Text b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Builder -> Parser Token
commentLessThan (Builder
content Builder -> Builder -> Builder
forall a. Semigroup a => a -> a -> a
<> Builder
"<"))
    Parser Token -> Parser Token -> Parser Token
forall a. Parser Text a -> Parser Text a -> Parser Text a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> Builder -> Parser Token
comment Builder
content

-- | /§8.2.47/: Comment less-than sign bang state
commentLessThanBang :: Builder -> Parser Token
commentLessThanBang :: Builder -> Parser Token
commentLessThanBang Builder
content =
        (Char -> Parser Char
char Char
'-' Parser Char -> Parser Token -> Parser Token
forall a b. Parser Text a -> Parser Text b -> Parser Text b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Builder -> Parser Token
commentLessThanBangDash Builder
content)
    Parser Token -> Parser Token -> Parser Token
forall a. Parser Text a -> Parser Text a -> Parser Text a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> Builder -> Parser Token
comment Builder
content

-- | /§8.2.48/: Comment less-than sign bang dash state
commentLessThanBangDash :: Builder -> Parser Token
commentLessThanBangDash :: Builder -> Parser Token
commentLessThanBangDash Builder
content =
        (Char -> Parser Char
char Char
'-' Parser Char -> Parser Token -> Parser Token
forall a b. Parser Text a -> Parser Text b -> Parser Text b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Builder -> Parser Token
commentLessThanBangDashDash Builder
content)
    Parser Token -> Parser Token -> Parser Token
forall a. Parser Text a -> Parser Text a -> Parser Text a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> Builder -> Parser Token
commentEndDash Builder
content

-- | /§8.2.49/: Comment less-than sign bang dash dash state
commentLessThanBangDashDash :: Builder -> Parser Token
commentLessThanBangDashDash :: Builder -> Parser Token
commentLessThanBangDashDash Builder
content =
        (Char -> Parser Char
char Char
'>' Parser Char -> Parser Token -> Parser Token
forall a b. Parser Text a -> Parser Text b -> Parser Text b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Builder -> Parser Token
comment Builder
content)
    Parser Token -> Parser Token -> Parser Token
forall a. Parser Text a -> Parser Text a -> Parser Text a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> (Parser Text ()
forall t. Chunk t => Parser t ()
endOfInput Parser Text () -> Parser Token -> Parser Token
forall a b. Parser Text a -> Parser Text b -> Parser Text b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Builder -> Parser Token
comment Builder
content)
    Parser Token -> Parser Token -> Parser Token
forall a. Parser Text a -> Parser Text a -> Parser Text a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> Builder -> Parser Token
commentEnd Builder
content

-- | /§8.2.4.50/: Comment end dash state
commentEndDash :: Builder -> Parser Token
commentEndDash :: Builder -> Parser Token
commentEndDash Builder
content = do
        (Char -> Parser Char
char Char
'-' Parser Char -> Parser Token -> Parser Token
forall a b. Parser Text a -> Parser Text b -> Parser Text b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Builder -> Parser Token
commentEnd Builder
content)
    Parser Token -> Parser Token -> Parser Token
forall a. Parser Text a -> Parser Text a -> Parser Text a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> (Parser Text ()
forall t. Chunk t => Parser t ()
endOfInput Parser Text () -> Parser Token -> Parser Token
forall a b. Parser Text a -> Parser Text b -> Parser Text b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Token -> Parser Token
forall a. a -> Parser Text a
forall (m :: * -> *) a. Monad m => a -> m a
return (Builder -> Token
Comment Builder
content))
    Parser Token -> Parser Token -> Parser Token
forall a. Parser Text a -> Parser Text a -> Parser Text a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> (Builder -> Parser Token
comment (Builder
content Builder -> Builder -> Builder
forall a. Semigroup a => a -> a -> a
<> Builder
"-"))

-- | /§8.2.4.51/: Comment end state
commentEnd :: Builder -> Parser Token
commentEnd :: Builder -> Parser Token
commentEnd Builder
content = do
        (Char -> Parser Char
char Char
'>' Parser Char -> Parser Token -> Parser Token
forall a b. Parser Text a -> Parser Text b -> Parser Text b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Token -> Parser Token
forall a. a -> Parser Text a
forall (m :: * -> *) a. Monad m => a -> m a
return (Builder -> Token
Comment Builder
content))
    Parser Token -> Parser Token -> Parser Token
forall a. Parser Text a -> Parser Text a -> Parser Text a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> (Char -> Parser Char
char Char
'!' Parser Char -> Parser Token -> Parser Token
forall a b. Parser Text a -> Parser Text b -> Parser Text b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Builder -> Parser Token
commentEndBang Builder
content)
    Parser Token -> Parser Token -> Parser Token
forall a. Parser Text a -> Parser Text a -> Parser Text a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> (Char -> Parser Char
char Char
'-' Parser Char -> Parser Token -> Parser Token
forall a b. Parser Text a -> Parser Text b -> Parser Text b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Builder -> Parser Token
commentEnd (Builder
content Builder -> Builder -> Builder
forall a. Semigroup a => a -> a -> a
<> Builder
"-"))
    Parser Token -> Parser Token -> Parser Token
forall a. Parser Text a -> Parser Text a -> Parser Text a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> (Parser Text ()
forall t. Chunk t => Parser t ()
endOfInput Parser Text () -> Parser Token -> Parser Token
forall a b. Parser Text a -> Parser Text b -> Parser Text b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Token -> Parser Token
forall a. a -> Parser Text a
forall (m :: * -> *) a. Monad m => a -> m a
return (Builder -> Token
Comment Builder
content))
    Parser Token -> Parser Token -> Parser Token
forall a. Parser Text a -> Parser Text a -> Parser Text a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> (Builder -> Parser Token
comment (Builder
content Builder -> Builder -> Builder
forall a. Semigroup a => a -> a -> a
<> Builder
"--"))

-- | /§8.2.4.52/: Comment end bang state
commentEndBang :: Builder -> Parser Token
commentEndBang :: Builder -> Parser Token
commentEndBang Builder
content = do
        (Char -> Parser Char
char Char
'-' Parser Char -> Parser Token -> Parser Token
forall a b. Parser Text a -> Parser Text b -> Parser Text b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Builder -> Parser Token
commentEndDash (Builder
content Builder -> Builder -> Builder
forall a. Semigroup a => a -> a -> a
<> Builder
"--!"))
    Parser Token -> Parser Token -> Parser Token
forall a. Parser Text a -> Parser Text a -> Parser Text a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> (Char -> Parser Char
char Char
'>' Parser Char -> Parser Token -> Parser Token
forall a b. Parser Text a -> Parser Text b -> Parser Text b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Token -> Parser Token
forall a. a -> Parser Text a
forall (m :: * -> *) a. Monad m => a -> m a
return (Builder -> Token
Comment Builder
content))
    Parser Token -> Parser Token -> Parser Token
forall a. Parser Text a -> Parser Text a -> Parser Text a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> (Parser Text ()
forall t. Chunk t => Parser t ()
endOfInput Parser Text () -> Parser Token -> Parser Token
forall a b. Parser Text a -> Parser Text b -> Parser Text b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Token -> Parser Token
forall a. a -> Parser Text a
forall (m :: * -> *) a. Monad m => a -> m a
return (Builder -> Token
Comment Builder
content))
    Parser Token -> Parser Token -> Parser Token
forall a. Parser Text a -> Parser Text a -> Parser Text a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> (Builder -> Parser Token
comment (Builder
content Builder -> Builder -> Builder
forall a. Semigroup a => a -> a -> a
<> Builder
"--!"))

-- | /§8.2.4.53/: DOCTYPE state
-- FIXME
doctype :: Parser Token
doctype :: Parser Token
doctype = do
    Text
content <- (Char -> Bool) -> Parser Text
takeTill (Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
==Char
'>')
    Char
_ <- Char -> Parser Char
char Char
'>'
    Token -> Parser Token
forall a. a -> Parser Text a
forall (m :: * -> *) a. Monad m => a -> m a
return (Token -> Parser Token) -> Token -> Parser Token
forall a b. (a -> b) -> a -> b
$ Text -> Token
Doctype Text
content

-- | Parse a lazy list of tokens from strict 'Text'.
parseTokens :: Text -> [Token]
parseTokens :: Text -> [Token]
parseTokens = (Text -> Maybe (Token, Text)) -> Text -> [Token]
forall b a. (b -> Maybe (a, b)) -> b -> [a]
unfoldr Text -> Maybe (Token, Text)
f
  where
    f :: Text -> Maybe (Token, Text)
    f :: Text -> Maybe (Token, Text)
f Text
t
      | Text -> Bool
T.null Text
t = Maybe (Token, Text)
forall a. Maybe a
Nothing
      | Bool
otherwise =
        case Parser Token -> Text -> Result Token
forall a. Parser a -> Text -> Result a
parse Parser Token
token Text
t of
            Done Text
rest Token
tok -> (Token, Text) -> Maybe (Token, Text)
forall a. a -> Maybe a
Just (Token
tok, Text
rest)
            Partial Text -> Result Token
cont  ->
                case Text -> Result Token
cont Text
forall a. Monoid a => a
mempty of
                  Done Text
rest Token
tok -> (Token, Text) -> Maybe (Token, Text)
forall a. a -> Maybe a
Just (Token
tok, Text
rest)
                  Result Token
_             -> Maybe (Token, Text)
forall a. Maybe a
Nothing
            Result Token
_             -> Maybe (Token, Text)
forall a. Maybe a
Nothing

-- | Parse a lazy list of tokens from lazy 'TL.Text'.
parseTokensLazy :: TL.Text -> [Token]
parseTokensLazy :: Text -> [Token]
parseTokensLazy = (Text -> Maybe (Token, Text)) -> Text -> [Token]
forall b a. (b -> Maybe (a, b)) -> b -> [a]
unfoldr Text -> Maybe (Token, Text)
f
  where
    f :: TL.Text -> Maybe (Token, TL.Text)
    f :: Text -> Maybe (Token, Text)
f Text
t
      | Text -> Bool
TL.null Text
t = Maybe (Token, Text)
forall a. Maybe a
Nothing
      | Bool
otherwise =
        case Parser Token -> Text -> Result Token
forall a. Parser a -> Text -> Result a
AL.parse Parser Token
token Text
t of
            AL.Done Text
rest Token
tok -> (Token, Text) -> Maybe (Token, Text)
forall a. a -> Maybe a
Just (Token
tok, Text
rest)
            Result Token
_                -> Maybe (Token, Text)
forall a. Maybe a
Nothing

-- | See 'renderToken'.
renderTokens :: [Token] -> TL.Text
renderTokens :: [Token] -> Text
renderTokens = [Text] -> Text
forall a. Monoid a => [a] -> a
mconcat ([Text] -> Text) -> ([Token] -> [Text]) -> [Token] -> Text
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Token -> Text) -> [Token] -> [Text]
forall a b. (a -> b) -> [a] -> [b]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap Token -> Text
renderToken

-- | (Somewhat) canonical string representation of 'Token'.
renderToken :: Token -> TL.Text
renderToken :: Token -> Text
renderToken = Text -> Text
TL.fromStrict (Text -> Text) -> (Token -> Text) -> Token -> Text
forall b c a. (b -> c) -> (a -> b) -> a -> c
. [Text] -> Text
forall a. Monoid a => [a] -> a
mconcat ([Text] -> Text) -> (Token -> [Text]) -> Token -> Text
forall b c a. (b -> c) -> (a -> b) -> a -> c
. \case
    (TagOpen Text
n [])         -> [Text
"<", Text
n, Text
">"]
    (TagOpen Text
n [Attr]
attrs)      -> [Text
"<", Text
n, Text
" ", [Attr] -> Text
renderAttrs [Attr]
attrs, Text
">"]
    (TagSelfClose Text
n [Attr]
attrs) -> [Text
"<", Text
n, Text
" ", [Attr] -> Text
renderAttrs [Attr]
attrs, Text
" />"]
    (TagClose Text
n)           -> [Text
"</", Text
n, Text
">"]
    (ContentChar Char
c)        -> [Char -> Text
T.singleton Char
c]
    (ContentText Text
t)        -> [Text
t]
    (Comment Builder
builder)      -> [Text
"<!--", Text -> Text
TL.toStrict (Text -> Text) -> Text -> Text
forall a b. (a -> b) -> a -> b
$ Builder -> Text
B.toLazyText Builder
builder, Text
"-->"]
    (Doctype Text
t)            -> [Text
"<!DOCTYPE", Text
t, Text
">"]

-- | See 'renderAttr'.
renderAttrs :: [Attr] -> Text
renderAttrs :: [Attr] -> Text
renderAttrs = [Text] -> Text
T.unwords ([Text] -> Text) -> ([Attr] -> [Text]) -> [Attr] -> Text
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Attr -> Text) -> [Attr] -> [Text]
forall a b. (a -> b) -> [a] -> [b]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap Attr -> Text
renderAttr ([Attr] -> [Text]) -> ([Attr] -> [Attr]) -> [Attr] -> [Text]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. [Attr] -> [Attr]
forall a. [a] -> [a]
reverse

-- | Does not escape quotation in attribute values!
renderAttr :: Attr -> Text
renderAttr :: Attr -> Text
renderAttr (Attr Text
k Text
v) = [Text] -> Text
forall a. Monoid a => [a] -> a
mconcat [Text
k, Text
"=\"", Text
v, Text
"\""]

-- | Meld neighoring 'ContentChar' and 'ContentText'
-- constructors together and drops empty text elements.
canonicalizeTokens :: [Token] -> [Token]
canonicalizeTokens :: [Token] -> [Token]
canonicalizeTokens = (Token -> Bool) -> [Token] -> [Token]
forall a. (a -> Bool) -> [a] -> [a]
filter (Token -> Token -> Bool
forall a. Eq a => a -> a -> Bool
/= Text -> Token
ContentText Text
"") ([Token] -> [Token]) -> ([Token] -> [Token]) -> [Token] -> [Token]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. [Token] -> [Token]
meldTextTokens

meldTextTokens :: [Token] -> [Token]
meldTextTokens :: [Token] -> [Token]
meldTextTokens = [Token] -> [Token]
concatTexts ([Token] -> [Token]) -> ([Token] -> [Token]) -> [Token] -> [Token]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Token -> Token) -> [Token] -> [Token]
forall a b. (a -> b) -> [a] -> [b]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap Token -> Token
charToText
  where
    charToText :: Token -> Token
charToText (ContentChar Char
c) = Text -> Token
ContentText (Char -> Text
T.singleton Char
c)
    charToText Token
t = Token
t

    concatTexts :: [Token] -> [Token]
concatTexts = \case
      (ContentText Text
t : ContentText Text
t' : [Token]
ts) -> [Token] -> [Token]
concatTexts ([Token] -> [Token]) -> [Token] -> [Token]
forall a b. (a -> b) -> a -> b
$ Text -> Token
ContentText (Text
t Text -> Text -> Text
forall a. Semigroup a => a -> a -> a
<> Text
t') Token -> [Token] -> [Token]
forall a. a -> [a] -> [a]
: [Token]
ts
      (Token
t : [Token]
ts) -> Token
t Token -> [Token] -> [Token]
forall a. a -> [a] -> [a]
: [Token] -> [Token]
concatTexts [Token]
ts
      [] -> []