{-# LANGUAGE OverloadedStrings #-}
{- |
   Module      : Text.Pandoc.Readers.Pptx.Parse
   Copyright   : © 2025 Anton Antic
   License     : GNU GPL, version 2 or above

   Maintainer  : Anton Antic <anton@everworker.ai>
   Stability   : alpha
   Portability : portable

Parsing of PPTX archive to intermediate representation.
-}
module Text.Pandoc.Readers.Pptx.Parse
  ( Pptx(..)
  , PresentationDoc(..)
  , PptxSlide(..)
  , SlideId(..)
  , archiveToPptx
  ) where

import Codec.Archive.Zip (Archive, Entry, findEntryByPath, fromEntry)
import qualified Data.ByteString.Lazy as B
import Data.List (find)
import Data.Maybe (mapMaybe)
import qualified Data.Text as T
import qualified Data.Text.Lazy.Encoding as TL
import Data.Text (Text)
import System.FilePath (splitFileName)
import Text.Pandoc.Readers.OOXML.Shared
import Text.Pandoc.XML.Light
import Text.Read (readMaybe)

-- | Slide identifier
newtype SlideId = SlideId Int deriving (Int -> SlideId -> ShowS
[SlideId] -> ShowS
SlideId -> String
(Int -> SlideId -> ShowS)
-> (SlideId -> String) -> ([SlideId] -> ShowS) -> Show SlideId
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
$cshowsPrec :: Int -> SlideId -> ShowS
showsPrec :: Int -> SlideId -> ShowS
$cshow :: SlideId -> String
show :: SlideId -> String
$cshowList :: [SlideId] -> ShowS
showList :: [SlideId] -> ShowS
Show, SlideId -> SlideId -> Bool
(SlideId -> SlideId -> Bool)
-> (SlideId -> SlideId -> Bool) -> Eq SlideId
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
$c== :: SlideId -> SlideId -> Bool
== :: SlideId -> SlideId -> Bool
$c/= :: SlideId -> SlideId -> Bool
/= :: SlideId -> SlideId -> Bool
Eq, Eq SlideId
Eq SlideId =>
(SlideId -> SlideId -> Ordering)
-> (SlideId -> SlideId -> Bool)
-> (SlideId -> SlideId -> Bool)
-> (SlideId -> SlideId -> Bool)
-> (SlideId -> SlideId -> Bool)
-> (SlideId -> SlideId -> SlideId)
-> (SlideId -> SlideId -> SlideId)
-> Ord SlideId
SlideId -> SlideId -> Bool
SlideId -> SlideId -> Ordering
SlideId -> SlideId -> SlideId
forall a.
Eq a =>
(a -> a -> Ordering)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> a)
-> (a -> a -> a)
-> Ord a
$ccompare :: SlideId -> SlideId -> Ordering
compare :: SlideId -> SlideId -> Ordering
$c< :: SlideId -> SlideId -> Bool
< :: SlideId -> SlideId -> Bool
$c<= :: SlideId -> SlideId -> Bool
<= :: SlideId -> SlideId -> Bool
$c> :: SlideId -> SlideId -> Bool
> :: SlideId -> SlideId -> Bool
$c>= :: SlideId -> SlideId -> Bool
>= :: SlideId -> SlideId -> Bool
$cmax :: SlideId -> SlideId -> SlideId
max :: SlideId -> SlideId -> SlideId
$cmin :: SlideId -> SlideId -> SlideId
min :: SlideId -> SlideId -> SlideId
Ord)

-- | Complete PPTX document (intermediate representation)
data Pptx = Pptx
  { Pptx -> PresentationDoc
pptxPresentation :: PresentationDoc
  , Pptx -> [PptxSlide]
pptxSlides       :: [PptxSlide]
  , Pptx -> Archive
pptxArchive      :: Archive
  } deriving (Int -> Pptx -> ShowS
[Pptx] -> ShowS
Pptx -> String
(Int -> Pptx -> ShowS)
-> (Pptx -> String) -> ([Pptx] -> ShowS) -> Show Pptx
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
$cshowsPrec :: Int -> Pptx -> ShowS
showsPrec :: Int -> Pptx -> ShowS
$cshow :: Pptx -> String
show :: Pptx -> String
$cshowList :: [Pptx] -> ShowS
showList :: [Pptx] -> ShowS
Show)

-- | Individual slide data
data PptxSlide = PptxSlide
  { PptxSlide -> SlideId
slideId      :: SlideId
  , PptxSlide -> String
slidePath    :: FilePath
  , PptxSlide -> Element
slideElement :: Element     -- The parsed p:sld element
  , PptxSlide -> [(Text, Text)]
slideRels    :: [(Text, Text)]  -- Slide relationships
  } deriving (Int -> PptxSlide -> ShowS
[PptxSlide] -> ShowS
PptxSlide -> String
(Int -> PptxSlide -> ShowS)
-> (PptxSlide -> String)
-> ([PptxSlide] -> ShowS)
-> Show PptxSlide
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
$cshowsPrec :: Int -> PptxSlide -> ShowS
showsPrec :: Int -> PptxSlide -> ShowS
$cshow :: PptxSlide -> String
show :: PptxSlide -> String
$cshowList :: [PptxSlide] -> ShowS
showList :: [PptxSlide] -> ShowS
Show)

-- | Presentation-level information from presentation.xml
data PresentationDoc = PresentationDoc
  { PresentationDoc -> NameSpaces
presNameSpaces   :: NameSpaces
  , PresentationDoc -> (Integer, Integer)
presSlideSize    :: (Integer, Integer)  -- (width, height) in pixels
  , PresentationDoc -> [(SlideId, Text)]
presSlideIds     :: [(SlideId, Text)]   -- (slideId, relationshipId)
  } deriving (Int -> PresentationDoc -> ShowS
[PresentationDoc] -> ShowS
PresentationDoc -> String
(Int -> PresentationDoc -> ShowS)
-> (PresentationDoc -> String)
-> ([PresentationDoc] -> ShowS)
-> Show PresentationDoc
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
$cshowsPrec :: Int -> PresentationDoc -> ShowS
showsPrec :: Int -> PresentationDoc -> ShowS
$cshow :: PresentationDoc -> String
show :: PresentationDoc -> String
$cshowList :: [PresentationDoc] -> ShowS
showList :: [PresentationDoc] -> ShowS
Show)

-- | Parse PPTX archive to intermediate representation
archiveToPptx :: Archive -> Either Text Pptx
archiveToPptx :: Archive -> Either Text Pptx
archiveToPptx Archive
archive = do
  -- Find and parse presentation.xml
  String
presPath <- Archive -> Either Text String
getPresentationXmlPath Archive
archive
  Element
presElem <- Archive -> String -> Either Text Element
loadXMLFromArchive Archive
archive String
presPath
  PresentationDoc
presDoc <- Element -> Either Text PresentationDoc
elemToPresentation Element
presElem

  -- Load presentation relationships to resolve slide paths
  String
presRelsPath <- Archive -> String -> Either Text String
getPresentationRelsPath Archive
archive String
presPath
  [(Text, Text)]
presRels <- Archive -> String -> Either Text [(Text, Text)]
loadRelationships Archive
archive String
presRelsPath

  -- Parse each slide
  [PptxSlide]
slides <- ((SlideId, Text) -> Either Text PptxSlide)
-> [(SlideId, Text)] -> Either Text [PptxSlide]
forall (t :: * -> *) (m :: * -> *) a b.
(Traversable t, Monad m) =>
(a -> m b) -> t a -> m (t b)
forall (m :: * -> *) a b. Monad m => (a -> m b) -> [a] -> m [b]
mapM (Archive
-> [(Text, Text)] -> (SlideId, Text) -> Either Text PptxSlide
parseSlide Archive
archive [(Text, Text)]
presRels) (PresentationDoc -> [(SlideId, Text)]
presSlideIds PresentationDoc
presDoc)

  Pptx -> Either Text Pptx
forall a. a -> Either Text a
forall (m :: * -> *) a. Monad m => a -> m a
return (Pptx -> Either Text Pptx) -> Pptx -> Either Text Pptx
forall a b. (a -> b) -> a -> b
$ PresentationDoc -> [PptxSlide] -> Archive -> Pptx
Pptx PresentationDoc
presDoc [PptxSlide]
slides Archive
archive

-- | Find presentation.xml via root relationships
getPresentationXmlPath :: Archive -> Either Text FilePath
getPresentationXmlPath :: Archive -> Either Text String
getPresentationXmlPath Archive
archive = do
  -- Load _rels/.rels
  Entry
relsEntry <- Text -> Maybe Entry -> Either Text Entry
forall a. Text -> Maybe a -> Either Text a
maybeToEither Text
"Missing _rels/.rels" (Maybe Entry -> Either Text Entry)
-> Maybe Entry -> Either Text Entry
forall a b. (a -> b) -> a -> b
$
               String -> Archive -> Maybe Entry
findEntryByPath String
"_rels/.rels" Archive
archive

  Element
relsElem <- Entry -> Either Text Element
parseXMLFromEntry Entry
relsEntry

  -- The Relationships element has a default namespace, but Relationship children don't use prefix
  -- We need to look at all children regardless of namespace
  let relElems :: [Element]
relElems = [Content] -> [Element]
onlyElems ([Content] -> [Element]) -> [Content] -> [Element]
forall a b. (a -> b) -> a -> b
$ Element -> [Content]
elContent Element
relsElem

  -- Look for relationship containing "officeDocument" in Type attribute
  case (Element -> Bool) -> [Element] -> Maybe Element
forall (t :: * -> *) a. Foldable t => (a -> Bool) -> t a -> Maybe a
find Element -> Bool
isOfficeDocRel [Element]
relElems of
    Maybe Element
Nothing -> Text -> Either Text String
forall a b. a -> Either a b
Left (Text -> Either Text String) -> Text -> Either Text String
forall a b. (a -> b) -> a -> b
$ Text
"No presentation.xml relationship found. Found " Text -> Text -> Text
forall a. Semigroup a => a -> a -> a
<>
                     String -> Text
T.pack (Int -> String
forall a. Show a => a -> String
show ([Element] -> Int
forall a. [a] -> Int
forall (t :: * -> *) a. Foldable t => t a -> Int
length [Element]
relElems)) Text -> Text -> Text
forall a. Semigroup a => a -> a -> a
<> Text
" relationships."
    Just Element
rel -> do
      Text
target <- Text -> Maybe Text -> Either Text Text
forall a. Text -> Maybe a -> Either Text a
maybeToEither Text
"Missing Target attribute" (Maybe Text -> Either Text Text) -> Maybe Text -> Either Text Text
forall a b. (a -> b) -> a -> b
$
                QName -> Element -> Maybe Text
findAttr (Text -> QName
unqual Text
"Target") Element
rel
      String -> Either Text String
forall a. a -> Either Text a
forall (m :: * -> *) a. Monad m => a -> m a
return (String -> Either Text String) -> String -> Either Text String
forall a b. (a -> b) -> a -> b
$ Text -> String
T.unpack Text
target  -- Convert Text to FilePath

  where
    isOfficeDocRel :: Element -> Bool
isOfficeDocRel Element
el =
      case QName -> Element -> Maybe Text
findAttr (Text -> QName
unqual Text
"Type") Element
el of
        -- Must end with "/officeDocument" to avoid matching "/extended-properties"
        Just Text
relType -> Text
"/officeDocument" Text -> Text -> Bool
`T.isSuffixOf` Text
relType
        Maybe Text
Nothing -> Bool
False

-- | Load and parse XML from archive entry
loadXMLFromArchive :: Archive -> FilePath -> Either Text Element
loadXMLFromArchive :: Archive -> String -> Either Text Element
loadXMLFromArchive Archive
archive String
path = do
  Entry
entry <- Text -> Maybe Entry -> Either Text Entry
forall a. Text -> Maybe a -> Either Text a
maybeToEither (Text
"Entry not found: " Text -> Text -> Text
forall a. Semigroup a => a -> a -> a
<> String -> Text
T.pack String
path) (Maybe Entry -> Either Text Entry)
-> Maybe Entry -> Either Text Entry
forall a b. (a -> b) -> a -> b
$
           String -> Archive -> Maybe Entry
findEntryByPath String
path Archive
archive

  let xmlBytes :: ByteString
xmlBytes = Entry -> ByteString
fromEntry Entry
entry
  ByteString -> Either Text Element
parseXMLFromBS ByteString
xmlBytes

-- | Parse XML from ByteString
parseXMLFromBS :: B.ByteString -> Either Text Element
parseXMLFromBS :: ByteString -> Either Text Element
parseXMLFromBS = Text -> Either Text Element
parseXMLElement (Text -> Either Text Element)
-> (ByteString -> Text) -> ByteString -> Either Text Element
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ByteString -> Text
TL.decodeUtf8

-- | Parse XML from Entry
parseXMLFromEntry :: Entry -> Either Text Element
parseXMLFromEntry :: Entry -> Either Text Element
parseXMLFromEntry = ByteString -> Either Text Element
parseXMLFromBS (ByteString -> Either Text Element)
-> (Entry -> ByteString) -> Entry -> Either Text Element
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Entry -> ByteString
fromEntry

-- | Parse presentation.xml element to PresentationDoc
elemToPresentation :: Element -> Either Text PresentationDoc
elemToPresentation :: Element -> Either Text PresentationDoc
elemToPresentation Element
presElem = do
  let ns :: NameSpaces
ns = Element -> NameSpaces
elemToNameSpaces Element
presElem

  -- Extract slide size (with defaults)
  let sizeElem :: Maybe Element
sizeElem = NameSpaces -> Text -> Text -> Element -> Maybe Element
findChildByName NameSpaces
ns Text
"p" Text
"sldSz" Element
presElem
      (Integer
widthEMU, Integer
heightEMU) = case Maybe Element
sizeElem of
        Just Element
el ->
          let cx :: Integer
cx = Text -> Element -> Integer
readAttrInt Text
"cx" Element
el
              cy :: Integer
cy = Text -> Element -> Integer
readAttrInt Text
"cy" Element
el
           in (Integer
cx, Integer
cy)
        Maybe Element
Nothing -> (Integer
9144000, Integer
6858000)  -- Default 10" x 7.5"

  -- Convert EMUs to pixels (approximate for metadata)
  let width :: Integer
width = Integer
widthEMU Integer -> Integer -> Integer
forall a. Integral a => a -> a -> a
`div` Integer
emusPerInch
      height :: Integer
height = Integer
heightEMU Integer -> Integer -> Integer
forall a. Integral a => a -> a -> a
`div` Integer
emusPerInch

  -- Extract slide ID list (optional - some presentations may have no slides)
  let sldIdLstElem :: Maybe Element
sldIdLstElem = NameSpaces -> Text -> Text -> Element -> Maybe Element
findChildByName NameSpaces
ns Text
"p" Text
"sldIdLst" Element
presElem

  [(SlideId, Text)]
slideRefs <- case Maybe Element
sldIdLstElem of
    Maybe Element
Nothing -> [(SlideId, Text)] -> Either Text [(SlideId, Text)]
forall a. a -> Either Text a
forall (m :: * -> *) a. Monad m => a -> m a
return []  -- No slides is valid for templates/masters-only presentations
    Just Element
el -> do
      let sldIdElems :: [Element]
sldIdElems = QName -> Element -> [Element]
findChildren (NameSpaces -> Text -> Text -> QName
elemName NameSpaces
ns Text
"p" Text
"sldId") Element
el
      ((Int, Element) -> Either Text (SlideId, Text))
-> [(Int, Element)] -> Either Text [(SlideId, Text)]
forall (t :: * -> *) (m :: * -> *) a b.
(Traversable t, Monad m) =>
(a -> m b) -> t a -> m (t b)
forall (m :: * -> *) a b. Monad m => (a -> m b) -> [a] -> m [b]
mapM (NameSpaces -> (Int, Element) -> Either Text (SlideId, Text)
extractSlideRef NameSpaces
ns) ([Int] -> [Element] -> [(Int, Element)]
forall a b. [a] -> [b] -> [(a, b)]
zip [Int
1..] [Element]
sldIdElems)

  PresentationDoc -> Either Text PresentationDoc
forall a. a -> Either Text a
forall (m :: * -> *) a. Monad m => a -> m a
return (PresentationDoc -> Either Text PresentationDoc)
-> PresentationDoc -> Either Text PresentationDoc
forall a b. (a -> b) -> a -> b
$ PresentationDoc
    { presNameSpaces :: NameSpaces
presNameSpaces = NameSpaces
ns
    , presSlideSize :: (Integer, Integer)
presSlideSize = (Integer
width, Integer
height)
    , presSlideIds :: [(SlideId, Text)]
presSlideIds = [(SlideId, Text)]
slideRefs
    }

-- | Extract slide ID and relationship ID from p:sldId element
extractSlideRef :: NameSpaces -> (Int, Element) -> Either Text (SlideId, Text)
extractSlideRef :: NameSpaces -> (Int, Element) -> Either Text (SlideId, Text)
extractSlideRef NameSpaces
ns (Int
idx, Element
sldIdElem) = do
  Text
relId <- Text -> Maybe Text -> Either Text Text
forall a. Text -> Maybe a -> Either Text a
maybeToEither (Text
"Missing r:id in slide " Text -> Text -> Text
forall a. Semigroup a => a -> a -> a
<> String -> Text
T.pack (Int -> String
forall a. Show a => a -> String
show Int
idx)) (Maybe Text -> Either Text Text) -> Maybe Text -> Either Text Text
forall a b. (a -> b) -> a -> b
$
           NameSpaces -> Text -> Text -> Element -> Maybe Text
findAttrByName NameSpaces
ns Text
"r" Text
"id" Element
sldIdElem

  (SlideId, Text) -> Either Text (SlideId, Text)
forall a. a -> Either Text a
forall (m :: * -> *) a. Monad m => a -> m a
return (Int -> SlideId
SlideId Int
idx, Text
relId)

-- | Safe read attribute as Integer (with default of 0)
readAttrInt :: Text -> Element -> Integer
readAttrInt :: Text -> Element -> Integer
readAttrInt Text
attrName Element
el =
  case QName -> Element -> Maybe Text
findAttr (Text -> QName
unqual Text
attrName) Element
el of
    Just Text
str -> case String -> Maybe Integer
forall a. Read a => String -> Maybe a
readMaybe (Text -> String
T.unpack Text
str) of
      Just Integer
n -> Integer
n
      Maybe Integer
Nothing -> Integer
0
    Maybe Text
Nothing -> Integer
0

-- | Get presentation relationships path
getPresentationRelsPath :: Archive -> FilePath -> Either Text FilePath
getPresentationRelsPath :: Archive -> String -> Either Text String
getPresentationRelsPath Archive
_archive String
presPath =
  -- ppt/presentation.xml → ppt/_rels/presentation.xml.rels
  let (String
dir, String
file) = String -> (String, String)
splitFileName String
presPath
      relsPath :: String
relsPath = String
dir String -> ShowS
forall a. [a] -> [a] -> [a]
++ String
"/_rels/" String -> ShowS
forall a. [a] -> [a] -> [a]
++ String
file String -> ShowS
forall a. [a] -> [a] -> [a]
++ String
".rels"
   in String -> Either Text String
forall a b. b -> Either a b
Right String
relsPath

-- | Load relationships from .rels file
loadRelationships :: Archive -> FilePath -> Either Text [(Text, Text)]
loadRelationships :: Archive -> String -> Either Text [(Text, Text)]
loadRelationships Archive
archive String
relsPath =
  case String -> Archive -> Maybe Entry
findEntryByPath String
relsPath Archive
archive of
    Maybe Entry
Nothing -> [(Text, Text)] -> Either Text [(Text, Text)]
forall a b. b -> Either a b
Right []  -- No relationships is OK
    Just Entry
entry -> do
      Element
relsElem <- Entry -> Either Text Element
parseXMLFromEntry Entry
entry
      let relElems :: [Element]
relElems = [Content] -> [Element]
onlyElems ([Content] -> [Element]) -> [Content] -> [Element]
forall a b. (a -> b) -> a -> b
$ Element -> [Content]
elContent Element
relsElem
      [(Text, Text)] -> Either Text [(Text, Text)]
forall a. a -> Either Text a
forall (m :: * -> *) a. Monad m => a -> m a
return ([(Text, Text)] -> Either Text [(Text, Text)])
-> [(Text, Text)] -> Either Text [(Text, Text)]
forall a b. (a -> b) -> a -> b
$ (Element -> Maybe (Text, Text)) -> [Element] -> [(Text, Text)]
forall a b. (a -> Maybe b) -> [a] -> [b]
mapMaybe Element -> Maybe (Text, Text)
extractRelationship [Element]
relElems
  where
    extractRelationship :: Element -> Maybe (Text, Text)
extractRelationship Element
el = do
      Text
relId <- QName -> Element -> Maybe Text
findAttr (Text -> QName
unqual Text
"Id") Element
el
      Text
target <- QName -> Element -> Maybe Text
findAttr (Text -> QName
unqual Text
"Target") Element
el
      (Text, Text) -> Maybe (Text, Text)
forall a. a -> Maybe a
forall (m :: * -> *) a. Monad m => a -> m a
return (Text
relId, Text
target)

-- | Parse a single slide
parseSlide :: Archive -> [(Text, Text)] -> (SlideId, Text) -> Either Text PptxSlide
parseSlide :: Archive
-> [(Text, Text)] -> (SlideId, Text) -> Either Text PptxSlide
parseSlide Archive
archive [(Text, Text)]
rels (SlideId
sid, Text
relId) = do
  -- Resolve relationship to get slide path
  Text
target <- Text -> Maybe Text -> Either Text Text
forall a. Text -> Maybe a -> Either Text a
maybeToEither (Text
"Relationship not found: " Text -> Text -> Text
forall a. Semigroup a => a -> a -> a
<> Text
relId) (Maybe Text -> Either Text Text) -> Maybe Text -> Either Text Text
forall a b. (a -> b) -> a -> b
$
            Text -> [(Text, Text)] -> Maybe Text
forall a b. Eq a => a -> [(a, b)] -> Maybe b
lookup Text
relId [(Text, Text)]
rels

  -- Resolve relative path: ppt/slides/slide1.xml
  let slidePath' :: String
slidePath' = String
"ppt/" String -> ShowS
forall a. Semigroup a => a -> a -> a
<> Text -> String
T.unpack Text
target

  -- Load and parse slide XML
  Element
slideElem <- Archive -> String -> Either Text Element
loadXMLFromArchive Archive
archive String
slidePath'

  -- Load slide-specific relationships
  String
slideRelsPath <- Archive -> String -> Either Text String
getPresentationRelsPath Archive
archive String
slidePath'
  [(Text, Text)]
slideRels' <- Archive -> String -> Either Text [(Text, Text)]
loadRelationships Archive
archive String
slideRelsPath

  PptxSlide -> Either Text PptxSlide
forall a. a -> Either Text a
forall (m :: * -> *) a. Monad m => a -> m a
return (PptxSlide -> Either Text PptxSlide)
-> PptxSlide -> Either Text PptxSlide
forall a b. (a -> b) -> a -> b
$ SlideId -> String -> Element -> [(Text, Text)] -> PptxSlide
PptxSlide SlideId
sid String
slidePath' Element
slideElem [(Text, Text)]
slideRels'

-- | Helper: Maybe a -> Either Text a
maybeToEither :: Text -> Maybe a -> Either Text a
maybeToEither :: forall a. Text -> Maybe a -> Either Text a
maybeToEither Text
err Maybe a
Nothing = Text -> Either Text a
forall a b. a -> Either a b
Left Text
err
maybeToEither Text
_ (Just a
x) = a -> Either Text a
forall a b. b -> Either a b
Right a
x