{-# LANGUAGE OverloadedStrings #-}

{- |
Module      : Langchain.DocumentLoader.PdfLoader
Description : A PDF loader that extracts documents from PDF files.
Copyright   : (C) 2025 Tushar Adhatrao
License     : MIT
Maintainer  : Tushar Adhatrao <tusharadhatrao@gmail.com>
Stability   : experimental

This module provides a loader for PDF files by implementing the
'BaseLoader' interface from "Langchain.DocumentLoader.Core". It uses
the 'Pdf.Document' library to open a PDF and extract its content, turning
each page into a 'Document'. Additionally, it provides a method to load the
raw content of the file and split it using a recursive character splitter.
-}
module Langchain.DocumentLoader.PdfLoader
  ( PdfLoader (..)
  ) where

import Data.Aeson
import Data.Map (fromList)
import Data.Text (pack)
import Langchain.DocumentLoader.Core
import Langchain.TextSplitter.Character
import Pdf.Document hiding (Document)
import System.Directory (doesFileExist)

-- TODO: Need some error handling for this function

{- |
An internal function
Reads a PDF file and extracts a list of 'Document's, one per page.

This function opens the PDF file at the specified 'FilePath' and uses
the Pdf.Document library to extract the text from each page. Each page's
content is wrapped in a 'Document' along with metadata indicating the page number.

Note: This function currently has minimal error handling. Improvements may be
required to properly handle various PDF parsing errors.

@param fPath The file path to the PDF file.
@return An IO action yielding a list of 'Document's extracted from the PDF.
-}
readPdf :: FilePath -> IO [Document]
readPdf :: FilePath -> IO [Document]
readPdf FilePath
fPath = do
  FilePath -> (Pdf -> IO [Document]) -> IO [Document]
forall a. FilePath -> (Pdf -> IO a) -> IO a
withPdfFile FilePath
fPath ((Pdf -> IO [Document]) -> IO [Document])
-> (Pdf -> IO [Document]) -> IO [Document]
forall a b. (a -> b) -> a -> b
$ \Pdf
pdf -> do
    Document
doc <- Pdf -> IO Document
document Pdf
pdf
    Catalog
catalog <- Document -> IO Catalog
documentCatalog Document
doc
    PageNode
rootNode <- Catalog -> IO PageNode
catalogPageNode Catalog
catalog
    Int
count <- PageNode -> IO Int
pageNodeNKids PageNode
rootNode
    [Text]
textList <- [IO Text] -> IO [Text]
forall (t :: * -> *) (m :: * -> *) a.
(Traversable t, Monad m) =>
t (m a) -> m (t a)
forall (m :: * -> *) a. Monad m => [m a] -> m [a]
sequence [Page -> IO Text
pageExtractText (Page -> IO Text) -> IO Page -> IO Text
forall (m :: * -> *) a b. Monad m => (a -> m b) -> m a -> m b
=<< PageNode -> Int -> IO Page
pageNodePageByNum PageNode
rootNode Int
i | Int
i <- [Int
0 .. Int
count Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
1]]
    [Document] -> IO [Document]
forall a. a -> IO a
forall (f :: * -> *) a. Applicative f => a -> f a
pure
      ([Document] -> IO [Document]) -> [Document] -> IO [Document]
forall a b. (a -> b) -> a -> b
$ ((Text, Int) -> Document) -> [(Text, Int)] -> [Document]
forall a b. (a -> b) -> [a] -> [b]
map
        ( \(Text
content, Int
pageNum) ->
            Document
              { pageContent :: Text
pageContent = Text
content
              , metadata :: Map Text Value
metadata = [(Text, Value)] -> Map Text Value
forall k a. Ord k => [(k, a)] -> Map k a
fromList [(Text
"page number", Scientific -> Value
Number (Scientific -> Value) -> Scientific -> Value
forall a b. (a -> b) -> a -> b
$ Int -> Scientific
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
pageNum)]
              }
        )
      ([(Text, Int)] -> [Document]) -> [(Text, Int)] -> [Document]
forall a b. (a -> b) -> a -> b
$ [Text] -> [Int] -> [(Text, Int)]
forall a b. [a] -> [b] -> [(a, b)]
zip [Text]
textList [Int
1 .. Int
count]

{- |
A loader for PDF files.

The 'PdfLoader' data type encapsulates a 'FilePath' pointing to a PDF document.
It implements the 'BaseLoader' interface to provide methods for loading and
splitting PDF content.
-}
data PdfLoader = PdfLoader FilePath

instance BaseLoader PdfLoader where
  -- \|
  --  Loads all pages from the PDF file specified by the 'PdfLoader'.
  --
  --  This function first checks whether the file exists. If it does, it uses
  --  'readPdf' to extract the content of each page as a separate 'Document'. If
  --  the file is not found, an appropriate error message is returned.
  --
  --  @param loader A 'PdfLoader' containing the file path to the PDF.
  --  @return An IO action yielding either an error message or a list of 'Document's.
  --
  load :: PdfLoader -> IO (Either FilePath [Document])
load (PdfLoader FilePath
path) = do
    Bool
exists <- FilePath -> IO Bool
doesFileExist FilePath
path
    if Bool
exists
      then do
        [Document]
content <- FilePath -> IO [Document]
readPdf FilePath
path
        Either FilePath [Document] -> IO (Either FilePath [Document])
forall a. a -> IO a
forall (m :: * -> *) a. Monad m => a -> m a
return (Either FilePath [Document] -> IO (Either FilePath [Document]))
-> Either FilePath [Document] -> IO (Either FilePath [Document])
forall a b. (a -> b) -> a -> b
$ [Document] -> Either FilePath [Document]
forall a b. b -> Either a b
Right [Document]
content
      else
        Either FilePath [Document] -> IO (Either FilePath [Document])
forall a. a -> IO a
forall (m :: * -> *) a. Monad m => a -> m a
return (Either FilePath [Document] -> IO (Either FilePath [Document]))
-> Either FilePath [Document] -> IO (Either FilePath [Document])
forall a b. (a -> b) -> a -> b
$ FilePath -> Either FilePath [Document]
forall a b. a -> Either a b
Left (FilePath -> Either FilePath [Document])
-> FilePath -> Either FilePath [Document]
forall a b. (a -> b) -> a -> b
$ FilePath
"File not found: " FilePath -> FilePath -> FilePath
forall a. [a] -> [a] -> [a]
++ FilePath
path

  -- \|
  --  Loads the raw content of the PDF file and splits it using a recursive character splitter.
  --
  --  This method reads the entire file as text (without parsing its PDF structure) and applies
  --  'splitText' with default recursive character options to divide the text into chunks.
  --  This approach is useful when only a simple text split is required rather than structured
  --  page extraction.
  --
  --  @param loader A 'PdfLoader' containing the file path to the PDF.
  --  @return An IO action yielding either an error message or a list of text chunks.
  --
  loadAndSplit :: PdfLoader -> IO (Either FilePath [Text])
loadAndSplit (PdfLoader FilePath
path) = do
    Bool
exists <- FilePath -> IO Bool
doesFileExist FilePath
path
    if Bool
exists
      then do
        FilePath
content <- FilePath -> IO FilePath
readFile FilePath
path
        Either FilePath [Text] -> IO (Either FilePath [Text])
forall a. a -> IO a
forall (m :: * -> *) a. Monad m => a -> m a
return (Either FilePath [Text] -> IO (Either FilePath [Text]))
-> Either FilePath [Text] -> IO (Either FilePath [Text])
forall a b. (a -> b) -> a -> b
$ [Text] -> Either FilePath [Text]
forall a b. b -> Either a b
Right ([Text] -> Either FilePath [Text])
-> [Text] -> Either FilePath [Text]
forall a b. (a -> b) -> a -> b
$ CharacterSplitterOps -> Text -> [Text]
splitText CharacterSplitterOps
defaultCharacterSplitterOps (FilePath -> Text
pack FilePath
content)
      else
        Either FilePath [Text] -> IO (Either FilePath [Text])
forall a. a -> IO a
forall (m :: * -> *) a. Monad m => a -> m a
return (Either FilePath [Text] -> IO (Either FilePath [Text]))
-> Either FilePath [Text] -> IO (Either FilePath [Text])
forall a b. (a -> b) -> a -> b
$ FilePath -> Either FilePath [Text]
forall a b. a -> Either a b
Left (FilePath -> Either FilePath [Text])
-> FilePath -> Either FilePath [Text]
forall a b. (a -> b) -> a -> b
$ FilePath
"File not found: " FilePath -> FilePath -> FilePath
forall a. [a] -> [a] -> [a]
++ FilePath
path