{-# LANGUAGE OverloadedStrings #-}
module Langchain.DocumentLoader.PdfLoader
( PdfLoader (..)
) where
import Data.Aeson
import Data.Map (fromList)
import Data.Text (pack)
import Langchain.DocumentLoader.Core
import Langchain.TextSplitter.Character
import Pdf.Document hiding (Document)
import System.Directory (doesFileExist)
readPdf :: FilePath -> IO [Document]
readPdf :: FilePath -> IO [Document]
readPdf FilePath
fPath = do
FilePath -> (Pdf -> IO [Document]) -> IO [Document]
forall a. FilePath -> (Pdf -> IO a) -> IO a
withPdfFile FilePath
fPath ((Pdf -> IO [Document]) -> IO [Document])
-> (Pdf -> IO [Document]) -> IO [Document]
forall a b. (a -> b) -> a -> b
$ \Pdf
pdf -> do
Document
doc <- Pdf -> IO Document
document Pdf
pdf
Catalog
catalog <- Document -> IO Catalog
documentCatalog Document
doc
PageNode
rootNode <- Catalog -> IO PageNode
catalogPageNode Catalog
catalog
Int
count <- PageNode -> IO Int
pageNodeNKids PageNode
rootNode
[Text]
textList <- [IO Text] -> IO [Text]
forall (t :: * -> *) (m :: * -> *) a.
(Traversable t, Monad m) =>
t (m a) -> m (t a)
forall (m :: * -> *) a. Monad m => [m a] -> m [a]
sequence [Page -> IO Text
pageExtractText (Page -> IO Text) -> IO Page -> IO Text
forall (m :: * -> *) a b. Monad m => (a -> m b) -> m a -> m b
=<< PageNode -> Int -> IO Page
pageNodePageByNum PageNode
rootNode Int
i | Int
i <- [Int
0 .. Int
count Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
1]]
[Document] -> IO [Document]
forall a. a -> IO a
forall (f :: * -> *) a. Applicative f => a -> f a
pure
([Document] -> IO [Document]) -> [Document] -> IO [Document]
forall a b. (a -> b) -> a -> b
$ ((Text, Int) -> Document) -> [(Text, Int)] -> [Document]
forall a b. (a -> b) -> [a] -> [b]
map
( \(Text
content, Int
pageNum) ->
Document
{ pageContent :: Text
pageContent = Text
content
, metadata :: Map Text Value
metadata = [(Text, Value)] -> Map Text Value
forall k a. Ord k => [(k, a)] -> Map k a
fromList [(Text
"page number", Scientific -> Value
Number (Scientific -> Value) -> Scientific -> Value
forall a b. (a -> b) -> a -> b
$ Int -> Scientific
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
pageNum)]
}
)
([(Text, Int)] -> [Document]) -> [(Text, Int)] -> [Document]
forall a b. (a -> b) -> a -> b
$ [Text] -> [Int] -> [(Text, Int)]
forall a b. [a] -> [b] -> [(a, b)]
zip [Text]
textList [Int
1 .. Int
count]
data PdfLoader = PdfLoader FilePath
instance BaseLoader PdfLoader where
load :: PdfLoader -> IO (Either FilePath [Document])
load (PdfLoader FilePath
path) = do
Bool
exists <- FilePath -> IO Bool
doesFileExist FilePath
path
if Bool
exists
then do
[Document]
content <- FilePath -> IO [Document]
readPdf FilePath
path
Either FilePath [Document] -> IO (Either FilePath [Document])
forall a. a -> IO a
forall (m :: * -> *) a. Monad m => a -> m a
return (Either FilePath [Document] -> IO (Either FilePath [Document]))
-> Either FilePath [Document] -> IO (Either FilePath [Document])
forall a b. (a -> b) -> a -> b
$ [Document] -> Either FilePath [Document]
forall a b. b -> Either a b
Right [Document]
content
else
Either FilePath [Document] -> IO (Either FilePath [Document])
forall a. a -> IO a
forall (m :: * -> *) a. Monad m => a -> m a
return (Either FilePath [Document] -> IO (Either FilePath [Document]))
-> Either FilePath [Document] -> IO (Either FilePath [Document])
forall a b. (a -> b) -> a -> b
$ FilePath -> Either FilePath [Document]
forall a b. a -> Either a b
Left (FilePath -> Either FilePath [Document])
-> FilePath -> Either FilePath [Document]
forall a b. (a -> b) -> a -> b
$ FilePath
"File not found: " FilePath -> FilePath -> FilePath
forall a. [a] -> [a] -> [a]
++ FilePath
path
loadAndSplit :: PdfLoader -> IO (Either FilePath [Text])
loadAndSplit (PdfLoader FilePath
path) = do
Bool
exists <- FilePath -> IO Bool
doesFileExist FilePath
path
if Bool
exists
then do
FilePath
content <- FilePath -> IO FilePath
readFile FilePath
path
Either FilePath [Text] -> IO (Either FilePath [Text])
forall a. a -> IO a
forall (m :: * -> *) a. Monad m => a -> m a
return (Either FilePath [Text] -> IO (Either FilePath [Text]))
-> Either FilePath [Text] -> IO (Either FilePath [Text])
forall a b. (a -> b) -> a -> b
$ [Text] -> Either FilePath [Text]
forall a b. b -> Either a b
Right ([Text] -> Either FilePath [Text])
-> [Text] -> Either FilePath [Text]
forall a b. (a -> b) -> a -> b
$ CharacterSplitterOps -> Text -> [Text]
splitText CharacterSplitterOps
defaultCharacterSplitterOps (FilePath -> Text
pack FilePath
content)
else
Either FilePath [Text] -> IO (Either FilePath [Text])
forall a. a -> IO a
forall (m :: * -> *) a. Monad m => a -> m a
return (Either FilePath [Text] -> IO (Either FilePath [Text]))
-> Either FilePath [Text] -> IO (Either FilePath [Text])
forall a b. (a -> b) -> a -> b
$ FilePath -> Either FilePath [Text]
forall a b. a -> Either a b
Left (FilePath -> Either FilePath [Text])
-> FilePath -> Either FilePath [Text]
forall a b. (a -> b) -> a -> b
$ FilePath
"File not found: " FilePath -> FilePath -> FilePath
forall a. [a] -> [a] -> [a]
++ FilePath
path