{-# LANGUAGE DeriveGeneric #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE TypeFamilies #-}

{- |
Module      : Langchain.Tool.WebScraper
Description : Tool for scrapping text content from URL
Copyright   : (c) 2025 Tushar Adhatrao
License     : MIT
Maintainer  : Tushar Adhatrao <tusharadhatrao@gmail.com>
Stability   : experimental
-}
module Langchain.Tool.WebScraper (WebScraper (..), WebPageInfo (..), fetchAndScrape) where

import Control.Exception (SomeException, try)
import Data.Aeson (ToJSON)
import qualified Data.ByteString.Lazy as LBS
import Data.Maybe (listToMaybe)
import Data.Text (Text)
import qualified Data.Text as T
import qualified Data.Text.Encoding as TE
import GHC.Generics (Generic)
import Langchain.Tool.Core
import Network.HTTP.Simple
import Text.HTML.Scalpel

-- | Represents a web scraper tool that extracts content from web pages
data WebScraper = WebScraper
  deriving (Int -> WebScraper -> ShowS
[WebScraper] -> ShowS
WebScraper -> String
(Int -> WebScraper -> ShowS)
-> (WebScraper -> String)
-> ([WebScraper] -> ShowS)
-> Show WebScraper
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
$cshowsPrec :: Int -> WebScraper -> ShowS
showsPrec :: Int -> WebScraper -> ShowS
$cshow :: WebScraper -> String
show :: WebScraper -> String
$cshowList :: [WebScraper] -> ShowS
showList :: [WebScraper] -> ShowS
Show)

-- | Stores the extracted webpage information
data WebPageInfo = WebPageInfo
  { WebPageInfo -> Maybe Text
pageTitle :: Maybe Text
  , WebPageInfo -> [Text]
pageHeadings :: [Text]
  , WebPageInfo -> [(Text, Text)]
pageLinks :: [(Text, Text)] -- (Link text, URL)
  , WebPageInfo -> Text
pageText :: Text
  }
  deriving (Int -> WebPageInfo -> ShowS
[WebPageInfo] -> ShowS
WebPageInfo -> String
(Int -> WebPageInfo -> ShowS)
-> (WebPageInfo -> String)
-> ([WebPageInfo] -> ShowS)
-> Show WebPageInfo
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
$cshowsPrec :: Int -> WebPageInfo -> ShowS
showsPrec :: Int -> WebPageInfo -> ShowS
$cshow :: WebPageInfo -> String
show :: WebPageInfo -> String
$cshowList :: [WebPageInfo] -> ShowS
showList :: [WebPageInfo] -> ShowS
Show, (forall x. WebPageInfo -> Rep WebPageInfo x)
-> (forall x. Rep WebPageInfo x -> WebPageInfo)
-> Generic WebPageInfo
forall x. Rep WebPageInfo x -> WebPageInfo
forall x. WebPageInfo -> Rep WebPageInfo x
forall a.
(forall x. a -> Rep a x) -> (forall x. Rep a x -> a) -> Generic a
$cfrom :: forall x. WebPageInfo -> Rep WebPageInfo x
from :: forall x. WebPageInfo -> Rep WebPageInfo x
$cto :: forall x. Rep WebPageInfo x -> WebPageInfo
to :: forall x. Rep WebPageInfo x -> WebPageInfo
Generic)

-- Make WebPageInfo serializable to JSON
instance ToJSON WebPageInfo

-- | Input type for the WebScraper - just a URL
type ScraperInput = Text

-- | Implement the Tool typeclass for WebScraper
instance Tool WebScraper where
  type Input WebScraper = ScraperInput
  type Output WebScraper = Text

  toolName :: WebScraper -> Text
toolName WebScraper
_ = Text
"web_scraper"

  toolDescription :: WebScraper -> Text
toolDescription WebScraper
_ =
    Text
"Scrapes content from a webpage. Provide a valid URL, and it will extract the title,"
      Text -> Text -> Text
forall a. Semigroup a => a -> a -> a
<> Text
"headings, links, and text content."

  runTool :: WebScraper -> Input WebScraper -> IO (Output WebScraper)
runTool WebScraper
_ Input WebScraper
url = do
    Either String WebPageInfo
result <- Text -> IO (Either String WebPageInfo)
fetchAndScrape Text
Input WebScraper
url
    case Either String WebPageInfo
result of
      Left String
err -> Text -> IO Text
forall a. a -> IO a
forall (f :: * -> *) a. Applicative f => a -> f a
pure (Text -> IO Text) -> Text -> IO Text
forall a b. (a -> b) -> a -> b
$ Text
"Error scraping webpage: " Text -> Text -> Text
forall a. Semigroup a => a -> a -> a
<> String -> Text
T.pack (ShowS
forall a. Show a => a -> String
show String
err)
      Right WebPageInfo
info -> Text -> IO Text
forall a. a -> IO a
forall (f :: * -> *) a. Applicative f => a -> f a
pure (Text -> IO Text) -> Text -> IO Text
forall a b. (a -> b) -> a -> b
$ String -> Text
T.pack (WebPageInfo -> String
forall a. Show a => a -> String
show WebPageInfo
info)

-- | Fetch HTML content from a URL and extract webpage information
fetchAndScrape :: Text -> IO (Either String WebPageInfo)
fetchAndScrape :: Text -> IO (Either String WebPageInfo)
fetchAndScrape Text
url = do
  Request
request_ <- String -> IO Request
forall (m :: * -> *). MonadThrow m => String -> m Request
parseRequest (Text -> String
T.unpack Text
url)
  Either SomeException (Response ByteString)
eResp <- IO (Response ByteString)
-> IO (Either SomeException (Response ByteString))
forall e a. Exception e => IO a -> IO (Either e a)
try (IO (Response ByteString)
 -> IO (Either SomeException (Response ByteString)))
-> IO (Response ByteString)
-> IO (Either SomeException (Response ByteString))
forall a b. (a -> b) -> a -> b
$ Request -> IO (Response ByteString)
forall (m :: * -> *).
MonadIO m =>
Request -> m (Response ByteString)
httpLBS Request
request_ :: IO (Either SomeException (Response LBS.ByteString))
  case Either SomeException (Response ByteString)
eResp of
    Left SomeException
err -> Either String WebPageInfo -> IO (Either String WebPageInfo)
forall a. a -> IO a
forall (f :: * -> *) a. Applicative f => a -> f a
pure (Either String WebPageInfo -> IO (Either String WebPageInfo))
-> Either String WebPageInfo -> IO (Either String WebPageInfo)
forall a b. (a -> b) -> a -> b
$ String -> Either String WebPageInfo
forall a b. a -> Either a b
Left (SomeException -> String
forall a. Show a => a -> String
show SomeException
err)
    Right Response ByteString
r -> do
      let rBody :: ByteString
rBody = (Response ByteString -> ByteString
forall a. Response a -> a
getResponseBody Response ByteString
r)
      let htmlContent :: Text
htmlContent = ByteString -> Text
TE.decodeUtf8 (ByteString -> Text) -> ByteString -> Text
forall a b. (a -> b) -> a -> b
$ ByteString -> ByteString
LBS.toStrict ByteString
rBody
      let scraped :: Maybe WebPageInfo
scraped = Text -> Scraper Text WebPageInfo -> Maybe WebPageInfo
forall str a. StringLike str => str -> Scraper str a -> Maybe a
scrapeStringLike Text
htmlContent Scraper Text WebPageInfo
scrapeWebPageInfo
      case Maybe WebPageInfo
scraped of
        Maybe WebPageInfo
Nothing -> Either String WebPageInfo -> IO (Either String WebPageInfo)
forall a. a -> IO a
forall (f :: * -> *) a. Applicative f => a -> f a
pure (Either String WebPageInfo -> IO (Either String WebPageInfo))
-> Either String WebPageInfo -> IO (Either String WebPageInfo)
forall a b. (a -> b) -> a -> b
$ String -> Either String WebPageInfo
forall a b. a -> Either a b
Left String
"Failed to parse HTML content"
        Just WebPageInfo
info -> Either String WebPageInfo -> IO (Either String WebPageInfo)
forall a. a -> IO a
forall (f :: * -> *) a. Applicative f => a -> f a
pure (Either String WebPageInfo -> IO (Either String WebPageInfo))
-> Either String WebPageInfo -> IO (Either String WebPageInfo)
forall a b. (a -> b) -> a -> b
$ WebPageInfo -> Either String WebPageInfo
forall a b. b -> Either a b
Right WebPageInfo
info

-- | Define the Scalpel scraper for extracting webpage information
scrapeWebPageInfo :: Scraper Text WebPageInfo
scrapeWebPageInfo :: Scraper Text WebPageInfo
scrapeWebPageInfo = do
  Maybe Text
title <- Scraper Text (Maybe Text)
scrapeTitle
  [Text]
headings <- Scraper Text [Text]
scrapeHeadings
  [(Text, Text)]
links <- Scraper Text [(Text, Text)]
scrapeLinks
  Text
t <- Scraper Text Text
scrapeText
  WebPageInfo -> Scraper Text WebPageInfo
forall a. a -> ScraperT Text Identity a
forall (m :: * -> *) a. Monad m => a -> m a
return (WebPageInfo -> Scraper Text WebPageInfo)
-> WebPageInfo -> Scraper Text WebPageInfo
forall a b. (a -> b) -> a -> b
$ Maybe Text -> [Text] -> [(Text, Text)] -> Text -> WebPageInfo
WebPageInfo Maybe Text
title [Text]
headings [(Text, Text)]
links Text
t

-- | Scrape the page title
scrapeTitle :: Scraper Text (Maybe Text)
scrapeTitle :: Scraper Text (Maybe Text)
scrapeTitle = ([Text] -> Maybe Text)
-> Scraper Text [Text] -> Scraper Text (Maybe Text)
forall a b.
(a -> b) -> ScraperT Text Identity a -> ScraperT Text Identity b
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap [Text] -> Maybe Text
forall a. [a] -> Maybe a
listToMaybe (Scraper Text [Text] -> Scraper Text (Maybe Text))
-> Scraper Text [Text] -> Scraper Text (Maybe Text)
forall a b. (a -> b) -> a -> b
$ Selector -> Scraper Text [Text]
forall str (m :: * -> *).
(StringLike str, Monad m) =>
Selector -> ScraperT str m [str]
texts Selector
"title"

-- | Scrape all headings (h1-h6)
scrapeHeadings :: Scraper Text [Text]
scrapeHeadings :: Scraper Text [Text]
scrapeHeadings = do
  [Text]
h1s <- Selector -> Scraper Text [Text]
forall str (m :: * -> *).
(StringLike str, Monad m) =>
Selector -> ScraperT str m [str]
texts Selector
"h1"
  [Text]
h2s <- Selector -> Scraper Text [Text]
forall str (m :: * -> *).
(StringLike str, Monad m) =>
Selector -> ScraperT str m [str]
texts Selector
"h2"
  [Text]
h3s <- Selector -> Scraper Text [Text]
forall str (m :: * -> *).
(StringLike str, Monad m) =>
Selector -> ScraperT str m [str]
texts Selector
"h3"
  [Text]
h4s <- Selector -> Scraper Text [Text]
forall str (m :: * -> *).
(StringLike str, Monad m) =>
Selector -> ScraperT str m [str]
texts Selector
"h4"
  [Text]
h5s <- Selector -> Scraper Text [Text]
forall str (m :: * -> *).
(StringLike str, Monad m) =>
Selector -> ScraperT str m [str]
texts Selector
"h5"
  [Text]
h6s <- Selector -> Scraper Text [Text]
forall str (m :: * -> *).
(StringLike str, Monad m) =>
Selector -> ScraperT str m [str]
texts Selector
"h6"
  [Text] -> Scraper Text [Text]
forall a. a -> ScraperT Text Identity a
forall (m :: * -> *) a. Monad m => a -> m a
return ([Text] -> Scraper Text [Text]) -> [Text] -> Scraper Text [Text]
forall a b. (a -> b) -> a -> b
$ [[Text]] -> [Text]
forall (t :: * -> *) a. Foldable t => t [a] -> [a]
concat [[Text]
h1s, [Text]
h2s, [Text]
h3s, [Text]
h4s, [Text]
h5s, [Text]
h6s]

-- | Scrape all links with their URLs
scrapeLinks :: Scraper Text [(Text, Text)]
scrapeLinks :: Scraper Text [(Text, Text)]
scrapeLinks = Selector
-> ScraperT Text Identity (Text, Text)
-> Scraper Text [(Text, Text)]
forall str (m :: * -> *) a.
(StringLike str, Monad m) =>
Selector -> ScraperT str m a -> ScraperT str m [a]
chroots Selector
"a" (ScraperT Text Identity (Text, Text)
 -> Scraper Text [(Text, Text)])
-> ScraperT Text Identity (Text, Text)
-> Scraper Text [(Text, Text)]
forall a b. (a -> b) -> a -> b
$ do
  Text
linkText <- Selector -> Scraper Text Text
forall str (m :: * -> *).
(StringLike str, Monad m) =>
Selector -> ScraperT str m str
text Selector
"a"
  Text
linkHref <- String -> Selector -> Scraper Text Text
forall str (m :: * -> *).
(Show str, StringLike str, Monad m) =>
String -> Selector -> ScraperT str m str
attr String
"href" Selector
"a"
  (Text, Text) -> ScraperT Text Identity (Text, Text)
forall a. a -> ScraperT Text Identity a
forall (m :: * -> *) a. Monad m => a -> m a
return (Text
linkText, Text
linkHref)

-- | Scrape main text content (from p, div, span elements)
scrapeText :: Scraper Text Text
scrapeText :: Scraper Text Text
scrapeText = do
  [Text]
paragraphs <- Selector -> Scraper Text [Text]
forall str (m :: * -> *).
(StringLike str, Monad m) =>
Selector -> ScraperT str m [str]
texts Selector
"p"
  [Text]
divs <- Selector -> Scraper Text [Text]
forall str (m :: * -> *).
(StringLike str, Monad m) =>
Selector -> ScraperT str m [str]
texts Selector
"div"
  [Text]
spans <- Selector -> Scraper Text [Text]
forall str (m :: * -> *).
(StringLike str, Monad m) =>
Selector -> ScraperT str m [str]
texts Selector
"span"
  [Text]
listElems <- Selector -> Scraper Text [Text]
forall str (m :: * -> *).
(StringLike str, Monad m) =>
Selector -> ScraperT str m [str]
texts Selector
"li"
  Text -> Scraper Text Text
forall a. a -> ScraperT Text Identity a
forall (m :: * -> *) a. Monad m => a -> m a
return (Text -> Scraper Text Text) -> Text -> Scraper Text Text
forall a b. (a -> b) -> a -> b
$ Text -> [Text] -> Text
T.intercalate Text
"\n\n" ([Text] -> Text) -> [Text] -> Text
forall a b. (a -> b) -> a -> b
$ (Text -> Bool) -> [Text] -> [Text]
forall a. (a -> Bool) -> [a] -> [a]
filter (Bool -> Bool
not (Bool -> Bool) -> (Text -> Bool) -> Text -> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Text -> Bool
T.null) ([Text] -> [Text]) -> [Text] -> [Text]
forall a b. (a -> b) -> a -> b
$ [[Text]] -> [Text]
forall (t :: * -> *) a. Foldable t => t [a] -> [a]
concat [[Text]
paragraphs, [Text]
divs, [Text]
spans, [Text]
listElems]