{-# LANGUAGE TupleSections #-}
{-# LANGUAGE TemplateHaskell #-}
{-# LANGUAGE OverloadedStrings #-}


{-|
Description: In a way, this is the most central component of the entire library;

DOM -> Link >>= request --> DOM -> Link ...
    ^^ 
    this may be infinitely complicated by stuff such as JS 

The recursive nature of scraping is the central data structure of a URL 

Which makes me think that there may be more to consider at some point with the modern-uri package
And doing stuff such as building site trees 


-}

module Scrappy.Links where


-- import Scrappy.Elem.Types (Elem'(..), ElemHead, innerText')
-- import Scrappy.Elem.ElemHeadParse (hrefParser)
-- import Find (findSomeHTMLNaive)

-- import qualified Network.URI as URI 

import Control.Monad (join)
import qualified Network.URI as NURI
-- TODO(galen): Replace with Network.URI and deprecate Text.URI
import Text.URI (URI, uriQuery, mkURI, uriPath, unRText, emptyURI, uriScheme, uriAuthority, RTextLabel(..))
import Control.Lens ((^.))
import qualified Text.URI.Lens as UL
import Text.Parsec (ParsecT, Stream )
import Data.Functor.Classes (eq1)
import Data.Map (Map)
import Data.Either (fromRight, isRight)
import Data.Maybe (catMaybes, fromJust, fromMaybe)
import Data.List (isSuffixOf, isInfixOf, isPrefixOf)
import qualified Data.List.NonEmpty as NE (length, last)
import Data.Text (Text, pack, unpack, splitOn
                 )
import Data.Char (toLower)

import Data.Aeson.TH (defaultOptions, deriveJSON)

type PageNumber = Int

-- |
type BaseUrl = Link
type Url = String 



type HrefURI = String 

-- TODO(galen): make this a Link 
type CurrentUrl = Url  

type DOI = String -- Change to URI if this works 

-- linkToURI :: Link -> URI
-- linkToURI = undefined

-- evalLink :: Link -> String
-- evalLink = linkToText
--   where
--     linkToText x = case x of
--       OuterPage x' -> x'
--       SearchFormURL y -> y
--       ListingPage _ _ _ _ -> undefined
--       PageHasPdf r -> r
--       Sourcery _ _ -> undefined



type Src = Url
type RelativeUrl = Url 


fixRelativeUrl :: BaseUrl -> Url -> Url
fixRelativeUrl :: Link -> Url -> Url
fixRelativeUrl (Link Url
bUrl) Url
url
  | Url
url Url -> Url -> Bool
forall a. Eq a => a -> a -> Bool
== Url
"" = Url
bUrl 
  | Url
url Url -> Url -> Bool
forall a. Eq a => a -> a -> Bool
== Url
"/" = Url
bUrl 
  | Url -> Url -> Bool
forall a. Eq a => [a] -> [a] -> Bool
isInfixOf Url
bUrl Url
url = Url
url
  | Url -> Char
forall a. HasCallStack => [a] -> a
last Url
bUrl Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== Char
'/' Bool -> Bool -> Bool
&& (Url -> Url -> Bool
forall a. Eq a => [a] -> [a] -> Bool
isPrefixOf Url
"/" Url
url) = Url
bUrl Url -> Url -> Url
forall a. Semigroup a => a -> a -> a
<> (Url -> Url
forall a. HasCallStack => [a] -> [a]
tail Url
url) -- both
  | Url -> Char
forall a. HasCallStack => [a] -> a
last Url
bUrl Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== Char
'/' Bool -> Bool -> Bool
&& (Bool -> Bool
not (Bool -> Bool) -> Bool -> Bool
forall a b. (a -> b) -> a -> b
$ Url -> Url -> Bool
forall a. Eq a => [a] -> [a] -> Bool
isPrefixOf Url
"/" Url
url) = Url
bUrl Url -> Url -> Url
forall a. Semigroup a => a -> a -> a
<> Url
url  -- a 
  | Url -> Char
forall a. HasCallStack => [a] -> a
last Url
bUrl Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
/= Char
'/' Bool -> Bool -> Bool
&& (Url -> Url -> Bool
forall a. Eq a => [a] -> [a] -> Bool
isPrefixOf Url
"/" Url
url) = Url
bUrl Url -> Url -> Url
forall a. Semigroup a => a -> a -> a
<> Url
url -- b 
  | Url -> Char
forall a. HasCallStack => [a] -> a
last Url
bUrl Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
/= Char
'/' Bool -> Bool -> Bool
&& (Bool -> Bool
not (Bool -> Bool) -> Bool -> Bool
forall a b. (a -> b) -> a -> b
$ Url -> Url -> Bool
forall a. Eq a => [a] -> [a] -> Bool
isPrefixOf Url
"/" Url
url) = Url
bUrl Url -> Url -> Url
forall a. Semigroup a => a -> a -> a
<> Url
"/" Url -> Url -> Url
forall a. Semigroup a => a -> a -> a
<> Url
url -- neither 

   --- || ((last bUrl /= '/') && (isPrefixOf "/" url)) = bUrl <> url

-- fixRelativeURI :: UURI -> URI.URI -> URI.URI
-- fixRelativeURI base relative = undefined
--   -- confirm that it truly is relative
  -- 

-- | Could set last url in state 
getHtmlStateful :: Url -> {- StateT SiteDetails -} String
getHtmlStateful :: Url -> Url
getHtmlStateful = Url -> Url
forall a. HasCallStack => a
undefined

-- Whatever man
type LastUrl = Link
type Href = String 


fixSameSiteURL :: LastUrl -> Href -> Maybe Url
fixSameSiteURL :: Link -> Url -> Maybe Url
fixSameSiteURL Link
lastUrl Url
href = Maybe Url
forall a. HasCallStack => a
undefined


-- | Generic algorithm for determining full path given last url 
fixURL :: LastUrl -> Href -> Url
fixURL :: Link -> Url -> Url
fixURL Link
previous Url
href = 
  -- checkIfSchemeInHref
  let
    base :: Link
base = if Url -> Url -> Bool
forall a. Eq a => [a] -> [a] -> Bool
isPrefixOf Url
"/" Url
href then Maybe Link -> Link
forall a. HasCallStack => Maybe a -> a
fromJust (Maybe Link -> Link) -> Maybe Link -> Link
forall a b. (a -> b) -> a -> b
$ Link -> Maybe Link
deriveBaseUrl Link
previous else Link
previous
    hrefURI :: Maybe URI
hrefURI = Text -> Maybe URI
forall (m :: * -> *). MonadThrow m => Text -> m URI
mkURI (Text -> Maybe URI) -> (Url -> Text) -> Url -> Maybe URI
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Url -> Text
pack (Url -> Maybe URI) -> Url -> Maybe URI
forall a b. (a -> b) -> a -> b
$ Url
href
  in
    case Maybe (Maybe (RText 'Scheme)) -> Maybe (RText 'Scheme)
forall (m :: * -> *) a. Monad m => m (m a) -> m a
join (Maybe (Maybe (RText 'Scheme)) -> Maybe (RText 'Scheme))
-> Maybe (Maybe (RText 'Scheme)) -> Maybe (RText 'Scheme)
forall a b. (a -> b) -> a -> b
$ URI -> Maybe (RText 'Scheme)
uriScheme (URI -> Maybe (RText 'Scheme))
-> Maybe URI -> Maybe (Maybe (RText 'Scheme))
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Maybe URI
hrefURI of
      -- We could easily check here if authority is the same 
      Just RText 'Scheme
_ -> Url
href 
      Maybe (RText 'Scheme)
Nothing -> Link -> Url -> Url
fixRelativeUrl Link
base Url
href 
        -- checkIfRelativeToLast -- doesnt start with /
        -- case isPrefixOf "/" href of
        --   True -> fixRelativeUrl (deriveBaseUrl previous) href
        --   False -> fixRelativeUrl previous href 
 
-- fixURL :: LastUrl -> Href -> Maybe Url
-- fixURL prev href = do

-- -- | The fromJust should never be called if Links are used properly
-- deriveBaseUrl :: Link -> BaseUrl
-- deriveBaseUrl (Link url) = Link $ mkBaseUrl $ fromJust $ mkURI . pack $ url 

-- -- | I think this is good (might also bee good lens practice tho to simplify)
-- mkBaseUrl :: URI -> String
-- mkBaseUrl uri =
--   (unpack $ ((unRText . fromJust) $  uri ^. UL.uriScheme))
--   <> ("://")
--   <> (unpack (unRText $ (fromRight undefined (uri ^. UL.uriAuthority)) ^. UL.authHost))
  
------------------------------

-- | the fromJust should never be called if Links are used properly
deriveBaseUrl :: Link -> Maybe BaseUrl
deriveBaseUrl :: Link -> Maybe Link
deriveBaseUrl (Link Url
url) = URI -> Maybe Link
mkBaseUrl (URI -> Maybe Link) -> Maybe URI -> Maybe Link
forall (m :: * -> *) a b. Monad m => (a -> m b) -> m a -> m b
=<< (Text -> Maybe URI
forall (m :: * -> *). MonadThrow m => Text -> m URI
mkURI (Text -> Maybe URI) -> (Url -> Text) -> Url -> Maybe URI
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Url -> Text
pack (Url -> Maybe URI) -> Url -> Maybe URI
forall a b. (a -> b) -> a -> b
$ Url
url)


-- | I think this is good (might also bee good lens practice tho to simplify)
mkBaseUrl :: URI -> Maybe Link 
mkBaseUrl :: URI -> Maybe Link
mkBaseUrl URI
uri = do
  Text
scheme <- (RText 'Scheme -> Text) -> Maybe (RText 'Scheme) -> Maybe Text
forall a b. (a -> b) -> Maybe a -> Maybe b
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap RText 'Scheme -> Text
forall (l :: RTextLabel). RText l -> Text
unRText (Maybe (RText 'Scheme) -> Maybe Text)
-> Maybe (RText 'Scheme) -> Maybe Text
forall a b. (a -> b) -> a -> b
$ URI
uri URI
-> Getting (Maybe (RText 'Scheme)) URI (Maybe (RText 'Scheme))
-> Maybe (RText 'Scheme)
forall s a. s -> Getting a s a -> a
^. Getting (Maybe (RText 'Scheme)) URI (Maybe (RText 'Scheme))
Lens' URI (Maybe (RText 'Scheme))
UL.uriScheme
  Text
host <- case URI
uri URI
-> Getting (Either Bool Authority) URI (Either Bool Authority)
-> Either Bool Authority
forall s a. s -> Getting a s a -> a
^. Getting (Either Bool Authority) URI (Either Bool Authority)
Lens' URI (Either Bool Authority)
UL.uriAuthority of
    Right Authority
author -> Text -> Maybe Text
forall a. a -> Maybe a
Just (Text -> Maybe Text) -> Text -> Maybe Text
forall a b. (a -> b) -> a -> b
$ RText 'Host -> Text
forall (l :: RTextLabel). RText l -> Text
unRText (RText 'Host -> Text) -> RText 'Host -> Text
forall a b. (a -> b) -> a -> b
$ Authority
author Authority
-> Getting (RText 'Host) Authority (RText 'Host) -> RText 'Host
forall s a. s -> Getting a s a -> a
^. Getting (RText 'Host) Authority (RText 'Host)
Lens' Authority (RText 'Host)
UL.authHost 
    Left Bool
_ -> Maybe Text
forall a. Maybe a
Nothing  
  Link -> Maybe Link
forall a. a -> Maybe a
Just (Link -> Maybe Link) -> (Url -> Link) -> Url -> Maybe Link
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Url -> Link
Link (Url -> Maybe Link) -> Url -> Maybe Link
forall a b. (a -> b) -> a -> b
$ (Text -> Url
unpack Text
scheme) Url -> Url -> Url
forall a. Semigroup a => a -> a -> a
<> (Url
"://") Url -> Url -> Url
forall a. Semigroup a => a -> a -> a
<> (Text -> Url
unpack Text
host)
  
  -- (unpack $ ((unRText . fromJust) $  uri ^. UL.uriScheme))
  -- <> ("://")
  -- <> (unpack (unRText $ (fromRight undefined (uri ^. UL.uriAuthority)) ^. UL.authHost))


                    
  -- if yes 
  --   then weNeedTheLastUrl
  --   else deriveBaseUrl 
  -- deriveBaseUrl

-- for the scraper, instead of keeping the baseURL we should store the current URL
-- which can still be used to derive the

-- baseURL :: SiteDetails (-> CurrentUrl ->) -> BaseUrl 
                               
class IsLink a where
  renderLink :: a -> Url 




-- lets view Link as meant to contain Informationally derived meaning from internet ; that contains how to get
-- Keeps state for generic streaming 
-- data Link = OuterPage String
--           | SearchFormURL  String
--           | ListingPage [GeneratedLink] PageNumber PageKey String
--           | PageHasPdf String
--           --  PdfLink String
--           | Sourcery (PdfLink) ReferenceSys

-- pageKey=param




getFileName :: Link -> Maybe String
getFileName :: Link -> Maybe Url
getFileName = Link -> Maybe Url
getLastPath




doiParser :: ParsecT s u m DOI 
doiParser :: forall s u (m :: * -> *). ParsecT s u m Url
doiParser = ParsecT s u m Url
forall a. HasCallStack => a
undefined
  -- baseURL is doi.org
  
  -- isDOI :: Url -> Bool 

data ReferenceSys = RefSys [String] [String]

type GeneratedLink = String


-- type PdfLink = String   
-- | Name and Namespace are really same shit; might just converge
-- | Refer to literally "name" attribute
type Namespace = Text

-- | This is an operationally focused type where
-- | a certain namespace is found to have n num of Options
type Option = Text 


-- | More for show / reasoning rn .. non-optimal
data QParams = Opt (Map Namespace [Option]) | SimpleKV (Text, Text)



-- SiteTree can be modelled as a stream ; just depends on how we apply it -- if lazily
-- | Inter site urls and whether they have been checked for some pattern
type SiteTree = [(Bool, Text)]


-- | This wouldnt need to be exported as our interfaces would implement it under the hood
-- | and return a Link'
data DOMLink = Href' Href
             | Src Url
             | PlainLink Url 
--               | LastUrl' String
          

-- scrapeSameSiteLinks :: ParsecT s u m Link
-- scrapeSameSiteLinks = undefined

-- scrapeLinks :: ParsecT s u m Link
-- scrapeLinks = undefined


newtype Link = Link Url deriving (Link -> Link -> Bool
(Link -> Link -> Bool) -> (Link -> Link -> Bool) -> Eq Link
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
$c== :: Link -> Link -> Bool
== :: Link -> Link -> Bool
$c/= :: Link -> Link -> Bool
/= :: Link -> Link -> Bool
Eq, Int -> Link -> Url -> Url
[Link] -> Url -> Url
Link -> Url
(Int -> Link -> Url -> Url)
-> (Link -> Url) -> ([Link] -> Url -> Url) -> Show Link
forall a.
(Int -> a -> Url -> Url)
-> (a -> Url) -> ([a] -> Url -> Url) -> Show a
$cshowsPrec :: Int -> Link -> Url -> Url
showsPrec :: Int -> Link -> Url -> Url
$cshow :: Link -> Url
show :: Link -> Url
$cshowList :: [Link] -> Url -> Url
showList :: [Link] -> Url -> Url
Show, ReadPrec [Link]
ReadPrec Link
Int -> ReadS Link
ReadS [Link]
(Int -> ReadS Link)
-> ReadS [Link] -> ReadPrec Link -> ReadPrec [Link] -> Read Link
forall a.
(Int -> ReadS a)
-> ReadS [a] -> ReadPrec a -> ReadPrec [a] -> Read a
$creadsPrec :: Int -> ReadS Link
readsPrec :: Int -> ReadS Link
$creadList :: ReadS [Link]
readList :: ReadS [Link]
$creadPrec :: ReadPrec Link
readPrec :: ReadPrec Link
$creadListPrec :: ReadPrec [Link]
readListPrec :: ReadPrec [Link]
Read, Eq Link
Eq Link =>
(Link -> Link -> Ordering)
-> (Link -> Link -> Bool)
-> (Link -> Link -> Bool)
-> (Link -> Link -> Bool)
-> (Link -> Link -> Bool)
-> (Link -> Link -> Link)
-> (Link -> Link -> Link)
-> Ord Link
Link -> Link -> Bool
Link -> Link -> Ordering
Link -> Link -> Link
forall a.
Eq a =>
(a -> a -> Ordering)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> a)
-> (a -> a -> a)
-> Ord a
$ccompare :: Link -> Link -> Ordering
compare :: Link -> Link -> Ordering
$c< :: Link -> Link -> Bool
< :: Link -> Link -> Bool
$c<= :: Link -> Link -> Bool
<= :: Link -> Link -> Bool
$c> :: Link -> Link -> Bool
> :: Link -> Link -> Bool
$c>= :: Link -> Link -> Bool
>= :: Link -> Link -> Bool
$cmax :: Link -> Link -> Link
max :: Link -> Link -> Link
$cmin :: Link -> Link -> Link
min :: Link -> Link -> Link
Ord)

-- | This is a general interface for extracting a raw link
-- | from scraping according to specs about the scraper itself
-- | IE if it is 100% same site
parseLink :: Bool -> Link -> Url -> Maybe Link
parseLink :: Bool -> Link -> Url -> Maybe Link
parseLink Bool
onlySameSite Link
lastLink Url
newLink = 
  case Url -> Bool
hasNoURIScheme Url
newLink of
    Bool
True -> Link -> Maybe Link
forall a. a -> Maybe a
Just (Link -> Maybe Link) -> (Url -> Link) -> Url -> Maybe Link
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Url -> Link
Link (Url -> Maybe Link) -> Url -> Maybe Link
forall a b. (a -> b) -> a -> b
$ Link -> Url -> Url
fixRelativeUrl (Maybe Link -> Link
forall a. HasCallStack => Maybe a -> a
fromJust (Maybe Link -> Link) -> Maybe Link -> Link
forall a b. (a -> b) -> a -> b
$ Link -> Maybe Link
deriveBaseUrl Link
lastLink) Url
newLink
    Bool
False -> case Url -> Bool
isHTTP Url
newLink of
      Bool
False -> Maybe Link
forall a. Maybe a
Nothing 
      Bool
True -> case Bool
onlySameSite of
        Bool
False ->  Link -> Maybe Link
forall a. a -> Maybe a
Just (Link -> Maybe Link) -> (Url -> Link) -> Url -> Maybe Link
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Url -> Link
Link (Url -> Maybe Link) -> Url -> Maybe Link
forall a b. (a -> b) -> a -> b
$ Url
newLink
        Bool
True -> case Url -> Link -> Bool
sameAuthority Url
newLink Link
lastLink of
          Bool
False -> Maybe Link
forall a. Maybe a
Nothing
          Bool
True -> Link -> Maybe Link
forall a. a -> Maybe a
Just (Link -> Maybe Link) -> (Url -> Link) -> Url -> Maybe Link
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Url -> Link
Link (Url -> Maybe Link) -> Url -> Maybe Link
forall a b. (a -> b) -> a -> b
$ Url
newLink
  where
    hasNoURIScheme :: Url -> Bool
hasNoURIScheme Url
url = (Maybe (Maybe (RText 'Scheme)) -> Maybe (RText 'Scheme)
forall (m :: * -> *) a. Monad m => m (m a) -> m a
join (Maybe (Maybe (RText 'Scheme)) -> Maybe (RText 'Scheme))
-> Maybe (Maybe (RText 'Scheme)) -> Maybe (RText 'Scheme)
forall a b. (a -> b) -> a -> b
$ (URI -> Maybe (RText 'Scheme))
-> Maybe URI -> Maybe (Maybe (RText 'Scheme))
forall a b. (a -> b) -> Maybe a -> Maybe b
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap URI -> Maybe (RText 'Scheme)
uriScheme (Maybe URI -> Maybe (Maybe (RText 'Scheme)))
-> Maybe URI -> Maybe (Maybe (RText 'Scheme))
forall a b. (a -> b) -> a -> b
$ Text -> Maybe URI
forall (m :: * -> *). MonadThrow m => Text -> m URI
mkURI (Text -> Maybe URI) -> (Url -> Text) -> Url -> Maybe URI
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Url -> Text
pack (Url -> Maybe URI) -> Url -> Maybe URI
forall a b. (a -> b) -> a -> b
$ Url
url) Maybe (RText 'Scheme) -> Maybe (RText 'Scheme) -> Bool
forall a. Eq a => a -> a -> Bool
== Maybe (RText 'Scheme)
forall a. Maybe a
Nothing
    isHTTP :: Url -> Bool
isHTTP Url
url = Url -> [Url] -> Bool
forall a. Eq a => a -> [a] -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
elem (Url -> Maybe Url -> Url
forall a. a -> Maybe a -> a
fromMaybe Url
"" ((URI -> Url) -> Maybe URI -> Maybe Url
forall a b. (a -> b) -> Maybe a -> Maybe b
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap URI -> Url
NURI.uriScheme (Maybe URI -> Maybe Url) -> Maybe URI -> Maybe Url
forall a b. (a -> b) -> a -> b
$ Url -> Maybe URI
NURI.parseURI Url
url)) [Url
"https:", Url
"http:"]

sameAuthority :: Url -> Link -> Bool
sameAuthority :: Url -> Link -> Bool
sameAuthority Url
href (Link Url
linky) =
  let
    getMainAuthority :: Url -> Text
getMainAuthority = [Text] -> Text
forall a. HasCallStack => [a] -> a
last ([Text] -> Text) -> (Url -> [Text]) -> Url -> Text
forall b c a. (b -> c) -> (a -> b) -> a -> c
. HasCallStack => Text -> Text -> [Text]
Text -> Text -> [Text]
splitOn Text
"." (Text -> [Text]) -> (Url -> Text) -> Url -> [Text]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Url -> Text
pack
    getRegName :: Url -> Maybe Text
getRegName Url
l = (URIAuth -> Text) -> Maybe URIAuth -> Maybe Text
forall a b. (a -> b) -> Maybe a -> Maybe b
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap (Url -> Text
getMainAuthority (Url -> Text) -> (URIAuth -> Url) -> URIAuth -> Text
forall b c a. (b -> c) -> (a -> b) -> a -> c
. URIAuth -> Url
NURI.uriRegName) (Maybe URIAuth -> Maybe Text) -> Maybe URIAuth -> Maybe Text
forall a b. (a -> b) -> a -> b
$ URI -> Maybe URIAuth
NURI.uriAuthority (URI -> Maybe URIAuth) -> Maybe URI -> Maybe URIAuth
forall (m :: * -> *) a b. Monad m => (a -> m b) -> m a -> m b
=<< Url -> Maybe URI
NURI.parseURI Url
l
  in case Text -> Text -> Bool
forall a. Eq a => a -> a -> Bool
(==) (Text -> Text -> Bool) -> Maybe Text -> Maybe (Text -> Bool)
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> (Url -> Maybe Text
getRegName Url
href) Maybe (Text -> Bool) -> Maybe Text -> Maybe Bool
forall a b. Maybe (a -> b) -> Maybe a -> Maybe b
forall (f :: * -> *) a b. Applicative f => f (a -> b) -> f a -> f b
<*> (Url -> Maybe Text
getRegName Url
linky) of
    Maybe Bool
Nothing -> Bool
False 
    Just Bool
b -> Bool
b


type HostName = String 
-- MOVE TO SCRAPPY
getHostName :: Link -> Maybe HostName
getHostName :: Link -> Maybe Url
getHostName (Link Url
url) = do 
  URI
uri <- Text -> Maybe URI
forall (m :: * -> *). MonadThrow m => Text -> m URI
mkURI (Text -> Maybe URI) -> Text -> Maybe URI
forall a b. (a -> b) -> a -> b
$ Url -> Text
pack Url
url
  case (Authority -> Url) -> Either Bool Authority -> Either Bool Url
forall a b. (a -> b) -> Either Bool a -> Either Bool b
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap (Text -> Url
unpack (Text -> Url) -> (Authority -> Text) -> Authority -> Url
forall b c a. (b -> c) -> (a -> b) -> a -> c
. RText 'Host -> Text
forall (l :: RTextLabel). RText l -> Text
unRText (RText 'Host -> Text)
-> (Authority -> RText 'Host) -> Authority -> Text
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Authority
-> Getting (RText 'Host) Authority (RText 'Host) -> RText 'Host
forall s a. s -> Getting a s a -> a
^. Getting (RText 'Host) Authority (RText 'Host)
Lens' Authority (RText 'Host)
UL.authHost)) (URI
uri URI
-> Getting (Either Bool Authority) URI (Either Bool Authority)
-> Either Bool Authority
forall s a. s -> Getting a s a -> a
^. Getting (Either Bool Authority) URI (Either Bool Authority)
Lens' URI (Either Bool Authority)
UL.uriAuthority) of
    Right Url
hn -> Url -> Maybe Url
forall a. a -> Maybe a
Just Url
hn
    Either Bool Url
_ -> Maybe Url
forall a. Maybe a
Nothing 
    
-- | Only exported interface 
instance IsLink Link where
  renderLink :: Link -> Url
renderLink (Link Url
url) = Url
url 


--getHtmlST :: sv -> Link -> m (sv, Html) 


-- -- | In reality, this is 4 helper functions 
-- link :: (Maybe LastUrl) -> ScraperT Link
-- link onlyThisSite = do
--   link' <- parseOpeningTag linkStuff
--   validateLink onlyThisSite link' 


-- doesnt have a scheme:
--   NoScheme -> must be same site and relative;-> Just $ relative to current or base URL ? 
--   HasScheme -> if mustBeSS && isSSite then Just url else Nothing 


-- -- validateLink is gonna be an interface that may use fixURL and sees if its the same site 
-- -- | All 4 scrapers would use validateLink 
-- validateLink :: Bool -> LastUrl -> DOMLink -> Link
-- validateLink ots lastUrl iLink = case ots of
--   True -> ""
--   False -> "" 


-- Note following ideas

-- data Source = Source (Citations, Html)



-- findAdvancedSearchLinks :: ParsecT s u m [String]
-- findAdvancedSearchLinks = undefined


-- | Core function of module, filters for any links which point to other pages on the current site
-- | and have not been found over the course of scraping the site yet 
-- | filters out urls like https://othersite.com and "#"
maybeUsefulNewUrl :: Link -> [(Link, a)] -> Link -> Maybe Link
maybeUsefulNewUrl :: forall a. Link -> [(Link, a)] -> Link -> Maybe Link
maybeUsefulNewUrl Link
baseUrl [(Link, a)]
tree Link
url = Link -> Link -> Maybe Link
maybeUsefulUrl Link
baseUrl Link
url Maybe Link -> (Link -> Maybe Link) -> Maybe Link
forall a b. Maybe a -> (a -> Maybe b) -> Maybe b
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= [(Link, a)] -> Link -> Maybe Link
forall a. [(Link, a)] -> Link -> Maybe Link
maybeNewUrl [(Link, a)]
tree 




urlIsNew :: [(a, Url)] -> HrefURI -> Bool
urlIsNew :: forall a. [(a, Url)] -> Url -> Bool
urlIsNew [] Url
uri = Bool
True
urlIsNew ((a, Url)
branch:[(a, Url)]
tree) Url
uri
  | Maybe (Maybe (Bool, NonEmpty (RText 'PathPiece)))
-> Maybe (Maybe (Bool, NonEmpty (RText 'PathPiece))) -> Bool
forall (f :: * -> *) a. (Eq1 f, Eq a) => f a -> f a -> Bool
eq1 ((URI -> Maybe (Bool, NonEmpty (RText 'PathPiece)))
-> Maybe URI -> Maybe (Maybe (Bool, NonEmpty (RText 'PathPiece)))
forall a b. (a -> b) -> Maybe a -> Maybe b
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap URI -> Maybe (Bool, NonEmpty (RText 'PathPiece))
uriPath (Url -> Maybe URI
mkURI' (Url
uri))) ((URI -> Maybe (Bool, NonEmpty (RText 'PathPiece)))
-> Maybe URI -> Maybe (Maybe (Bool, NonEmpty (RText 'PathPiece)))
forall a b. (a -> b) -> Maybe a -> Maybe b
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap URI -> Maybe (Bool, NonEmpty (RText 'PathPiece))
uriPath (Url -> Maybe URI
mkURI' ((a, Url) -> Url
forall a b. (a, b) -> b
snd (a, Url)
branch))) = Bool
False
  | Bool
otherwise = [(a, Url)] -> Url -> Bool
forall a. [(a, Url)] -> Url -> Bool
urlIsNew [(a, Url)]
tree Url
uri
  where
    mkURI' :: String -> Maybe URI
    mkURI' :: Url -> Maybe URI
mkURI' Url
url = Text -> Maybe URI
forall (m :: * -> *). MonadThrow m => Text -> m URI
mkURI (Url -> Text
pack Url
url)



maybeNewUrl :: [(Link, a)] -> Link -> Maybe Link
maybeNewUrl :: forall a. [(Link, a)] -> Link -> Maybe Link
maybeNewUrl [] Link
uri = Link -> Maybe Link
forall a. a -> Maybe a
Just Link
uri
maybeNewUrl ((Link, a)
branch:[(Link, a)]
tree) Link
uri =
  if Maybe (Maybe (Bool, NonEmpty (RText 'PathPiece)))
-> Maybe (Maybe (Bool, NonEmpty (RText 'PathPiece))) -> Bool
forall (f :: * -> *) a. (Eq1 f, Eq a) => f a -> f a -> Bool
eq1 ((URI -> Maybe (Bool, NonEmpty (RText 'PathPiece)))
-> Maybe URI -> Maybe (Maybe (Bool, NonEmpty (RText 'PathPiece)))
forall a b. (a -> b) -> Maybe a -> Maybe b
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap URI -> Maybe (Bool, NonEmpty (RText 'PathPiece))
uriPath (Url -> Maybe URI
mkURI' (Link -> Url
forall a. IsLink a => a -> Url
renderLink Link
uri))) ((URI -> Maybe (Bool, NonEmpty (RText 'PathPiece)))
-> Maybe URI -> Maybe (Maybe (Bool, NonEmpty (RText 'PathPiece)))
forall a b. (a -> b) -> Maybe a -> Maybe b
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap URI -> Maybe (Bool, NonEmpty (RText 'PathPiece))
uriPath (Url -> Maybe URI
mkURI' (Url -> Maybe URI) -> ((Link, a) -> Url) -> (Link, a) -> Maybe URI
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Link -> Url
forall a. IsLink a => a -> Url
renderLink (Link -> Url) -> ((Link, a) -> Link) -> (Link, a) -> Url
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Link, a) -> Link
forall a b. (a, b) -> a
fst ((Link, a) -> Maybe URI) -> (Link, a) -> Maybe URI
forall a b. (a -> b) -> a -> b
$ (Link, a)
branch))
  then Maybe Link
forall a. Maybe a
Nothing
  else [(Link, a)] -> Link -> Maybe Link
forall a. [(Link, a)] -> Link -> Maybe Link
maybeNewUrl [(Link, a)]
tree Link
uri
  -- eq1 (fmap uriPath (mkURI' (pack uri))) (fmap uriPath (mkURI' (fst branch))) = False
  -- otherwise = urlIsNew tree uri
  where
    mkURI' :: String -> Maybe URI
    mkURI' :: Url -> Maybe URI
mkURI' Url
url = Text -> Maybe URI
forall (m :: * -> *). MonadThrow m => Text -> m URI
mkURI (Url -> Text
pack Url
url)
  



-- | Filters javascript refs, inner page DOM refs, urls with query strings and those that
-- | do not contain the base url of the host site
maybeUsefulUrl :: Link -> Link -> Maybe Link
maybeUsefulUrl :: Link -> Link -> Maybe Link
maybeUsefulUrl (Link Url
baseUrl) Link
url = do
  Link -> Maybe Link
noJSorShit Link
url
  Link -> Maybe Url
numberOfQueryParamsIsZero Link
url
  if Url -> Url -> Bool
forall a. Eq a => [a] -> [a] -> Bool
isInfixOf Url
baseUrl (Link -> Url
forall a. IsLink a => a -> Url
renderLink Link
url) then Link -> Maybe Link
forall a. a -> Maybe a
forall (m :: * -> *) a. Monad m => a -> m a
return Link
url else Maybe Link
forall a. Maybe a
Nothing
  Link -> Maybe Link
allowableEndings Link
url

  where
    noJSorShit :: Link -> Maybe Link
    noJSorShit :: Link -> Maybe Link
noJSorShit Link
link =
      if (Bool -> Bool
not (Bool -> Bool) -> Bool -> Bool
forall a b. (a -> b) -> a -> b
$ Bool -> [Bool] -> Bool
forall a. Eq a => a -> [a] -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
elem Bool
True (Link -> [Url] -> [Bool]
urlContains Link
link [Url
"javascript", Url
"about", Url
"help", Url
"#"]))
      then Link -> Maybe Link
forall a. a -> Maybe a
Just Link
url
      else Maybe Link
forall a. Maybe a
Nothing

    urlContains :: Link -> [String] -> [Bool]
    urlContains :: Link -> [Url] -> [Bool]
urlContains (Link Url
url) [Url]
icases = (Url -> Bool) -> [Url] -> [Bool]
forall a b. (a -> b) -> [a] -> [b]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap (((Url -> Url -> Bool) -> Url -> Url -> Bool
forall a b c. (a -> b -> c) -> b -> a -> c
flip Url -> Url -> Bool
forall a. Eq a => [a] -> [a] -> Bool
isInfixOf) ((Char -> Char) -> Url -> Url
forall a b. (a -> b) -> [a] -> [b]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap Char -> Char
toLower Url
url)) [Url]
icases
  
    allowableEndings :: Link -> Maybe Link
allowableEndings Link
url =
      let lastPath :: Url
lastPath = Url -> Maybe Url -> Url
forall a. a -> Maybe a -> a
fromMaybe Url
"" (Maybe Url -> Url) -> Maybe Url -> Url
forall a b. (a -> b) -> a -> b
$ Link -> Maybe Url
getLastPath Link
url
      in
        if (Char -> Url -> Bool
forall a. Eq a => a -> [a] -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
elem Char
'.' Url
lastPath)
        then Url -> Link -> Maybe Link
forall {a}. Url -> a -> Maybe a
allowableFile Url
lastPath Link
url -- must be of allowable
        else Link -> Maybe Link
forall a. a -> Maybe a
Just Link
url
  
    allowableFile :: Url -> a -> Maybe a
allowableFile Url
endPath a
url =

      if Bool -> [Bool] -> Bool
forall a. Eq a => a -> [a] -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
elem Bool
True ([Bool] -> Bool) -> [Bool] -> Bool
forall a b. (a -> b) -> a -> b
$ (Url -> Bool) -> [Url] -> [Bool]
forall a b. (a -> b) -> [a] -> [b]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap (\Url
x -> Url -> Url -> Bool
forall a. Eq a => [a] -> [a] -> Bool
isSuffixOf Url
x ((Char -> Char) -> Url -> Url
forall a b. (a -> b) -> [a] -> [b]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap Char -> Char
toLower Url
endPath)) [Url]
allowed
      then a -> Maybe a
forall a. a -> Maybe a
Just a
url
      else Maybe a
forall a. Maybe a
Nothing
      where allowed :: [Url]
allowed = [Url
".aspx", Url
".html", Url
".pdf", Url
".php"]

    
-- getLastPath :: Url -> String
-- getLastPath url = unpack (unRText (NE.last (snd (fromJust (fromJust (fmap uriPath (mkURI (pack url))))))))

getLastPath :: Link -> Maybe String
getLastPath :: Link -> Maybe Url
getLastPath (Link Url
url) = do 
  URI
x <- Text -> Maybe URI
forall (m :: * -> *). MonadThrow m => Text -> m URI
mkURI (Text -> Maybe URI) -> Text -> Maybe URI
forall a b. (a -> b) -> a -> b
$ Url -> Text
pack Url
url 
  (Bool, NonEmpty (RText 'PathPiece))
x' <- URI -> Maybe (Bool, NonEmpty (RText 'PathPiece))
uriPath URI
x
  Url -> Maybe Url
forall a. a -> Maybe a
Just (Url -> Maybe Url)
-> ((Bool, NonEmpty (RText 'PathPiece)) -> Url)
-> (Bool, NonEmpty (RText 'PathPiece))
-> Maybe Url
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Text -> Url
unpack (Text -> Url)
-> ((Bool, NonEmpty (RText 'PathPiece)) -> Text)
-> (Bool, NonEmpty (RText 'PathPiece))
-> Url
forall b c a. (b -> c) -> (a -> b) -> a -> c
. RText 'PathPiece -> Text
forall (l :: RTextLabel). RText l -> Text
unRText (RText 'PathPiece -> Text)
-> ((Bool, NonEmpty (RText 'PathPiece)) -> RText 'PathPiece)
-> (Bool, NonEmpty (RText 'PathPiece))
-> Text
forall b c a. (b -> c) -> (a -> b) -> a -> c
. NonEmpty (RText 'PathPiece) -> RText 'PathPiece
forall a. NonEmpty a -> a
NE.last (NonEmpty (RText 'PathPiece) -> RText 'PathPiece)
-> ((Bool, NonEmpty (RText 'PathPiece))
    -> NonEmpty (RText 'PathPiece))
-> (Bool, NonEmpty (RText 'PathPiece))
-> RText 'PathPiece
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Bool, NonEmpty (RText 'PathPiece)) -> NonEmpty (RText 'PathPiece)
forall a b. (a, b) -> b
snd ((Bool, NonEmpty (RText 'PathPiece)) -> Maybe Url)
-> (Bool, NonEmpty (RText 'PathPiece)) -> Maybe Url
forall a b. (a -> b) -> a -> b
$ (Bool, NonEmpty (RText 'PathPiece))
x'

-- | Input is meant to be right from 
usefulNewUrls :: Link -> [(Link, a)] -> [Link] -> [Maybe Link]
usefulNewUrls :: forall a. Link -> [(Link, a)] -> [Link] -> [Maybe Link]
usefulNewUrls Link
_ [(Link, a)]
_ [] = []
usefulNewUrls Link
baseUrl [(Link, a)]
tree (Link
link:[Link]
links) = (Link -> [(Link, a)] -> Link -> Maybe Link
forall a. Link -> [(Link, a)] -> Link -> Maybe Link
maybeUsefulNewUrl Link
baseUrl [(Link, a)]
tree Link
link) Maybe Link -> [Maybe Link] -> [Maybe Link]
forall a. a -> [a] -> [a]
: Link -> [(Link, a)] -> [Link] -> [Maybe Link]
forall a. Link -> [(Link, a)] -> [Link] -> [Maybe Link]
usefulNewUrls Link
baseUrl [(Link, a)]
tree [Link]
links

usefulUrls :: Link -> [Link] -> [Maybe Link]
usefulUrls :: Link -> [Link] -> [Maybe Link]
usefulUrls Link
baseUrl (Link
link:[Link]
links) = Link -> Link -> Maybe Link
maybeUsefulUrl Link
baseUrl Link
link Maybe Link -> [Maybe Link] -> [Maybe Link]
forall a. a -> [a] -> [a]
: Link -> [Link] -> [Maybe Link]
usefulUrls Link
baseUrl [Link]
links 

numberOfQueryParamsIsZero :: Link -> Maybe String
numberOfQueryParamsIsZero :: Link -> Maybe Url
numberOfQueryParamsIsZero (Link Url
uri) = do
  URI
x <- Text -> Maybe URI
forall (m :: * -> *). MonadThrow m => Text -> m URI
mkURI (Url -> Text
pack Url
uri)
  if [QueryParam] -> Int
forall a. [a] -> Int
forall (t :: * -> *) a. Foldable t => t a -> Int
length (URI -> [QueryParam]
uriQuery URI
x) Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== Int
0
  then Url -> Maybe Url
forall a. a -> Maybe a
Just Url
uri
  else Maybe Url
forall a. Maybe a
Nothing


deriveJSON defaultOptions ''Link