scrappy-core-0.1.0.1: html pattern matching library and high-level interface concurrent requests lib for webscraping

Safe Haskell	None
Language	Haskell2010

Scrappy.Links

Description

DOM -> Link >>= request --> DOM -> Link ... ^^ this may be infinitely complicated by stuff such as JS

The recursive nature of scraping is the central data structure of a URL

Which makes me think that there may be more to consider at some point with the modern-uri package And doing stuff such as building site trees

Synopsis

type PageNumber = Int
type BaseUrl = Link
type Url = String
type HrefURI = String
type CurrentUrl = Url
type DOI = String
type Src = Url
type RelativeUrl = Url
fixRelativeUrl :: BaseUrl -> Url -> Url
getHtmlStateful :: Url -> String
type LastUrl = Link
type Href = String
fixSameSiteURL :: LastUrl -> Href -> Maybe Url
fixURL :: LastUrl -> Href -> Url
deriveBaseUrl :: Link -> Maybe BaseUrl
mkBaseUrl :: URI -> Maybe Link
class IsLink a where
- renderLink :: a -> Url
getFileName :: Link -> Maybe String
doiParser :: forall s u (m :: Type -> Type). ParsecT s u m DOI
data ReferenceSys = RefSys [String] [String]
type GeneratedLink = String
type Namespace = Text
type Option = Text
data QParams
- = Opt (Map Namespace [Option])
- | SimpleKV (Text, Text)
type SiteTree = [(Bool, Text)]
data DOMLink
- = Href' Href
- | Src Url
- | PlainLink Url
newtype Link = Link Url
parseLink :: Bool -> Link -> Url -> Maybe Link
sameAuthority :: Url -> Link -> Bool
type HostName = String
getHostName :: Link -> Maybe HostName
maybeUsefulNewUrl :: Link -> [(Link, a)] -> Link -> Maybe Link
urlIsNew :: [(a, Url)] -> HrefURI -> Bool
maybeNewUrl :: [(Link, a)] -> Link -> Maybe Link
maybeUsefulUrl :: Link -> Link -> Maybe Link
getLastPath :: Link -> Maybe String
usefulNewUrls :: Link -> [(Link, a)] -> [Link] -> [Maybe Link]
usefulUrls :: Link -> [Link] -> [Maybe Link]
numberOfQueryParamsIsZero :: Link -> Maybe String

Documentation

type PageNumber = Int Source #

type BaseUrl = Link Source #

type Url = String Source #

type HrefURI = String Source #

type CurrentUrl = Url Source #

type DOI = String Source #

type Src = Url Source #

type RelativeUrl = Url Source #

fixRelativeUrl :: BaseUrl -> Url -> Url Source #

getHtmlStateful :: Url -> String Source #

Could set last url in state

type LastUrl = Link Source #

type Href = String Source #

fixSameSiteURL :: LastUrl -> Href -> Maybe Url Source #

fixURL :: LastUrl -> Href -> Url Source #

Generic algorithm for determining full path given last url

deriveBaseUrl :: Link -> Maybe BaseUrl Source #

the fromJust should never be called if Links are used properly

mkBaseUrl :: URI -> Maybe Link Source #

I think this is good (might also bee good lens practice tho to simplify)

class IsLink a where Source #

Methods

renderLink :: a -> Url Source #

Instances

Instances details

IsLink Link Source #	Only exported interface
Instance details Defined in Scrappy.Links Methods renderLink :: Link -> Url Source #

getFileName :: Link -> Maybe String Source #

doiParser :: forall s u (m :: Type -> Type). ParsecT s u m DOI Source #

data ReferenceSys Source #

Constructors

RefSys [String] [String]

type GeneratedLink = String Source #

type Namespace = Text Source #

Name and Namespace are really same shit; might just converge | Refer to literally "name" attribute

type Option = Text Source #

This is an operationally focused type where | a certain namespace is found to have n num of Options

data QParams Source #

More for show / reasoning rn .. non-optimal

Constructors

Opt (Map Namespace [Option])
SimpleKV (Text, Text)

type SiteTree = [(Bool, Text)] Source #

Inter site urls and whether they have been checked for some pattern

data DOMLink Source #

This wouldnt need to be exported as our interfaces would implement it under the hood | and return a Link'

Constructors

Href' Href
Src Url
PlainLink Url

newtype Link Source #

Constructors

Instances

Instances details

FromJSON Link Source #
Instance details Defined in Scrappy.Links Methods parseJSON :: Value -> Parser Link # parseJSONList :: Value -> Parser [Link] # omittedField :: Maybe Link #
ToJSON Link Source #
Instance details Defined in Scrappy.Links Methods toJSON :: Link -> Value # toEncoding :: Link -> Encoding # toJSONList :: [Link] -> Value # toEncodingList :: [Link] -> Encoding # omitField :: Link -> Bool #
Read Link Source #
Instance details Defined in Scrappy.Links Methods readsPrec :: Int -> ReadS Link # readList :: ReadS [Link] # readPrec :: ReadPrec Link # readListPrec :: ReadPrec [Link] #
Show Link Source #
Instance details Defined in Scrappy.Links Methods showsPrec :: Int -> Link -> ShowS # show :: Link -> String # showList :: [Link] -> ShowS #
Eq Link Source #
Instance details Defined in Scrappy.Links Methods (==) :: Link -> Link -> Bool # (/=) :: Link -> Link -> Bool #
Ord Link Source #
Instance details Defined in Scrappy.Links Methods compare :: Link -> Link -> Ordering # (<) :: Link -> Link -> Bool # (<=) :: Link -> Link -> Bool # (>) :: Link -> Link -> Bool # (>=) :: Link -> Link -> Bool # max :: Link -> Link -> Link # min :: Link -> Link -> Link #
IsLink Link Source #	Only exported interface
Instance details Defined in Scrappy.Links Methods renderLink :: Link -> Url Source #

parseLink :: Bool -> Link -> Url -> Maybe Link Source #

This is a general interface for extracting a raw link | from scraping according to specs about the scraper itself | IE if it is 100% same site

sameAuthority :: Url -> Link -> Bool Source #

type HostName = String Source #

getHostName :: Link -> Maybe HostName Source #

maybeUsefulNewUrl :: Link -> [(Link, a)] -> Link -> Maybe Link Source #

Core function of module, filters for any links which point to other pages on the current site | and have not been found over the course of scraping the site yet | filters out urls like https://othersite.com and "#"

urlIsNew :: [(a, Url)] -> HrefURI -> Bool Source #

maybeNewUrl :: [(Link, a)] -> Link -> Maybe Link Source #

maybeUsefulUrl :: Link -> Link -> Maybe Link Source #

Filters javascript refs, inner page DOM refs, urls with query strings and those that | do not contain the base url of the host site

getLastPath :: Link -> Maybe String Source #

usefulNewUrls :: Link -> [(Link, a)] -> [Link] -> [Maybe Link] Source #

Input is meant to be right from

usefulUrls :: Link -> [Link] -> [Maybe Link] Source #

numberOfQueryParamsIsZero :: Link -> Maybe String Source #