| Safe Haskell | None |
|---|---|
| Language | Haskell2010 |
Scrappy.Links
Description
DOM -> Link >>= request --> DOM -> Link ... ^^ this may be infinitely complicated by stuff such as JS
The recursive nature of scraping is the central data structure of a URL
Which makes me think that there may be more to consider at some point with the modern-uri package And doing stuff such as building site trees
Synopsis
- type PageNumber = Int
- type BaseUrl = Link
- type Url = String
- type HrefURI = String
- type CurrentUrl = Url
- type DOI = String
- type Src = Url
- type RelativeUrl = Url
- fixRelativeUrl :: BaseUrl -> Url -> Url
- getHtmlStateful :: Url -> String
- type LastUrl = Link
- type Href = String
- fixSameSiteURL :: LastUrl -> Href -> Maybe Url
- fixURL :: LastUrl -> Href -> Url
- deriveBaseUrl :: Link -> Maybe BaseUrl
- mkBaseUrl :: URI -> Maybe Link
- class IsLink a where
- renderLink :: a -> Url
- getFileName :: Link -> Maybe String
- doiParser :: forall s u (m :: Type -> Type). ParsecT s u m DOI
- data ReferenceSys = RefSys [String] [String]
- type GeneratedLink = String
- type Namespace = Text
- type Option = Text
- data QParams
- type SiteTree = [(Bool, Text)]
- data DOMLink
- newtype Link = Link Url
- parseLink :: Bool -> Link -> Url -> Maybe Link
- sameAuthority :: Url -> Link -> Bool
- type HostName = String
- getHostName :: Link -> Maybe HostName
- maybeUsefulNewUrl :: Link -> [(Link, a)] -> Link -> Maybe Link
- urlIsNew :: [(a, Url)] -> HrefURI -> Bool
- maybeNewUrl :: [(Link, a)] -> Link -> Maybe Link
- maybeUsefulUrl :: Link -> Link -> Maybe Link
- getLastPath :: Link -> Maybe String
- usefulNewUrls :: Link -> [(Link, a)] -> [Link] -> [Maybe Link]
- usefulUrls :: Link -> [Link] -> [Maybe Link]
- numberOfQueryParamsIsZero :: Link -> Maybe String
Documentation
type PageNumber = Int Source #
type CurrentUrl = Url Source #
type RelativeUrl = Url Source #
getHtmlStateful :: Url -> String Source #
Could set last url in state
deriveBaseUrl :: Link -> Maybe BaseUrl Source #
the fromJust should never be called if Links are used properly
mkBaseUrl :: URI -> Maybe Link Source #
I think this is good (might also bee good lens practice tho to simplify)
Methods
renderLink :: a -> Url Source #
Instances
| IsLink Link Source # | Only exported interface |
Defined in Scrappy.Links Methods renderLink :: Link -> Url Source # | |
type GeneratedLink = String Source #
type Namespace = Text Source #
Name and Namespace are really same shit; might just converge | Refer to literally "name" attribute
This is an operationally focused type where | a certain namespace is found to have n num of Options
More for show / reasoning rn .. non-optimal
type SiteTree = [(Bool, Text)] Source #
Inter site urls and whether they have been checked for some pattern
This wouldnt need to be exported as our interfaces would implement it under the hood | and return a Link'
parseLink :: Bool -> Link -> Url -> Maybe Link Source #
This is a general interface for extracting a raw link | from scraping according to specs about the scraper itself | IE if it is 100% same site
maybeUsefulNewUrl :: Link -> [(Link, a)] -> Link -> Maybe Link Source #
Core function of module, filters for any links which point to other pages on the current site | and have not been found over the course of scraping the site yet | filters out urls like https://othersite.com and "#"
maybeUsefulUrl :: Link -> Link -> Maybe Link Source #
Filters javascript refs, inner page DOM refs, urls with query strings and those that | do not contain the base url of the host site