| Safe Haskell | None |
|---|---|
| Language | Haskell2010 |
Scrappy.Elem.TreeElemParser
Synopsis
- skipManyTill :: Alternative m => m a -> m end -> m end
- manyTill_ :: forall s u (m :: Type -> Type) a end. ParsecT s u m a -> ParsecT s u m end -> ParsecT s u m ([a], end)
- data Many a
- treeLookupIdx :: TreeIndex -> Forest a -> a
- treeElemParser :: forall s (m :: Type -> Type) a u. (Stream s m Char, ShowHTML a) => Maybe [Elem] -> Maybe (ParsecT s u m a) -> [(String, Maybe String)] -> ParsecT s u m (TreeHTML a)
- selfClosing :: [String]
- treeElemParser' :: forall s (m :: Type -> Type) a u. (Stream s m Char, ShowHTML a) => Maybe [Elem] -> Maybe (ParsecT s u m a) -> [(String, Maybe String)] -> ParsecT s u m (TreeHTML a)
- innerTreeElemParser :: forall a s (m :: Type -> Type) u. (ShowHTML a, Stream s m Char) => Elem -> Maybe (ParsecT s u m a) -> ParsecT s u m (String, [a], [Tree ElemHead])
- type SubTree a = [Tree a]
- treeElemParserSpecific :: forall s (m :: Type -> Type) a u. (Stream s m Char, ShowHTML a) => Maybe (ParsecT s u m a) -> Elem -> [(String, String)] -> SubTree ElemHead -> ParsecT s u m (TreeHTML a)
- validateGPR :: forall s u (m :: Type -> Type) a. [Many (Tree ElemHead)] -> ParsecT s u m [HTMLMatcher TreeHTML a]
- htmlGenParserRepeat' :: forall s (m :: Type -> Type) a u. (Stream s m Char, ShowHTML a) => String -> Maybe (ParsecT s u m a) -> [Many (Tree ElemHead)] -> ParsecT s u m [HTMLMatcher TreeHTML a]
- htmlGenParserRepeat :: forall s (m :: Type -> Type) a u. (Stream s m Char, ShowHTML a) => String -> Maybe (ParsecT s u m a) -> [Many (Tree ElemHead)] -> ParsecT s u m [HTMLMatcher TreeHTML a]
- specificChar :: forall s (m :: Type -> Type) u. Stream s m Char => ParsecT s u m Char
- treeElemParserSpecificContinuous :: forall s (m :: Type -> Type) a u. (Stream s m Char, ShowHTML a) => Maybe (ParsecT s u m a) -> [Many (Tree ElemHead)] -> ParsecT s u m ([Many (Tree ElemHead)], TreeHTML a)
- treeElemParserContains :: forall s (m :: Type -> Type) a u. (Stream s m Char, ShowHTML a) => Maybe (ParsecT s u m a) -> [Many (Tree ElemHead)] -> ParsecT s u m ([Many (Tree ElemHead)], TreeHTML a)
- htmlGenParserContains :: forall s (m :: Type -> Type) a u. (Stream s m Char, ShowHTML a) => String -> Maybe (ParsecT s u m a) -> [Many (Tree ElemHead)] -> ParsecT s u m [HTMLMatcher TreeHTML a]
- specificChar' :: forall s (m :: Type -> Type) u. Stream s m Char => Elem -> ParsecT s u m Char
- innerParserContains :: forall s (m :: Type -> Type) a u. (Stream s m Char, ShowHTML a) => Maybe (ParsecT s u m a) -> Elem -> SubTree ElemHead -> ParsecT s u m ([a], String, [Tree ElemHead])
- similarTreeH :: forall s (m :: Type -> Type) a u. (Stream s m Char, ShowHTML a) => Maybe (ParsecT s u m a) -> TreeHTML a -> ParsecT s u m (TreeHTML a)
- htmlGroupSimilar :: forall s (m :: Type -> Type) a u. (Stream s m Char, ShowHTML a) => Maybe [Elem] -> Maybe (ParsecT s u m a) -> [(String, Maybe String)] -> ParsecT s u m (GroupHtml TreeHTML a)
- takeTill :: (a -> Bool) -> [a] -> [a]
- tryElHeads :: forall s u (m :: Type -> Type). (Elem, Attrs) -> [Many (Tree ElemHead)] -> ParsecT s u m ([Tree ElemHead], [Many (Tree ElemHead)])
- tryElHeads' :: (Elem, Attrs) -> [Many (Tree ElemHead)] -> Either String ([Tree ElemHead], [Many (Tree ElemHead)])
- innerParserSpecific :: forall s (m :: Type -> Type) a u. (Stream s m Char, ShowHTML a) => Maybe (ParsecT s u m a) -> Elem -> SubTree ElemHead -> ParsecT s u m ([a], String, [Tree ElemHead])
- groupify :: Eq a => [Tree a] -> [Many (Tree a)] -> [Many (Tree a)]
- htmlGroup :: forall s (m :: Type -> Type) a u. (Stream s m Char, ShowHTML a) => Maybe [Elem] -> Maybe (ParsecT s u m a) -> [(String, Maybe String)] -> ParsecT s u m (GroupHtml TreeHTML a)
- table :: forall s (m :: Type -> Type) u. Stream s m Char => ParsecT s u m (GroupHtml TreeHTML String)
- multiTreeElemHeadParser :: forall s (m :: Type -> Type) a u. (Stream s m Char, ShowHTML a) => ParsecT s u m a -> Many (Tree ElemHead) -> ParsecT s u m [HTMLMatcher TreeHTML a]
- fromMany :: Many a -> a
- stylingTags :: [Elem]
- stylingElem :: forall s (m :: Type -> Type) u. Stream s m Char => ParsecT s u m String
- sameTreeH :: forall s (m :: Type -> Type) a u. (Stream s m Char, ShowHTML a) => Maybe (ParsecT s u m a) -> TreeHTML a -> ParsecT s u m (TreeHTML a)
- htmlGenParserFlex :: forall s (m :: Type -> Type) a u. (Stream s m Char, ShowHTML a) => Maybe (ParsecT s u m a) -> ParsecT s u m (HTMLMatcher TreeHTML a)
- htmlGenParser :: forall s (m :: Type -> Type) a u. (Stream s m Char, ShowHTML a) => ParsecT s u m a -> ParsecT s u m (TreeHTML a) -> ParsecT s u m (HTMLMatcher TreeHTML a)
- specificForest :: forall s (m :: Type -> Type) a u. (Stream s m Char, ShowHTML a) => [Tree ElemHead] -> ParsecT s u m a -> ParsecT s u m [HTMLMatcher TreeHTML a]
- nodeToTreeElemExpr :: forall s (m :: Type -> Type) a u. (Stream s m Char, ShowHTML a) => Tree ElemHead -> ParsecT s u m a -> ParsecT s u m (TreeHTML a)
- innerElemParser2 :: forall a s (m :: Type -> Type) u. (ShowHTML a, Stream s m Char) => String -> Maybe (ParsecT s u m a) -> ParsecT s u m [HTMLMatcher TreeHTML a]
- treeElemParserAnyInside :: forall s (m :: Type -> Type) a u. (Stream s m Char, ShowHTML a) => Maybe (ParsecT s u m a) -> ParsecT s u m (TreeHTML a)
- anyHtmlGroup :: forall a s (m :: Type -> Type) u. (ShowHTML a, Stream s m Char) => ParsecT s u m (GroupHtml TreeHTML a)
- findAllSpaceMutExGroups :: forall a s (m :: Type -> Type) u. (ShowHTML a, Stream s m Char) => ParsecT s u m (Maybe [GroupHtml TreeHTML a])
- findAllMutExGroups' :: a
Documentation
skipManyTill :: Alternative m => m a -> m end -> m end Source #
manyTill_ :: forall s u (m :: Type -> Type) a end. ParsecT s u m a -> ParsecT s u m end -> ParsecT s u m ([a], end) Source #
Note for research on Parsers/Scrapers + AI research -> if a scraper does provide only | a slow method for processing (Picture -> *) that we might be able to solve this issue with | either Quantum (same as current AI) or with attentive methodologies to "gamble" on what to first pay | attention to , which could further be based on if Video like (frame -> frame) or single picture
treeLookupIdx :: TreeIndex -> Forest a -> a Source #
treeElemParser :: forall s (m :: Type -> Type) a u. (Stream s m Char, ShowHTML a) => Maybe [Elem] -> Maybe (ParsecT s u m a) -> [(String, Maybe String)] -> ParsecT s u m (TreeHTML a) Source #
Like elemParser, this matches on an html element but also represents the innerHTML | as a Tree ElemHead so that we can match this structure in elements further down in the DOM | see groupHtml and treeElemParserSpecific
selfClosing :: [String] Source #
treeElemParser' :: forall s (m :: Type -> Type) a u. (Stream s m Char, ShowHTML a) => Maybe [Elem] -> Maybe (ParsecT s u m a) -> [(String, Maybe String)] -> ParsecT s u m (TreeHTML a) Source #
Used by treeElemParser, is not an interface, use treeElemParser
innerTreeElemParser :: forall a s (m :: Type -> Type) u. (ShowHTML a, Stream s m Char) => Elem -> Maybe (ParsecT s u m a) -> ParsecT s u m (String, [a], [Tree ElemHead]) Source #
The real difference between (htmlGroup _ _ _) and specificRepetitiveForest is a matter of if we accept the next | piece to be a new discovery to match on or if we are in that process of matching what we just found
type SubTree a = [Tree a] Source #
NOTE: In future: create function that simplifies all numbers that will be compared to their number of digits 1426674 -> 1234567 1834324 -> 1234567 (==) -> True
Ideal case is that we can do (Many a) struturing even if interspersed with text which would solve issues like | many search terms highlighted, we then wouldnt need to know what the search term is | AND!! we have already seen that it could for exmaple be: aclass="hiddenText"Hockey/b/a
treeElemParserSpecific :: forall s (m :: Type -> Type) a u. (Stream s m Char, ShowHTML a) => Maybe (ParsecT s u m a) -> Elem -> [(String, String)] -> SubTree ElemHead -> ParsecT s u m (TreeHTML a) Source #
Note: unlike other Element parsers, it does not call itself but innerParserSpecific instead loops with |
validateGPR :: forall s u (m :: Type -> Type) a. [Many (Tree ElemHead)] -> ParsecT s u m [HTMLMatcher TreeHTML a] Source #
htmlGenParserRepeat' :: forall s (m :: Type -> Type) a u. (Stream s m Char, ShowHTML a) => String -> Maybe (ParsecT s u m a) -> [Many (Tree ElemHead)] -> ParsecT s u m [HTMLMatcher TreeHTML a] Source #
Uses HTMLMatcher to collect cases of html while parsing inside of a certain element
htmlGenParserRepeat :: forall s (m :: Type -> Type) a u. (Stream s m Char, ShowHTML a) => String -> Maybe (ParsecT s u m a) -> [Many (Tree ElemHead)] -> ParsecT s u m [HTMLMatcher TreeHTML a] Source #
BUT! we need to check off our list of demanded elements so that when we parse the end tag, we can see if | the elements (ordered and parsed only in order) were all found before the end tag for htmlGenParserRepeat it can just change the passed state of [Many (Tree ElemHead)]
specificChar :: forall s (m :: Type -> Type) u. Stream s m Char => ParsecT s u m Char Source #
NEW IDEA!!
treeElemParserSpecificContinuous :: forall s (m :: Type -> Type) a u. (Stream s m Char, ShowHTML a) => Maybe (ParsecT s u m a) -> [Many (Tree ElemHead)] -> ParsecT s u m ([Many (Tree ElemHead)], TreeHTML a) Source #
treeElemParserSpecific is an interface to this (via innerParserSpecific) | This inner function uses the Many datatype to differentiate between whether we should expect | to parse a single element with the given specs or allow for multiple of the given element specs in a row | |
treeElemParserContains :: forall s (m :: Type -> Type) a u. (Stream s m Char, ShowHTML a) => Maybe (ParsecT s u m a) -> [Many (Tree ElemHead)] -> ParsecT s u m ([Many (Tree ElemHead)], TreeHTML a) Source #
This is largely a subfunc of htmlGenParserContains | Accepts any element and if element is in the order of our checklist-of-elems, we give the tail of elems back | if the tail reaches [] before we hit the end tag then we are successful
htmlGenParserContains :: forall s (m :: Type -> Type) a u. (Stream s m Char, ShowHTML a) => String -> Maybe (ParsecT s u m a) -> [Many (Tree ElemHead)] -> ParsecT s u m [HTMLMatcher TreeHTML a] Source #
specificChar' :: forall s (m :: Type -> Type) u. Stream s m Char => Elem -> ParsecT s u m Char Source #
innerParserContains :: forall s (m :: Type -> Type) a u. (Stream s m Char, ShowHTML a) => Maybe (ParsecT s u m a) -> Elem -> SubTree ElemHead -> ParsecT s u m ([a], String, [Tree ElemHead]) Source #
similarTreeH :: forall s (m :: Type -> Type) a u. (Stream s m Char, ShowHTML a) => Maybe (ParsecT s u m a) -> TreeHTML a -> ParsecT s u m (TreeHTML a) Source #
Very similar to treeElemParserSpecific except that it allows for a new nodes in the HTML DOM tree | to exist at random as long as when we resume parsing we still find all of the branches we found in the | TreeHTML a that is given as an arg to this function
htmlGroupSimilar :: forall s (m :: Type -> Type) a u. (Stream s m Char, ShowHTML a) => Maybe [Elem] -> Maybe (ParsecT s u m a) -> [(String, Maybe String)] -> ParsecT s u m (GroupHtml TreeHTML a) Source #
Returns an entire group of highly similar elements based on their specifications such | as their innerTrees, the element tag, and attributes. | | This can be used to autonomously determine the structure of and find search result items after you've submitted a form htmlGroupSimilar :: (Stream s m Char, ShowHTML a) => Maybe [Elem] -> Maybe (ParsecT s u m a) -> [(String, Maybe String)] -> ParsecT s u m (GroupHtml TreeHTML a) htmlGroupSimilar elemOpts matchh attrsSubset = -- Not sure about the order yet tho fmap mkGH $ try (treeElemParser elemOpts matchh attrsSubset >>= (treeH -> fmap (treeH :) (some (try $ similarTreeH matchh treeH))))
tryElHeads :: forall s u (m :: Type -> Type). (Elem, Attrs) -> [Many (Tree ElemHead)] -> ParsecT s u m ([Tree ElemHead], [Many (Tree ElemHead)]) Source #
yields how many are still worth trying
tryElHeads' :: (Elem, Attrs) -> [Many (Tree ElemHead)] -> Either String ([Tree ElemHead], [Many (Tree ElemHead)]) Source #
innerParserSpecific :: forall s (m :: Type -> Type) a u. (Stream s m Char, ShowHTML a) => Maybe (ParsecT s u m a) -> Elem -> SubTree ElemHead -> ParsecT s u m ([a], String, [Tree ElemHead]) Source #
groupify :: Eq a => [Tree a] -> [Many (Tree a)] -> [Many (Tree a)] Source #
IS THIS IN THE RIGHT ORDER OR DOES IT NEED TO BE REVERSED? | Creates a simplified set of instructions for parsing a very specific Tree structure
htmlGroup :: forall s (m :: Type -> Type) a u. (Stream s m Char, ShowHTML a) => Maybe [Elem] -> Maybe (ParsecT s u m a) -> [(String, Maybe String)] -> ParsecT s u m (GroupHtml TreeHTML a) Source #
Returns a minimum of 2 --> almost like same should be function ; same :: a -> [a] to be applied to some doc/String
| note: not sure if this exists but here's where we could handle iterating names of attributes
| Can generalize to ElementRep e
table :: forall s (m :: Type -> Type) u. Stream s m Char => ParsecT s u m (GroupHtml TreeHTML String) Source #
Html table group
multiTreeElemHeadParser :: forall s (m :: Type -> Type) a u. (Stream s m Char, ShowHTML a) => ParsecT s u m a -> Many (Tree ElemHead) -> ParsecT s u m [HTMLMatcher TreeHTML a] Source #
Deprecated: use specificContinuous style functions
Build [Many (Tree ElemHead)] | Write parser that tries each case OR parses openingTag and then decides what case it fits | For set (Previous, Next) if Next is True then delete Previous parser | Next becomes Previous in future equation(s)
Fail onto plain IText (of parent element that the parser is currently in)
Inner parser of treeElemParserSpecific
specificRepetitiveForest :: (Stream s m Char, ShowHTML a)
=> [Many (Tree ElemHead)]
-> ParsecT s u m a
-> ParsecT s u m [HTMLMatcher TreeHTML a]
specificRepetitiveForest [] _ = return []
specificRepetitiveForest (mElHead1:mElHead2:manyElHeads) match = do
-- | ysA could be == []
htmlGenParserRepeat match (manyElHeads) -- NEW
ysA {-maybeNext-} <- multiTreeElemHeadParser match mElHead2
ysB <- case ysA of
Just a -> specificRepetitiveForest (mElHead2:manyElHeads) match
Nothing ->
-- WHat happens to the rest of Many ElHeads in this case?
htmlGenParserRepeat match (multiTreeElemHeadParser match mElHead1)
htmlGenParserRepeat match (manyElHeads) -- NEW
return (ysA <> ysB)
let
-- funcP :: (ShowHTML a, Stream s m Char) => ParsecT s u m [TreeHTML a]
funcP = multiTreeElemHeadParser match mElHead1
y <- htmlGenParserRepeat match funcP -- this literally just allows for matching on multiple elems too
-- Only applies to "specific" functions, prev: any case
ys <- case y of
-- Discard last parsed pattern and go to next element formula on success of y
((Element _):xs') -> specificRepetitiveForest manyElHeads match
_ -> specificRepetitiveForest (manyElHead:manyElHeads) match
-- return all results
This is all I actually need , no need for recursion here, since thats already done in top level func
fromMany :: Many a -> a Source #
Is able to repeat / execute any pattern that returns multiple elements of same type |(see manyTreeElemHeadParser) htmlGenParserRepeat :: (Stream s m Char, ShowHTML a) => ParsecT s u m a -> [Many (Tree ElemHead)] -> ParsecT s u m [TreeHTML a] -- Can just apply multiTreeElemHeadParser (if i should) inside -> ParsecT s u m [HTMLMatcher TreeHTML a]
HTMLGenParserRepeat is in this use case always going to be exact ie these 3 elems then the end tag | ... and maybe some text in between there
OF the cases we can do this: | parse and repeat function/recurse | or find end tag >> return [] which ends list
stylingTags :: [Elem] Source #
Note that multiTreeElemHeadParser is still not handled, all I need to do is auto delete if only one | actual function of multiTreeElemHeadParser will not be used but broken up
stylingElem :: forall s (m :: Type -> Type) u. Stream s m Char => ParsecT s u m String Source #
Just gives the inners
sameTreeH :: forall s (m :: Type -> Type) a u. (Stream s m Char, ShowHTML a) => Maybe (ParsecT s u m a) -> TreeHTML a -> ParsecT s u m (TreeHTML a) Source #
Interface to find same element
htmlGenParserFlex :: forall s (m :: Type -> Type) a u. (Stream s m Char, ShowHTML a) => Maybe (ParsecT s u m a) -> ParsecT s u m (HTMLMatcher TreeHTML a) Source #
htmlGenParser :: forall s (m :: Type -> Type) a u. (Stream s m Char, ShowHTML a) => ParsecT s u m a -> ParsecT s u m (TreeHTML a) -> ParsecT s u m (HTMLMatcher TreeHTML a) Source #
specificForest :: forall s (m :: Type -> Type) a u. (Stream s m Char, ShowHTML a) => [Tree ElemHead] -> ParsecT s u m a -> ParsecT s u m [HTMLMatcher TreeHTML a] Source #
Deprecated: you likely need specificRepetitiveForest
Library function for when you want an exact match, if 3 of ElemHead A then it looks for 3 Elemhead A accumMaybe' :: [HTMLMatcher] -> ParsecT s u m a
nodeToTreeElemExpr :: forall s (m :: Type -> Type) a u. (Stream s m Char, ShowHTML a) => Tree ElemHead -> ParsecT s u m a -> ParsecT s u m (TreeHTML a) Source #
innerElemParser2 :: forall a s (m :: Type -> Type) u. (ShowHTML a, Stream s m Char) => String -> Maybe (ParsecT s u m a) -> ParsecT s u m [HTMLMatcher TreeHTML a] Source #
Used by treeElemParser'
treeElemParserAnyInside :: forall s (m :: Type -> Type) a u. (Stream s m Char, ShowHTML a) => Maybe (ParsecT s u m a) -> ParsecT s u m (TreeHTML a) Source #
anyHtmlGroup :: forall a s (m :: Type -> Type) u. (ShowHTML a, Stream s m Char) => ParsecT s u m (GroupHtml TreeHTML a) Source #
findAllSpaceMutExGroups :: forall a s (m :: Type -> Type) u. (ShowHTML a, Stream s m Char) => ParsecT s u m (Maybe [GroupHtml TreeHTML a]) Source #
findAllMutExGroups' :: a Source #