| Safe Haskell | None |
|---|---|
| Language | Haskell2010 |
Scrappy.Elem.ITextElemParser
Description
This will eventually be a beautiful interface between NLP and scrappy
Synopsis
- emptyTree :: forall a s (m :: Type -> Type) u. (ShowHTML a, Stream s m Char) => Maybe [Elem] -> Maybe (ParsecT s u m a) -> [(String, Maybe String)] -> ParsecT s u m (TreeHTML a)
- preface :: forall s (m :: Type -> Type) u pre a. Stream s m Char => ParsecT s u m pre -> ParsecT s u m a -> ParsecT s u m a
- class Zero a where
- consumeZero :: a -> b -> b
- class Singleton a where
- consumeSingleton :: a -> b
- class Multiple a where
- consumeMultiple :: a -> b
- class (Zero a, Singleton a, Multiple a) => Existential a where
- consumeExists :: a -> b
- emptyTreeGroup :: forall a s (m :: Type -> Type) u. (ShowHTML a, Stream s m Char) => Maybe [Elem] -> Maybe (ParsecT s u m a) -> [(String, Maybe String)] -> ParsecT s u m [TreeHTML a]
- elemAny :: forall s (m :: Type -> Type) u. Stream s m Char => ParsecT s u m (Elem' String)
- type ResearchResult = String
- data Paragraph = Paragraph {
- unParagraph :: [Sentence]
- data Sentence = Sentence {
- unSentence :: [WrittenWord]
- data WrittenWord = WW {}
- punctuation :: forall s (m :: Type -> Type) u. Stream s m Char => ParsecT s u m Char
- writtenWord :: forall s (m :: Type -> Type) u. Stream s m Char => ParsecT s u m WrittenWord
- wordSeparator :: forall s (m :: Type -> Type) u. Stream s m Char => ParsecT s u m String
- comma :: forall s (m :: Type -> Type) u. Stream s m Char => ParsecT s u m String
- colon :: forall s (m :: Type -> Type) u. Stream s m Char => ParsecT s u m String
- semiColon :: forall s (m :: Type -> Type) u. Stream s m Char => ParsecT s u m String
- word' :: forall s (m :: Type -> Type) u. Stream s m Char => ParsecT s u m String
- capitalizedWord :: forall s (m :: Type -> Type) u. Stream s m Char => ParsecT s u m String
- number :: forall s (m :: Type -> Type) u. Stream s m Char => ParsecT s u m String
- sentence :: forall s (m :: Type -> Type) u. Stream s m Char => ParsecT s u m Sentence
- sentenceWhere :: forall s (m :: Type -> Type) u. Stream s m Char => ([WrittenWord] -> Bool) -> ParsecT s u m Sentence
- sentenceTail :: forall s (m :: Type -> Type) u. Stream s m Char => Bool -> ParsecT s u m [WrittenWord]
- styleTags :: [String]
- negParseOpeningTag :: forall s (m :: Type -> Type) u. Stream s m Char => [Elem] -> ParsecT s u m (Elem, Attrs)
- textChunk :: forall s (m :: Type -> Type) u. Stream s m Char => ParsecT s u m String
- openOrCloseTag :: forall s (m :: Type -> Type) u. Stream s m Char => ParsecT s u m ()
- anyEndTag :: forall s (m :: Type -> Type) u. Stream s m Char => ParsecT s u m Char
- anyThingbut :: forall s (m :: Type -> Type) u. Stream s m Char => [String] -> ParsecT s u m String
- textChunkIf :: forall s (m :: Type -> Type) u. Stream s m Char => (String -> Bool) -> ParsecT s u m String
- plainText :: forall s (m :: Type -> Type) u. Stream s m Char => ParsecT s u m String
- styleElem :: forall s (m :: Type -> Type) u. Stream s m Char => ParsecT s u m (Elem' String)
- type Html = String
- removeStyleTags :: Html -> Html
- catEithers :: [Either e a] -> [a]
- divideUp :: forall s (m :: Type -> Type) u. Stream s m Char => ParsecT s u m String -> ParsecT s u m [Either String String]
- onlyPlainText :: forall s (m :: Type -> Type) u. Stream s m Char => ParsecT s u m String
- data AccumITextElem a = ACT [String]
- textOnlyFoldr :: HTMLMatcher AccumITextElem String -> (String, [String]) -> (String, [String])
Documentation
emptyTree :: forall a s (m :: Type -> Type) u. (ShowHTML a, Stream s m Char) => Maybe [Elem] -> Maybe (ParsecT s u m a) -> [(String, Maybe String)] -> ParsecT s u m (TreeHTML a) Source #
comment is to crash nix as reminder to move somewhere sensible data OpenStruct a = OpenStruct (Parser a) type CloseStruct a = OpenStruct a -> ClosePiece a could be even _ -> f , when the Close struct is independent of Open and I dont think this would affect speed data ClosePiece a = ClosePiece (Parser a)
paired with maybeUsefulNewUrls this would allow us to scrape an entire | site for a singular pattern | and just by virtue of basic haskell types, there's zero reason we cant | have some simple type: | data Scrapeable = Case1 A | Case2 B ... fanExistential :: Url -> (Url -> Bool) -> MaybeT m a -> MaybeT m [a] fanExistential url = do html <- getHtmlST sv url links <- flip successesM html $ hoistMaybe $ scrape (hrefParser' cond) fanExistential links
preface :: forall s (m :: Type -> Type) u pre a. Stream s m Char => ParsecT s u m pre -> ParsecT s u m a -> ParsecT s u m a Source #
Returns a minimum of 2 --> almost like same should be function ; same :: a -> [a] to be applied to some doc/String
| note: not sure if this exists but here's where we could handle iterating names of attributes
| Can generalize to ElementRep e
Methods
consumeZero :: a -> b -> b Source #
class (Zero a, Singleton a, Multiple a) => Existential a where Source #
Methods
consumeExists :: a -> b Source #
emptyTreeGroup :: forall a s (m :: Type -> Type) u. (ShowHTML a, Stream s m Char) => Maybe [Elem] -> Maybe (ParsecT s u m a) -> [(String, Maybe String)] -> ParsecT s u m [TreeHTML a] Source #
Only matches if no innerTrees | This doesn't behave exactly like a "group" function | because it allows matching on one element | but this will also never be empty
type ResearchResult = String Source #
TODO(galen): these should build off each other
Constructors
| Paragraph | |
Fields
| |
Constructors
| Sentence | |
Fields
| |
data WrittenWord Source #
Instances
| Monoid WrittenWord Source # | Technically this shouldnt exist ever |
Defined in Scrappy.Elem.ITextElemParser Methods mempty :: WrittenWord # mappend :: WrittenWord -> WrittenWord -> WrittenWord # mconcat :: [WrittenWord] -> WrittenWord # | |
| Semigroup WrittenWord Source # | |
Defined in Scrappy.Elem.ITextElemParser Methods (<>) :: WrittenWord -> WrittenWord -> WrittenWord # sconcat :: NonEmpty WrittenWord -> WrittenWord # stimes :: Integral b => b -> WrittenWord -> WrittenWord # | |
| Show WrittenWord Source # | |
Defined in Scrappy.Elem.ITextElemParser Methods showsPrec :: Int -> WrittenWord -> ShowS # show :: WrittenWord -> String # showList :: [WrittenWord] -> ShowS # | |
writtenWord :: forall s (m :: Type -> Type) u. Stream s m Char => ParsecT s u m WrittenWord Source #
Word also means bits but I mean written specifically | This can definitely be expanded upon to increase its reach | while maintaining validity
sentenceWhere :: forall s (m :: Type -> Type) u. Stream s m Char => ([WrittenWord] -> Bool) -> ParsecT s u m Sentence Source #
sentenceTail :: forall s (m :: Type -> Type) u. Stream s m Char => Bool -> ParsecT s u m [WrittenWord] Source #
for research: new concept: reliable generalizations of thinking
styleTags :: [String] Source #
To my understanding this should not affect how we parse; it is | only for sure a given that the result of our low level read is really | just words and so the parsers should focus on setting up the next | parser
This is built in a way that allows the idea of a sentence | to be as internally valid as possible; the sentence controls | the period mkParagraph :: [Sentence] -> Paragraph mkParagraph ss = Paragraph . mkParagraph' $ ss where mkParagraph' :: [Sentence] -> String mkParagraph' ((Sentence s):[]) = s <> ('n':[]) mkParagraph' ((Sentence s):ss) = s <> " " <> (mkParagraph' ss)
Note: will need more complex accumulator for case where an elem has two distinct text segements broken up | by an element, (rare case)
negParseOpeningTag :: forall s (m :: Type -> Type) u. Stream s m Char => [Elem] -> ParsecT s u m (Elem, Attrs) Source #
Will only match elements not specified
openOrCloseTag :: forall s (m :: Type -> Type) u. Stream s m Char => ParsecT s u m () Source #
This will match any element open or closing tag that is not a style tag
anyThingbut :: forall s (m :: Type -> Type) u. Stream s m Char => [String] -> ParsecT s u m String Source #
Despite the fun name, this is just for textChunk use
textChunkIf :: forall s (m :: Type -> Type) u. Stream s m Char => (String -> Bool) -> ParsecT s u m String Source #
styleElem :: forall s (m :: Type -> Type) u. Stream s m Char => ParsecT s u m (Elem' String) Source #
removeStyleTags :: Html -> Html Source #
catEithers :: [Either e a] -> [a] Source #
divideUp :: forall s (m :: Type -> Type) u. Stream s m Char => ParsecT s u m String -> ParsecT s u m [Either String String] Source #
data AccumITextElem a Source #
textOnlyFoldr :: HTMLMatcher AccumITextElem String -> (String, [String]) -> (String, [String]) Source #