From 4c4f2dcb8ff5c2dfd7758f6dc4d2324f9b344103 Mon Sep 17 00:00:00 2001 From: Gregor Kleen Date: Tue, 12 Jan 2016 23:41:01 +0100 Subject: thermoprint-4 --- provider/posts/thermoprint-4.md | 90 ++++++++++++++++++++++++++++++++++++----- 1 file changed, 79 insertions(+), 11 deletions(-) diff --git a/provider/posts/thermoprint-4.md b/provider/posts/thermoprint-4.md index 6b2a022..2606c06 100644 --- a/provider/posts/thermoprint-4.md +++ b/provider/posts/thermoprint-4.md @@ -17,32 +17,100 @@ our solution (as illustrated by the following mockup): data DomTree = Element Text (Map Text Text) [DomTree] | Content Text deriving (Show, Eq) - -type Errors = () -bbcode :: Text -> Either Errors DomTree +bbcode :: Text -> Maybe DomTree -- ^ Parse BBCode ~~~ Writing a parser capable of dealing with `Text` directly from scratch would be unnecessarily abstruse, we’ll be using the [attoparsec](https://hackage.haskell.org/package/attoparsec/docs/Data-Attoparsec-Text.html) family of parser -combinators instead (the exact usage of attoparsec is out of scope -- we'll just mockup the types). +combinators instead. + +We reproduce an incomplete version of the lexer below (it’s missing tag attributes and self-closing tags). + +We introduce `escapedText`, a helper function for extracting text until we reach one of a set of delimiting characters +(exclusive). +While doing this we also parse any delimiting character iff it's prefixed with an escape character (we use `\\`) -- the +escape character itself needs only be escaped if encountered directly before one of the delimiting characters. ~~~ {.haskell} -data Token = BBOpen Text - | BBClose Text - | BBStr Text +data Token = BBOpen Text -- ^ "[open]" + | BBClose Text -- ^ "[/close]" + | BBStr Text -- ^ "text" -tokenize :: Parser [Token] +token :: Parser [Token] +token = BBClose <$ "[/" <*> escapedText' [']'] <* "]" + <|> BBOpen <$ "[" <*> escapedText' [']'] <* "]" + <|> BBStr <$> escapedText ['['] + +escapedText' :: [Char] -> Parser Text +escapedText' = option "" . escapedText -type Error' = () +escapedText :: [Char] -> Parser Text +escapedText [] = takeText -- No delimiting characters -- parse all remaining input +escapedText cs = recurse $ choice [ takeWhile1 (not . special) -- a series of characters we don't treat as special + , escapeSeq -- an escaped delimiter + , escapeChar' -- the escape character + ] + where + escapeChar = '\\' + special = inClass $ escapeChar : cs + escapeChar' = string $ T.singleton escapeChar + escapeSeq = escapeChar' *> (T.singleton <$> satisfy special) -- escape character followed by a special character (which encludes the escape character) + recurse p = mappend <$> p <*> escapedText' cs -- parse a prefix and optionally append another chunk of escapedText -runTokenizer :: Text -> Either Error' [Token] +runTokenizer :: Text -> Maybe [Token] +runTokenizer = either (const Nothing) Just . parseOnly (many token <* endOfInput) ~~~ We have now reduced the Problem to `[Token] -> DomTree`. We quickly see that the structure of the problem is that of a [fold](https://hackage.haskell.org/package/base/docs/Data-Foldable.html). -Having realised this we now require an object of type `DomTree -> Token -> DomTree` to recursively build up our target +Having realised this we require a function of type `Token -> DomTree -> DomTree` to recursively build up our target structure. + +In general we’ll want to not only keep track of the `DomTree` during recursion but also maintain a reference to the +position at which we’ll be inserting new tokens. +This kind of problem is well understood and solved idiomatically by using a +[zipper](https://en.wikipedia.org/wiki/Zipper_(data_structure)) +([a cursory introduction](http://learnyouahaskell.com/zippers)). + +Writing zippers tends to be tedious. We’ll therefore introduce an +[additional intermediate structure](https://hackage.haskell.org/package/containers/docs/Data-Tree.html) for which an +[implementation](https://hackage.haskell.org/package/rosezipper) is available readily. +The morphism from this new structure (`Forest BBLabel`) to our `DomTree` will be almost trivial. + +~~~ {.haskell} +import Data.Tree.Zipper (TreePos, Empty, Full) +import qualified Data.Tree.Zipper as Z + +data BBLabel = BBTag Text + | BBPlain Text + +rose :: [BBToken] -> Maybe (Forest BBLabel) +rose = Z.toForest <$> foldM (flip rose') (Z.fromForest []) + +rose' :: BBToken -> TreePos Empty BBLabel -> Maybe (TreePos Empty BBLabel) +rose' (BBStr t) = return . Z.nextSpace . Z.insert (Node (BBPlain t) []) -- insert a node with no children and move one step to the right in the forest we’re currently viewing +rose' (BBOpen t) = return . Z.children . Z.insert (Node (BBTag t) []) -- insert the node and move into position to insert it's first child +rose' (BBClose t) = close t -- haskell complains if multiple equations for the same function have a differing number of arguments, therefore: 'close' + where + close :: Text -> TreePos Empty BBLabel -> Maybe (TreePos Empty BBLabel) + close tag pos = do + pos' <- Z.parent pos -- fail if we're trying to close a tag that does not have a parent (this indicates imbalanced tags) + let + pTag = (\(BBTag t) -> t) $ Z.label pos' -- yes, this will fail unceremoniously if the parent is not a tag, this poses no problem since we're constructing the structure ourselves. The proof that this failure mode does not occur is left as an exercise for the reader. + guard (pTag == tag) -- The structure shows that this mode of failure (opening tags content does not match the closing tags) is not logically required -- it only serves as a *notification* to the user + return $ Z.nextSpace pos' -- move one level up and to point at the next sibling of the parent +~~~ + +All that is left to do now is present our final morphism: + +~~~ {.haskell} +dom :: Forest BBLabel -> [DomTree] +dom = map dom' + where + dom' (Node (BBPlain t) []) = Content t + dom' (Node (BBTag t) ts = Element t $ map dom' ts +~~~ -- cgit v1.2.3