1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
|
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE DeriveGeneric #-}
-- | An implementation of BBcode parsing 'Text' to a syntax tree
module Text.BBCode
( bbcode
, BBCodeError(..)
, TreeError(..)
, DomForest
, DomTree(..)
-- , dom
-- , BBLabel
-- , rose
-- , matches
) where
import Data.Text (Text)
import GHC.Generics (Generic)
import Control.Exception (Exception)
import Data.Typeable (Typeable)
import Control.Monad (unless, join, foldM, (<=<))
import Data.Function (on)
import Control.Applicative
import Text.BBCode.Lexer (BBToken(..), token)
import Data.Attoparsec.Text (parseOnly, endOfInput)
import Data.Tree
import Data.Tree.Zipper (TreePos, Empty, Full)
import qualified Data.Tree.Zipper as Z
import Data.Map (Map)
import qualified Data.Map as Map
import Data.CaseInsensitive (CI)
import qualified Data.CaseInsensitive as CI
import Data.Bifunctor (Bifunctor(first))
import Data.Maybe (catMaybes)
-- | Our target structure -- a rose tree with an explicit terminal constructor
data DomTree = Element (CI Text) (Map (CI Text) Text) [DomTree]
| Content Text
deriving (Show, Eq)
-- | List of paragraphs, which are comprised of lists of 'DomTree's
type DomForest = [[DomTree]]
dom :: Forest BBLabel -> DomForest
-- ^ Parse semantically constrained rose tree to syntactically constrained version
--
-- Silently drops children of semantically terminal nodes ('BBPlain')
--
-- We already ensured that paragraphs occur nowhere but at toplevel
dom = map (\(Node BBPar ts) -> map dom' ts) . ensureTopLevelPar
where
ensureTopLevelPar xs@((Node BBPar _):_) = xs
ensureTopLevelPar xs = pure $ Node BBPar xs
dom' (Node (BBPlain t) _) = Content t
dom' (Node (BBTag t attrs) ts) = Element (CI.mk t) (Map.mapKeys CI.mk attrs) $ map dom' ts
-- | Errors encountered during parsing
data BBCodeError = LexerError String -- ^ Error while parsing input to stream of tokens
| TreeError TreeError -- ^ Error while parsing stream of tokens to syntax tree
deriving (Show, Eq, Generic, Typeable)
instance Exception BBCodeError
bbcode :: Text -> Either BBCodeError DomForest
-- ^ Parse BBCode
bbcode t = fmap dom $ first LexerError (parseOnly (many token <* endOfInput) t) >>= first TreeError . rose
-- | Errors in input encountered during parsing of lexed token-stream
data TreeError = MismatchedTags Text Text -- ^ Closing tags label does not match opening tags
| ImbalancedTags Text -- ^ We found an extraneous closing tag
| UnclosedTags [Text] -- ^ We found opened tags that were not closed
| ParagraphWithinTag -- ^ We found a paragraph-break within a tag
deriving (Show, Eq, Generic, Typeable)
instance Exception TreeError
-- | The label of our rose-tree nodes carries the tag name and a map of attributes
data BBLabel = BBTag Text (Map Text Text)
| BBPar
| BBPlain Text
deriving (Show, Eq)
matches :: Text -> Text -> Bool
-- ^ @`matches` "open" "close"@ should be 'True' iff @[/close]@ is a valid closing tag for @[open]@
--
-- @ (==) `on` 'CI.mk' @
matches = (==) `on` CI.mk
rose :: [BBToken] -> Either TreeError (Forest BBLabel)
-- ^ Assuming that both tags and content have the same type (we use 'BBLabel') bbcode is a flat representation of a rose tree
--
-- We use @'Tree' 'BBLabel'@ only as another intermediate structure because it carries no guarantee that the data is semantically valid -- a 'BBPlain'-value semantically has no children.
--
-- The use of 'Tree' was still deemed desirable because the morphism to a more sensible structure is straightforward and 'Data.Tree.Zipper' provides all the tools needed to implement 'rose' in a sensible fashion
rose = fmap Z.toForest . checkClosure <=< foldM (flip rose') (Z.fromForest [])
where
rose' (BBStr t) = return . Z.nextSpace . Z.insert (Node (BBPlain t) [])
rose' BBNewPar = parBreak -- for more pointless
rose' (BBOpen t attrs) = return . Z.children . Z.insert (Node (BBTag t $ Map.fromList attrs) [])
rose' (BBContained t attrs) = return . Z.nextSpace . Z.insert (Node (BBTag t $ Map.fromList attrs) [])
rose' (BBClose t) = close t -- for more pointless
close :: Text -> TreePos Empty BBLabel -> Either TreeError (TreePos Empty BBLabel)
close tag pos = do
pos' <- maybe (Left $ ImbalancedTags tag) Right $ Z.parent pos >>= traversePars
let
pTag = (\(BBTag t _) -> t) $ Z.label pos'
unless (pTag `matches` tag) . Left $ MismatchedTags pTag tag -- The structure shows that this mode of failure is not logically required -- it's just nice to have
return $ Z.nextSpace pos'
where
traversePars pos
| isPar . Z.tree $ pos = Z.parent pos >>= traversePars
| otherwise = return pos
parBreak :: TreePos Empty BBLabel -> Either TreeError (TreePos Empty BBLabel)
parBreak z = let siblings = reverse $ Z.before z
siblingsAsPars
| all isPar siblings = Right z
| Z.isRoot z = Right . Z.fromForest $ [Node BBPar siblings]
| (Just p) <- Z.parent z
, BBPar <- Z.label p = Right . Z.nextSpace $ p
| otherwise = Left ParagraphWithinTag
in Z.children . Z.insert (Node BBPar []) . Z.last <$> siblingsAsPars
isPar (Node BBPar _) = True
isPar _ = False
checkClosure :: TreePos Empty BBLabel -> Either TreeError (TreePos Empty BBLabel)
-- This failure mode isn't required either
checkClosure z
| null parentTags = Right z
| otherwise = Left . UnclosedTags $ parentTags
where
parentTags = catMaybes . map (getTag . (\(_, p, _) -> p)) $ Z.parents z
getTag (BBTag t _) = Just t
getTag _ = Nothing
|