diff options
Diffstat (limited to 'bbcode/src/Text/BBCode')
-rw-r--r-- | bbcode/src/Text/BBCode/Lexer.hs | 74 |
1 files changed, 74 insertions, 0 deletions
diff --git a/bbcode/src/Text/BBCode/Lexer.hs b/bbcode/src/Text/BBCode/Lexer.hs new file mode 100644 index 0000000..d2aa2bc --- /dev/null +++ b/bbcode/src/Text/BBCode/Lexer.hs | |||
@@ -0,0 +1,74 @@ | |||
1 | {-# LANGUAGE OverloadedStrings #-} | ||
2 | {-# LANGUAGE DeriveGeneric, DeriveAnyClass #-} | ||
3 | |||
4 | -- | A parser to transform 'Text' into a stream of 'BBToken's | ||
5 | module Text.BBCode.Lexer | ||
6 | ( BBToken(..) | ||
7 | , token | ||
8 | , escapedText | ||
9 | , escapedText' | ||
10 | ) where | ||
11 | |||
12 | import Data.Attoparsec.Text | ||
13 | |||
14 | import Data.Text (Text) | ||
15 | import qualified Data.Text as T (singleton, head, last, tail, null) | ||
16 | |||
17 | import Control.Applicative | ||
18 | |||
19 | import Test.QuickCheck (Arbitrary(..), CoArbitrary, genericShrink) | ||
20 | import Test.QuickCheck.Gen (oneof, suchThat) | ||
21 | import Test.QuickCheck.Instances | ||
22 | import GHC.Generics (Generic) | ||
23 | |||
24 | -- | Our lexicographical unit | ||
25 | data BBToken = BBOpen Text -- ^ Tag open | ||
26 | | BBClose Text -- ^ Tag close | ||
27 | | BBStr Text -- ^ Content of a tag | ||
28 | deriving (Generic, Eq, Show, CoArbitrary) | ||
29 | |||
30 | -- | This instance does not produce: | ||
31 | -- | ||
32 | -- * opening and closing tags whose 'Text' ends in @\\@ | ||
33 | -- * empty 'BBStr's | ||
34 | instance Arbitrary BBToken where | ||
35 | shrink = genericShrink | ||
36 | arbitrary = oneof [ BBOpen <$> tagText | ||
37 | , BBClose <$> tagText | ||
38 | , BBStr <$> nonEmpty | ||
39 | ] | ||
40 | where | ||
41 | tagText = arbitrary `suchThat` (not . lastIsEscape) | ||
42 | lastIsEscape t | ||
43 | | T.null t = False | ||
44 | | T.last t == '\\' = True | ||
45 | | otherwise = False | ||
46 | nonEmpty = (arbitrary `suchThat` (not . T.null)) | ||
47 | |||
48 | token :: Parser BBToken | ||
49 | -- ^ Tokenizer | ||
50 | token = BBClose <$> (string "[/" *> escapedText' [']'] <* string "]") | ||
51 | <|> BBOpen <$> (string "[" *> escapedText' [']'] <* string "]") | ||
52 | <|> BBStr <$> escapedText ['['] | ||
53 | |||
54 | escapedText :: [Char] -> Parser Text | ||
55 | -- ^ @escapedText cs@ consumes 'Text' up to (not including) the first occurence of a character from @cs@ that is not escaped using @\\@ | ||
56 | -- | ||
57 | -- Always consumes at least one character | ||
58 | -- | ||
59 | -- @\\@ needs to be escaped (prefixed with @\\@) iff it precedes a character from @cs@ | ||
60 | escapedText [] = takeText | ||
61 | escapedText cs = recurse $ choice [ takeWhile1 (not . special) | ||
62 | , escapeSeq | ||
63 | , escapeChar' | ||
64 | ] | ||
65 | where | ||
66 | escapeChar = '\\' | ||
67 | special = inClass $ escapeChar : cs | ||
68 | escapeChar' = string $ T.singleton escapeChar | ||
69 | escapeSeq = escapeChar' >> (T.singleton <$> satisfy special) -- s/\\[:cs]/\1/ | ||
70 | recurse p = mappend <$> p <*> escapedText' cs | ||
71 | |||
72 | escapedText' :: [Char] -> Parser Text | ||
73 | -- ^ @'option' "" $ 'escapedText' cs@ | ||
74 | escapedText' cs = option "" $ escapedText cs | ||