diff options
| author | Gregor Kleen <gkleen@yggdrasil.li> | 2016-01-12 00:02:10 +0000 |
|---|---|---|
| committer | Gregor Kleen <gkleen@yggdrasil.li> | 2016-01-12 00:02:10 +0000 |
| commit | c9554b25d4ad99ceec1bef7bd60b1df82ef5ce8a (patch) | |
| tree | 2701c86c9d75b2cbfae81b92c07949fc1c86fc07 /bbcode/src/Text/BBCode | |
| parent | 650feae1e8c267981f224e1de31ff4729a526afd (diff) | |
| download | thermoprint-c9554b25d4ad99ceec1bef7bd60b1df82ef5ce8a.tar thermoprint-c9554b25d4ad99ceec1bef7bd60b1df82ef5ce8a.tar.gz thermoprint-c9554b25d4ad99ceec1bef7bd60b1df82ef5ce8a.tar.bz2 thermoprint-c9554b25d4ad99ceec1bef7bd60b1df82ef5ce8a.tar.xz thermoprint-c9554b25d4ad99ceec1bef7bd60b1df82ef5ce8a.zip | |
BBCode lexer
Diffstat (limited to 'bbcode/src/Text/BBCode')
| -rw-r--r-- | bbcode/src/Text/BBCode/Lexer.hs | 74 |
1 files changed, 74 insertions, 0 deletions
diff --git a/bbcode/src/Text/BBCode/Lexer.hs b/bbcode/src/Text/BBCode/Lexer.hs new file mode 100644 index 0000000..d2aa2bc --- /dev/null +++ b/bbcode/src/Text/BBCode/Lexer.hs | |||
| @@ -0,0 +1,74 @@ | |||
| 1 | {-# LANGUAGE OverloadedStrings #-} | ||
| 2 | {-# LANGUAGE DeriveGeneric, DeriveAnyClass #-} | ||
| 3 | |||
| 4 | -- | A parser to transform 'Text' into a stream of 'BBToken's | ||
| 5 | module Text.BBCode.Lexer | ||
| 6 | ( BBToken(..) | ||
| 7 | , token | ||
| 8 | , escapedText | ||
| 9 | , escapedText' | ||
| 10 | ) where | ||
| 11 | |||
| 12 | import Data.Attoparsec.Text | ||
| 13 | |||
| 14 | import Data.Text (Text) | ||
| 15 | import qualified Data.Text as T (singleton, head, last, tail, null) | ||
| 16 | |||
| 17 | import Control.Applicative | ||
| 18 | |||
| 19 | import Test.QuickCheck (Arbitrary(..), CoArbitrary, genericShrink) | ||
| 20 | import Test.QuickCheck.Gen (oneof, suchThat) | ||
| 21 | import Test.QuickCheck.Instances | ||
| 22 | import GHC.Generics (Generic) | ||
| 23 | |||
| 24 | -- | Our lexicographical unit | ||
| 25 | data BBToken = BBOpen Text -- ^ Tag open | ||
| 26 | | BBClose Text -- ^ Tag close | ||
| 27 | | BBStr Text -- ^ Content of a tag | ||
| 28 | deriving (Generic, Eq, Show, CoArbitrary) | ||
| 29 | |||
| 30 | -- | This instance does not produce: | ||
| 31 | -- | ||
| 32 | -- * opening and closing tags whose 'Text' ends in @\\@ | ||
| 33 | -- * empty 'BBStr's | ||
| 34 | instance Arbitrary BBToken where | ||
| 35 | shrink = genericShrink | ||
| 36 | arbitrary = oneof [ BBOpen <$> tagText | ||
| 37 | , BBClose <$> tagText | ||
| 38 | , BBStr <$> nonEmpty | ||
| 39 | ] | ||
| 40 | where | ||
| 41 | tagText = arbitrary `suchThat` (not . lastIsEscape) | ||
| 42 | lastIsEscape t | ||
| 43 | | T.null t = False | ||
| 44 | | T.last t == '\\' = True | ||
| 45 | | otherwise = False | ||
| 46 | nonEmpty = (arbitrary `suchThat` (not . T.null)) | ||
| 47 | |||
| 48 | token :: Parser BBToken | ||
| 49 | -- ^ Tokenizer | ||
| 50 | token = BBClose <$> (string "[/" *> escapedText' [']'] <* string "]") | ||
| 51 | <|> BBOpen <$> (string "[" *> escapedText' [']'] <* string "]") | ||
| 52 | <|> BBStr <$> escapedText ['['] | ||
| 53 | |||
| 54 | escapedText :: [Char] -> Parser Text | ||
| 55 | -- ^ @escapedText cs@ consumes 'Text' up to (not including) the first occurence of a character from @cs@ that is not escaped using @\\@ | ||
| 56 | -- | ||
| 57 | -- Always consumes at least one character | ||
| 58 | -- | ||
| 59 | -- @\\@ needs to be escaped (prefixed with @\\@) iff it precedes a character from @cs@ | ||
| 60 | escapedText [] = takeText | ||
| 61 | escapedText cs = recurse $ choice [ takeWhile1 (not . special) | ||
| 62 | , escapeSeq | ||
| 63 | , escapeChar' | ||
| 64 | ] | ||
| 65 | where | ||
| 66 | escapeChar = '\\' | ||
| 67 | special = inClass $ escapeChar : cs | ||
| 68 | escapeChar' = string $ T.singleton escapeChar | ||
| 69 | escapeSeq = escapeChar' >> (T.singleton <$> satisfy special) -- s/\\[:cs]/\1/ | ||
| 70 | recurse p = mappend <$> p <*> escapedText' cs | ||
| 71 | |||
| 72 | escapedText' :: [Char] -> Parser Text | ||
| 73 | -- ^ @'option' "" $ 'escapedText' cs@ | ||
| 74 | escapedText' cs = option "" $ escapedText cs | ||
