From c9554b25d4ad99ceec1bef7bd60b1df82ef5ce8a Mon Sep 17 00:00:00 2001 From: Gregor Kleen Date: Tue, 12 Jan 2016 00:02:10 +0000 Subject: BBCode lexer --- bbcode/src/Text/BBCode.hs | 10 ++++++ bbcode/src/Text/BBCode/Lexer.hs | 74 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 84 insertions(+) create mode 100644 bbcode/src/Text/BBCode/Lexer.hs (limited to 'bbcode/src') diff --git a/bbcode/src/Text/BBCode.hs b/bbcode/src/Text/BBCode.hs index 455decb..7a328a8 100644 --- a/bbcode/src/Text/BBCode.hs +++ b/bbcode/src/Text/BBCode.hs @@ -1,4 +1,14 @@ +{-# LANGUAGE OverloadedStrings #-} + module Text.BBCode ( ) where +import Data.Attoparsec.Text + +import Data.Text (Text) +import qualified Data.Text as T (singleton, head, tail) + +import Control.Applicative + +import Text.BBCode.Lexer (BBToken(..), token) diff --git a/bbcode/src/Text/BBCode/Lexer.hs b/bbcode/src/Text/BBCode/Lexer.hs new file mode 100644 index 0000000..d2aa2bc --- /dev/null +++ b/bbcode/src/Text/BBCode/Lexer.hs @@ -0,0 +1,74 @@ +{-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE DeriveGeneric, DeriveAnyClass #-} + +-- | A parser to transform 'Text' into a stream of 'BBToken's +module Text.BBCode.Lexer + ( BBToken(..) + , token + , escapedText + , escapedText' + ) where + +import Data.Attoparsec.Text + +import Data.Text (Text) +import qualified Data.Text as T (singleton, head, last, tail, null) + +import Control.Applicative + +import Test.QuickCheck (Arbitrary(..), CoArbitrary, genericShrink) +import Test.QuickCheck.Gen (oneof, suchThat) +import Test.QuickCheck.Instances +import GHC.Generics (Generic) + +-- | Our lexicographical unit +data BBToken = BBOpen Text -- ^ Tag open + | BBClose Text -- ^ Tag close + | BBStr Text -- ^ Content of a tag + deriving (Generic, Eq, Show, CoArbitrary) + +-- | This instance does not produce: +-- +-- * opening and closing tags whose 'Text' ends in @\\@ +-- * empty 'BBStr's +instance Arbitrary BBToken where + shrink = genericShrink + arbitrary = oneof [ BBOpen <$> tagText + , BBClose <$> tagText + , BBStr <$> nonEmpty + ] + where + tagText = arbitrary `suchThat` (not . lastIsEscape) + lastIsEscape t + | T.null t = False + | T.last t == '\\' = True + | otherwise = False + nonEmpty = (arbitrary `suchThat` (not . T.null)) + +token :: Parser BBToken +-- ^ Tokenizer +token = BBClose <$> (string "[/" *> escapedText' [']'] <* string "]") + <|> BBOpen <$> (string "[" *> escapedText' [']'] <* string "]") + <|> BBStr <$> escapedText ['['] + +escapedText :: [Char] -> Parser Text +-- ^ @escapedText cs@ consumes 'Text' up to (not including) the first occurence of a character from @cs@ that is not escaped using @\\@ +-- +-- Always consumes at least one character +-- +-- @\\@ needs to be escaped (prefixed with @\\@) iff it precedes a character from @cs@ +escapedText [] = takeText +escapedText cs = recurse $ choice [ takeWhile1 (not . special) + , escapeSeq + , escapeChar' + ] + where + escapeChar = '\\' + special = inClass $ escapeChar : cs + escapeChar' = string $ T.singleton escapeChar + escapeSeq = escapeChar' >> (T.singleton <$> satisfy special) -- s/\\[:cs]/\1/ + recurse p = mappend <$> p <*> escapedText' cs + +escapedText' :: [Char] -> Parser Text +-- ^ @'option' "" $ 'escapedText' cs@ +escapedText' cs = option "" $ escapedText cs -- cgit v1.2.3