From c9554b25d4ad99ceec1bef7bd60b1df82ef5ce8a Mon Sep 17 00:00:00 2001 From: Gregor Kleen Date: Tue, 12 Jan 2016 00:02:10 +0000 Subject: BBCode lexer --- bbcode/src/Text/BBCode.hs | 10 +++++ bbcode/src/Text/BBCode/Lexer.hs | 74 ++++++++++++++++++++++++++++++++++++ bbcode/test/Spec.hs | 1 + bbcode/test/Text/BBCode/LexerSpec.hs | 68 +++++++++++++++++++++++++++++++++ bbcode/thermoprint-bbcode.cabal | 21 +++++++++- bbcode/thermoprint-bbcode.nix | 15 ++++++++ 6 files changed, 187 insertions(+), 2 deletions(-) create mode 100644 bbcode/src/Text/BBCode/Lexer.hs create mode 100644 bbcode/test/Spec.hs create mode 100644 bbcode/test/Text/BBCode/LexerSpec.hs create mode 100644 bbcode/thermoprint-bbcode.nix (limited to 'bbcode') diff --git a/bbcode/src/Text/BBCode.hs b/bbcode/src/Text/BBCode.hs index 455decb..7a328a8 100644 --- a/bbcode/src/Text/BBCode.hs +++ b/bbcode/src/Text/BBCode.hs @@ -1,4 +1,14 @@ +{-# LANGUAGE OverloadedStrings #-} + module Text.BBCode ( ) where +import Data.Attoparsec.Text + +import Data.Text (Text) +import qualified Data.Text as T (singleton, head, tail) + +import Control.Applicative + +import Text.BBCode.Lexer (BBToken(..), token) diff --git a/bbcode/src/Text/BBCode/Lexer.hs b/bbcode/src/Text/BBCode/Lexer.hs new file mode 100644 index 0000000..d2aa2bc --- /dev/null +++ b/bbcode/src/Text/BBCode/Lexer.hs @@ -0,0 +1,74 @@ +{-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE DeriveGeneric, DeriveAnyClass #-} + +-- | A parser to transform 'Text' into a stream of 'BBToken's +module Text.BBCode.Lexer + ( BBToken(..) + , token + , escapedText + , escapedText' + ) where + +import Data.Attoparsec.Text + +import Data.Text (Text) +import qualified Data.Text as T (singleton, head, last, tail, null) + +import Control.Applicative + +import Test.QuickCheck (Arbitrary(..), CoArbitrary, genericShrink) +import Test.QuickCheck.Gen (oneof, suchThat) +import Test.QuickCheck.Instances +import GHC.Generics (Generic) + +-- | Our lexicographical unit +data BBToken = BBOpen Text -- ^ Tag open + | BBClose Text -- ^ Tag close + | BBStr Text -- ^ Content of a tag + deriving (Generic, Eq, Show, CoArbitrary) + +-- | This instance does not produce: +-- +-- * opening and closing tags whose 'Text' ends in @\\@ +-- * empty 'BBStr's +instance Arbitrary BBToken where + shrink = genericShrink + arbitrary = oneof [ BBOpen <$> tagText + , BBClose <$> tagText + , BBStr <$> nonEmpty + ] + where + tagText = arbitrary `suchThat` (not . lastIsEscape) + lastIsEscape t + | T.null t = False + | T.last t == '\\' = True + | otherwise = False + nonEmpty = (arbitrary `suchThat` (not . T.null)) + +token :: Parser BBToken +-- ^ Tokenizer +token = BBClose <$> (string "[/" *> escapedText' [']'] <* string "]") + <|> BBOpen <$> (string "[" *> escapedText' [']'] <* string "]") + <|> BBStr <$> escapedText ['['] + +escapedText :: [Char] -> Parser Text +-- ^ @escapedText cs@ consumes 'Text' up to (not including) the first occurence of a character from @cs@ that is not escaped using @\\@ +-- +-- Always consumes at least one character +-- +-- @\\@ needs to be escaped (prefixed with @\\@) iff it precedes a character from @cs@ +escapedText [] = takeText +escapedText cs = recurse $ choice [ takeWhile1 (not . special) + , escapeSeq + , escapeChar' + ] + where + escapeChar = '\\' + special = inClass $ escapeChar : cs + escapeChar' = string $ T.singleton escapeChar + escapeSeq = escapeChar' >> (T.singleton <$> satisfy special) -- s/\\[:cs]/\1/ + recurse p = mappend <$> p <*> escapedText' cs + +escapedText' :: [Char] -> Parser Text +-- ^ @'option' "" $ 'escapedText' cs@ +escapedText' cs = option "" $ escapedText cs diff --git a/bbcode/test/Spec.hs b/bbcode/test/Spec.hs new file mode 100644 index 0000000..a824f8c --- /dev/null +++ b/bbcode/test/Spec.hs @@ -0,0 +1 @@ +{-# OPTIONS_GHC -F -pgmF hspec-discover #-} diff --git a/bbcode/test/Text/BBCode/LexerSpec.hs b/bbcode/test/Text/BBCode/LexerSpec.hs new file mode 100644 index 0000000..8b95874 --- /dev/null +++ b/bbcode/test/Text/BBCode/LexerSpec.hs @@ -0,0 +1,68 @@ +{-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE FlexibleInstances #-} + +module Text.BBCode.LexerSpec (spec) where + +import Test.Hspec +import Test.Hspec.QuickCheck (prop) +import Test.QuickCheck (Property, Discard(..), property) +import Text.BBCode.Lexer + +import Data.Text (Text) +import qualified Data.Text as T (singleton, replace, last, null) + +import Data.Monoid ((<>), mconcat, Endo(..)) + +import Data.Attoparsec.Text (parseOnly, endOfInput) + +import Control.Applicative +import Control.Monad (zipWithM_) + +coToken :: BBToken -> Text +-- ^ Inverse of `token` +coToken (BBOpen t) = "[" <> escape [']'] t <> "]" +coToken (BBClose t) = "[/" <> escape [']'] t <> "]" +coToken (BBStr t) = escape ['['] t + +escape :: [Char] -> Text -> Text +-- ^ Inverse of `escapedText` +escape xs = endos [ T.replace c' ("\\" <> c') . T.replace ("\\" <> c') ("\\\\" <> c') | c <- xs, let c' = T.singleton c ] . escapeLast + where + endos = appEndo . mconcat . map Endo + escapeLast t + | T.null t = "" + | T.last t == '\\' = t <> "\\" + | otherwise = t + +tokenize :: Text -> Either String [BBToken] +-- ^ Run 'many token' against input +tokenize = parseOnly (many token <* endOfInput) + +spec :: Spec +spec = do + prop "prop_token" prop_token + zipWithM_ example [1..] examples + where + example n (s, ts) = let str = "Example " <> show n + in specify str $ (tokenize s == Right ts) + +prop_token :: Text -> Property +-- ^ prop> (mconcat . map coToken) <$> tokenize x == Right x +-- +-- Where 'x' is restricted such that `tokenize` succeeds +prop_token x = discardLeft (((== x) . mconcat . map coToken) <$> tokenize x) + where + discardLeft = either (const $ property Discard) property + +examples :: [(Text, [BBToken])] +examples = [ ("[t]test[/t]" + , [BBOpen "t", BBStr "test", BBClose "t"]) + , ("[t]te\\st[/t]" + , [BBOpen "t", BBStr "te\\st", BBClose "t"]) + , ("[t]te\\[st[/t]" + , [BBOpen "t", BBStr "te[st", BBClose "t"]) + , ("[\\t]test[/t]" + , [BBOpen "\\t", BBStr "test", BBClose "t"]) + , ("[t]test[/t\\]]" + , [BBOpen "t", BBStr "test", BBClose "t]"]) + ] diff --git a/bbcode/thermoprint-bbcode.cabal b/bbcode/thermoprint-bbcode.cabal index 537cfe7..ea00969 100644 --- a/bbcode/thermoprint-bbcode.cabal +++ b/bbcode/thermoprint-bbcode.cabal @@ -17,9 +17,26 @@ build-type: Simple cabal-version: >=1.10 library - -- exposed-modules: + exposed-modules: Text.BBCode + , Text.BBCode.Lexer -- other-modules: -- other-extensions: build-depends: base >=4.8 && <4.9 + , attoparsec >=0.13.0 && <1 + , text >=1.2.1 && <2 + , QuickCheck >=2.8.1 && <3 + , quickcheck-instances >=0.3.11 && <1 hs-source-dirs: src - default-language: Haskell2010 \ No newline at end of file + default-language: Haskell2010 + +Test-Suite tests + type: exitcode-stdio-1.0 + hs-source-dirs: test + main-is: Spec.hs + ghc-options: -threaded -with-rtsopts=-N + build-depends: base >=4.8.1 && <5 + , thermoprint-bbcode -any + , hspec >=2.2.1 && <3 + , QuickCheck >=2.8.1 && <3 + , attoparsec >=0.13.0 && <1 + , text >=1.2.1 && <2 \ No newline at end of file diff --git a/bbcode/thermoprint-bbcode.nix b/bbcode/thermoprint-bbcode.nix new file mode 100644 index 0000000..5521b10 --- /dev/null +++ b/bbcode/thermoprint-bbcode.nix @@ -0,0 +1,15 @@ +{ mkDerivation, attoparsec, base, hspec, QuickCheck +, quickcheck-instances, stdenv, text +}: +mkDerivation { + pname = "thermoprint-bbcode"; + version = "0.0.0"; + src = ./.; + libraryHaskellDepends = [ + attoparsec base QuickCheck quickcheck-instances text + ]; + testHaskellDepends = [ attoparsec base hspec QuickCheck text ]; + homepage = "http://dirty-haskell.org/tags/thermoprint.html"; + description = "A parser for a subset of bbcode compatible with thermoprint-spec"; + license = stdenv.lib.licenses.publicDomain; +} -- cgit v1.2.3