From a9b3b2b2902b0a79437f87cda8cd18f426359621 Mon Sep 17 00:00:00 2001 From: Gregor Kleen Date: Mon, 11 Jan 2016 04:58:05 +0000 Subject: thermoprint-2 --- provider/posts/thermoprint-2.lhs | 262 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 262 insertions(+) create mode 100644 provider/posts/thermoprint-2.lhs (limited to 'provider/posts') diff --git a/provider/posts/thermoprint-2.lhs b/provider/posts/thermoprint-2.lhs new file mode 100644 index 0000000..056f1ef --- /dev/null +++ b/provider/posts/thermoprint-2.lhs @@ -0,0 +1,262 @@ +--- +title: On the design of a structured document format compatible with character oriented printers +published: 2016-01-11 +tags: Thermoprint +--- + +This post is an annotated version of the file `spec/src/Thermoprint/Printout.hs` as of commit [f6dc3d1](git://git.yggdrasil.li/thermoprint#f6dc3d1). + +> {-# LANGUAGE DeriveGeneric, DeriveAnyClass #-} +> {-# LANGUAGE OverloadedStrings #-} +> {-# OPTIONS_HADDOCK show-extensions #-} + +Motivation +---------- + +We want our codebase to be compatible with as many different models of printers as we are willing to implement. +It is therefore desirable to maintain a structured document format which we can transform into a printer-specific representation of the payload to be printed with minimal effort. + +In this post we present one such format. + +Contents +-------- + +> -- | This module contains the definition of the structure -- 'Printout' -- used to represent the content of a job +> module Thermoprint.Printout +> ( Printout(..) +> , Paragraph(..) +> , Chunk(..) +> , Block(..) +> , Line( HSpace +> , SpaceSep +> ) +> , text, cotext +> , prop_text +> ) where + +Preliminaries +------------- + +> import Data.Sequence (Seq, (|>), (<|)) + +A Sequence represents the same structure as the linked lists common in haskell but supports $O(1)$ `snoc`, which is desirable since we intend to iteratively build up the structure when parsing input formats. + +> import Data.Text.Lazy (Text) +> +> import Data.ByteString.Lazy (ByteString) + +The entire structure will be lazy by default but an instance of `NFData`, thus the lazy variants of `Text` and `ByteString`. + +> import GHC.Generics (Generic) + +We will use derived instances of `Generic` to get handed suitable instances of rather complicated classes such as `Arbitrary` and `FromJSON` + +> import Control.DeepSeq (NFData) + +Instances of `NFData` allow us to strictly evaluate our document structure when needed + +> import Data.Aeson (FromJSON(..), ToJSON(..), Value(..)) +> import qualified Data.Aeson as JSON (encode, decode) +> import Data.Aeson.Types (typeMismatch) + +We will encode the document as a [json](https://en.wikipedia.org/wiki/JSON) object during transport + +> import Test.QuickCheck.Arbitrary (Arbitrary(..), CoArbitrary, genericShrink) +> import Test.QuickCheck.Modifiers (NonNegative(..)) +> import Test.QuickCheck.Gen (oneof, suchThat, scale) +> import Test.QuickCheck.Instances +> import Test.QuickCheck (forAll, Property) + +We will use [QuickCheck](https://hackage.haskell.org/package/QuickCheck) for automatic test generation. + +> import qualified Data.Text.Lazy as TL (split, null, pack, filter, intercalate, map) +> import qualified Data.Text as T (pack) +> import Data.Char (isSpace) +> +> import Data.Monoid (Monoid(..), (<>)) +> +> import Data.List (dropWhile, dropWhileEnd, groupBy, genericLength, genericReplicate) +> +> import Data.Sequence as Seq (fromList, null, singleton) +> +> import Data.Function (on) +> +> import Data.Foldable (toList, fold) + +We will need to do some parsing and pretty-printing to implement `text` and `cotext`, respectively. + +> import Data.Encoding (encodeLazyByteStringExplicit, decodeLazyByteString) +> import Data.Encoding.UTF8 +> import qualified Data.ByteString.Base64.Lazy as Base64 (encode, decode) + +Since we want end users to be able to include direct instructions the printer in the form of a lazy [`ByteString`](https://hackage.haskell.org/package/bytestring) we need some way to encode `ByteString`s in JSON. +We chose [base64](https://hackage.haskell.org/package/base64-bytestring). + +> import Prelude hiding (fold) +> +> +> -- | A 'Printout' is a sequence of visually seperated 'Paragraph's +> type Printout = Seq Paragraph + +"visually seperated" will most likely end up meaning "seperated by a single blank line" + +> -- | A 'Paragraph' is a non-seperated sequence of 'Chunk's +> type Paragraph = Seq Chunk +> +> -- | We introduce both 'Chunk' and 'Paragraph' mainly to allow 'Raw'. +> -- +> -- Were we to disallow 'Raw', 'Block' would be identical to 'Paragraph' +> data Chunk = Cooked Block -- ^ text semantically structured to be rendered in accordance with the display format of printer +> | Raw ByteString -- ^ direct instructions to the printer +> deriving (Generic, NFData, Show, CoArbitrary) +> +> instance FromJSON Chunk where +> parseJSON s@(String _) = Raw <$> ((either fail return . decodeBase64) =<< parseJSON s) +> where +> decodeBase64 :: String -> Either String ByteString +> decodeBase64 s = (either (Left . show) Right . encodeLazyByteStringExplicit UTF8Strict $ s) >>= Base64.decode +> parseJSON o@(Object _) = Cooked <$> parseJSON o +> parseJSON v = typeMismatch "Chunk" v +> +> instance ToJSON Chunk where +> toJSON (Raw bs) = String . T.pack . decodeLazyByteString UTF8Strict . Base64.encode $ bs +> toJSON (Cooked block) = toJSON block + +We provide custom instances of `FromJSON Chunk` and `ToJSON Chunk` so that we might reduce the sice of the resulting JSON somewhat (this is an opportune target since disambiguaty is simple) + +> -- | 'Block' is the entry point for our structured document format +> data Block = Line Line -- ^ a single 'Line' of text +> | VSpace Integer -- ^ vertical space of height equivalent to 'Integer' lines +> | NewlSep (Seq Block) -- ^ A sequence of 'Block's seperated by newlines +> deriving (Generic, NFData, Show, CoArbitrary, FromJSON, ToJSON) +> +> {- | A 'Line' is one of: +> +> * a single word +> * horizontal space equivalent to the width of 'Integer' `em`. +> * a sequence of words seperated by spaces +> +> We don't export all constructors and instead encourage the use of 'text'. +> -} +> data Line = Word Text +> | HSpace Integer +> | SpaceSep (Seq Line) +> deriving (Generic, NFData, Show, CoArbitrary, FromJSON, ToJSON) +> +> instance Monoid Block where +> mempty = NewlSep mempty +> x@(NewlSep xs) `mappend` y@(NewlSep ys) +> | Seq.null xs = y +> | Seq.null ys = x +> | otherwise = NewlSep (xs <> ys) +> (NewlSep xs) `mappend` y +> | Seq.null xs = y +> | otherwise = NewlSep (xs |> y) +> x `mappend` (NewlSep ys) +> | Seq.null ys = x +> | otherwise = NewlSep (x <| ys) +> x `mappend` y = NewlSep $ Seq.fromList [x, y] +> +> instance Monoid Line where +> mempty = SpaceSep mempty +> x@(SpaceSep xs) `mappend` y@(SpaceSep ys) +> | Seq.null xs = y +> | Seq.null ys = x +> | otherwise = SpaceSep (xs <> ys) +> (SpaceSep xs) `mappend` y +> | Seq.null xs = y +> | otherwise = SpaceSep (xs |> y) +> x `mappend` (SpaceSep ys) +> | Seq.null ys = x +> | otherwise = SpaceSep (x <| ys) +> x `mappend` y = SpaceSep $ Seq.fromList [x, y] + +The Monoid instances for `Block` and `Line` are somewhat unwieldy since we want to guarantee minimum overhead by reducing expressions such as `SpaceSep (fromList [x])` to `x` whenever possible. + +The same effect would have been possible by introducing the monoid structure *one level higher* -- we could have introduced constructors such as `Line :: Seq Word -> Block`. +This was deemed undesirable since we would not have been able to implement instances such as `Monoid Line` which allow the use of more generic functions during parsing. + +> text :: Text -> Either Block Line +> -- ^ Smart constructor for 'Line'/'Block' which maps word and line boundaries (as determined by 'isSpace' and '(== '\n')' respectively) to the structure of 'Block' and 'Line'. +> -- +> -- Since we are unwilling to duplicate the list of chars from 'isSpace' we cannot reasonably determine a width for the various whitespace 'Char's. +> -- Thus they are all weighted equally as having width 1 `em`. +> text t = case splitLines t of +> [] -> Right mempty +> [Line x] -> Right x +> xs -> Left $ mconcat xs +> where +> splitLines :: Text -> [Block] +> splitLines t = map toBlock +> . groupBy ((==) `on` TL.null) +> $ TL.split (== '\n') t +> splitWords :: Text -> [Line] +> splitWords t = map toLine +> . groupBy ((==) `on` TL.null) +> $ TL.split isSpace t +> toBlock [] = mempty +> toBlock xs@(x:_) +> | TL.null x = VSpace $ genericLength xs - 1 +> | otherwise = mconcat . map (Line . mconcat . splitWords) $ xs +> toLine [] = mempty +> toLine xs@(x:_) +> | TL.null x = HSpace $ genericLength xs - 1 +> | otherwise = mconcat . map Word $ xs +> list :: b -> (a -> [a] -> b) -> [a] -> b +> list c _ [] = c +> list _ f (x:xs) = f x xs + +Implementations using `TL.lines` and `TL.words` were tested. +We chose to use `TL.split`-based solutions instead because the more specific splitting functions provided by [text](https://hackage.haskell.org/package/text) drop information concerning the exact amount of whitespace. + +> cotext :: Block -> Text +> -- ^ inverse of +> -- @ +> -- either id Line . `text` +> -- @ +> cotext (VSpace n) = TL.pack . genericReplicate n $ '\n' +> cotext (NewlSep xs) = TL.intercalate "\n" . map cotext . toList $ xs +> cotext (Line x) = cotext' x +> where +> cotext' (Word x) = x +> cotext' (HSpace n) = TL.pack . genericReplicate n $ ' ' +> cotext' (SpaceSep xs) = TL.intercalate " " . map cotext' . toList $ xs + +We provide cotext for testing `text` and to enable determining semantic equality of `Printout`s at a later date + +> prop_text :: Text -> Bool +> -- ^ prop> (`cotext` . either id Line . `text` $ x) == x +> -- +> -- Where 'x' is restricted to those `TL.Text` which do not contain whitespace besides ' ' and '\n'. +> prop_text x = (cotext . either id Line . text $ x') == x' +> where +> x' = TL.map normSpace x +> normSpace c +> | isSpace c +> , c `elem` keep = c +> | isSpace c = ' ' -- We have to do this because all whitespace gets interpreted as width 1 +> | otherwise = c +> keep = [' ', '\n'] +> +> -- | We don't test 'Raw' 'Chunk's +> instance Arbitrary Chunk where +> shrink = genericShrink +> arbitrary = Cooked <$> arbitrary +> +> instance Arbitrary Block where +> shrink = genericShrink +> arbitrary = oneof [ Line <$> arbitrary +> , VSpace . getNonNegative <$> arbitrary +> , NewlSep <$> scale' arbitrary +> ] +> +> instance Arbitrary Line where +> shrink = genericShrink +> arbitrary = oneof [ Word . TL.filter (not . isSpace) <$> arbitrary -- 'isSpace '\n' == True' +> , HSpace . getNonNegative <$> arbitrary +> , SpaceSep <$> scale' arbitrary +> ] +> +> scale' = scale (round . sqrt . fromInteger . toInteger) + +Failing to properly scale the tested structures was shown to use more than 8GiB of RAM during testing -- cgit v1.2.3