{-# LANGUAGE Safe #-}
{-# LANGUAGE ViewPatterns #-}
{-# OPTIONS_GHC -Wno-incomplete-uni-patterns #-}
{-|
Module      : Text.Gigaparsec.Errors.TokenExtractors
Description : This module contains implementations of token extractors that can be used in the
              @ErrorBuilder@ to decide how to extract unexpected tokens from the residual input left
              over from a parse error.
License     : BSD-3-Clause
Maintainer  : Jamie Willis, Gigaparsec Maintainers
Stability   : stable

This module contains implementations of token extractors that can be used in the
"Text.Gigaparsec.Errors.ErrorBuilder" to decide how to extract unexpected tokens from the residual
input left over from a parse error.

These are common strategies, and something here is likely to be what is needed. They are all careful
to handle unprintable characters and whitespace in a sensible way, and account for unicode codepoints
that are wider than a single 16-bit character.

@since 0.2.5.0
-}
module Text.Gigaparsec.Errors.TokenExtractors (
    Token(..), TokenExtractor,
    tillNextWhitespace,
    singleChar,
    matchParserDemand,
    lexToken, lexTokenWithSelect
  ) where

import Text.Gigaparsec (
    Parsec, Result(Success), parse,
    atomic, lookAhead, notFollowedBy, some, (<+>), (<~>), mapMaybeS
  )
import Text.Gigaparsec.Char (item)
import Text.Gigaparsec.Combinator (option)
import Text.Gigaparsec.Position (offset)

import Data.Char (generalCategory, ord, GeneralCategory(Format, Surrogate, PrivateUse, NotAssigned, Control))
import Data.Char qualified as Char (isSpace)
import Data.List.NonEmpty (NonEmpty((:|)), nonEmpty)
import Data.List.NonEmpty qualified as NonEmpty (toList)
import Data.Void (Void)
import Data.Foldable (maximumBy)
import Numeric (showHex)
import Data.Function (on)
import Data.Maybe (catMaybes)

{-|
Type alias for token extractors, matches the shape of
'Text.Gigaparsec.Errors.ErrorBuilder.unexpectedToken'.

@since 0.2.5.0
-}
type TokenExtractor :: *
type TokenExtractor = NonEmpty Char -- ^ the remaining input, @cs@, at point of failure.
                    -> Word         -- ^ the input the parser tried to read when it failed
                                    --   (this is __not__ guaranteed to be smaller than the length of
                                    --    @cs@, but is __guaranteed to be greater than 0__).
                    -> Bool         -- ^ was this error generated as part of \"lexing\", or in a wider parser (see 'Text.Gigaparsec.Errors.Combinator.markAsToken').
                    -> Token        -- ^ a token extracted from @cs@ that will be used as part of the unexpected message.

{-|
This type represents an extracted token returned by 'Text.Gigaparsec.Errors.ErrorBuilder.unexpectedToken'
in 'Text.Gigaparsec.Errors.ErrorBuilder.ErrorBuilder'.

There is deliberately no analogue for @EndOfInput@ because we guarantee that non-empty
residual input is provided to token extraction.

@since 0.2.5.0
-}
type Token :: *
data Token = Raw                   -- ^ This is a token that is directly extracted from the residual input itself.
              !String              -- ^ the input extracted.
           | Named                 -- ^ This is a token that has been given a name, and is treated like a labelled item.
              !String              -- ^ the description of the token.
              {-# UNPACK #-} !Word -- ^ the amount of residual input this token ate.

{-# INLINABLE tillNextWhitespace #-}
{-|
This extractor provides an implementation for 'Text.Gigaparsec.ErrorBuilder.unexpectedToken':
it will construct a token that extends to the next available whitespace in the remaining input.
It can be configured to constrict this token to the minimum of the next whitespace or whatever the
parser demanded.

In the case of unprintable characters or whitespace, this extractor will favour reporting a more
meaningful name.

@since 0.2.5.0
-}
tillNextWhitespace :: Bool           -- ^ should the extractor cap the token to the amount of input the parser demanded?
                   -> (Char -> Bool) -- ^ what counts as a space character
                   -> TokenExtractor
tillNextWhitespace :: Bool -> (Char -> Bool) -> TokenExtractor
tillNextWhitespace Bool
_ Char -> Bool
_ (NonEmpty Char -> Maybe Token
whitespaceOrUnprintable -> Just Token
tok) Word
_ Bool
_ = Token
tok
tillNextWhitespace Bool
trimToDemand Char -> Bool
isSpace (Char
c :| [Char]
cs) Word
parserDemanded Bool
_
  | Bool
trimToDemand = [Char] -> Token
Raw (Int -> [Char] -> [Char]
forall a. Int -> [a] -> [a]
take (Word -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral Word
parserDemanded) ([Char] -> [Char]
tillSpace (Char
cChar -> [Char] -> [Char]
forall a. a -> [a] -> [a]
:[Char]
cs)))
  | Bool
otherwise    = [Char] -> Token
Raw ([Char] -> [Char]
tillSpace (Char
cChar -> [Char] -> [Char]
forall a. a -> [a] -> [a]
:[Char]
cs))
  where tillSpace :: [Char] -> [Char]
tillSpace = (Char -> Bool) -> [Char] -> [Char]
forall a. (a -> Bool) -> [a] -> [a]
takeWhile (Bool -> Bool
not (Bool -> Bool) -> (Char -> Bool) -> Char -> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Char -> Bool
isSpace)

{-|
This extractor provides an implementation for 'Text.Gigaparsec.ErrorBuilder.unexpectedToken':
it will unconditionally report the first character in the remaining input as the problematic token.

In the case of unprintable characters or whitespace, this extractor will favour reporting
a more meaningful name.

@since 0.2.5.0
-}
{-# INLINABLE singleChar #-}
singleChar :: TokenExtractor
singleChar :: TokenExtractor
singleChar (NonEmpty Char -> Maybe Token
whitespaceOrUnprintable -> Just Token
tok) Word
_ Bool
_ = Token
tok
singleChar (Char
c :| [Char]
_) Word
_ Bool
_ = [Char] -> Token
Raw [Char
c]

{-|
This extractor provides an implementation for 'Text.Gigaparsec.ErrorBuilder.unexpectedToken':
it will make a token as wide as the amount of input the parser tried to consume when it failed.

In the case of unprintable characters or whitespace, this extractor will favour reporting a more
meaningful name.

@since 0.2.5.0
-}
{-# INLINABLE matchParserDemand #-}
matchParserDemand :: TokenExtractor
matchParserDemand :: TokenExtractor
matchParserDemand (NonEmpty Char -> Maybe Token
whitespaceOrUnprintable -> Just Token
tok) Word
_ Bool
_ = Token
tok
matchParserDemand (Char
c :| [Char]
cs) Word
parserDemanded Bool
_ = [Char] -> Token
Raw (Int -> [Char] -> [Char]
forall a. Int -> [a] -> [a]
take (Word -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral Word
parserDemanded) (Char
cChar -> [Char] -> [Char]
forall a. a -> [a] -> [a]
:[Char]
cs))

{-# INLINABLE whitespaceOrUnprintable #-}
whitespaceOrUnprintable :: NonEmpty Char -> Maybe Token
whitespaceOrUnprintable :: NonEmpty Char -> Maybe Token
whitespaceOrUnprintable (Char
'\n' :| [Char]
_) = Token -> Maybe Token
forall a. a -> Maybe a
Just (Token -> Maybe Token) -> Token -> Maybe Token
forall a b. (a -> b) -> a -> b
$ [Char] -> Word -> Token
Named [Char]
"newline" Word
1
whitespaceOrUnprintable (Char
'\r' :| [Char]
_) = Token -> Maybe Token
forall a. a -> Maybe a
Just (Token -> Maybe Token) -> Token -> Maybe Token
forall a b. (a -> b) -> a -> b
$ [Char] -> Word -> Token
Named [Char]
"carriage return" Word
1
whitespaceOrUnprintable (Char
'\t' :| [Char]
_) = Token -> Maybe Token
forall a. a -> Maybe a
Just (Token -> Maybe Token) -> Token -> Maybe Token
forall a b. (a -> b) -> a -> b
$ [Char] -> Word -> Token
Named [Char]
"tab" Word
1
whitespaceOrUnprintable (Char
' ' :| [Char]
_) = Token -> Maybe Token
forall a. a -> Maybe a
Just (Token -> Maybe Token) -> Token -> Maybe Token
forall a b. (a -> b) -> a -> b
$ [Char] -> Word -> Token
Named [Char]
"space" Word
1
whitespaceOrUnprintable (Char
c :| [Char]
_)
  | Char -> Bool
Char.isSpace Char
c = Token -> Maybe Token
forall a. a -> Maybe a
Just (Token -> Maybe Token) -> Token -> Maybe Token
forall a b. (a -> b) -> a -> b
$ [Char] -> Word -> Token
Named [Char]
"whitespace character" Word
1
  | Bool
otherwise      = case Char -> GeneralCategory
generalCategory Char
c of
    GeneralCategory
Format -> Maybe Token
unprintable
    GeneralCategory
Surrogate -> Maybe Token
unprintable
    GeneralCategory
PrivateUse -> Maybe Token
unprintable
    GeneralCategory
NotAssigned -> Maybe Token
unprintable
    GeneralCategory
Control -> Maybe Token
unprintable
    GeneralCategory
_ -> Maybe Token
forall a. Maybe a
Nothing
  where unprintable :: Maybe Token
unprintable = Token -> Maybe Token
forall a. a -> Maybe a
Just (Token -> Maybe Token) -> Token -> Maybe Token
forall a b. (a -> b) -> a -> b
$ [Char] -> Word -> Token
Named ([Char]
"non-printable character (\\x" [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++ Int -> [Char] -> [Char]
forall a. Integral a => a -> [Char] -> [Char]
showHex (Char -> Int
ord Char
c) [Char]
")") Word
1

{-|
This extractor provides an implementation for 'Text.Gigaparsec.ErrorBuilder.unexpectedToken':
it will try and parse the residual input to identify a valid lexical token to report.

When parsing a grammar that as a dedicated lexical distinction, it is nice to be able to report
problematic tokens relevant to that grammar as opposed to generic input lifted straight from the
input stream. The easiest way of doing this would be having a pre-lexing pass and parsing based
on tokens, but this is deliberately not how Parsley is designed. Instead, this extractor can
try and parse the remaining input to try and identify a token on demand.

If the @lexicalError@ flag of the @unexpectedToken@ function is not set, which would indicate a
problem within a token reported by a classical lexer and not the parser, the extractor will
try to parse each of the provided @tokens@ in turn: whichever is the longest matched of these
tokens will be reported as the problematic one, where an earlier token arbitrates ties
('lexTokenWithSelect' can alter which is chosen). For best effect, these tokens should not consume
whitespace (which would otherwise be included at the end of the token!): this means that, if
using the @Lexer@, the functionality in __@nonlexeme@__ should be used. If one of the
givens tokens cannot be parsed, the input until the /next/ valid parsable token (or end of input)
is returned as a @Raw@.

If @lexicalError@ is true, then the given token extractor will be used instead to extract a
default token.

@since 0.2.5.0
-}
{-# INLINABLE lexToken #-}
lexToken :: [Parsec String] -- ^ The tokens that should be recognised by this extractor: each parser should return the
                            -- intended name of the token exactly as it should appear in the "Named" token.
                            --
                            -- This /should/ include a whitespace parser for "unexpected whitespace". However, with the
                            -- exception of the whitespace parser, these tokens should not consume trailing (and
                            -- certainly not leading) whitespace: if using definitions from "Text.Gigaparsec.Token.Lexer"
                            -- functionality, the @nonlexeme@ versions of the tokens should be used.
         -> TokenExtractor  -- ^ If the parser failed during the parsing of a token, this
                            -- function extracts the problematic item from the remaining input.
         -> TokenExtractor
lexToken :: [Parsec [Char]] -> TokenExtractor -> TokenExtractor
lexToken = (NonEmpty ([Char], Word) -> ([Char], Word))
-> [Parsec [Char]] -> TokenExtractor -> TokenExtractor
lexTokenWithSelect ((([Char], Word) -> ([Char], Word) -> Ordering)
-> NonEmpty ([Char], Word) -> ([Char], Word)
forall (t :: * -> *) a.
Foldable t =>
(a -> a -> Ordering) -> t a -> a
maximumBy (Word -> Word -> Ordering
forall a. Ord a => a -> a -> Ordering
compare (Word -> Word -> Ordering)
-> (([Char], Word) -> Word)
-> ([Char], Word)
-> ([Char], Word)
-> Ordering
forall b c a. (b -> b -> c) -> (a -> b) -> a -> a -> c
`on` ([Char], Word) -> Word
forall a b. (a, b) -> b
snd))

{-|
This extractor provides an implementation for 'Text.Gigaparsec.ErrorBuilder.unexpectedToken':
it will try and parse the residual input to identify a valid lexical token to report.

When parsing a grammar that as a dedicated lexical distinction, it is nice to be able to report
problematic tokens relevant to that grammar as opposed to generic input lifted straight from the
input stream. The easiest way of doing this would be having a pre-lexing pass and parsing based
on tokens, but this is deliberately not how Parsley is designed. Instead, this extractor can
try and parse the remaining input to try and identify a token on demand.

If the @lexicalError@ flag of the @unexpectedToken@ function is not set, which would indicate a
problem within a token reported by a classical lexer and not the parser, the extractor will
try to parse each of the provided @tokens@ in turn: the given function is used to select which
is returned.
For best effect, these tokens should not consume
whitespace (which would otherwise be included at the end of the token!): this means that, if
using the @Lexer@, the functionality in __@nonlexeme@__ should be used. If one of the
givens tokens cannot be parsed, the input until the /next/ valid parsable token (or end of input)
is returned as a @Raw@.

If @lexicalError@ is true, then the given token extractor will be used instead to extract a
default token.

@since 0.2.5.0
-}
{-# INLINABLE lexTokenWithSelect #-}
lexTokenWithSelect :: (NonEmpty (String, Word) -> (String, Word))
                   -- ^ If the extractor is successful in identifying tokens that can be parsed from
                   -- the residual input, this function will select /one/ of them to report back.
                   -> [Parsec String]
                   -- ^ The tokens that should be recognised by this extractor: each parser should return the
                   -- intended name of the token exactly as it should appear in the "Named" token.
                   --
                   -- This /should/ include a whitespace parser for "unexpected whitespace". However, with the
                   -- exception of the whitespace parser, these tokens should not consume trailing (and
                   -- certainly not leading) whitespace: if using definitions from "Text.Gigaparsec.Token.Lexer"
                   -- functionality, the @nonlexeme@ versions of the tokens should be used.
                   -> TokenExtractor
                   -- ^ If the parser failed during the parsing of a token, this
                   -- function extracts the problematic item from the remaining input.
                   -> TokenExtractor
lexTokenWithSelect :: (NonEmpty ([Char], Word) -> ([Char], Word))
-> [Parsec [Char]] -> TokenExtractor -> TokenExtractor
lexTokenWithSelect NonEmpty ([Char], Word) -> ([Char], Word)
selectToken [Parsec [Char]]
tokens TokenExtractor
extractItem NonEmpty Char
cs Word
parserDemanded Bool
lexical
  | Bool
lexical = TokenExtractor
extractItem NonEmpty Char
cs Word
parserDemanded Bool
True
  | Bool
otherwise = NonEmpty Char -> Token
extractToken NonEmpty Char
cs
  where
    extractToken :: NonEmpty Char -> Token
    extractToken :: NonEmpty Char -> Token
extractToken NonEmpty Char
inp =
      let Success Either (NonEmpty ([Char], Word)) [Char]
rawOrToks = forall err a.
ErrorBuilder err =>
Parsec a -> [Char] -> Result err a
parse @Void Parsec (Either (NonEmpty ([Char], Word)) [Char])
parser (NonEmpty Char -> [Char]
forall a. NonEmpty a -> [a]
NonEmpty.toList NonEmpty Char
inp)
      in (NonEmpty ([Char], Word) -> Token)
-> ([Char] -> Token)
-> Either (NonEmpty ([Char], Word)) [Char]
-> Token
forall a c b. (a -> c) -> (b -> c) -> Either a b -> c
either (([Char] -> Word -> Token) -> ([Char], Word) -> Token
forall a b c. (a -> b -> c) -> (a, b) -> c
uncurry [Char] -> Word -> Token
Named (([Char], Word) -> Token)
-> (NonEmpty ([Char], Word) -> ([Char], Word))
-> NonEmpty ([Char], Word)
-> Token
forall b c a. (b -> c) -> (a -> b) -> a -> c
. NonEmpty ([Char], Word) -> ([Char], Word)
selectToken) [Char] -> Token
Raw Either (NonEmpty ([Char], Word)) [Char]
rawOrToks

    parser :: Parsec (Either (NonEmpty (String, Word)) String)
    parser :: Parsec (Either (NonEmpty ([Char], Word)) [Char])
parser =
      let toks :: Parsec (NonEmpty ([Char], Word))
toks = ([Maybe ([Char], Word)] -> Maybe (NonEmpty ([Char], Word)))
-> Parsec [Maybe ([Char], Word)]
-> Parsec (NonEmpty ([Char], Word))
forall a b. (a -> Maybe b) -> Parsec a -> Parsec b
mapMaybeS ([([Char], Word)] -> Maybe (NonEmpty ([Char], Word))
forall a. [a] -> Maybe (NonEmpty a)
nonEmpty ([([Char], Word)] -> Maybe (NonEmpty ([Char], Word)))
-> ([Maybe ([Char], Word)] -> [([Char], Word)])
-> [Maybe ([Char], Word)]
-> Maybe (NonEmpty ([Char], Word))
forall b c a. (b -> c) -> (a -> b) -> a -> c
. [Maybe ([Char], Word)] -> [([Char], Word)]
forall a. [Maybe a] -> [a]
catMaybes)
                           ((Parsec [Char] -> Parsec (Maybe ([Char], Word)))
-> [Parsec [Char]] -> Parsec [Maybe ([Char], Word)]
forall (t :: * -> *) (f :: * -> *) a b.
(Traversable t, Applicative f) =>
(a -> f b) -> t a -> f (t b)
forall (f :: * -> *) a b.
Applicative f =>
(a -> f b) -> [a] -> f [b]
traverse (\Parsec [Char]
t -> Parsec ([Char], Word) -> Parsec (Maybe ([Char], Word))
forall a. Parsec a -> Parsec (Maybe a)
option (Parsec ([Char], Word) -> Parsec ([Char], Word)
forall a. Parsec a -> Parsec a
lookAhead (Parsec [Char] -> Parsec [Char]
forall a. Parsec a -> Parsec a
atomic Parsec [Char]
t Parsec [Char] -> Parsec Word -> Parsec ([Char], Word)
forall a b. Parsec a -> Parsec b -> Parsec (a, b)
<~> Parsec Word
offset))) [Parsec [Char]]
tokens)
          rawTok :: Parsec [Char]
rawTok = Parsec Char -> Parsec [Char]
forall a. Parsec a -> Parsec [a]
some ((Parsec [Char] -> Parsec ()) -> [Parsec [Char]] -> Parsec [()]
forall (t :: * -> *) (f :: * -> *) a b.
(Traversable t, Applicative f) =>
(a -> f b) -> t a -> f (t b)
forall (f :: * -> *) a b.
Applicative f =>
(a -> f b) -> [a] -> f [b]
traverse Parsec [Char] -> Parsec ()
forall a. Parsec a -> Parsec ()
notFollowedBy [Parsec [Char]]
tokens Parsec [()] -> Parsec Char -> Parsec Char
forall a b. Parsec a -> Parsec b -> Parsec b
forall (f :: * -> *) a b. Applicative f => f a -> f b -> f b
*> Parsec Char
item)
      in Parsec (NonEmpty ([Char], Word))
toks Parsec (NonEmpty ([Char], Word))
-> Parsec [Char]
-> Parsec (Either (NonEmpty ([Char], Word)) [Char])
forall a b. Parsec a -> Parsec b -> Parsec (Either a b)
<+> Parsec [Char]
rawTok