{-# LANGUAGE LambdaCase #-}
{-# LANGUAGE PatternSynonyms #-}
{-# LANGUAGE TupleSections #-}
{-# LANGUAGE ViewPatterns #-}

module GHC.Parser.String (
  StringLexError (..),
  lexString,
  lexMultilineString,

  -- * Unicode smart quote helpers
  isDoubleSmartQuote,
  isSingleSmartQuote,
) where

import GHC.Prelude hiding (getChar)

import Control.Arrow ((>>>))
import Control.Monad (when)
import Data.Char (chr, ord)
import qualified Data.Foldable1 as Foldable1
import qualified Data.List.NonEmpty as NonEmpty
import Data.Maybe (listToMaybe, mapMaybe)
import GHC.Data.StringBuffer (StringBuffer)
import qualified GHC.Data.StringBuffer as StringBuffer
import GHC.Parser.CharClass (
  hexDigit,
  is_decdigit,
  is_hexdigit,
  is_octdigit,
  is_space,
  octDecDigit,
 )
import GHC.Parser.Errors.Types (LexErr (..))
import GHC.Utils.Panic (panic)

type BufPos = Int
data StringLexError = StringLexError LexErr BufPos

lexString :: Int -> StringBuffer -> Either StringLexError String
lexString = lexStringWith processChars processChars
  where
    processChars :: HasChar c => [c] -> Either (c, LexErr) [c]
    processChars =
          collapseGaps
      >>> resolveEscapes

-- -----------------------------------------------------------------------------
-- Lexing interface

{-
Note [Lexing strings]
~~~~~~~~~~~~~~~~~~~~~

After verifying if a string is lexically valid with Alex, we still need to do
some post processing of the string, namely:
1. Collapse string gaps
2. Resolve escape characters

The problem: 'lexemeToString' is more performant than manually reading
characters from the StringBuffer. However, that completely erases the position
of each character, which we need in order to report the correct position for
error messages (e.g. when resolving escape characters).

So what we'll do is do two passes. The first pass is optimistic; just convert
to a plain String and process it. If this results in an error, we do a second
pass, this time where each character is annotated with its position. Now, the
error has all the information it needs.

Ideally, lexStringWith would take a single (forall c. HasChar c => ...) function,
but to help the specializer, we pass it in twice to concretize it for the two
types we actually use.
-}

-- | See Note [Lexing strings]
lexStringWith ::
  ([Char] -> Either (Char, LexErr) [Char])
  -> ([CharPos] -> Either (CharPos, LexErr) [CharPos])
  -> Int
  -> StringBuffer
  -> Either StringLexError String
lexStringWith processChars processCharsPos len buf =
  case processChars $ bufferChars buf len of
    Right s -> Right s
    Left _ ->
      case processCharsPos $ bufferLocatedChars buf len of
        Right _ -> panic "expected lex error on second pass"
        Left ((_, pos), e) -> Left $ StringLexError e pos

class HasChar c where
  getChar :: c -> Char
  setChar :: Char -> c -> c

instance HasChar Char where
  getChar = id
  setChar = const

instance HasChar (Char, x) where
  getChar = fst
  setChar c (_, x) = (c, x)

pattern Char :: HasChar c => Char -> c
pattern Char c <- (getChar -> c)
{-# COMPLETE Char #-}

bufferChars :: StringBuffer -> Int -> [Char]
bufferChars = StringBuffer.lexemeToString

type CharPos = (Char, BufPos)

bufferLocatedChars :: StringBuffer -> Int -> [CharPos]
bufferLocatedChars initialBuf len = go initialBuf
  where
    go buf
      | atEnd buf = []
      | otherwise  =
          let (c, buf') = StringBuffer.nextChar buf
           in (c, StringBuffer.cur buf) : go buf'

    atEnd buf = StringBuffer.byteDiff initialBuf buf >= len

-- -----------------------------------------------------------------------------
-- Lexing phases

collapseGaps :: HasChar c => [c] -> [c]
collapseGaps = go
  where
    go = \case
      c1@(Char '\\') : c2@(Char c) : cs
        | is_space c -> go $ dropGap cs
        | otherwise  -> c1 : c2 : go cs
      c : cs -> c : go cs
      [] -> []

    dropGap = \case
      Char '\\' : cs -> cs
      _ : cs -> dropGap cs
      [] -> panic "gap unexpectedly ended"

resolveEscapes :: HasChar c => [c] -> Either (c, LexErr) [c]
resolveEscapes = go dlistEmpty
  where
    go !acc = \case
      [] -> pure $ dlistToList acc
      Char '\\' : Char '&' : cs -> go acc cs
      backslash@(Char '\\') : cs ->
        case resolveEscapeChar cs of
          Right (esc, cs') -> go (acc `dlistSnoc` setChar esc backslash) cs'
          Left (c, e) -> Left (c, e)
      c : cs -> go (acc `dlistSnoc` c) cs

-- -----------------------------------------------------------------------------
-- Escape characters

-- | Resolve a escape character, after having just lexed a backslash.
-- Assumes escape character is valid.
resolveEscapeChar :: HasChar c => [c] -> Either (c, LexErr) (Char, [c])
resolveEscapeChar = \case
  Char 'a'  : cs -> pure ('\a', cs)
  Char 'b'  : cs -> pure ('\b', cs)
  Char 'f'  : cs -> pure ('\f', cs)
  Char 'n'  : cs -> pure ('\n', cs)
  Char 'r'  : cs -> pure ('\r', cs)
  Char 't'  : cs -> pure ('\t', cs)
  Char 'v'  : cs -> pure ('\v', cs)
  Char '\\' : cs -> pure ('\\', cs)
  Char '"'  : cs -> pure ('\"', cs)
  Char '\'' : cs -> pure ('\'', cs)
  -- escape codes
  Char 'x' : cs -> parseNum is_hexdigit 16 hexDigit cs
  Char 'o' : cs -> parseNum is_octdigit 8 octDecDigit cs
  cs@(Char c : _) | is_decdigit c -> parseNum is_decdigit 10 octDecDigit cs
  -- control characters (e.g. '\^M')
  Char '^' : Char c : cs -> pure (chr $ ord c - ord '@', cs)
  -- long form escapes (e.g. '\NUL')
  cs | Just (esc, cs') <- parseLongEscape cs -> pure (esc, cs')
  -- shouldn't happen
  Char c : _ -> panic $ "found unexpected escape character: " ++ show c
  [] -> panic "escape character unexpectedly ended"
  where
    parseNum isDigit base toDigit =
      let go x = \case
            ch@(Char c) : cs | isDigit c -> do
              let x' = x * base + toDigit c
              when (x' > 0x10ffff) $ Left (ch, LexNumEscapeRange)
              go x' cs
            cs -> pure (chr x, cs)
       in go 0

parseLongEscape :: HasChar c => [c] -> Maybe (Char, [c])
parseLongEscape cs = listToMaybe (mapMaybe tryParse longEscapeCodes)
  where
    tryParse (code, esc) =
      case splitAt (length code) cs of
        (pre, cs') | map getChar pre == code -> Just (esc, cs')
        _ -> Nothing

    longEscapeCodes =
      [ ("NUL", '\NUL')
      , ("SOH", '\SOH')
      , ("STX", '\STX')
      , ("ETX", '\ETX')
      , ("EOT", '\EOT')
      , ("ENQ", '\ENQ')
      , ("ACK", '\ACK')
      , ("BEL", '\BEL')
      , ("BS" , '\BS' )
      , ("HT" , '\HT' )
      , ("LF" , '\LF' )
      , ("VT" , '\VT' )
      , ("FF" , '\FF' )
      , ("CR" , '\CR' )
      , ("SO" , '\SO' )
      , ("SI" , '\SI' )
      , ("DLE", '\DLE')
      , ("DC1", '\DC1')
      , ("DC2", '\DC2')
      , ("DC3", '\DC3')
      , ("DC4", '\DC4')
      , ("NAK", '\NAK')
      , ("SYN", '\SYN')
      , ("ETB", '\ETB')
      , ("CAN", '\CAN')
      , ("EM" , '\EM' )
      , ("SUB", '\SUB')
      , ("ESC", '\ESC')
      , ("FS" , '\FS' )
      , ("GS" , '\GS' )
      , ("RS" , '\RS' )
      , ("US" , '\US' )
      , ("SP" , '\SP' )
      , ("DEL", '\DEL')
      ]

-- -----------------------------------------------------------------------------
-- Unicode Smart Quote detection (#21843)

isDoubleSmartQuote :: Char -> Bool
isDoubleSmartQuote = \case
  '“' -> True
  '”' -> True
  _ -> False

isSingleSmartQuote :: Char -> Bool
isSingleSmartQuote = \case
  '‘' -> True
  '’' -> True
  _ -> False

-- -----------------------------------------------------------------------------
-- Multiline strings

-- | See Note [Multiline string literals]
--
-- Assumes string is lexically valid. Skips the steps about splitting
-- and rejoining lines, and instead manually find newline characters,
-- for performance.
lexMultilineString :: Int -> StringBuffer -> Either StringLexError String
lexMultilineString = lexStringWith processChars processChars
  where
    processChars :: HasChar c => [c] -> Either (c, LexErr) [c]
    processChars =
          collapseGaps             -- Step 1
      >>> normalizeEOL
      >>> expandLeadingTabs        -- Step 3
      >>> rmCommonWhitespacePrefix -- Step 4
      >>> collapseOnlyWsLines      -- Step 5
      >>> rmFirstNewline           -- Step 7a
      >>> rmLastNewline            -- Step 7b
      >>> resolveEscapes           -- Step 8

    -- Normalize line endings to LF. The spec dictates that lines should be
    -- split on newline characters and rejoined with ``\n``. But because we
    -- aren't actually splitting/rejoining, we'll manually normalize here
    normalizeEOL :: HasChar c => [c] -> [c]
    normalizeEOL =
      let go = \case
            Char '\r' : c@(Char '\n') : cs -> c : go cs
            c@(Char '\r') : cs -> setChar '\n' c : go cs
            c@(Char '\f') : cs -> setChar '\n' c : go cs
            c : cs -> c : go cs
            [] -> []
       in go

    -- expands all tabs, since the lexer will verify that tabs can only appear
    -- as leading indentation
    expandLeadingTabs :: HasChar c => [c] -> [c]
    expandLeadingTabs =
      let go !col = \case
            c@(Char '\t') : cs ->
              let fill = 8 - (col `mod` 8)
               in replicate fill (setChar ' ' c) ++ go (col + fill) cs
            c : cs -> c : go (if getChar c == '\n' then 0 else col + 1) cs
            [] -> []
       in go 0

    rmCommonWhitespacePrefix :: HasChar c => [c] -> [c]
    rmCommonWhitespacePrefix cs0 =
      let commonWSPrefix = getCommonWsPrefix (map getChar cs0)
          go = \case
            c@(Char '\n') : cs -> c : go (dropLine commonWSPrefix cs)
            c : cs -> c : go cs
            [] -> []
          -- drop x characters from the string, or up to a newline, whichever
          -- comes first
          dropLine !x = \case
            cs | x <= 0 -> cs
            cs@(Char '\n' : _) -> cs
            _ : cs -> dropLine (x - 1) cs
            [] -> []
       in go cs0

    collapseOnlyWsLines :: HasChar c => [c] -> [c]
    collapseOnlyWsLines =
      let go = \case
            c@(Char '\n') : cs | Just cs' <- checkAllWs cs -> c : go cs'
            c : cs -> c : go cs
            [] -> []
          checkAllWs = \case
            -- got all the way to a newline or the end of the string, return
            cs@(Char '\n' : _) -> Just cs
            cs@[] -> Just cs
            -- found whitespace, continue
            Char c : cs | is_space c -> checkAllWs cs
            -- anything else, stop
            _ -> Nothing
       in go

    rmFirstNewline :: HasChar c => [c] -> [c]
    rmFirstNewline = \case
      Char '\n' : cs -> cs
      cs -> cs

    rmLastNewline :: HasChar c => [c] -> [c]
    rmLastNewline =
      let go = \case
            [] -> []
            [Char '\n'] -> []
            c : cs -> c : go cs
       in go

-- | See step 4 in Note [Multiline string literals]
--
-- Assumes tabs have already been expanded.
getCommonWsPrefix :: String -> Int
getCommonWsPrefix s =
  case NonEmpty.nonEmpty includedLines of
    Nothing -> 0
    Just ls -> Foldable1.minimum $ NonEmpty.map (length . takeWhile is_space) ls
  where
    includedLines =
        filter (not . all is_space) -- ignore whitespace-only lines
      . drop 1                      -- ignore first line in calculation
      $ lines s

{-
Note [Multiline string literals]
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Multiline string literals were added following the acceptance of the
proposal: https://github.com/ghc-proposals/ghc-proposals/pull/569

Multiline string literals are syntax sugar for normal string literals,
with an extra post processing step. This all happens in the Lexer; that
is, HsMultilineString will contain the post-processed string. This matches
the same behavior as HsString, which contains the normalized string
(see Note [Literal source text]).

The canonical steps for post processing a multiline string are:
1. Collapse string gaps
2. Split the string by newlines
3. Convert leading tabs into spaces
    * In each line, any tabs preceding non-whitespace characters are replaced with spaces up to the next tab stop
4. Remove common whitespace prefix in every line except the first (see below)
5. If a line contains only whitespace, remove all of the whitespace
6. Join the string back with `\n` delimiters
7a. If the first character of the string is a newline, remove it
7b. If the last character of the string is a newline, remove it
8. Interpret escaped characters

The common whitespace prefix can be informally defined as "The longest
prefix of whitespace shared by all lines in the string, excluding the
first line and any whitespace-only lines".

It's more precisely defined with the following algorithm:

1. Take a list representing the lines in the string
2. Ignore the following elements in the list:
    * The first line (we want to ignore everything before the first newline)
    * Empty lines
    * Lines with only whitespace characters
3. Calculate the longest prefix of whitespace shared by all lines in the remaining list
-}

-- -----------------------------------------------------------------------------
-- DList

newtype DList a = DList ([a] -> [a])

dlistEmpty :: DList a
dlistEmpty = DList id

dlistToList :: DList a -> [a]
dlistToList (DList f) = f []

dlistSnoc :: DList a -> a -> DList a
dlistSnoc (DList f) x = DList (f . (x :))