{-# OPTIONS  #-}
-----------------------------------------------------------------------------
-- |
-- Module      : Language.Python.Common.LexerUtils 
-- Copyright   : (c) 2009 Bernie Pope 
-- License     : BSD-style
-- Maintainer  : bjpop@csse.unimelb.edu.au
-- Stability   : experimental
-- Portability : ghc
--
-- Various utilities to support the Python lexer. 
-----------------------------------------------------------------------------

module Language.Python.Common.LexerUtils where

import Control.Monad (liftM)
import Control.Monad.Error.Class (throwError)
import Data.List (foldl')
import Data.Word (Word8)
import Language.Python.Common.Token as Token
import Language.Python.Common.ParserMonad hiding (location)
import Language.Python.Common.SrcLocation 
import Codec.Binary.UTF8.String as UTF8 (encode)

type Byte = Word8

-- Beginning of. BOF = beginning of file, BOL = beginning of line
data BO = BOF | BOL

-- Functions for building tokens 

type StartCode = Int
type Action = SrcSpan -> Int -> String -> P Token 

lineJoin :: Action
lineJoin :: Action
lineJoin span :: SrcSpan
span _len :: Int
_len _str :: String
_str = 
   Token -> StateT ParseState (Either ParseError) Token
forall (m :: * -> *) a. Monad m => a -> m a
return (Token -> StateT ParseState (Either ParseError) Token)
-> Token -> StateT ParseState (Either ParseError) Token
forall a b. (a -> b) -> a -> b
$ SrcSpan -> Token
LineJoinToken (SrcSpan -> Token) -> SrcSpan -> Token
forall a b. (a -> b) -> a -> b
$ SrcSpan -> SrcSpan
spanStartPoint SrcSpan
span

endOfLine :: P Token -> Action
endOfLine :: StateT ParseState (Either ParseError) Token -> Action
endOfLine lexToken :: StateT ParseState (Either ParseError) Token
lexToken span :: SrcSpan
span _len :: Int
_len _str :: String
_str = do
   SrcSpan -> P ()
setLastEOL (SrcSpan -> P ()) -> SrcSpan -> P ()
forall a b. (a -> b) -> a -> b
$ SrcSpan -> SrcSpan
spanStartPoint SrcSpan
span
   StateT ParseState (Either ParseError) Token
lexToken

bolEndOfLine :: P Token -> Int -> Action 
bolEndOfLine :: StateT ParseState (Either ParseError) Token -> Int -> Action
bolEndOfLine lexToken :: StateT ParseState (Either ParseError) Token
lexToken bol :: Int
bol span :: SrcSpan
span len :: Int
len inp :: String
inp = do
   Int -> P ()
pushStartCode Int
bol 
   StateT ParseState (Either ParseError) Token -> Action
endOfLine StateT ParseState (Either ParseError) Token
lexToken SrcSpan
span Int
len String
inp

dedentation :: P Token -> Action
dedentation :: StateT ParseState (Either ParseError) Token -> Action
dedentation lexToken :: StateT ParseState (Either ParseError) Token
lexToken span :: SrcSpan
span _len :: Int
_len _str :: String
_str = do
   Int
topIndent <- P Int
getIndent
   -- case compare (endCol span) topIndent of
   case Int -> Int -> Ordering
forall a. Ord a => a -> a -> Ordering
compare (SrcSpan -> Int
startCol SrcSpan
span) Int
topIndent of
      EQ -> do P ()
popStartCode
               StateT ParseState (Either ParseError) Token
lexToken 
      LT -> do P ()
popIndent
               Token -> StateT ParseState (Either ParseError) Token
forall (m :: * -> *) a. Monad m => a -> m a
return Token
dedentToken 
      GT -> SrcSpan -> String -> StateT ParseState (Either ParseError) Token
forall a b. Span a => a -> String -> P b
spanError SrcSpan
span "indentation error"

indentation :: P Token -> Int -> BO -> Action 
-- Check if we are at the EOF. If yes, we may need to generate a newline,
-- in case we came here from BOL (but not BOF).
indentation :: StateT ParseState (Either ParseError) Token -> Int -> BO -> Action
indentation lexToken :: StateT ParseState (Either ParseError) Token
lexToken _dedentCode :: Int
_dedentCode bo :: BO
bo _loc :: SrcSpan
_loc _len :: Int
_len [] = do
   P ()
popStartCode
   case BO
bo of
      BOF -> StateT ParseState (Either ParseError) Token
lexToken
      BOL -> StateT ParseState (Either ParseError) Token
newlineToken
indentation lexToken :: StateT ParseState (Either ParseError) Token
lexToken dedentCode :: Int
dedentCode bo :: BO
bo span :: SrcSpan
span _len :: Int
_len _str :: String
_str = do
   P ()
popStartCode
   Int
parenDepth <- P Int
getParenStackDepth
   if Int
parenDepth Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
> 0
      then StateT ParseState (Either ParseError) Token
lexToken
      else do 
         Int
topIndent <- P Int
getIndent
         -- case compare (endCol span) topIndent of
         case Int -> Int -> Ordering
forall a. Ord a => a -> a -> Ordering
compare (SrcSpan -> Int
startCol SrcSpan
span) Int
topIndent of
            EQ -> case BO
bo of
                     BOF -> StateT ParseState (Either ParseError) Token
lexToken
                     BOL -> StateT ParseState (Either ParseError) Token
newlineToken   
            LT -> do Int -> P ()
pushStartCode Int
dedentCode 
                     StateT ParseState (Either ParseError) Token
newlineToken 
            -- GT -> do pushIndent (endCol span)
            GT -> do Int -> P ()
pushIndent (SrcSpan -> Int
startCol SrcSpan
span)
                     Token -> StateT ParseState (Either ParseError) Token
forall (m :: * -> *) a. Monad m => a -> m a
return Token
indentToken
   where
   indentToken :: Token
indentToken = SrcSpan -> Token
IndentToken SrcSpan
span

symbolToken :: (SrcSpan -> Token) -> Action 
symbolToken :: (SrcSpan -> Token) -> Action
symbolToken mkToken :: SrcSpan -> Token
mkToken location :: SrcSpan
location _ _ = Token -> StateT ParseState (Either ParseError) Token
forall (m :: * -> *) a. Monad m => a -> m a
return (SrcSpan -> Token
mkToken SrcSpan
location)

token :: (SrcSpan -> String -> a -> Token) -> (String -> a) -> Action 
token :: (SrcSpan -> String -> a -> Token) -> (String -> a) -> Action
token mkToken :: SrcSpan -> String -> a -> Token
mkToken read :: String -> a
read location :: SrcSpan
location len :: Int
len str :: String
str 
   = Token -> StateT ParseState (Either ParseError) Token
forall (m :: * -> *) a. Monad m => a -> m a
return (Token -> StateT ParseState (Either ParseError) Token)
-> Token -> StateT ParseState (Either ParseError) Token
forall a b. (a -> b) -> a -> b
$ SrcSpan -> String -> a -> Token
mkToken SrcSpan
location String
literal (String -> a
read String
literal) 
   where
   literal :: String
literal = Int -> String -> String
forall a. Int -> [a] -> [a]
take Int
len String
str

-- special tokens for the end of file and end of line
endOfFileToken :: Token
endOfFileToken :: Token
endOfFileToken = SrcSpan -> Token
EOFToken SrcSpan
SpanEmpty
dedentToken :: Token
dedentToken = SrcSpan -> Token
DedentToken SrcSpan
SpanEmpty 

newlineToken :: P Token
newlineToken :: StateT ParseState (Either ParseError) Token
newlineToken = do
   SrcSpan
loc <- P SrcSpan
getLastEOL
   Token -> StateT ParseState (Either ParseError) Token
forall (m :: * -> *) a. Monad m => a -> m a
return (Token -> StateT ParseState (Either ParseError) Token)
-> Token -> StateT ParseState (Either ParseError) Token
forall a b. (a -> b) -> a -> b
$ SrcSpan -> Token
NewlineToken SrcSpan
loc

-- Test if we are at the end of the line or file
atEOLorEOF :: a -> AlexInput -> Int -> AlexInput -> Bool
atEOLorEOF :: a -> AlexInput -> Int -> AlexInput -> Bool
atEOLorEOF _user :: a
_user _inputBeforeToken :: AlexInput
_inputBeforeToken _tokenLength :: Int
_tokenLength (_loc :: SrcLocation
_loc, _bs :: [Byte]
_bs, inputAfterToken :: String
inputAfterToken) 
   = String -> Bool
forall (t :: * -> *) a. Foldable t => t a -> Bool
null String
inputAfterToken Bool -> Bool -> Bool
|| Char
nextChar Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== '\n' Bool -> Bool -> Bool
|| Char
nextChar Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== '\r'
   where
   nextChar :: Char
nextChar = String -> Char
forall a. [a] -> a
head String
inputAfterToken 

notEOF :: a -> AlexInput -> Int -> AlexInput -> Bool
notEOF :: a -> AlexInput -> Int -> AlexInput -> Bool
notEOF _user :: a
_user _inputBeforeToken :: AlexInput
_inputBeforeToken _tokenLength :: Int
_tokenLength (_loc :: SrcLocation
_loc, _bs :: [Byte]
_bs, inputAfterToken :: String
inputAfterToken) 
   = Bool -> Bool
not (String -> Bool
forall (t :: * -> *) a. Foldable t => t a -> Bool
null String
inputAfterToken)

readBinary :: String -> Integer
readBinary :: String -> Integer
readBinary 
   = String -> Integer
toBinary (String -> Integer) -> (String -> String) -> String -> Integer
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Int -> String -> String
forall a. Int -> [a] -> [a]
drop 2 
   where
   toBinary :: String -> Integer
toBinary = (Integer -> Char -> Integer) -> Integer -> String -> Integer
forall (t :: * -> *) b a.
Foldable t =>
(b -> a -> b) -> b -> t a -> b
foldl' Integer -> Char -> Integer
forall a. Num a => a -> Char -> a
acc 0
   acc :: a -> Char -> a
acc b :: a
b '0' = 2 a -> a -> a
forall a. Num a => a -> a -> a
* a
b
   acc b :: a
b '1' = 2 a -> a -> a
forall a. Num a => a -> a -> a
* a
b a -> a -> a
forall a. Num a => a -> a -> a
+ 1

readFloat :: String -> Double
readFloat :: String -> Double
readFloat str :: String
str@('.':cs :: String
cs) = String -> Double
forall a. Read a => String -> a
read ('0'Char -> String -> String
forall a. a -> [a] -> [a]
:String -> String
readFloatRest String
str)
readFloat str :: String
str = String -> Double
forall a. Read a => String -> a
read (String -> String
readFloatRest String
str)
readFloatRest :: String -> String
readFloatRest :: String -> String
readFloatRest [] = []
readFloatRest ['.'] = ".0"
readFloatRest (c :: Char
c:cs :: String
cs) = Char
c Char -> String -> String
forall a. a -> [a] -> [a]
: String -> String
readFloatRest String
cs

mkString :: (SrcSpan -> String -> Token) -> Action
mkString :: (SrcSpan -> String -> Token) -> Action
mkString toToken :: SrcSpan -> String -> Token
toToken loc :: SrcSpan
loc len :: Int
len str :: String
str = do
   Token -> StateT ParseState (Either ParseError) Token
forall (m :: * -> *) a. Monad m => a -> m a
return (Token -> StateT ParseState (Either ParseError) Token)
-> Token -> StateT ParseState (Either ParseError) Token
forall a b. (a -> b) -> a -> b
$ SrcSpan -> String -> Token
toToken SrcSpan
loc (Int -> String -> String
forall a. Int -> [a] -> [a]
take Int
len String
str)

stringToken :: SrcSpan -> String -> Token
stringToken :: SrcSpan -> String -> Token
stringToken = SrcSpan -> String -> Token
StringToken

rawStringToken :: SrcSpan -> String -> Token
rawStringToken :: SrcSpan -> String -> Token
rawStringToken = SrcSpan -> String -> Token
StringToken

byteStringToken :: SrcSpan -> String -> Token
byteStringToken :: SrcSpan -> String -> Token
byteStringToken = SrcSpan -> String -> Token
ByteStringToken

formatStringToken :: SrcSpan -> String -> Token
formatStringToken :: SrcSpan -> String -> Token
formatStringToken = SrcSpan -> String -> Token
StringToken

formatRawStringToken :: SrcSpan -> String -> Token
formatRawStringToken :: SrcSpan -> String -> Token
formatRawStringToken = SrcSpan -> String -> Token
StringToken

unicodeStringToken :: SrcSpan -> String -> Token
unicodeStringToken :: SrcSpan -> String -> Token
unicodeStringToken = SrcSpan -> String -> Token
UnicodeStringToken

rawByteStringToken :: SrcSpan -> String -> Token
rawByteStringToken :: SrcSpan -> String -> Token
rawByteStringToken = SrcSpan -> String -> Token
ByteStringToken

openParen :: (SrcSpan -> Token) -> Action
openParen :: (SrcSpan -> Token) -> Action
openParen mkToken :: SrcSpan -> Token
mkToken loc :: SrcSpan
loc _len :: Int
_len _str :: String
_str = do
   let token :: Token
token = SrcSpan -> Token
mkToken SrcSpan
loc
   Token -> P ()
pushParen Token
token 
   Token -> StateT ParseState (Either ParseError) Token
forall (m :: * -> *) a. Monad m => a -> m a
return Token
token 

closeParen :: (SrcSpan -> Token) -> Action
closeParen :: (SrcSpan -> Token) -> Action
closeParen mkToken :: SrcSpan -> Token
mkToken loc :: SrcSpan
loc _len :: Int
_len _str :: String
_str = do
  let token :: Token
token = SrcSpan -> Token
mkToken SrcSpan
loc
  Maybe Token
topParen <- P (Maybe Token)
getParen
  case Maybe Token
topParen of
     Nothing -> SrcSpan -> String -> StateT ParseState (Either ParseError) Token
forall a b. Span a => a -> String -> P b
spanError SrcSpan
loc String
err1 
     Just open :: Token
open -> if Token -> Token -> Bool
matchParen Token
open Token
token 
                    then P ()
popParen P ()
-> StateT ParseState (Either ParseError) Token
-> StateT ParseState (Either ParseError) Token
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Token -> StateT ParseState (Either ParseError) Token
forall (m :: * -> *) a. Monad m => a -> m a
return Token
token
                    else SrcSpan -> String -> StateT ParseState (Either ParseError) Token
forall a b. Span a => a -> String -> P b
spanError SrcSpan
loc String
err2
   where
   -- XXX fix these error messages
   err1 :: String
err1 = "Lexical error ! unmatched closing paren"
   err2 :: String
err2 = "Lexical error ! unmatched closing paren"

matchParen :: Token -> Token -> Bool
matchParen :: Token -> Token -> Bool
matchParen (LeftRoundBracketToken {}) (RightRoundBracketToken {}) = Bool
True
matchParen (LeftBraceToken {}) (RightBraceToken {}) = Bool
True
matchParen (LeftSquareBracketToken {}) (RightSquareBracketToken {}) = Bool
True
matchParen _ _ = Bool
False

-- -----------------------------------------------------------------------------
-- Functionality required by Alex 

type AlexInput = (SrcLocation,  -- current src location
                 [Byte],        -- byte buffer for next character
                 String)        -- input string

alexInputPrevChar :: AlexInput -> Char
alexInputPrevChar :: AlexInput -> Char
alexInputPrevChar _ = String -> Char
forall a. HasCallStack => String -> a
error "alexInputPrevChar not used"

-- byte buffer should be empty here
alexGetChar :: AlexInput -> Maybe (Char, AlexInput)
alexGetChar :: AlexInput -> Maybe (Char, AlexInput)
alexGetChar (loc :: SrcLocation
loc, [], input :: String
input) 
   | String -> Bool
forall (t :: * -> *) a. Foldable t => t a -> Bool
null String
input  = Maybe (Char, AlexInput)
forall a. Maybe a
Nothing
   | Bool
otherwise = SrcLocation -> Maybe (Char, AlexInput) -> Maybe (Char, AlexInput)
forall a b. a -> b -> b
seq SrcLocation
nextLoc ((Char, AlexInput) -> Maybe (Char, AlexInput)
forall a. a -> Maybe a
Just (Char
nextChar, (SrcLocation
nextLoc, [], String
rest)))
   where
   nextChar :: Char
nextChar = String -> Char
forall a. [a] -> a
head String
input
   rest :: String
rest = String -> String
forall a. [a] -> [a]
tail String
input 
   nextLoc :: SrcLocation
nextLoc = Char -> SrcLocation -> SrcLocation
moveChar Char
nextChar SrcLocation
loc
alexGetChar (loc :: SrcLocation
loc, _:_, _) = String -> Maybe (Char, AlexInput)
forall a. HasCallStack => String -> a
error "alexGetChar called with non-empty byte buffer"

-- mapFst :: (a -> b) -> (a, c) -> (b, c)
-- mapFst f (a, c) = (f a, c)

alexGetByte :: AlexInput -> Maybe (Byte, AlexInput)
-- alexGetByte = fmap (mapFst (fromIntegral . ord)) . alexGetChar
alexGetByte :: AlexInput -> Maybe (Byte, AlexInput)
alexGetByte (loc :: SrcLocation
loc, b :: Byte
b:bs :: [Byte]
bs, input :: String
input) = (Byte, AlexInput) -> Maybe (Byte, AlexInput)
forall a. a -> Maybe a
Just (Byte
b, (SrcLocation
loc, [Byte]
bs, String
input))
alexGetByte (loc :: SrcLocation
loc, [], []) = Maybe (Byte, AlexInput)
forall a. Maybe a
Nothing
alexGetByte (loc :: SrcLocation
loc, [], nextChar :: Char
nextChar:rest :: String
rest) =
   SrcLocation -> Maybe (Byte, AlexInput) -> Maybe (Byte, AlexInput)
forall a b. a -> b -> b
seq SrcLocation
nextLoc ((Byte, AlexInput) -> Maybe (Byte, AlexInput)
forall a. a -> Maybe a
Just (Byte
byte, (SrcLocation
nextLoc, [Byte]
restBytes, String
rest)))
   where
   nextLoc :: SrcLocation
nextLoc = Char -> SrcLocation -> SrcLocation
moveChar Char
nextChar SrcLocation
loc
   byte :: Byte
byte:restBytes :: [Byte]
restBytes = String -> [Byte]
UTF8.encode [Char
nextChar]

moveChar :: Char -> SrcLocation -> SrcLocation 
moveChar :: Char -> SrcLocation -> SrcLocation
moveChar '\n' = Int -> SrcLocation -> SrcLocation
incLine 1 
moveChar '\t' = SrcLocation -> SrcLocation
incTab 
moveChar '\r' = SrcLocation -> SrcLocation
forall a. a -> a
id 
moveChar _    = Int -> SrcLocation -> SrcLocation
incColumn 1 

lexicalError :: P a
lexicalError :: P a
lexicalError = do
  SrcLocation
location <- P SrcLocation
getLocation
  Char
c <- (String -> Char)
-> StateT ParseState (Either ParseError) String
-> StateT ParseState (Either ParseError) Char
forall (m :: * -> *) a1 r. Monad m => (a1 -> r) -> m a1 -> m r
liftM String -> Char
forall a. [a] -> a
head StateT ParseState (Either ParseError) String
getInput
  ErrorType (StateT ParseState (Either ParseError)) -> P a
forall (m :: * -> *) a. MonadError m => ErrorType m -> m a
throwError (ErrorType (StateT ParseState (Either ParseError)) -> P a)
-> ErrorType (StateT ParseState (Either ParseError)) -> P a
forall a b. (a -> b) -> a -> b
$ Char -> SrcLocation -> ParseError
UnexpectedChar Char
c SrcLocation
location

readOctNoO :: String -> Integer
readOctNoO :: String -> Integer
readOctNoO (zero :: Char
zero:rest :: String
rest) = String -> Integer
forall a. Read a => String -> a
read (Char
zeroChar -> String -> String
forall a. a -> [a] -> [a]
:'O'Char -> String -> String
forall a. a -> [a] -> [a]
:String
rest)