with Ada.Text_IO;

with Ada.Finalization;
with Ada.Unchecked_Deallocation;

package Lexer is

  --====================================================================
  -- Author    Christoph Grein
  -- Version   6.1
  -- Date      30 June 2011
  --====================================================================
  -- A LL(1) grammar parser for a stream of characters representing an
  -- Ada program.
  --
  -- Notes
  -- =====
  --
  -- Reserved words and identifiers are separate lexical items
  -- (RM 2.2(1)).
  -- The reserved words Access, Delta, Digits, Range are also used as
  -- attribute designators (RM 4.1.4(3,5)).
  --
  -- An operator occurring as an operator symbol (RM 6.1(9), e.g.
  -- "=" (A, B)) is always returned as a string literal ("="), never as
  -- the corresponding token (Equal_A).
  -- Such a string literal can easily be identified as an operator in
  -- the following cases:
  -- - In a selected component (RM 4.1.3(2)), it is preceded by a dot.
  -- - In a subprogram specification (RM 6.1(4)) or a function call
  --   (RM 6.4(3), it is followed by an opening parenthesis.
  -- - In a formal_subprogram_declaration as a subprogram default
  --   (RM 12.6(2)), it is preceded by the reserved word is.
  -- Only the occurence in a generic instantiation as an explicit
  -- generic actual parameter (RM 12.3(5)) necessitates a compilation-
  -- like analysis to discriminate between a simple string and an
  -- operator symbol, which is of course out of the capabilities of
  -- lexical parsing.
  --
  -- Usage
  -- =====
  --
  -- Read the file into a string and initialize the lexer with a pointer
  -- to this string; the size of the horizontal tabulation character
  -- (system dependent) has also to be given. Reserved words are only
  -- recognized as such if the appropriate Ada generation is given, else
  -- they are returned as identifiers.
  -- Procedure Free may be used to deallocate the space occupied by the
  -- input string. (This should only be done when the lexer has finished
  -- and also you do no longer need it because Lexer does not keep a
  -- copy.) Free comes in two versions: via the string pointer used to
  -- initialize Lexer or via the internal status. Use whichever is more
  -- convenient.
  --
  -- Get_Token reads the next token from the input string. End_Error is
  -- raised when the end of input has been reached. For each token, the
  -- start and end position in the input stream is stored. Furthermore,
  -- while the input is processed, lines and columns are counted. The
  -- end of a line may be given by any of CR (Mac), LF (Unix), or
  -- CR-LF (DOS, MS-Windows).
  --
  -- Reset resets the parser to the given token's position in the input
  -- string. This token and the ones following will be returned on the
  -- next calls to Get_Token. Reset_Error is raised when called with a
  -- token that does not match the one in the input string.
  -- (Status is limited because it makes no sense to memorize a previous
  -- state. Reset has to be used instead.)
  --====================================================================
  -- History
  -- Author Version   Date    Reason for change
  --  C.G.    0.0  25.05.1998 PDL
  --  C.G.    0.1  31.05.1998 Added bad token
  --  C.G.    1.0  17.06.1998 Final design (added reason of bad token)
  --  C.G.    2.0  01.07.1998 Make Token controlled to prevent storage
  --                          leak
  --  C.G.    2.1  06.07.1998 Size of Tab
  --  C.G.    3.0  28.07.1998 Also lex Java
  --  C.G.    3.1  05.08.1998 Added Documentation_Tag
  --  C.G.    3.2  22.09.1998 Replace
  --                            Ampersand_AJ    by Concatenate_A, And_J
  --                            Vertical_Bar_AJ by Alternative_A, Or_J
  --  C.G.    3.3  03.10.1998 Added functions Image and Tag_Pos for
  --                          documentation tags, Is_Operator
  --  C.G.    3.4  29.01.1999 Added function Pos
  --  C.G.    3.5  13.08.1999 Further comments on Java
  --  C.G.    3.6  30.01.2001 Comments on reserved word attributes
  --  C.G.    4.0  22.02.2005 Ada 2005; Java removed
  --  C.G.    5.0  20.08.2005 Can now work concurrently on several files
  --  C.G.    5.1  21.08.2005 Status simplified
  --  C.G.    6.0  06.10.2010 Ada 2012 new keyword Some
  --  C.G.    6.1  30.06.2011 Ada Generations
  --====================================================================

  type Token_Name is
    (-- Reserved words RM 2.9 (2)  Ada 95  Ada 2005  Ada 2012
     Abort_A, Abs_A, Abstract_A, Accept_A, Access_A, Aliased_A, All_A, And_A, Array_A, At_A,
     Begin_A, Body_A,
     Case_A, Constant_A,
     Declare_A, Delay_A, Delta_A, Digits_A, Do_A,
     Else_A, Elsif_A, End_A, Entry_A, Exception_A, Exit_A,
     For_A, Function_A,
     Generic_A, Goto_A,
     If_A, In_A, Interface_A, Is_A,
     Limited_A, Loop_A,
     Mod_A,
     New_A, Not_A, Null_A,
     Of_A, Or_A, Others_A, Out_A, Overriding_A,
     Package_A, Pragma_A, Private_A, Procedure_A, Protected_A,
     Raise_A, Range_A, Record_A, Rem_A, Renames_A, Requeue_A, Return_A, Reverse_A,
     Select_A, Separate_A, Some_A, Subtype_A, Synchronized_A,
     Tagged_A, Task_A, Terminate_A, Then_A, Type_A,
     Until_A, Use_A,
     When_A, While_A, With_A,
     Xor_A,
     -- Delimiters RM 2.2 (9)
     -- & ' ( ) * + , - . / : ; < = > |
     -- Compound delimiters RM 2.2 (14)
     -- => .. ** := /= >= <= << >> <>
     Colon_A, Comma_A, Dot_A, Semicolon_A, Tick_A,         -- : , . ; '
     Left_Parenthesis_A, Right_Parenthesis_A,              -- ( )
     Concatenate_A,                                        -- &
     Alternative_A,                                        -- |
     Equal_A, Not_Equal_A,                                 -- = /=
     Less_A, Less_Equal_A, Greater_Equal_A, Greater_A,     -- < <= >= >
     Plus_A, Minus_A, Times_A, Divide_A,                   -- + - * /
     Arrow_A, Assignment_A, Double_Dot_A, Exponentiate_A,  -- => := .. **
     Left_Label_Bracket_A, Right_Label_Bracket_A, Box_A,   -- << >> <>
     -- Literals (RM 2.4 .. 2.6)
     Integer_A,              -- 1, 1E+10
     Based_Integer_A,        -- 13#C#E+10
     Real_A,                 -- 1.0E+10
     Based_Real_A,           -- 13#C.B#E+5
     Character_A, String_A,  -- 'a' "xyz"
     -- Other tokens
     Identifier_A,
     Comment_A,              -- -- to end of line
     -- Bad token (syntax error)
     Bad_Token_A);

  subtype Reserved_Word is Token_Name range Abort_A   .. Xor_A;
  subtype Delimiter     is Token_Name range Colon_A   .. Box_A;
  subtype Literal       is Token_Name range Integer_A .. String_A;
  subtype Number        is Literal    range Integer_A .. Based_Real_A;
  subtype Whole_Number  is Number     range Integer_A .. Based_Integer_A;
  subtype Real_Number   is Number     range Real_A    .. Based_Real_A;

  type Token is private;

  function Name  (Item: Token) return Token_Name;
  function Image (Item: Token) return String;
  pragma Inline (Name);

  -- Operators (not in operator symbol form) cannot be made a subtype of
  -- Token_Name. Thus a function is specified.
  -- RM 4.5(2)    and or xor
  --       (3)    = /= < <= > >=
  --       (4,5)  + - &
  --       (6)    * / mod rem
  --       (7)    ** abs not

  function Is_Operator (Item: Token) return Boolean;

  -- Return True when used as an attribute (only possible for identifiers
  -- and a few reserved words).

  function Is_Attribute (Item: Token) return Boolean;

  -- Only for based numbers.

  subtype Number_Base is Ada.Text_IO.Number_Base;

  function Base (Item: Token) return Number_Base;
  pragma Inline (Base);

  -- Only for bad tokens (if there are errors, it depends on the context
  -- when the bad token's end is assumed and which error is reported).
  -- Illegal_Literal is used for any error in identifiers or numbers that
  -- is not covered by more explicit reports.

  type Token_Error is (Non_Language_Character, -- e.g. {
                       Illegal_Literal,        -- e.g. non-graphic character
                       Illegal_Underline,      -- for identifiers and numbers
                       Illegal_Base,           -- for numbers
                       Missing_Base_Quote,
                       Illegal_Extended_Digit,
                       Missing_String_Quote);  -- string terminated by EOL

  function Error (Item: Token) return Token_Error;
  pragma Inline (Error);

  -- Token position within string and file

  function First (Item: Token) return Positive;  -- range in the
  function Last  (Item: Token) return Positive;  --   input string
  function Line  (Item: Token) return Positive;  -- line and
  function Col   (Item: Token) return Positive;  --   column in the file
  pragma  Inline (First, Last, Line, Col);

  -- Input string ---------------------------------------------------------

  type String_Pointer is access all String;

  type Ada_Generation is (Ada_83, Ada_95, Ada_2005, Ada_2012);

  type Status is limited private;

  procedure Initialize (State: out Status; Input: in String_Pointer; Size_of_Tab: in Positive;
                        Generation: in Ada_Generation := Ada_Generation'Last);

  procedure Free is new Ada.Unchecked_Deallocation (String, String_Pointer);
  procedure Free (State: in Status);

  -- Current position within Input

  function Pos  (State: Status) return Positive;  -- position in the input string
  function Line (State: Status) return Positive;  -- current line and
  function Col  (State: Status) return Positive;  --   column in the file
  pragma Inline (Pos, Line, Col);

  function End_of_Input (State: Status) return Boolean;
  pragma Inline (End_of_Input);

  -- Token parser ---------------------------------------------------------

  function Get_Token (State: Status) return Token;

  procedure Reset (State: in Status; to_Token: in  Token);

  End_Error, Reset_Error: exception;

private

  type Token_Core (Name: Token_Name := Comment_A) is record
    -- location of token in input string and file
    Line , Col,
    First, Last: Positive;
    case Name is
      when Access_A | Delta_A | Digits_A | Range_A =>
        Attribute: Boolean;
      when Number'First .. Bad_Token_A =>
        Image: String_Pointer;  -- illegal token if null
        case Name is
          when Based_Real_A    |
               Based_Integer_A => Base : Number_Base;
          when Bad_Token_A     => Error: Token_Error;
          when Identifier_A    => Attr : Boolean;  -- like Attribute above
          when others          => null;
        end case;
      when others =>
        null;
    end case;
  end record;

  type Token is new Ada.Finalization.Controlled with record
    Core: Token_Core;
  end record;

  procedure Adjust   (Object: in out Token);
  procedure Finalize (Object: in out Token);

  -- Lexer's state

  type Error_Occurrence (Occurred: Boolean := False) is record  -- for bad token
    case Occurred is
      when False => null;
      when True  => Name: Token_Error;
    end case;
  end record;

  package Stacks is
    -- A simple stack remembering the last two column numbers (needed to
    -- go back across EOL and HT) - (we go back at most two characters).
    type Object is private;
    procedure Push (O: in out Object; Col: in  Positive);
    procedure Pop  (O: in out Object; Col: out Positive);
  private
    type Any_Name is array (Boolean) of Positive;
    type Object is record
      Store: Any_Name;
      Top  : Boolean := Any_Name'First;
    end record;
  end Stacks;

  type Rosen_Trick (Outer: access Status) is limited null record;
  type Status is record
    Self            : Rosen_Trick (Outer => Status'Access);
    Input_Location  : Positive;        -- the next position to read
    Input_String    : String_Pointer;  --   in this string
    Current_Line    ,                  -- the next position in
    Current_Col     : Positive;        --   the file
    Tab_Size        : Positive;
    Last_Token      : Token;           -- (attribute vs. character)
    Next_Char       : Character;       -- the character just read and to be processed
    Last_Error      : Error_Occurrence;
    Stack           : Stacks.Object;
  end record;

end Lexer;