/*--------------------------------------------------------------------------*\
|   expr_tokenizer.pro
|
|   This file contains Prolog predicates that "tokenize" an input
|   line into a list of atoms representing the tokens in the line.
|   It is intended to show how one might tokenize input for a simple
|   expression parser that reads integers and basic math symbols,
|   including parentheses.  It also shows how to use the tokenizing
|   predicate by using a modified version of "echo_input".
|
|   All of the functions below work on a string that is represented
|   as a Prolog list of ASCII character codes.  This is the format
|   returned by the read_chars/1 input predicate defined in the
|   echo_input.pro sample program.  It is also one of the common
|   ways of representing character strings in Prolog so that they can
|   be manipulated easily by user-defined predicates.  You can use the
|   string_to_list/2 predicate to convert back and forth between a
|   "..." string literal and a list of ASCII codes.  The atom_chars/2
|   predicate performs the same task for atom names and lists of ASCII
|   codes.
|
|   See the end of this file for sample usage.
\*--------------------------------------------------------------------------*/

/*--------------------------------------------------------------------------*\
|    tokenize_line/2
|    This predicate takes a list of ASCII codes (such as those
|    produced by read_chars/1 in the echo_input.pro sample, or
|    string_to_list/2) and returns a list of atoms representing the
|    words comprising the input.
\*--------------------------------------------------------------------------*/
tokenize_line([],[]).
tokenize_line([Space | Rest], Tokens) :-
    member(Space, [9, 32]),
    tokenize_line(Rest, Tokens).
tokenize_line(String, [First_Token | Rest]) :-
    tokenize_word(String, First_Token, Untokenized),
    tokenize_line(Untokenized, Rest).

/*--------------------------------------------------------------------------*\
|    tokenize_word/3
|    This predicate takes a list of ASCII codes and splits it into two
|    parts: a list of ASCII codes representing the first word in the
|    list (which is converted into an atom), and a list of ASCII codes
|    representing the unprocessed input.  This predicate assumes that
|    the first item in its input list of ASCII codes is the first
|    character of the word to tokenize.
|
|    The atoms returned for each token are:
|    +                plus
|    -                minus
|    *                asterisk
|    /                slash
|    (                left_paren
|    )                right_paren
|    end of line      end_of_line
|    [0-9]+           number
|    anything else    invalid_token
\*--------------------------------------------------------------------------*/
tokenize_word([], end_of_line, []).
tokenize_word([43 | Rest], plus, Rest).
tokenize_word([45 | Rest], minus, Rest).
tokenize_word([42 | Rest], asterisk, Rest).
tokenize_word([47 | Rest], slash, Rest).
tokenize_word([40 | Rest], left_paren, Rest).
tokenize_word([41 | Rest], right_paren, Rest).
tokenize_word([42 | Rest], asterisk, Rest).
tokenize_word([Digit | Rest], Num, Untokenized) :-
    Digit >= 48, Digit =< 57,
    tokenize_number([Digit | Rest], Num_List, Untokenized),
    number_chars(Num, Num_List).
tokenize_word([_ | Rest], invalid_token, Rest).


/*--------------------------------------------------------------------------*\
|    tokenize_number/3
|    This predicate takes a list of ASCII codes and splits it into two
|    parts: a list of ASCII codes representing a series of digits, and
|    a list of ASCII codes representing the unprocessed input.
\*--------------------------------------------------------------------------*/
tokenize_number([], [], []).
tokenize_number([Nondigit | Rest], [], [Nondigit | Rest]) :-
    Nondigit < 48; Nondigit > 57.
tokenize_number([Digit | Rest], [Digit | Rest_of_Number], Untokenized) :-
    Digit >= 48, Digit =< 57,
    tokenize_number(Rest, Rest_of_Number, Untokenized).


/*--------------------------------------------------------------------------*\
|    The code below is borrowed from echo_input.pro--see that file for
|    details.
\*--------------------------------------------------------------------------*/
echo_input :-
	repeat,
	\+ echo_line.

echo_line :-
	read_chars(Data),
	tokenize_line(Data,Words),  /* Here is where we use the tokenizer */
	write(Words),
	nl.

read_chars(Cs) :- get0(C), read_more_chars(C, Cs).

read_more_chars(End, [])   :- member(End, [10,13]), !.
read_more_chars(-1, _)     :- !, fail.
read_more_chars(C, [C|Cs]) :- read_chars(Cs).


/*--------------------------------------------------------------------------*\
|    The following directive tests our tokenizer.
\*--------------------------------------------------------------------------*/
?-  write('Testing tokenize_line ... '),
    write('please type lines followed by <Enter>.'), nl,
    write('Type Ctrl-D twice at the start of a line to stop.'), nl,
    set_input(user_input),
    echo_input.