Here I discuss the process of implementing and debugging a tokenizer based
on the SML (Standard ML) parser tutorial from Scott Smith
(http://www.cs.jhu.edu/~scott/pl/lectures/parsing.html) in ATS, without
using linear types. This was done in a bit of a hurry, so certainly there
may be some bugs due to little testing. Once complete, I will include a
more polished tutorial in its entirety. Comments on style are also welcome.
The fact that I got a segmentation fault when writing simple code probably
indicates 1) I need practice… and 2) that dealing with strings is
probably a good case for using linear types.
First, we use Hongwei Xi’s implementation of the string stream mentioned in
the tutorial
(https://groups.google.com/forum/?fromgroups=#!topic/ats-lang-users/oFuXRr4K8ts).
After our first round of implementation, and implementing (potentially
redundant) functions like toCaps, etc, we arrive at the following code.
However, I want to stress that the implementation of string_caps is not
good because it creates references that need to be cleared by the GC very
frequently (but it works), as noted in the ATS Book section on references.
For my original try which I did late at night (possibly only part of the
problem), see here: http://pastebin.com/M0ckExtX
(* ********************************* Begin CODE
******************************** *)
staload "prelude/SATS/string.sats"
staload "prelude/SATS/printf.sats"
staload “libc/SATS/stdio.sats”
#define NUL '\000’
exception UnforseenLexeme of ()
typedef strstream = '{
peek = () - char,
next = () - void,
prev = () - void
//cose() - if using ptrs
}
datatype GRTOK =
| TKgene of string
| TKand of ()
| TKor of ()
| TKlpar of ()
| TKrpar of ()
| TKEND of ()
extern
fun toCaps(c:char): char
extern
fun strstream_make (str: string): strstream
extern
fun getToken (ss: strstream): GRTOK
fn string_of_string1 {n:nat} (str : string n): string = str
//Get the minimum of two sizes, assert the minimum via type
fun {m,n:nat} minim (x:size_t m, y:size_t n): size_t(min(m,n)) = if x < y
then x else y
(*Hongwei’s implementation of a string stream: *)
implement
strstream_make (str:string) =
let
val [n:int] str = string1_of_string (str)
val n = string_length (str)
val pos = ref<sizeLte(n)> (size1_of_int1(0))
val fpeek = lam () : char =
let
val i = !pos
in if i < n then str[i] else NUL
end // end of [lam]
val fnext = lam () : void =
let
val i = !pos
in if i < n then !pos := i + 1
end // end of [lam]
val fprev = lam () : void =
let
val i = !pos
in if i > 0 then !pos := i - 1
end // end of [lam]
in '{ peek= fpeek, next= fnext, prev= fprev}
end // end of [strstream_make]
//Need an alternative as this requires a lot of GC:
fun string_caps(str: string): string =
let
val strstr = strstream_make(str)
fun loop(): string =
let
val x = strstr.peek()
val x = toCaps(x)
in
if x <> NUL then
let
val _ = strstr.next()
val xs = loop()
val x = string_of_string1(tostring(x))
in
x + xs
end
else string1_of_string("")
end
in
loop()
end
(* Assumes ASCII *)
fun print_tks(ss: strstream): void =
let
fun loop(): void =
let
val tk = getToken(ss)
in case+ tk of
| TKEND() => print(“END\n”)
| TKand() => (print ("AND "); loop())
| TKor() => (print("OR “); loop())
| TKlpar() => (print(”( “); loop())
| TKrpar() => (print(”) “); loop())
| TKgene(g) => (print(g+” "); loop())
end
in
loop()
end
implement
toCaps©: char = if (c >= ‘a’ && c <= ‘z’) then
char_of_int(int_of_char©-32) else c
fn isAnd(str: string): bool = if (str = “AND”) || (str = “&”)|| str = "&&"
then true else false
fn isOr(str: string): bool = if (str = “OR”) || str = “|” || str = "||"
then true else false
fn isGeneCh(c:char): bool = (c >= ‘0’ && c <= ‘9’) || (c >= ‘A’ && c <= ‘Z’)
|| (c >= ‘a’ && c <= ‘z’) || c = ‘_’ || c = ‘.’ || c = ‘-’
//AND, OR, and, or, &, |
fn isBoolCh(c:char): bool = c = ‘a’ || c = ‘A’ || c = ‘n’ || c = ‘N’ || c =
‘d’ || c = ‘D’
|| c = ‘o’ || c = ‘O’ || c = ‘r’ || c = ‘R’ || c = ‘|’ || c = ‘&’
fn isBool1stCh(c:char): bool = c = ‘a’ || c = ‘A’|| c = ‘o’ || c = ‘O’ || c
= ‘|’ || c = ‘&’
fn stopChar(c:char): bool = c = ‘\000’
//This should probably be changed, but we assume these are the above are
the only
//language letters.
fn isWhiteSpace(c:char): bool = not (stopChar© orelse isGeneCh© orelse
isBoolCh© orelse (c = ‘(’) orelse (c = ‘)’) )
implement
getToken (ss: strstream): GRTOK =
let
val c = ss.peek()
val _ = ss.next(); //need to be fun1?
fun whileCharTst(chtst: char - bool): string =
if chtst© then (ss.next(); tostring© + whileCharTst(chtst)) else
""
in
if c = ‘\000’ then TKEND()
else if c = ‘(’ then TKlpar()
else if c = ‘)’ then TKrpar()
//genes and booleans could share letters … need to fix
else if isBool1stCh© then
let
val logword = whileCharTst(isBoolCh)
in //BEWARE of potential problems of function evaluation in situ…
if isAnd(string_caps(logword)) then TKand ()
else if isOr(string_caps(logword)) then TKor ()
else TKgene(logword) //Assume gene for now, e.g. could be "AND&|OR"
end
else if isGeneCh© then TKgene(whileCharTst(isGeneCh)) //Build a
string
else if isWhiteSpace© then
let
val _ = whileCharTst(isWhiteSpace)
in
getToken(ss)
end
else $raise UnforseenLexeme ()
end
(* ********************************* End CODE
******************************** *)
To test, do the following:
val teststr = "Gene1 AND (Gene2) or Gene3 or Gene4 and Gene5"
val teststrstr = strstream_make(teststr)
val _ = print_tks(teststrstr)
//This will result in a segmentation fault.
Now, looking at gdb (debugger) output we see the following:
Starting program: /media/RAID5/share/ATS_learning/toCNF asdfs
Program received signal SIGSEGV, Segmentation fault.
_int_malloc (av=0x7ffff7dd3720, bytes=2) at malloc.c:3883
3883 malloc.c: No such file or directory.
This is not terribly instructive to me at least, but it seems to indicate
there is some problem with memory allocationdue to the seg fault occurring
in a call to malloc.
After appropriately placing several println! statements, we may begin to
notice that, among other things, the value ‘c’ in getToken() is not being
updated by calls to whileCharTst(). This is because c is being redefined in
a “let clause” which has local scope; there may be other scope-related
issues to the original formulation. This is the key problem, and once we do
this and a few other fixes, we arrive at the following function for
getToken():
(*
)
( getToken() version 2
*)
implement
getToken (ss: strstream): GRTOK =
let
val c = ss.peek()
val _ = ss.next(); //need to be fun1?
fun whileCharTst(chtst: char - bool): string =
let
val cw = ss.peek()
in
if chtst(cw) then
let
val _ = ss.next()
//val _ = println!(cw)
in
tostring(cw) + whileCharTst(chtst)
end
else (ss.next(); “”) //Need to theck next() stop condition
end
in
if c = ‘\000’ then TKEND()
else if c = ‘(’ then TKlpar()
else if c = ‘)’ then (print(" rpar \n"); TKrpar())
//genes and booleans could share letters … need to fix
else if isBool1stCh© then
let
val logword = tostring© + whileCharTst(isBoolCh)
in //BEWARE of potential problems of function evaluation in situ…
if isAnd(string_caps(logword)) then TKand ()
else if isOr(string_caps(logword)) then TKor ()
else TKgene(logword) //Assume gene for now, e.g. could be "AND&|OR"
end
else if isGeneCh© then TKgene(tostring© + whileCharTst(isGeneCh))
//Build a string
else if isWhiteSpace© then
let
val _ = whileCharTst(isWhiteSpace)
in
getToken(ss)
end
else $raise UnforseenLexeme ()
end
(*
*)
Now that the segmentation fault is fixed, we see that there is still a bug
in the output:
The recaptured tokens don’t seem to be parsing the right parentheses
(TKrpar()):
Gene1 AND ( Gene2 r Gene3 OR Gene4 AND Gene5 END
Somehow we are getting an “r” where the “)” should be. Putting a space
between “Gene2” and the
following “)”, i.e.:
val teststr = "Gene1 AND (Gene2 ) or Gene3 or Gene4 and Gene5"
yields the following:
Gene1 AND ( Gene2 rpar
) r Gene3 OR Gene4 AND Gene5 END
So we are now finding the “)”, but we’re still getting the mystery “r”,
which at least isn’t so mysterious
anymore since we are missing the following “or”, and note that the original
"or" is lower case, unlike what
we get from the print_tks() function. Something in the tokenizer needs to
be fixed.
Using a “)” directly next to the “or”:
val teststr = "Gene1 AND (Gene2 )or Gene3 or Gene4 and Gene5"
we now correctly get:
Gene1 AND ( Gene2 rpar
) OR Gene3 OR Gene4 AND Gene5 END
So something is definitely happening with the whitespace consumption, but
why only here? At some point along the way I’d introduced a bug by
including an extra ss.next() in whileCharTst(). Removing it seems to fix
the problem. Now we have:
implement
getToken (ss: strstream): GRTOK =
let
val c = ss.peek()
val _ = ss.next(); //need to be fun1?
fun whileCharTst(chtst: char - bool): string =
let
val cw = ss.peek()
in
if chtst(cw) then
let
val _ = ss.next()
in
tostring(cw) + whileCharTst(chtst)
end
else “” //Need to theck next() stop condition
end
in
if c = ‘\000’ then TKEND()
else if c = ‘(’ then TKlpar()
else if c = ‘)’ then TKrpar()
//genes and booleans could share letters … need to fix
else if isBool1stCh© then
let
val logword = tostring© + whileCharTst(isBoolCh)
in
if isAnd(string_caps(logword)) then TKand ()
else if isOr(string_caps(logword)) then TKor ()
else TKgene(logword) //Assume gene for now, e.g. could be "AND&|OR"
end
else if isGeneCh© then TKgene(tostring© + whileCharTst(isGeneCh))
//Build a string
else if isWhiteSpace© then
let
val _ = whileCharTst(isWhiteSpace)
in
getToken(ss)
end
else $raise UnforseenLexeme ()
end