src/compiler/SexprLexer.cxx
author Jonathan S. Shapiro <shap@eros-os.com>
Mon Jul 21 22:14:23 2008 -0400 (2 years ago)
changeset 310 0fa3cbf66e6f
parent 297 8389f9101088
child 315 84bbaed7d606
permissions -rw-r--r--
Update .hgignore to ignore src/man/Makefile
     1 /**************************************************************************
     2  *
     3  * Copyright (C) 2008, Johns Hopkins University.
     4  * All rights reserved.
     5  *
     6  * Redistribution and use in source and binary forms, with or
     7  * without modification, are permitted provided that the following
     8  * conditions are met:
     9  *
    10  *   - Redistributions of source code must contain the above 
    11  *     copyright notice, this list of conditions, and the following
    12  *     disclaimer. 
    13  *
    14  *   - Redistributions in binary form must reproduce the above
    15  *     copyright notice, this list of conditions, and the following
    16  *     disclaimer in the documentation and/or other materials 
    17  *     provided with the distribution.
    18  *
    19  *   - Neither the names of the copyright holders nor the names of any
    20  *     of any contributors may be used to endorse or promote products
    21  *     derived from this software without specific prior written
    22  *     permission. 
    23  *
    24  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
    25  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
    26  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
    27  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
    28  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
    29  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
    30  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
    31  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
    32  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
    33  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
    34  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
    35  *
    36  **************************************************************************/
    37 
    38 #include <assert.h>
    39 #include <string>
    40 
    41 #include <unicode/uchar.h>
    42 
    43 #include <sherpa/utf8.hxx>
    44 #include <sherpa/LexLoc.hxx>
    45 #include <sherpa/utf8.hxx>
    46 
    47 #include "BUILD/BitcParser.hxx"
    48 
    49 using namespace sherpa;
    50 
    51 #include "SexprLexer.hxx"
    52 
    53 static bool
    54 valid_char_printable(uint32_t ucs4)
    55 {
    56   if (strchr("!#$%&`()*+-.,/:;<>=?@_|~^[]'", ucs4))
    57     return true;
    58   return false;
    59 }
    60 
    61 static bool
    62 valid_ident_punct(uint32_t ucs4)
    63 {
    64   if (strchr("!$%&*+-/<>=?@_~", ucs4))
    65     return true;
    66   return false;
    67 }
    68 
    69 static bool
    70 valid_ident_start(uint32_t ucs4)
    71 {
    72   return (u_hasBinaryProperty(ucs4,UCHAR_XID_START) || 
    73 	  valid_ident_punct(ucs4));
    74 }
    75 
    76 static bool
    77 valid_ident_continue(uint32_t ucs4)
    78 {
    79   return (u_hasBinaryProperty(ucs4,UCHAR_XID_CONTINUE) ||
    80 	  valid_ident_punct(ucs4));
    81 }
    82 
    83 static bool
    84 valid_ifident_start(uint32_t ucs4)
    85 {
    86   return (isalpha(ucs4) || ucs4 == '_');
    87   //  return (u_hasBinaryProperty(ucs4,UCHAR_XID_START));
    88 }
    89 
    90 static bool
    91 valid_ifident_continue(uint32_t ucs4)
    92 {
    93   return (isalpha(ucs4) || isdigit(ucs4) || ucs4 == '_' || ucs4 == '-');
    94   //  return (u_hasBinaryProperty(ucs4,UCHAR_XID_CONTINUE) ||
    95   //valid_ifident_punct(ucs4));
    96 }
    97 
    98 static bool
    99 valid_charpoint(uint32_t ucs4)
   100 {
   101   if (valid_char_printable(ucs4))
   102     return true;
   103 
   104   return u_isgraph(ucs4);
   105 }
   106 
   107 static bool
   108 valid_charpunct(uint32_t ucs4)
   109 {
   110   if (strchr("!\"#$%&'()*+,-./:;{}<=>?@[\\]^_`|~", ucs4))
   111     return true;
   112   return false;
   113 }
   114 
   115 static unsigned
   116 validate_string(const char *s)
   117 {
   118   const char *spos = s;
   119   uint32_t c;
   120 
   121   while (*spos) {
   122     const char *snext;
   123     c = sherpa::utf8_decode(spos, &snext); //&OK
   124 
   125     if (c == ' ') {		/* spaces are explicitly legal */
   126       spos = snext;
   127     }
   128     else if (c == '\\') {	/* escaped characters are legal */
   129       const char *escStart = spos;
   130       spos++;
   131       switch (*spos) {
   132       case 'n':
   133       case 't':
   134       case 'r':
   135       case 'b':
   136       case 's':
   137       case 'f':
   138       case '"':
   139       case '\\':
   140 	{
   141 	  spos++;
   142 	  break;
   143 	}
   144       case '{':
   145 	{
   146 	  if (*++spos != 'U')
   147 	    return (spos - s);
   148 	  if (*++spos != '+')
   149 	    return (spos - s);
   150 	  while (*++spos != '}')
   151 	    if (!isxdigit(*spos))
   152 	      return (spos - s);
   153 	}
   154 	spos++;
   155 	break;
   156       default:
   157 	// Bad escape sequence
   158 	return (escStart - s);
   159       }
   160     }
   161     else if (u_isgraph(c)) {
   162       spos = snext;
   163     }
   164     else return (spos - s);
   165   }
   166 
   167   return 0;
   168 }
   169 
   170 struct KeyWord {
   171   const char *nm;
   172   int  tokValue;
   173 } keywords[] = {
   174   { "and",              tk_AND },
   175   { "apply",            tk_APPLY },
   176   { "array",            tk_ARRAY },
   177   { "array-length",     tk_ARRAY_LENGTH },
   178   { "array-nth",        tk_ARRAY_NTH },
   179   { "as",               tk_AS },
   180   { "assert",           tk_Reserved },
   181   { "begin",            tk_BEGIN },
   182   { "bitc-version",     tk_BITC_VERSION },
   183   { "bitfield",         tk_BITFIELD },
   184   { "block",            tk_Reserved },
   185   { "bool",             tk_BOOL },
   186   { "break",            tk_Reserved },
   187   { "by-ref",           tk_BY_REF },
   188   { "case",             tk_CASE },
   189   { "catch",            tk_CATCH },
   190   { "char",             tk_CHAR },
   191   { "check",            tk_Reserved },
   192   { "coindset",         tk_Reserved },
   193   { "cond",             tk_COND },
   194   { "constrain",        tk_Reserved },
   195   { "continue",         tk_Reserved },
   196   { "declare",          tk_DECLARE },
   197   { "defequiv",         tk_Reserved },
   198   { "defexception",     tk_DEFEXCEPTION },
   199   { "define",           tk_DEFINE },
   200   { "definstance",      tk_DEFINSTANCE },
   201   { "definvariant",     tk_Reserved },
   202   { "defobject",        tk_Reserved },
   203   { "defrefine",        tk_Reserved },
   204   { "defrepr",          tk_DEFREPR },
   205   { "defstruct",        tk_DEFSTRUCT },
   206   { "deftheory",        tk_Reserved },
   207   { "defthm",           tk_DEFTHM },
   208   { "deftypeclass",     tk_DEFTYPECLASS },
   209   { "defunion",         tk_DEFUNION },
   210   { "defvariant",       tk_Reserved },
   211   { "deref",            tk_DEREF },
   212   { "disable",          tk_Reserved },
   213   { "do",               tk_DO },
   214   { "do*",              tk_Reserved },
   215   { "double",           tk_DOUBLE },
   216   { "dup",              tk_DUP },
   217   { "enable",           tk_Reserved },
   218   { "exception",        tk_EXCEPTION },
   219   { "external",         tk_EXTERNAL },
   220   { "fill",             tk_FILL },
   221   { "float",            tk_FLOAT },
   222   { "fn",               tk_FN },
   223   { "forall",           tk_FORALL },
   224   { "hide",             tk_HIDE },
   225   { "if",               tk_IF },
   226   { "import",           tk_IMPORT },
   227   { "import!",          tk_Reserved },
   228   { "indset",           tk_Reserved },
   229   { "inner-ref",        tk_INNER_REF },
   230   { "int16",            tk_INT16 },
   231   { "int32",            tk_INT32 },
   232   { "int64",            tk_INT64 },
   233   { "int8",             tk_INT8 },
   234   { "interface",        tk_INTERFACE },
   235   { "label",            tk_Reserved },
   236   { "lambda",           tk_LAMBDA },
   237   { "let",              tk_LET },
   238   { "let*",             tk_Reserved },
   239   { "letrec",           tk_LETREC },
   240   { "literal",          tk_Reserved },
   241   { "location",         tk_Reserved },
   242   //  { "make-vector",      tk_MAKE_VECTOR },
   243   { "make-vector",      tk_MAKE_VECTORL },
   244   { "member",           tk_MEMBER },   /* REDUNDANT */
   245   { "method",           tk_METHOD },
   246   { "module",           tk_MODULE },
   247   { "mutable",          tk_MUTABLE },
   248   { "namespace",        tk_Reserved },
   249   { "not",              tk_NOT },
   250   { "nth",              tk_Reserved },
   251   { "opaque",           tk_OPAQUE },
   252   { "or",               tk_OR },
   253   { "otherwise",        tk_OTHERWISE },
   254   { "proclaim",         tk_PROCLAIM },
   255   { "provide",          tk_PROVIDE },
   256   { "provide!",         tk_Reserved },
   257   { "quad",             tk_QUAD },
   258   { "read-only",        tk_Reserved },
   259   { "ref",              tk_REF },
   260   { "require",          tk_Reserved },
   261   { "reserved",         tk_RESERVED },
   262   { "return",           tk_Reserved },
   263   { "return-from",      tk_Reserved },
   264   { "sensory",          tk_Reserved },
   265   { "set!",             tk_SET },
   266   { "size-of",          tk_Reserved },
   267   { "string",           tk_STRING },
   268   { "super",            tk_Reserved },
   269   { "suspend",          tk_SUSPEND },  
   270   { "switch",           tk_SWITCH },
   271   { "tag",              tk_TAG },
   272   { "the",              tk_THE },
   273   { "throw",            tk_THROW },
   274   { "try",              tk_TRY },
   275   { "tycon",            tk_Reserved },
   276   { "tyfn",             tk_TYFN },
   277   { "uint16",           tk_UINT16 },
   278   { "uint32",           tk_UINT32 },
   279   { "uint64",           tk_UINT64 },
   280   { "uint8",            tk_UINT8 },
   281   { "using",            tk_Reserved },
   282   { "val",              tk_VAL },
   283   { "value-at",         tk_Reserved },
   284   { "vector",           tk_VECTOR },
   285   { "vector-length",    tk_VECTOR_LENGTH },
   286   { "vector-nth",       tk_VECTOR_NTH },
   287   { "when",             tk_WHEN },
   288   { "where",            tk_WHERE },
   289   { "word",             tk_WORD }
   290   
   291   //{ "immutable",        tk_IMMUTABLE },
   292   //{ "module",           tk_MODULE },
   293   //{ "mutual-recursion", tk_MUTUAL_RECURSION },
   294   //{ "nat",              tk_NAT },
   295   //{ "package",          tk_PACKAGE }, 
   296   //{ "restricted-ref",   tk_RESTRICTEDREF },
   297   //{ "sequence-length",  tk_SEQUENCELENGTH },
   298   //{ "sequence-ref",     tk_SEQUENCEREF }, /* REDUNDANT */
   299 
   300 };
   301 
   302 static int
   303 kwstrcmp(const void *vKey, const void *vCandidate)
   304 {
   305   const char *key = ((const KeyWord *) vKey)->nm;
   306   const char *candidate = ((const KeyWord *) vCandidate)->nm;
   307 
   308   return strcmp(key, candidate);
   309 }
   310 
   311 int
   312 SexprLexer::kwCheck(const char *s)
   313 {
   314   if (ifIdentMode) {
   315     if (!valid_ifident_start(*s))
   316       return tk_Reserved;
   317 
   318     for (++s; *s; s++)
   319       if (!valid_ifident_continue(*s))
   320 	return tk_Reserved;
   321 
   322     return tk_Ident;
   323   }
   324 
   325   KeyWord key = { s, 0 };
   326   KeyWord *entry = 
   327     (KeyWord *)bsearch(&key, keywords, // &OK
   328 		       sizeof(keywords)/sizeof(keywords[0]), 
   329 		       sizeof(keywords[0]), kwstrcmp);
   330 
   331   // If it is in the token table, return the indicated token type:
   332   if (entry)
   333     return entry->tokValue;
   334 
   335   // Otherwise, check for various reserved words:
   336 
   337   // Things starting with "__":
   338   if (s[0] == '_' && s[1] == '_') {
   339     if(!isRuntimeUoc)
   340       return tk_Reserved;
   341   }
   342 
   343   // Things starting with "def":
   344   if (s[0] == 'd' && s[1] == 'e' && s[2] == 'f')
   345     return tk_Reserved;
   346 
   347   // Things starting with "#":
   348   if (s[0] == '#')
   349     return tk_Reserved;
   350 
   351   return tk_Ident;
   352 }
   353 
   354 void
   355 SexprLexer::ReportParseError()
   356 {
   357   errStream << here
   358 	    << ": syntax error (via yyerror)" << '\n';
   359   num_errors++;
   360 }
   361  
   362 void
   363 SexprLexer::ReportParseError(const LexLoc& where, std::string msg)
   364 {
   365   errStream << where
   366 	    << ": "
   367 	    << msg << std::endl;
   368 
   369   num_errors++;
   370 }
   371 
   372 void
   373 SexprLexer::ReportParseWarning(const LexLoc& where, std::string msg)
   374 {
   375   errStream << where
   376 	    << ": "
   377 	    << msg << std::endl;
   378 }
   379 
   380 SexprLexer::SexprLexer(std::ostream& _err, std::istream& _in, 
   381 		       const std::string& origin,
   382 		       bool commandLineInput)
   383   :here(origin, 1, 0), inStream(_in), errStream(_err)
   384 {
   385   inStream.unsetf(std::ios_base::skipws);
   386 
   387   num_errors = 0;
   388   isRuntimeUoc = false;
   389   ifIdentMode = false;
   390   isCommandLineInput = commandLineInput;
   391   debug = false;
   392   putbackChar = -1;
   393   nModules = 0;
   394 }
   395 
   396 long 
   397 SexprLexer::digitValue(ucs4_t ucs4)
   398 {
   399   long l = -1;
   400 
   401   if (ucs4 >= '0' && ucs4 <= '9')
   402     l = ucs4 - '0';
   403   if (ucs4 >= 'a' && ucs4 <= 'f')
   404     l = ucs4 - 'a' + 10;
   405   if (ucs4 >= 'A' && ucs4 <= 'F')
   406     l = ucs4 - 'A' + 10;
   407 
   408   if (l > radix)
   409     l = -1;
   410   return l;
   411 }
   412 
   413 ucs4_t
   414 SexprLexer::getChar()
   415 {
   416   char utf[8];
   417   unsigned char c;
   418 
   419   long ucs4 = putbackChar;
   420 
   421   if (putbackChar != -1) {
   422     putbackChar = -1;
   423     utf8_encode(ucs4, utf);
   424     goto checkDigit;
   425   }
   426 
   427   memset(utf, 0, 8);
   428 
   429   utf[0] = inStream.get();
   430   c = utf[0];
   431   if (utf[0] == EOF)
   432     return EOF;
   433 
   434   if (c <= 127)
   435     goto done;
   436 
   437   utf[1] = inStream.get();
   438   if (utf[1] == EOF)
   439     return EOF;
   440 
   441   if (c <= 223)
   442     goto done;
   443 
   444   utf[2] = inStream.get();
   445   if (utf[2] == EOF)
   446     return EOF;
   447 
   448   if (c <= 239)
   449     goto done;
   450 
   451   utf[3] = inStream.get();
   452   if (utf[3] == EOF)
   453     return EOF;
   454 
   455   if (c <= 247)
   456     goto done;
   457  
   458   utf[4] = inStream.get();
   459   if (utf[4] == EOF)
   460     return EOF;
   461 
   462   if (c <= 251)
   463     goto done;
   464 
   465   utf[5] = inStream.get();
   466   if (utf[5] == EOF)
   467     return EOF;
   468 
   469  done:
   470   ucs4 = utf8_decode(utf, 0);
   471  checkDigit:
   472   thisToken += utf;
   473 
   474   if (digitValue(c) < radix)
   475     nDigits++;
   476 
   477   return ucs4;
   478 }
   479 
   480 void
   481 SexprLexer::ungetChar(ucs4_t c)
   482 {
   483   char utf[8];
   484   assert(putbackChar == -1);
   485   putbackChar = c;
   486 
   487   unsigned len = utf8_encode(c, utf);
   488   thisToken.erase( thisToken.length() - len);
   489 }
   490 
   491 static bool
   492 isCharDelimiter(ucs4_t c)
   493 {
   494   switch (c) {
   495   case ' ':
   496   case '\t':
   497   case '\n':
   498   case '\r':
   499   case ')':
   500     return true;
   501   default:
   502     return false;
   503   }
   504 }
   505 
   506 int
   507 SexprLexer::lex(ParseType *lvalp)
   508 {
   509   ucs4_t c;
   510 
   511  startOver:
   512   thisToken.erase();
   513   nDigits = 0;			// until otherwise proven
   514   radix = 10;			// until otherwise proven
   515 
   516   c = getChar();
   517 
   518   switch (c) {
   519   case ';':			// Comments
   520     do {
   521       c = getChar();
   522     } while (c != '\n' && c != '\r');
   523     ungetChar(c);
   524     // FALL THROUGH 
   525 
   526   case ' ':			// White space
   527   case '\t':
   528   case '\n':
   529   case '\r':
   530     here.updateWith(thisToken);
   531     goto startOver;
   532 
   533 
   534   case '.':			// Single character tokens
   535   case ',':
   536   case '[':
   537   case ']':
   538   case '(':
   539   case ')':
   540   case ':':
   541   case '^':
   542     lvalp->tok = LToken(here, thisToken);
   543     here.updateWith(thisToken);
   544     return c;
   545 
   546 
   547   case '"':			// String literal
   548     {
   549       do {
   550 	c = getChar();
   551 
   552 	if (c == '\\') {
   553 	  (void) getChar();	// just ignore it -- will validate later
   554 	}
   555       }	while (c != '"');
   556       
   557       unsigned badpos = validate_string(thisToken.c_str());
   558 
   559       if (badpos) {
   560 	LexLoc badHere = here;
   561 	badHere.offset += badpos;
   562 	errStream << badHere.asString()
   563 		  << ": Illegal (non-printing) character in string '"
   564 		  << thisToken << "'\n";
   565 	num_errors++;
   566       }
   567 
   568       LexLoc tokStart = here;
   569       here.updateWith(thisToken);
   570       lvalp->tok = LToken(here, thisToken.substr(1, thisToken.size()-2));
   571       return tk_String;
   572     }
   573 
   574 
   575   case '#':			// character and boolean literals
   576     {
   577       c = getChar();
   578       switch(c) {
   579       case 't':
   580 	lvalp->tok = LToken(here, thisToken);
   581 	here.updateWith(thisToken);
   582 	return tk_TRUE;
   583       case 'f':
   584 	lvalp->tok = LToken(here, thisToken);
   585 	here.updateWith(thisToken);
   586 	return tk_FALSE;
   587       case '\\':
   588 	{
   589 	  c = getChar();
   590 
   591 	  if (valid_charpunct(c)) {
   592 	    lvalp->tok = LToken(here, thisToken);
   593 	    here.updateWith(thisToken);
   594 	    return tk_Char;
   595 	  }
   596 	  else if (c == 'U') {
   597 	    c = getChar();
   598 	    if (c == '+') {
   599 	      radix = 16;
   600 	      do {
   601 		c = getChar();
   602 	      } while(digitValue(c) >= 0);
   603 	      
   604 	      if (!isCharDelimiter(c)) {
   605 		ungetChar(c);
   606 		return EOF;
   607 	      }
   608 	    }
   609 	    ungetChar(c);
   610 
   611 	    lvalp->tok = LToken(here, thisToken);
   612 	    here.updateWith(thisToken);
   613 	    return tk_Char;
   614 	  }
   615 	  else if (valid_charpoint(c)) {
   616 	    // Collect more characters in case this is a named character
   617 	    // literal.
   618 	    do {
   619 	      c = getChar();
   620 	    } while (isalpha(c));
   621 	    ungetChar(c);
   622 
   623 	    if (thisToken.size() == 3 || // '#' '\' CHAR
   624 		thisToken == "#\\space" ||
   625 		thisToken == "#\\linefeed" ||
   626 		thisToken == "#\\return" ||
   627 		thisToken == "#\\tab" ||
   628 		thisToken == "#\\backspace" ||
   629 		thisToken == "#\\lbrace" ||
   630 		thisToken == "#\\rbrace") {
   631 	      lvalp->tok = LToken(here, thisToken);
   632 	      here.updateWith(thisToken);
   633 	      return tk_Char;
   634 	    }
   635 	  }
   636 
   637 	  // FIX: this is bad input
   638 	  return EOF;
   639 	}
   640       default:
   641 	// FIX: this is bad input
   642 	return EOF;
   643       }
   644     }
   645 
   646   case '\'':			// Type variables
   647     {
   648       c = getChar();
   649       if (!valid_ident_start(c)) {
   650 	// FIX: this is bad input
   651 	ungetChar(c);
   652 	return EOF;
   653       }
   654 
   655       do {
   656 	c = getChar();
   657       } while (valid_ident_continue(c));
   658       ungetChar(c);
   659 
   660       here.updateWith(thisToken);
   661       lvalp->tok = LToken(here, thisToken);
   662       return tk_TypeVar;
   663     }
   664 
   665     // Hyphen requires special handling. If it is followed by a digit
   666     // then it is the beginning of a numeric literal, else it is part
   667     // of an identifier.
   668   case '-':
   669     c = getChar();
   670     if (digitValue(c) < 0) {
   671       ungetChar(c);
   672       goto identifier;
   673     }
   674 
   675     /* ELSE fall through to digit processing */
   676   case '0':
   677   case '1':
   678   case '2':
   679   case '3':
   680   case '4':
   681   case '5':
   682   case '6':
   683   case '7':
   684   case '8':
   685   case '9':
   686     {
   687       do {
   688 	c = getChar();
   689       } while (digitValue(c) >= 0);
   690 
   691       /* At this point we could either discover a radix marker 'r', or
   692 	 a decimal point (indicating a floating poing literal). If it
   693 	 is a radix marker, change the radix value here so that we
   694 	 match the succeeding digits in the correct radix. */
   695       if (c == 'r') {
   696 	radix = strtol(thisToken.c_str(), 0, 10);
   697 	if (radix < 0) radix = -radix; // leading sign not part of radix
   698 
   699 	long count = 0;
   700 	do {
   701 	  c = getChar();
   702 	  count++;
   703 	} while (digitValue(c) >= 0);
   704 	count--;
   705 	/* FIX: if count is 0, number is malformed */
   706       }
   707 
   708       /* We are either done with the literal, in which case it is an
   709 	 integer literal, or we are about to see a decimal point, in
   710 	 which case it is a floating point literal */
   711       if (c != '.') {
   712 	ungetChar(c);
   713 	lvalp->tok = LToken(here, thisToken);
   714 	here.updateWith(thisToken);
   715 	return tk_Int;
   716       }
   717 
   718       /* We have seen a decimal point, so from here on it must be a
   719 	 floating point literal. */
   720       {
   721 	long count = 0;
   722 	do {
   723 	  c = getChar();
   724 	  count++;
   725 	} while (digitValue(c) >= 0);
   726 	count--;
   727 	/* FIX: if count is 0, number is malformed */
   728       }
   729 
   730       /* We are either done with this token or we are looking at a '^'
   731 	 indicating start of an exponent. */
   732       if (c != '^') {
   733 	ungetChar(c);
   734 	lvalp->tok = LToken(here, thisToken);
   735 	here.updateWith(thisToken);
   736 	return tk_Float;
   737       }
   738 
   739       /* Need to collect the exponent. Revert to radix 10 until
   740 	 otherwise proven. */
   741       radix = 10;
   742       c = getChar();
   743 
   744       if (c != '-' && digitValue(c) < 0) {
   745 	// FIX: Malformed token
   746       }
   747 
   748       do {
   749 	c = getChar();
   750       } while (digitValue(c) >= 0);
   751 
   752       /* Check for radix marker on exponent */
   753       if (c == 'r') {
   754 	radix = strtol(thisToken.c_str(), 0, 10);
   755 	if (radix < 0) radix = -radix; // leading sign not part of radix
   756 
   757 	long count = 0;
   758 	do {
   759 	  c = getChar();
   760 	  count++;
   761 	} while (digitValue(c) >= 0);
   762 	count--;
   763 	/* FIX: if count is 0, number is malformed */
   764       }
   765 
   766       ungetChar(c);
   767       lvalp->tok = LToken(here, thisToken);
   768       here.updateWith(thisToken);
   769       return tk_Float;
   770     }
   771 
   772   case EOF:
   773     return EOF;
   774 
   775   default:
   776     if (valid_ident_start(c))
   777       goto identifier;
   778 
   779     // FIX: Malformed token
   780     return EOF;
   781   }
   782 
   783  identifier:
   784   do {
   785     c = getChar();
   786   } while (valid_ident_continue(c));
   787   ungetChar(c);
   788   lvalp->tok = LToken(here, thisToken);
   789   here.updateWith(thisToken);
   790   return kwCheck(thisToken.c_str());
   791 }