1 /**************************************************************************
3 * Copyright (C) 2008, Johns Hopkins University.
6 * Redistribution and use in source and binary forms, with or
7 * without modification, are permitted provided that the following
10 * - Redistributions of source code must contain the above
11 * copyright notice, this list of conditions, and the following
14 * - Redistributions in binary form must reproduce the above
15 * copyright notice, this list of conditions, and the following
16 * disclaimer in the documentation and/or other materials
17 * provided with the distribution.
19 * - Neither the names of the copyright holders nor the names of any
20 * of any contributors may be used to endorse or promote products
21 * derived from this software without specific prior written
24 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
25 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
26 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
27 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
28 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
29 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
30 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
31 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
32 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
33 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
34 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 **************************************************************************/
41 #include <unicode/uchar.h>
43 #include <sherpa/utf8.hxx>
44 #include <sherpa/LexLoc.hxx>
45 #include <sherpa/utf8.hxx>
47 #include "BUILD/BitcParser.hxx"
49 using namespace sherpa;
51 #include "SexprLexer.hxx"
54 valid_char_printable(uint32_t ucs4)
56 if (strchr("!#$%&`()*+-.,/:;<>=?@_|~^[]'", ucs4))
62 valid_ident_punct(uint32_t ucs4)
64 if (strchr("!$%&*+-/<>=?@_~", ucs4))
70 valid_ident_start(uint32_t ucs4)
72 return (u_hasBinaryProperty(ucs4,UCHAR_XID_START) ||
73 valid_ident_punct(ucs4));
77 valid_ident_continue(uint32_t ucs4)
79 return (u_hasBinaryProperty(ucs4,UCHAR_XID_CONTINUE) ||
80 valid_ident_punct(ucs4));
84 valid_ifident_start(uint32_t ucs4)
86 return (isalpha(ucs4) || ucs4 == '_');
87 // return (u_hasBinaryProperty(ucs4,UCHAR_XID_START));
91 valid_ifident_continue(uint32_t ucs4)
93 return (isalpha(ucs4) || isdigit(ucs4) || ucs4 == '_' || ucs4 == '-');
94 // return (u_hasBinaryProperty(ucs4,UCHAR_XID_CONTINUE) ||
95 //valid_ifident_punct(ucs4));
99 valid_charpoint(uint32_t ucs4)
101 if (valid_char_printable(ucs4))
104 return u_isgraph(ucs4);
108 valid_charpunct(uint32_t ucs4)
110 if (strchr("!\"#$%&'()*+,-./:;{}<=>?@[\\]^_`|~", ucs4))
116 validate_string(const char *s)
118 const char *spos = s;
123 c = sherpa::utf8_decode(spos, &snext); //&OK
125 if (c == ' ') { /* spaces are explicitly legal */
128 else if (c == '\\') { /* escaped characters are legal */
129 const char *escStart = spos;
150 while (*++spos != '}')
151 if (!isxdigit(*spos))
157 // Bad escape sequence
158 return (escStart - s);
161 else if (u_isgraph(c)) {
164 else return (spos - s);
175 { "apply", tk_APPLY },
176 { "array", tk_ARRAY },
177 { "array-length", tk_ARRAY_LENGTH },
178 { "array-nth", tk_ARRAY_NTH },
180 { "assert", tk_Reserved },
181 { "begin", tk_BEGIN },
182 { "bitc-version", tk_BITC_VERSION },
183 { "bitfield", tk_BITFIELD },
184 { "block", tk_Reserved },
186 { "break", tk_Reserved },
187 { "by-ref", tk_BY_REF },
189 { "catch", tk_CATCH },
191 { "check", tk_Reserved },
192 { "coindset", tk_Reserved },
194 { "constrain", tk_Reserved },
195 { "continue", tk_Reserved },
196 { "declare", tk_DECLARE },
197 { "defequiv", tk_Reserved },
198 { "defexception", tk_DEFEXCEPTION },
199 { "define", tk_DEFINE },
200 { "definstance", tk_DEFINSTANCE },
201 { "definvariant", tk_Reserved },
202 { "defobject", tk_Reserved },
203 { "defrefine", tk_Reserved },
204 { "defrepr", tk_DEFREPR },
205 { "defstruct", tk_DEFSTRUCT },
206 { "deftheory", tk_Reserved },
207 { "defthm", tk_DEFTHM },
208 { "deftypeclass", tk_DEFTYPECLASS },
209 { "defunion", tk_DEFUNION },
210 { "defvariant", tk_Reserved },
211 { "deref", tk_DEREF },
212 { "disable", tk_Reserved },
214 { "do*", tk_Reserved },
215 { "double", tk_DOUBLE },
217 { "enable", tk_Reserved },
218 { "exception", tk_EXCEPTION },
219 { "external", tk_EXTERNAL },
221 { "float", tk_FLOAT },
223 { "forall", tk_FORALL },
226 { "import", tk_IMPORT },
227 { "import!", tk_Reserved },
228 { "indset", tk_Reserved },
229 { "inner-ref", tk_INNER_REF },
230 { "int16", tk_INT16 },
231 { "int32", tk_INT32 },
232 { "int64", tk_INT64 },
234 { "interface", tk_INTERFACE },
235 { "label", tk_Reserved },
236 { "lambda", tk_LAMBDA },
238 { "let*", tk_Reserved },
239 { "letrec", tk_LETREC },
240 { "literal", tk_Reserved },
241 { "location", tk_Reserved },
242 // { "make-vector", tk_MAKE_VECTOR },
243 { "make-vector", tk_MAKE_VECTORL },
244 { "member", tk_MEMBER }, /* REDUNDANT */
245 { "method", tk_METHOD },
246 { "module", tk_MODULE },
247 { "mutable", tk_MUTABLE },
248 { "namespace", tk_Reserved },
250 { "nth", tk_Reserved },
251 { "opaque", tk_OPAQUE },
253 { "otherwise", tk_OTHERWISE },
254 { "proclaim", tk_PROCLAIM },
255 { "provide", tk_PROVIDE },
256 { "provide!", tk_Reserved },
258 { "read-only", tk_Reserved },
260 { "require", tk_Reserved },
261 { "reserved", tk_RESERVED },
262 { "return", tk_Reserved },
263 { "return-from", tk_Reserved },
264 { "sensory", tk_Reserved },
266 { "size-of", tk_Reserved },
267 { "string", tk_STRING },
268 { "super", tk_Reserved },
269 { "suspend", tk_SUSPEND },
270 { "switch", tk_SWITCH },
273 { "throw", tk_THROW },
275 { "tycon", tk_Reserved },
277 { "uint16", tk_UINT16 },
278 { "uint32", tk_UINT32 },
279 { "uint64", tk_UINT64 },
280 { "uint8", tk_UINT8 },
281 { "using", tk_Reserved },
283 { "value-at", tk_Reserved },
284 { "vector", tk_VECTOR },
285 { "vector-length", tk_VECTOR_LENGTH },
286 { "vector-nth", tk_VECTOR_NTH },
288 { "where", tk_WHERE },
291 //{ "immutable", tk_IMMUTABLE },
292 //{ "module", tk_MODULE },
293 //{ "mutual-recursion", tk_MUTUAL_RECURSION },
295 //{ "package", tk_PACKAGE },
296 //{ "restricted-ref", tk_RESTRICTEDREF },
297 //{ "sequence-length", tk_SEQUENCELENGTH },
298 //{ "sequence-ref", tk_SEQUENCEREF }, /* REDUNDANT */
303 kwstrcmp(const void *vKey, const void *vCandidate)
305 const char *key = ((const KeyWord *) vKey)->nm;
306 const char *candidate = ((const KeyWord *) vCandidate)->nm;
308 return strcmp(key, candidate);
312 SexprLexer::kwCheck(const char *s)
315 if (!valid_ifident_start(*s))
319 if (!valid_ifident_continue(*s))
325 KeyWord key = { s, 0 };
327 (KeyWord *)bsearch(&key, keywords, // &OK
328 sizeof(keywords)/sizeof(keywords[0]),
329 sizeof(keywords[0]), kwstrcmp);
331 // If it is in the token table, return the indicated token type:
333 return entry->tokValue;
335 // Otherwise, check for various reserved words:
337 // Things starting with "__":
338 if (s[0] == '_' && s[1] == '_') {
343 // Things starting with "def":
344 if (s[0] == 'd' && s[1] == 'e' && s[2] == 'f')
347 // Things starting with "#":
355 SexprLexer::ReportParseError()
358 << ": syntax error (via yyerror)" << '\n';
363 SexprLexer::ReportParseError(const LexLoc& where, std::string msg)
373 SexprLexer::ReportParseWarning(const LexLoc& where, std::string msg)
380 SexprLexer::SexprLexer(std::ostream& _err, std::istream& _in,
381 const std::string& origin,
382 bool commandLineInput)
383 :here(origin, 1, 0), inStream(_in), errStream(_err)
385 inStream.unsetf(std::ios_base::skipws);
388 isRuntimeUoc = false;
390 isCommandLineInput = commandLineInput;
397 SexprLexer::digitValue(ucs4_t ucs4)
401 if (ucs4 >= '0' && ucs4 <= '9')
403 if (ucs4 >= 'a' && ucs4 <= 'f')
405 if (ucs4 >= 'A' && ucs4 <= 'F')
414 SexprLexer::getChar()
419 long ucs4 = putbackChar;
421 if (putbackChar != -1) {
423 utf8_encode(ucs4, utf);
429 utf[0] = inStream.get();
437 utf[1] = inStream.get();
444 utf[2] = inStream.get();
451 utf[3] = inStream.get();
458 utf[4] = inStream.get();
465 utf[5] = inStream.get();
470 ucs4 = utf8_decode(utf, 0);
474 if (digitValue(c) < radix)
481 SexprLexer::ungetChar(ucs4_t c)
484 assert(putbackChar == -1);
487 unsigned len = utf8_encode(c, utf);
488 thisToken.erase( thisToken.length() - len);
492 isCharDelimiter(ucs4_t c)
507 SexprLexer::lex(ParseType *lvalp)
513 nDigits = 0; // until otherwise proven
514 radix = 10; // until otherwise proven
519 case ';': // Comments
522 } while (c != '\n' && c != '\r');
526 case ' ': // White space
530 here.updateWith(thisToken);
534 case '.': // Single character tokens
542 lvalp->tok = LToken(here, thisToken);
543 here.updateWith(thisToken);
547 case '"': // String literal
553 (void) getChar(); // just ignore it -- will validate later
557 unsigned badpos = validate_string(thisToken.c_str());
560 LexLoc badHere = here;
561 badHere.offset += badpos;
562 errStream << badHere.asString()
563 << ": Illegal (non-printing) character in string '"
564 << thisToken << "'\n";
568 LexLoc tokStart = here;
569 here.updateWith(thisToken);
570 lvalp->tok = LToken(here, thisToken.substr(1, thisToken.size()-2));
575 case '#': // character and boolean literals
580 lvalp->tok = LToken(here, thisToken);
581 here.updateWith(thisToken);
584 lvalp->tok = LToken(here, thisToken);
585 here.updateWith(thisToken);
591 if (valid_charpunct(c)) {
592 lvalp->tok = LToken(here, thisToken);
593 here.updateWith(thisToken);
602 } while(digitValue(c) >= 0);
604 if (!isCharDelimiter(c)) {
611 lvalp->tok = LToken(here, thisToken);
612 here.updateWith(thisToken);
615 else if (valid_charpoint(c)) {
616 // Collect more characters in case this is a named character
620 } while (isalpha(c));
623 if (thisToken.size() == 3 || // '#' '\' CHAR
624 thisToken == "#\\space" ||
625 thisToken == "#\\linefeed" ||
626 thisToken == "#\\return" ||
627 thisToken == "#\\tab" ||
628 thisToken == "#\\backspace" ||
629 thisToken == "#\\lbrace" ||
630 thisToken == "#\\rbrace") {
631 lvalp->tok = LToken(here, thisToken);
632 here.updateWith(thisToken);
637 // FIX: this is bad input
641 // FIX: this is bad input
646 case '\'': // Type variables
649 if (!valid_ident_start(c)) {
650 // FIX: this is bad input
657 } while (valid_ident_continue(c));
660 here.updateWith(thisToken);
661 lvalp->tok = LToken(here, thisToken);
665 // Hyphen requires special handling. If it is followed by a digit
666 // then it is the beginning of a numeric literal, else it is part
670 if (digitValue(c) < 0) {
675 /* ELSE fall through to digit processing */
689 } while (digitValue(c) >= 0);
691 /* At this point we could either discover a radix marker 'r', or
692 a decimal point (indicating a floating poing literal). If it
693 is a radix marker, change the radix value here so that we
694 match the succeeding digits in the correct radix. */
696 radix = strtol(thisToken.c_str(), 0, 10);
697 if (radix < 0) radix = -radix; // leading sign not part of radix
703 } while (digitValue(c) >= 0);
705 /* FIX: if count is 0, number is malformed */
708 /* We are either done with the literal, in which case it is an
709 integer literal, or we are about to see a decimal point, in
710 which case it is a floating point literal */
713 lvalp->tok = LToken(here, thisToken);
714 here.updateWith(thisToken);
718 /* We have seen a decimal point, so from here on it must be a
719 floating point literal. */
725 } while (digitValue(c) >= 0);
727 /* FIX: if count is 0, number is malformed */
730 /* We are either done with this token or we are looking at a '^'
731 indicating start of an exponent. */
734 lvalp->tok = LToken(here, thisToken);
735 here.updateWith(thisToken);
739 /* Need to collect the exponent. Revert to radix 10 until
744 if (c != '-' && digitValue(c) < 0) {
745 // FIX: Malformed token
750 } while (digitValue(c) >= 0);
752 /* Check for radix marker on exponent */
754 radix = strtol(thisToken.c_str(), 0, 10);
755 if (radix < 0) radix = -radix; // leading sign not part of radix
761 } while (digitValue(c) >= 0);
763 /* FIX: if count is 0, number is malformed */
767 lvalp->tok = LToken(here, thisToken);
768 here.updateWith(thisToken);
776 if (valid_ident_start(c))
779 // FIX: Malformed token
786 } while (valid_ident_continue(c));
788 lvalp->tok = LToken(here, thisToken);
789 here.updateWith(thisToken);
790 return kwCheck(thisToken.c_str());