diff options
Diffstat (limited to 'tokenize.c')
| -rw-r--r-- | tokenize.c | 331 |
1 files changed, 175 insertions, 156 deletions
@@ -121,6 +121,42 @@ const char *show_string(const struct string *string) return buffer; } +static const char *show_char(const char *s, size_t len, char prefix, char delim) +{ + static char buffer[MAX_STRING + 4]; + char *p = buffer; + if (prefix) + *p++ = prefix; + *p++ = delim; + memcpy(p, s, len); + p += len; + *p++ = delim; + *p++ = '\0'; + return buffer; +} + +static const char *quote_char(const char *s, size_t len, char prefix, char delim) +{ + static char buffer[2*MAX_STRING + 6]; + size_t i; + char *p = buffer; + if (prefix) + *p++ = prefix; + if (delim == '"') + *p++ = '\\'; + *p++ = delim; + for (i = 0; i < len; i++) { + if (s[i] == '"' || s[i] == '\\') + *p++ = '\\'; + *p++ = s[i]; + } + if (delim == '"') + *p++ = '\\'; + *p++ = delim; + *p++ = '\0'; + return buffer; +} + const char *show_token(const struct token *token) { static char buffer[256]; @@ -137,10 +173,6 @@ const char *show_token(const struct token *token) case TOKEN_IDENT: return show_ident(token->ident); - case TOKEN_STRING: - case TOKEN_WIDE_STRING: - return show_string(token->string); - case TOKEN_NUMBER: return token->number; @@ -148,15 +180,23 @@ const char *show_token(const struct token *token) return show_special(token->special); case TOKEN_CHAR: - case TOKEN_WIDE_CHAR: { - char *ptr = buffer; - int c = token->character; - *ptr++ = '\''; - ptr = charstr(ptr, c, '\'', 0); - *ptr++ = '\''; - *ptr++ = '\0'; - return buffer; - } + return show_char(token->string->data, + token->string->length - 1, 0, '\''); + case TOKEN_CHAR+1 ... TOKEN_CHAR+4: + return show_char(token->embedded, + token_type(token) - TOKEN_CHAR, 0, '\''); + case TOKEN_WIDE_CHAR: + return show_char(token->string->data, + token->string->length - 1, 'L', '\''); + case TOKEN_WIDE_CHAR+1 ... TOKEN_WIDE_CHAR+4: + return show_char(token->embedded, + token_type(token) - TOKEN_WIDE_CHAR, 'L', '\''); + case TOKEN_STRING: + return show_char(token->string->data, + token->string->length - 1, 0, '"'); + case TOKEN_WIDE_STRING: + return show_char(token->string->data, + token->string->length - 1, 'L', '"'); case TOKEN_STREAMBEGIN: sprintf(buffer, "<beginning of '%s'>", stream_name(token->pos.stream)); @@ -180,6 +220,47 @@ const char *show_token(const struct token *token) } } +const char *quote_token(const struct token *token) +{ + static char buffer[256]; + + switch (token_type(token)) { + case TOKEN_ERROR: + return "syntax error"; + + case TOKEN_IDENT: + return show_ident(token->ident); + + case TOKEN_NUMBER: + return token->number; + + case TOKEN_SPECIAL: + return show_special(token->special); + + case TOKEN_CHAR: + return quote_char(token->string->data, + token->string->length - 1, 0, '\''); + case TOKEN_CHAR+1 ... TOKEN_CHAR+4: + return quote_char(token->embedded, + token_type(token) - TOKEN_CHAR, 0, '\''); + case TOKEN_WIDE_CHAR: + return quote_char(token->string->data, + token->string->length - 1, 'L', '\''); + case TOKEN_WIDE_CHAR+1 ... TOKEN_WIDE_CHAR+4: + return quote_char(token->embedded, + token_type(token) - TOKEN_WIDE_CHAR, 'L', '\''); + case TOKEN_STRING: + return quote_char(token->string->data, + token->string->length - 1, 0, '"'); + case TOKEN_WIDE_STRING: + return quote_char(token->string->data, + token->string->length - 1, 'L', '"'); + default: + sprintf(buffer, "unhandled token type '%d' ", token_type(token)); + return buffer; + } +} + #define HASHED_INPUT_BITS (6) #define HASHED_INPUT (1 << HASHED_INPUT_BITS) #define HASH_PRIME 0x9e370001UL @@ -384,22 +465,35 @@ enum { Dot = 16, ValidSecond = 32, Quote = 64, + Escape = 128, }; static const long cclass[257] = { - ['0' + 1 ... '9' + 1] = Digit | Hex, + ['0' + 1 ... '7' + 1] = Digit | Hex | Escape, /* \<octal> */ + ['8' + 1 ... '9' + 1] = Digit | Hex, ['A' + 1 ... 'D' + 1] = Letter | Hex, - ['E' + 1] = Letter | Hex | Exp, + ['E' + 1] = Letter | Hex | Exp, /* E<exp> */ ['F' + 1] = Letter | Hex, ['G' + 1 ... 'O' + 1] = Letter, - ['P' + 1] = Letter | Exp, + ['P' + 1] = Letter | Exp, /* P<exp> */ ['Q' + 1 ... 'Z' + 1] = Letter, - ['a' + 1 ... 'd' + 1] = Letter | Hex, - ['e' + 1] = Letter | Hex | Exp, - ['f' + 1] = Letter | Hex, - ['g' + 1 ... 'o' + 1] = Letter, - ['p' + 1] = Letter | Exp, - ['q' + 1 ... 'z' + 1] = Letter, + ['a' + 1 ... 'b' + 1] = Letter | Hex | Escape, /* \a, \b */ + ['c' + 1 ... 'd' + 1] = Letter | Hex, + ['e' + 1] = Letter | Hex | Exp | Escape,/* \e, e<exp> */ + ['f' + 1] = Letter | Hex | Escape, /* \f */ + ['g' + 1 ... 'm' + 1] = Letter, + ['n' + 1] = Letter | Escape, /* \n */ + ['o' + 1] = Letter, + ['p' + 1] = Letter | Exp, /* p<exp> */ + ['q' + 1] = Letter, + ['r' + 1] = Letter | Escape, /* \r */ + ['s' + 1] = Letter, + ['t' + 1] = Letter | Escape, /* \t */ + ['u' + 1] = Letter, + ['v' + 1] = Letter | Escape, /* \v */ + ['w' + 1] = Letter, + ['x' + 1] = Letter | Escape, /* \x<hex> */ + ['y' + 1 ... 'z' + 1] = Letter, ['_' + 1] = Letter, ['.' + 1] = Dot | ValidSecond, ['=' + 1] = ValidSecond, @@ -410,8 +504,10 @@ static const long cclass[257] = { ['&' + 1] = ValidSecond, ['|' + 1] = ValidSecond, ['#' + 1] = ValidSecond, - ['\'' + 1] = Quote, - ['"' + 1] = Quote, + ['\'' + 1] = Quote | Escape, + ['"' + 1] = Quote | Escape, + ['\\' + 1] = Escape, + ['?' + 1] = Escape, }; /* @@ -471,151 +567,74 @@ static int get_one_number(int c, int next, stream_t *stream) return next; } -static int escapechar(int first, int type, stream_t *stream, int *valp) -{ - int next, value; - - next = nextchar(stream); - value = first; - - if (first == '\n') - warning(stream_pos(stream), "Newline in string or character constant"); - - if (first == '\\' && next != EOF) { - value = next; - next = nextchar(stream); - if (value != type) { - switch (value) { - case 'a': - value = '\a'; - break; - case 'b': - value = '\b'; - break; - case 't': - value = '\t'; - break; - case 'n': - value = '\n'; - break; - case 'v': - value = '\v'; - break; - case 'f': - value = '\f'; - break; - case 'r': - value = '\r'; - break; - case 'e': - value = '\e'; - break; - case '\\': - break; - case '?': - break; - case '\'': - break; - case '"': - break; - case '\n': - warning(stream_pos(stream), "Newline in string or character constant"); - break; - case '0'...'7': { - int nr = 2; - value -= '0'; - while (next >= '0' && next <= '7') { - value = (value << 3) + (next-'0'); - next = nextchar(stream); - if (!--nr) - break; - } - value &= 0xff; - break; - } - case 'x': { - int hex = hexval(next); - if (hex < 16) { - value = hex; - next = nextchar(stream); - while ((hex = hexval(next)) < 16) { - value = (value << 4) + hex; - next = nextchar(stream); - } - value &= 0xff; - break; - } - } - /* Fall through */ - default: - warning(stream_pos(stream), "Unknown escape '%c'", value); - } - } - /* Mark it as escaped */ - value |= 0x100; - } - *valp = value; - return next; -} - -static int get_char_token(int next, stream_t *stream, enum token_type type) -{ - int value; - struct token *token; - - next = escapechar(next, '\'', stream, &value); - if (value == '\'' || next != '\'') { - sparse_error(stream_pos(stream), "Bad character constant"); - drop_token(stream); - return next; - } - - token = stream->token; - token_type(token) = type; - token->character = value & 0xff; - - add_token(stream); - return nextchar(stream); -} - -static int get_string_token(int next, stream_t *stream, enum token_type type) +static int eat_string(int next, stream_t *stream, enum token_type type) { static char buffer[MAX_STRING]; struct string *string; - struct token *token; + struct token *token = stream->token; int len = 0; + int escape; + int want_hex = 0; + char delim = type < TOKEN_STRING ? '\'' : '"'; - for (;;) { - int val; - next = escapechar(next, '"', stream, &val); - if (val == '"') - break; + for (escape = 0; escape || next != delim; next = nextchar(stream)) { + if (len < MAX_STRING) + buffer[len] = next; + len++; + if (next == '\n') { + warning(stream_pos(stream), + "Newline in string or character constant"); + if (delim == '\'') /* assume it's lost ' */ + break; + } if (next == EOF) { - warning(stream_pos(stream), "End of file in middle of string"); + warning(stream_pos(stream), + "End of file in middle of string"); return next; } - if (len < MAX_STRING) - buffer[len] = val; - len++; + if (!escape) { + if (want_hex && !(cclass[next + 1] & Hex)) + warning(stream_pos(stream), + "\\x used with no following hex digits"); + want_hex = 0; + escape = next == '\\'; + } else { + if (!(cclass[next + 1] & Escape)) + warning(stream_pos(stream), + "Unknown escape '%c'", next); + escape = 0; + want_hex = next == 'x'; + } } - + if (want_hex) + warning(stream_pos(stream), + "\\x used with no following hex digits"); if (len > MAX_STRING) { warning(stream_pos(stream), "string too long (%d bytes, %d bytes max)", len, MAX_STRING); len = MAX_STRING; } - - string = __alloc_string(len+1); - memcpy(string->data, buffer, len); - string->data[len] = '\0'; - string->length = len+1; + if (delim == '\'' && len <= 4) { + if (len == 0) { + sparse_error(stream_pos(stream), + "empty character constant"); + return nextchar(stream); + } + token_type(token) = type + len; + memset(buffer + len, '\0', 4 - len); + memcpy(token->embedded, buffer, 4); + } else { + token_type(token) = type; + string = __alloc_string(len+1); + memcpy(string->data, buffer, len); + string->data[len] = '\0'; + string->length = len+1; + token->string = string; + } /* Pass it on.. */ token = stream->token; - token_type(token) = type; - token->string = string; add_token(stream); - - return next; + return nextchar(stream); } static int drop_stream_eoln(stream_t *stream) @@ -731,9 +750,9 @@ static int get_one_special(int c, stream_t *stream) return get_one_number(c, next, stream); break; case '"': - return get_string_token(next, stream, TOKEN_STRING); + return eat_string(next, stream, TOKEN_STRING); case '\'': - return get_char_token(next, stream, TOKEN_CHAR); + return eat_string(next, stream, TOKEN_CHAR); case '/': if (next == '/') return drop_stream_eoln(stream); @@ -910,10 +929,10 @@ static int get_one_identifier(int c, stream_t *stream) if (cclass[next + 1] & Quote) { if (len == 1 && buf[0] == 'L') { if (next == '\'') - return get_char_token(nextchar(stream), stream, + return eat_string(nextchar(stream), stream, TOKEN_WIDE_CHAR); else - return get_string_token(nextchar(stream), stream, + return eat_string(nextchar(stream), stream, TOKEN_WIDE_STRING); } } |
