aboutsummaryrefslogtreecommitdiffstatshomepage
path: root/tokenize.c
diff options
Diffstat (limited to 'tokenize.c')
-rw-r--r--tokenize.c331
1 files changed, 175 insertions, 156 deletions
diff --git a/tokenize.c b/tokenize.c
index 42630212..95f308e0 100644
--- a/tokenize.c
+++ b/tokenize.c
@@ -121,6 +121,42 @@ const char *show_string(const struct string *string)
return buffer;
}
+static const char *show_char(const char *s, size_t len, char prefix, char delim)
+{
+ static char buffer[MAX_STRING + 4];
+ char *p = buffer;
+ if (prefix)
+ *p++ = prefix;
+ *p++ = delim;
+ memcpy(p, s, len);
+ p += len;
+ *p++ = delim;
+ *p++ = '\0';
+ return buffer;
+}
+
+static const char *quote_char(const char *s, size_t len, char prefix, char delim)
+{
+ static char buffer[2*MAX_STRING + 6];
+ size_t i;
+ char *p = buffer;
+ if (prefix)
+ *p++ = prefix;
+ if (delim == '"')
+ *p++ = '\\';
+ *p++ = delim;
+ for (i = 0; i < len; i++) {
+ if (s[i] == '"' || s[i] == '\\')
+ *p++ = '\\';
+ *p++ = s[i];
+ }
+ if (delim == '"')
+ *p++ = '\\';
+ *p++ = delim;
+ *p++ = '\0';
+ return buffer;
+}
+
const char *show_token(const struct token *token)
{
static char buffer[256];
@@ -137,10 +173,6 @@ const char *show_token(const struct token *token)
case TOKEN_IDENT:
return show_ident(token->ident);
- case TOKEN_STRING:
- case TOKEN_WIDE_STRING:
- return show_string(token->string);
-
case TOKEN_NUMBER:
return token->number;
@@ -148,15 +180,23 @@ const char *show_token(const struct token *token)
return show_special(token->special);
case TOKEN_CHAR:
- case TOKEN_WIDE_CHAR: {
- char *ptr = buffer;
- int c = token->character;
- *ptr++ = '\'';
- ptr = charstr(ptr, c, '\'', 0);
- *ptr++ = '\'';
- *ptr++ = '\0';
- return buffer;
- }
+ return show_char(token->string->data,
+ token->string->length - 1, 0, '\'');
+ case TOKEN_CHAR+1 ... TOKEN_CHAR+4:
+ return show_char(token->embedded,
+ token_type(token) - TOKEN_CHAR, 0, '\'');
+ case TOKEN_WIDE_CHAR:
+ return show_char(token->string->data,
+ token->string->length - 1, 'L', '\'');
+ case TOKEN_WIDE_CHAR+1 ... TOKEN_WIDE_CHAR+4:
+ return show_char(token->embedded,
+ token_type(token) - TOKEN_WIDE_CHAR, 'L', '\'');
+ case TOKEN_STRING:
+ return show_char(token->string->data,
+ token->string->length - 1, 0, '"');
+ case TOKEN_WIDE_STRING:
+ return show_char(token->string->data,
+ token->string->length - 1, 'L', '"');
case TOKEN_STREAMBEGIN:
sprintf(buffer, "<beginning of '%s'>", stream_name(token->pos.stream));
@@ -180,6 +220,47 @@ const char *show_token(const struct token *token)
}
}
+const char *quote_token(const struct token *token)
+{
+ static char buffer[256];
+
+ switch (token_type(token)) {
+ case TOKEN_ERROR:
+ return "syntax error";
+
+ case TOKEN_IDENT:
+ return show_ident(token->ident);
+
+ case TOKEN_NUMBER:
+ return token->number;
+
+ case TOKEN_SPECIAL:
+ return show_special(token->special);
+
+ case TOKEN_CHAR:
+ return quote_char(token->string->data,
+ token->string->length - 1, 0, '\'');
+ case TOKEN_CHAR+1 ... TOKEN_CHAR+4:
+ return quote_char(token->embedded,
+ token_type(token) - TOKEN_CHAR, 0, '\'');
+ case TOKEN_WIDE_CHAR:
+ return quote_char(token->string->data,
+ token->string->length - 1, 'L', '\'');
+ case TOKEN_WIDE_CHAR+1 ... TOKEN_WIDE_CHAR+4:
+ return quote_char(token->embedded,
+ token_type(token) - TOKEN_WIDE_CHAR, 'L', '\'');
+ case TOKEN_STRING:
+ return quote_char(token->string->data,
+ token->string->length - 1, 0, '"');
+ case TOKEN_WIDE_STRING:
+ return quote_char(token->string->data,
+ token->string->length - 1, 'L', '"');
+ default:
+ sprintf(buffer, "unhandled token type '%d' ", token_type(token));
+ return buffer;
+ }
+}
+
#define HASHED_INPUT_BITS (6)
#define HASHED_INPUT (1 << HASHED_INPUT_BITS)
#define HASH_PRIME 0x9e370001UL
@@ -384,22 +465,35 @@ enum {
Dot = 16,
ValidSecond = 32,
Quote = 64,
+ Escape = 128,
};
static const long cclass[257] = {
- ['0' + 1 ... '9' + 1] = Digit | Hex,
+ ['0' + 1 ... '7' + 1] = Digit | Hex | Escape, /* \<octal> */
+ ['8' + 1 ... '9' + 1] = Digit | Hex,
['A' + 1 ... 'D' + 1] = Letter | Hex,
- ['E' + 1] = Letter | Hex | Exp,
+ ['E' + 1] = Letter | Hex | Exp, /* E<exp> */
['F' + 1] = Letter | Hex,
['G' + 1 ... 'O' + 1] = Letter,
- ['P' + 1] = Letter | Exp,
+ ['P' + 1] = Letter | Exp, /* P<exp> */
['Q' + 1 ... 'Z' + 1] = Letter,
- ['a' + 1 ... 'd' + 1] = Letter | Hex,
- ['e' + 1] = Letter | Hex | Exp,
- ['f' + 1] = Letter | Hex,
- ['g' + 1 ... 'o' + 1] = Letter,
- ['p' + 1] = Letter | Exp,
- ['q' + 1 ... 'z' + 1] = Letter,
+ ['a' + 1 ... 'b' + 1] = Letter | Hex | Escape, /* \a, \b */
+ ['c' + 1 ... 'd' + 1] = Letter | Hex,
+ ['e' + 1] = Letter | Hex | Exp | Escape,/* \e, e<exp> */
+ ['f' + 1] = Letter | Hex | Escape, /* \f */
+ ['g' + 1 ... 'm' + 1] = Letter,
+ ['n' + 1] = Letter | Escape, /* \n */
+ ['o' + 1] = Letter,
+ ['p' + 1] = Letter | Exp, /* p<exp> */
+ ['q' + 1] = Letter,
+ ['r' + 1] = Letter | Escape, /* \r */
+ ['s' + 1] = Letter,
+ ['t' + 1] = Letter | Escape, /* \t */
+ ['u' + 1] = Letter,
+ ['v' + 1] = Letter | Escape, /* \v */
+ ['w' + 1] = Letter,
+ ['x' + 1] = Letter | Escape, /* \x<hex> */
+ ['y' + 1 ... 'z' + 1] = Letter,
['_' + 1] = Letter,
['.' + 1] = Dot | ValidSecond,
['=' + 1] = ValidSecond,
@@ -410,8 +504,10 @@ static const long cclass[257] = {
['&' + 1] = ValidSecond,
['|' + 1] = ValidSecond,
['#' + 1] = ValidSecond,
- ['\'' + 1] = Quote,
- ['"' + 1] = Quote,
+ ['\'' + 1] = Quote | Escape,
+ ['"' + 1] = Quote | Escape,
+ ['\\' + 1] = Escape,
+ ['?' + 1] = Escape,
};
/*
@@ -471,151 +567,74 @@ static int get_one_number(int c, int next, stream_t *stream)
return next;
}
-static int escapechar(int first, int type, stream_t *stream, int *valp)
-{
- int next, value;
-
- next = nextchar(stream);
- value = first;
-
- if (first == '\n')
- warning(stream_pos(stream), "Newline in string or character constant");
-
- if (first == '\\' && next != EOF) {
- value = next;
- next = nextchar(stream);
- if (value != type) {
- switch (value) {
- case 'a':
- value = '\a';
- break;
- case 'b':
- value = '\b';
- break;
- case 't':
- value = '\t';
- break;
- case 'n':
- value = '\n';
- break;
- case 'v':
- value = '\v';
- break;
- case 'f':
- value = '\f';
- break;
- case 'r':
- value = '\r';
- break;
- case 'e':
- value = '\e';
- break;
- case '\\':
- break;
- case '?':
- break;
- case '\'':
- break;
- case '"':
- break;
- case '\n':
- warning(stream_pos(stream), "Newline in string or character constant");
- break;
- case '0'...'7': {
- int nr = 2;
- value -= '0';
- while (next >= '0' && next <= '7') {
- value = (value << 3) + (next-'0');
- next = nextchar(stream);
- if (!--nr)
- break;
- }
- value &= 0xff;
- break;
- }
- case 'x': {
- int hex = hexval(next);
- if (hex < 16) {
- value = hex;
- next = nextchar(stream);
- while ((hex = hexval(next)) < 16) {
- value = (value << 4) + hex;
- next = nextchar(stream);
- }
- value &= 0xff;
- break;
- }
- }
- /* Fall through */
- default:
- warning(stream_pos(stream), "Unknown escape '%c'", value);
- }
- }
- /* Mark it as escaped */
- value |= 0x100;
- }
- *valp = value;
- return next;
-}
-
-static int get_char_token(int next, stream_t *stream, enum token_type type)
-{
- int value;
- struct token *token;
-
- next = escapechar(next, '\'', stream, &value);
- if (value == '\'' || next != '\'') {
- sparse_error(stream_pos(stream), "Bad character constant");
- drop_token(stream);
- return next;
- }
-
- token = stream->token;
- token_type(token) = type;
- token->character = value & 0xff;
-
- add_token(stream);
- return nextchar(stream);
-}
-
-static int get_string_token(int next, stream_t *stream, enum token_type type)
+static int eat_string(int next, stream_t *stream, enum token_type type)
{
static char buffer[MAX_STRING];
struct string *string;
- struct token *token;
+ struct token *token = stream->token;
int len = 0;
+ int escape;
+ int want_hex = 0;
+ char delim = type < TOKEN_STRING ? '\'' : '"';
- for (;;) {
- int val;
- next = escapechar(next, '"', stream, &val);
- if (val == '"')
- break;
+ for (escape = 0; escape || next != delim; next = nextchar(stream)) {
+ if (len < MAX_STRING)
+ buffer[len] = next;
+ len++;
+ if (next == '\n') {
+ warning(stream_pos(stream),
+ "Newline in string or character constant");
+ if (delim == '\'') /* assume it's lost ' */
+ break;
+ }
if (next == EOF) {
- warning(stream_pos(stream), "End of file in middle of string");
+ warning(stream_pos(stream),
+ "End of file in middle of string");
return next;
}
- if (len < MAX_STRING)
- buffer[len] = val;
- len++;
+ if (!escape) {
+ if (want_hex && !(cclass[next + 1] & Hex))
+ warning(stream_pos(stream),
+ "\\x used with no following hex digits");
+ want_hex = 0;
+ escape = next == '\\';
+ } else {
+ if (!(cclass[next + 1] & Escape))
+ warning(stream_pos(stream),
+ "Unknown escape '%c'", next);
+ escape = 0;
+ want_hex = next == 'x';
+ }
}
-
+ if (want_hex)
+ warning(stream_pos(stream),
+ "\\x used with no following hex digits");
if (len > MAX_STRING) {
warning(stream_pos(stream), "string too long (%d bytes, %d bytes max)", len, MAX_STRING);
len = MAX_STRING;
}
-
- string = __alloc_string(len+1);
- memcpy(string->data, buffer, len);
- string->data[len] = '\0';
- string->length = len+1;
+ if (delim == '\'' && len <= 4) {
+ if (len == 0) {
+ sparse_error(stream_pos(stream),
+ "empty character constant");
+ return nextchar(stream);
+ }
+ token_type(token) = type + len;
+ memset(buffer + len, '\0', 4 - len);
+ memcpy(token->embedded, buffer, 4);
+ } else {
+ token_type(token) = type;
+ string = __alloc_string(len+1);
+ memcpy(string->data, buffer, len);
+ string->data[len] = '\0';
+ string->length = len+1;
+ token->string = string;
+ }
/* Pass it on.. */
token = stream->token;
- token_type(token) = type;
- token->string = string;
add_token(stream);
-
- return next;
+ return nextchar(stream);
}
static int drop_stream_eoln(stream_t *stream)
@@ -731,9 +750,9 @@ static int get_one_special(int c, stream_t *stream)
return get_one_number(c, next, stream);
break;
case '"':
- return get_string_token(next, stream, TOKEN_STRING);
+ return eat_string(next, stream, TOKEN_STRING);
case '\'':
- return get_char_token(next, stream, TOKEN_CHAR);
+ return eat_string(next, stream, TOKEN_CHAR);
case '/':
if (next == '/')
return drop_stream_eoln(stream);
@@ -910,10 +929,10 @@ static int get_one_identifier(int c, stream_t *stream)
if (cclass[next + 1] & Quote) {
if (len == 1 && buf[0] == 'L') {
if (next == '\'')
- return get_char_token(nextchar(stream), stream,
+ return eat_string(nextchar(stream), stream,
TOKEN_WIDE_CHAR);
else
- return get_string_token(nextchar(stream), stream,
+ return eat_string(nextchar(stream), stream,
TOKEN_WIDE_STRING);
}
}