diff options
| author | Linus Torvalds <torvalds@home.transmeta.com> | 2003-03-13 12:53:56 -0700 |
|---|---|---|
| committer | Linus Torvalds <torvalds@ppc970.osdl.org> | 2005-04-07 20:59:14 -0700 |
| commit | 3ece2ef7c0a3d5975f65aa09911e1944e4125c45 (patch) | |
| tree | af5b6217bec44bbad283f7d35c8959f1c1ed930b | |
| download | sparse-dev-3ece2ef7c0a3d5975f65aa09911e1944e4125c45.tar.gz | |
Yaah. I'm a retard, but I want to at least try to see how hard it is
to do a semantic parser that is smaller than gcc is.
Right now this is just the lexer, though, along with a test app to print
the results back out to verify the thing.
| -rw-r--r-- | Makefile | 10 | ||||
| -rw-r--r-- | test-lexing.c | 126 | ||||
| -rw-r--r-- | token.h | 108 | ||||
| -rw-r--r-- | tokenize.c | 559 |
4 files changed, 803 insertions, 0 deletions
diff --git a/Makefile b/Makefile new file mode 100644 index 00000000..6599d016 --- /dev/null +++ b/Makefile @@ -0,0 +1,10 @@ +CFLAGS=-g -Wall + +test-lexing: test-lexing.o tokenize.o + gcc -o $@ test-lexing.o tokenize.o + +test-lexing.o: token.h +tokenize.o: token.h + +clean: + rm -f *.o diff --git a/test-lexing.c b/test-lexing.c new file mode 100644 index 00000000..419a3bcc --- /dev/null +++ b/test-lexing.c @@ -0,0 +1,126 @@ +#include <stdarg.h> +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <ctype.h> +#include <unistd.h> +#include <fcntl.h> +#include "token.h" + +void die(const char *fmt, ...) +{ + va_list args; + static char buffer[512]; + + va_start(args, fmt); + vsnprintf(buffer, sizeof(buffer), fmt, args); + va_end(args); + + fprintf(stderr, "%s\n", buffer); + exit(1); +} + +static char *show_value(struct value *value) +{ + static char buffer[256]; + + switch (value->type) { + case TOKEN_ERROR: + return "syntax error"; + + case TOKEN_IDENT: { + struct ident *ident = value->ident; + sprintf(buffer, "%.*s", ident->len, ident->name); + return buffer; + } + + case TOKEN_STRING: { + char *ptr; + int i; + struct string *string = value->string; + + ptr = buffer; + *ptr++ = '"'; + for (i = 0; i < string->length; i++) { + unsigned char c = string->data[i]; + if (isprint(c) && c != '"') { + *ptr++ = c; + continue; + } + *ptr++ = '\\'; + switch (c) { + case '\n': + *ptr++ = 'n'; + continue; + case '\t': + *ptr++ = 't'; + continue; + case '"': + *ptr++ = '"'; + continue; + } + if (!isdigit(string->data[i+1])) { + ptr += sprintf(ptr, "%o", c); + continue; + } + + ptr += sprintf(ptr, "%03o", c); + } + *ptr++ = '"'; + *ptr = '\0'; + return buffer; + } + + case TOKEN_INTEGER: { + char *ptr; + ptr = buffer + sprintf(buffer, "%llu", value->intval); + return buffer; + } + + case TOKEN_FP: { + sprintf(buffer, "%f", value->fpval); + return buffer; + } + + case TOKEN_SPECIAL: { + int val = value->special; + static const char *combinations[] = COMBINATION_STRINGS; + buffer[0] = val; + buffer[1] = 0; + if (val >= SPECIAL_BASE) + strcpy(buffer, combinations[val - SPECIAL_BASE]); + return buffer; + } + + default: + return "WTF???"; + } +} + +void callback(struct token *token) +{ + printf("%s ", show_value(&token->value)); +} + +int main(int argc, char **argv) +{ + int line; + int fd = open(argv[1], O_RDONLY); + struct token *token; + + if (fd < 0) + die("No such file: %s", argv[1]); + token = tokenize(argv[1], fd); + line = token->line; + while (token) { + callback(token); + token = token->next; + if (token && token->line != line) { + line = token->line; + putchar('\n'); + } + } + putchar('\n'); + print_ident_stat(); + return 0; +} diff --git a/token.h b/token.h new file mode 100644 index 00000000..f80e9a18 --- /dev/null +++ b/token.h @@ -0,0 +1,108 @@ +#ifndef TOKEN_H +#define TOKEN_H + +/* + * This describes the pure lexical elements (tokens), with + * no semantic meaning. In other words, an identifier doesn't + * have a type or meaning, it is only a specific string in + * the input stream. + * + * Semantic meaning is handled elsewhere. + */ + +struct stream { + const char *name; +}; + +extern int input_stream_nr; +extern struct stream *input_streams; + +extern int ident_hit, ident_miss; + +struct ident { + struct ident *next; /* Hash chain of identifiers */ + struct symbol *symbol; /* Pointer to semantic meaning list */ + unsigned char len; /* Length of identifier name */ + char name[]; /* Actual identifier */ +}; + +enum token_type { + TOKEN_ERROR = 0, + TOKEN_IDENT, + TOKEN_INTEGER, + TOKEN_FP, + TOKEN_STRING, + TOKEN_SPECIAL, +}; + +/* Combination tokens */ +#define COMBINATION_STRINGS { \ + "+=", "++", \ + "-=", "--", "->", \ + "*=", \ + "/=", "/*", "//", \ + "%=", \ + "..", "...", \ + "<=", "<<", "<<=", \ + ">=", ">>", ">>=", \ + "==", \ + "&&", "&=", \ + "||", "|=", \ + "^=", \ +} + +enum special_token { + SPECIAL_BASE = 256, + SPECIAL_ADD_ASSIGN = 256, + SPECIAL_INCREMENT, + SPECIAL_MINUS_ASSIGN, + SPECIAL_DECREMENT, + SPECIAL_DEREFERENCE, + SPECIAL_TIMES_ASSIGN, + SPECIAL_DIV_ASSIGN, + SPECIAL_COMMENT, + SPECIAL_CPPCOMMENT, + SPECIAL_MOD_ASSIGN, + SPECIAL_DOTDOT, + SPECIAL_ELLIPSIS, + SPECIAL_LTE, + SPECIAL_LEFTSHIFT, + SPECIAL_SHL_ASSIGN, + SPECIAL_GTE, + SPECIAL_RIGHTSHIFT, + SPECIAL_SHR_ASSIGN, + SPECIAL_EQUAL, + SPECIAL_LOGICAL_AND, + SPECIAL_AND_ASSIGN, + SPECIAL_LOGICAL_OR, + SPECIAL_OR_ASSIGN, + SPECIAL_XOR_ASSIGN +}; + +struct string { + unsigned int length; + char data[]; +}; + +struct value { + enum token_type type; + union { + double fpval; + unsigned long long intval; + struct ident *ident; + unsigned int special; + struct string *string; + }; +}; + +struct token { + unsigned int line; + unsigned int pos:16,stream:8,len:8; + struct value value; + struct token *next; +}; + +extern struct token * tokenize(const char *, int); +extern void die(const char *, ...); + +#endif diff --git a/tokenize.c b/tokenize.c new file mode 100644 index 00000000..02768b6d --- /dev/null +++ b/tokenize.c @@ -0,0 +1,559 @@ +/* + * This is a really stupid C tokenizer, intended to run after the + * preprocessor. + * + * A smart preprocessor would be integrated and pass the compiler + * the tokenized input directly, but lacking that we just tokenize + * the preprocessor output. + */ +#include <stdio.h> +#include <stdlib.h> +#include <stdarg.h> +#include <stddef.h> +#include <string.h> +#include <ctype.h> +#include <unistd.h> +#include "token.h" + +#define EOF (-1) + +int input_stream_nr = 0; +struct stream *input_streams; +static int input_streams_allocated; + +static int init_stream(const char *name) +{ + int stream = input_stream_nr; + + if (stream >= input_streams_allocated) { + int newalloc = stream * 4 / 3 + 10; + input_streams = realloc(input_streams, newalloc * sizeof(struct stream)); + if (!input_streams) + die("Unable to allocate more streams space"); + input_streams_allocated = newalloc; + } + input_streams[stream].name = name; + input_stream_nr = stream+1; + return stream; +} + +#define BUFSIZE (4096) +typedef struct { + int fd, line, pos, offset, size; + struct token **tokenlist; + struct token *token; + unsigned char buffer[BUFSIZE]; +} action_t; + +static int nextchar(action_t *action) +{ + int offset = action->offset; + int size = action->size; + int c; + + if (offset >= size) { + size = read(action->fd, action->buffer, sizeof(action->buffer)); + if (size <= 0) + return EOF; + action->size = size; + action->offset = 0; + offset = 0; + } + c = action->buffer[offset]; + action->offset = offset + 1; + action->pos++; + if (c == '\n') { + action->line++; + action->pos = 0; + } + return c; +} + +static void warn(action_t *action, const char *fmt, ...) +{ + static char buffer[512]; + struct stream *stream; + struct token *token = action->token; + + va_list args; + va_start(args, fmt); + vsprintf(buffer, fmt, args); + va_end(args); + + stream = input_streams + token->stream; + fprintf(stderr, "warning: %s:%d: %s\n", + stream->name, token->line, + buffer); +} + +static void add_token(action_t *action) +{ + struct token *token = action->token; + + action->token = NULL; + token->next = NULL; + *action->tokenlist = token; + action->tokenlist = &token->next; +} + +static void drop_token(action_t *action) +{ + struct token *token = action->token; + + action->token = NULL; + free(token); +} + +static int do_integer(unsigned long long value, int next, action_t *action) +{ + struct token *token = action->token; + + token->value.type = TOKEN_INTEGER; + token->value.intval = value; + add_token(action); + return next; +} + +static int get_base_number(unsigned int base, unsigned int __val, action_t *action) +{ + unsigned long long value = __val; + int next; + + for (;;) { + unsigned int n = 1000; + next = nextchar(action); + switch (next) { + case '0'...'9': + n = next-'0'; + break; + case 'a'...'z': + n = next-'a'+10; + break; + case 'A'...'Z': + n = next-'A'+10; + } + if (n >= base) + break; + value = value * base + n; + } + return do_integer(value, next, action); +} + +/* Parse error: return a token with a NULL "value" part */ +static void parse_error(action_t *action) +{ + add_token(action); +} + +static int get_hex_number(int x, action_t *action) +{ + int next = nextchar(action); + + switch (next) { + case '0'...'9': + return get_base_number(16, next-'0', action); + case 'a'...'f': + return get_base_number(16, next-'a'+10, action); + case 'A'...'F': + return get_base_number(16, next-'A'+10, action); + } + parse_error(action); + return next; +} + +static int get_one_number(int c, action_t *action) +{ + int next = nextchar(action); + + switch (next) { + case '0'...'7': + if (c == '0') + return get_base_number(8, next-'0', action); + /* fallthrough */ + case '8'...'9': + return get_base_number(10, c*10+next-'0'*11, action); + case 'x': case 'X': + if (c == '0') + return get_hex_number(next, action); + } + return do_integer(c-'0', next, action); +} + +static int hexval(int c) +{ + int retval = 256; + switch (c) { + case '0'...'9': + retval = c - '0'; + break; + case 'a'...'f': + retval = c - 'a' + 10; + break; + case 'A'...'F': + retval = c - 'A' + 10; + break; + } + return retval; +} + +static int escapechar(int first, int type, action_t *action, int *valp) +{ + int next, value; + + next = nextchar(action); + value = first; + + if (first == '\n') + warn(action, "Newline in string or character constant"); + + if (first == '\\' && next != EOF) { + value = next; + next = nextchar(action); + if (value != type) { + switch (value) { + case 'n': + value = '\n'; + break; + case 't': + value = '\t'; + break; + case '\\': + break; + case '0'...'7': { + int nr = 2; + value -= '0'; + while (next >= '0' && next <= '9') { + value = (value << 3) + (next-'0'); + next = nextchar(action); + if (!--nr) + break; + } + value &= 0xff; + break; + } + case 'x': { + int hex = hexval(next); + if (hex < 16) { + value = hex; + next = nextchar(action); + while ((hex = hexval(next)) < 16) { + value = (value << 4) + hex; + next = nextchar(action); + } + value &= 0xff; + break; + } + } + /* Fallthrough */ + default: + warn(action, "Unknown escape '%c'", value); + } + } + /* Mark it as escaped */ + value |= 0x100; + } + *valp = value; + return next; +} + +static int get_char_token(int next, action_t *action) +{ + int value; + struct token *token; + + next = escapechar(next, '\'', action, &value); + if (value == '\'' || next != '\'') { + warn(action, "Bad character constant"); + drop_token(action); + return next; + } + + token = action->token; + token->value.type = TOKEN_INTEGER; + token->value.intval = value & 0xff; + + add_token(action); + return nextchar(action); +} + +static int get_string_token(int next, action_t *action) +{ + static char buffer[512]; + struct string *string; + struct token *token; + int len = 0; + + for (;;) { + int val; + next = escapechar(next, '"', action, &val); + if (val == '"') + break; + if (next == EOF) { + warn(action, "Enf of file in middle of string"); + return next; + } + if (len < sizeof(buffer)) { + buffer[len] = val; + len++; + } + + } + + if (len > 256) + warn(action, "String too long"); + + string = malloc(sizeof(int)+len); + memcpy(string->data, buffer, len); + string->length = len; + + /* Pass it on.. */ + token = action->token; + token->value.type = TOKEN_STRING; + token->value.string = string; + add_token(action); + + return next; +} + +static int drop_stream_eoln(action_t *action) +{ + int next = nextchar(action); + drop_token(action); + for (;;) { + int curr = next; + if (curr == EOF) + return next; + next = nextchar(action); + if (curr == '\n') + return next; + } +} + +static int drop_stream_comment(action_t *action) +{ + int next = nextchar(action); + drop_token(action); + for (;;) { + int curr = next; + if (curr == EOF) { + warn(action, "End of file in the middle of a comment"); + return curr; + } + next = nextchar(action); + if (curr == '*' && next == '/') + break; + } + return nextchar(action); +} + +unsigned char combinations[][3] = COMBINATION_STRINGS; + +#define NR_COMBINATIONS (sizeof(combinations)/3) + +static int get_one_special(int c, action_t *action) +{ + struct token *token; + unsigned char c1, c2, c3; + int next, value, i; + char *comb; + + next = nextchar(action); + + /* + * Check for strings, character constants, and comments + */ + switch (c) { + case '"': + return get_string_token(next, action); + case '\'': + return get_char_token(next, action); + case '/': + if (next == '/') + return drop_stream_eoln(action); + if (next == '*') + return drop_stream_comment(action); + } + + /* + * Check for combinations + */ + value = c; + comb = combinations[0]; + c1 = c; c2 = next; c3 = 0; + for (i = 0; i < NR_COMBINATIONS; i++) { + if (comb[0] == c1 && comb[1] == c2 && comb[2] == c3) { + value = i + SPECIAL_BASE; + c = next; + next = nextchar(action); + if (c3) + break; + c3 = c; + } + comb += 3; + } + + /* Pass it on.. */ + token = action->token; + token->value.type = TOKEN_SPECIAL; + token->value.special = value; + add_token(action); + return next; +} + +#define IDENT_HASH_BITS (10) +#define IDENT_HASH_SIZE (1<<IDENT_HASH_BITS) +#define IDENT_HASH_MASK (IDENT_HASH_SIZE-1) +static struct ident *hash_table[IDENT_HASH_SIZE]; +int ident_hit, ident_miss; + +void print_ident_stat(void) +{ + int i; + int distribution[100]; + + fprintf(stderr, "identifiers: %d hits, %d misses\n", + ident_hit, ident_miss); + + for (i = 0; i < 100; i++) + distribution[i] = 0; + + for (i = 0; i < IDENT_HASH_SIZE; i++) { + struct ident * ident = hash_table[i]; + int count = 0; + + while (ident) { + count++; + ident = ident->next; + } + if (count > 99) + count = 99; + distribution[count]++; + } + + for (i = 0; i < 100; i++) { + if (distribution[i]) + fprintf(stderr, "%2d: %d buckets\n", i, distribution[i]); + } +} + +static struct ident *create_hashed_ident(const char *name, int len, unsigned long hash) +{ + struct ident *ident; + + hash = ((hash >> IDENT_HASH_BITS) + hash) & IDENT_HASH_MASK; + ident = hash_table[hash]; + while (ident) { + if (ident->len == len && !memcmp(ident->name, name, len)) { + ident_hit++; + return ident; + } + ident = ident->next; + } + + ident = malloc(offsetof(struct ident,name) + len); + if (!ident) + die("Out of memory for identifiers"); + + ident->symbol = NULL; + ident->len = len; + memcpy(ident->name, name, len); + ident->next = hash_table[hash]; + hash_table[hash] = ident; + ident_miss++; + return ident; +} + +#define ident_hash_init(c) (c) +#define ident_hash_add(oldhash,c) ((oldhash)*11 + (c)) +#define ident_hash_end(hash) (hash) + +static int get_one_identifier(int c, action_t *action) +{ + struct token *token; + struct ident *ident; + unsigned long hash; + char buf[256]; + int len = 1; + int next; + + hash = ident_hash_init(c); + buf[0] = c; + for (;;) { + next = nextchar(action); + switch (next) { + case '0'...'9': + case 'a'...'z': + case 'A'...'Z': + case '_': + if (len < sizeof(buf)) { + hash = ident_hash_add(hash, next); + buf[len] = next; + len++; + } + continue; + } + break; + }; + hash = ident_hash_end(hash); + + ident = create_hashed_ident(buf, len, hash); + + /* Pass it on.. */ + token = action->token; + token->value.type = TOKEN_IDENT; + token->value.ident = ident; + add_token(action); + return next; +} + +static int get_one_token(int c, action_t *action) +{ + switch (c) { + case '0'...'9': + return get_one_number(c, action); + case 'a'...'z': + case 'A'...'Z': + case '_': + return get_one_identifier(c, action); + default: + return get_one_special(c, action); + } +} + +struct token * tokenize(const char *name, int fd) +{ + struct token *retval; + int stream = init_stream(name); + action_t action; + int c; + + retval = NULL; + action.tokenlist = &retval; + action.token = NULL; + action.line = 1; + action.pos = 0; + action.fd = fd; + action.offset = 0; + action.size = 0; + + c = nextchar(&action); + while (c != EOF) { + if (!isspace(c)) { + struct token *token = malloc(sizeof(struct token)); + if (!token) + die("Out of memory for token"); + + memset(token, 0, sizeof(struct token)); + token->line = action.line; + token->pos = action.pos; + token->stream = stream; + + action.token = token; + + c = get_one_token(c, &action); + continue; + } + c = nextchar(&action); + } + return retval; +} |
