Yaah. I'm a retard, but I want to at least try to see how hard it is

to do a semantic parser that is smaller than gcc is. Right now this is just the lexer, though, along with a test app to print the results back out to verify the thing.
author: Linus Torvalds <torvalds@home.transmeta.com> 2003-03-13 12:53:56 -0700
committer: Linus Torvalds <torvalds@ppc970.osdl.org> 2005-04-07 20:59:14 -0700
commit: 3ece2ef7c0a3d5975f65aa09911e1944e4125c45 (patch)
tree: af5b6217bec44bbad283f7d35c8959f1c1ed930b
download: sparse-dev-3ece2ef7c0a3d5975f65aa09911e1944e4125c45.tar.gz
4 files changed, 803 insertions, 0 deletions
diff --git a/Makefile b/Makefile
new file mode 100644
index 00000000..6599d016
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,10 @@
+CFLAGS=-g -Wall
+
+test-lexing: test-lexing.o tokenize.o
+	gcc -o $@ test-lexing.o tokenize.o
+
+test-lexing.o: token.h
+tokenize.o: token.h
+
+clean:
+	rm -f *.o
diff --git a/test-lexing.c b/test-lexing.c
new file mode 100644
index 00000000..419a3bcc
--- /dev/null
+++ b/test-lexing.c
@@ -0,0 +1,126 @@
+#include <stdarg.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <ctype.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include "token.h"
+
+void die(const char *fmt, ...)
+{
+	va_list args;
+	static char buffer[512];
+
+	va_start(args, fmt);
+	vsnprintf(buffer, sizeof(buffer), fmt, args);
+	va_end(args);
+
+	fprintf(stderr, "%s\n", buffer);
+	exit(1);
+}
+
+static char *show_value(struct value *value)
+{
+	static char buffer[256];
+
+	switch (value->type) {
+	case TOKEN_ERROR:
+		return "syntax error";
+
+	case TOKEN_IDENT: {
+		struct ident *ident = value->ident;
+		sprintf(buffer, "%.*s", ident->len, ident->name);
+		return buffer;
+	}
+
+	case TOKEN_STRING: {
+		char *ptr;
+		int i;
+		struct string *string = value->string;
+
+		ptr = buffer;
+		*ptr++ = '"';
+		for (i = 0; i < string->length; i++) {
+			unsigned char c = string->data[i];
+			if (isprint(c) && c != '"') {
+				*ptr++ = c;
+				continue;
+			}
+			*ptr++ = '\\';
+			switch (c) {
+			case '\n':
+				*ptr++ = 'n';
+				continue;
+			case '\t':
+				*ptr++ = 't';
+				continue;
+			case '"':
+				*ptr++ = '"';
+				continue;
+			}
+			if (!isdigit(string->data[i+1])) {
+				ptr += sprintf(ptr, "%o", c);
+				continue;
+			}
+				
+			ptr += sprintf(ptr, "%03o", c);
+		}
+		*ptr++ = '"';
+		*ptr = '\0';
+		return buffer;
+	}
+
+	case TOKEN_INTEGER: {
+		char *ptr;
+		ptr = buffer + sprintf(buffer, "%llu", value->intval);
+		return buffer;
+	}
+
+	case TOKEN_FP: {
+		sprintf(buffer, "%f", value->fpval);
+		return buffer;
+	}
+
+	case TOKEN_SPECIAL: {
+		int val = value->special;
+		static const char *combinations[] = COMBINATION_STRINGS;
+		buffer[0] = val;
+		buffer[1] = 0;
+		if (val >= SPECIAL_BASE)
+			strcpy(buffer, combinations[val - SPECIAL_BASE]);
+		return buffer;
+	}
+	
+	default:
+		return "WTF???";
+	}
+}
+
+void callback(struct token *token)
+{
+	printf("%s ", show_value(&token->value));
+}
+
+int main(int argc, char **argv)
+{
+	int line;
+	int fd = open(argv[1], O_RDONLY);
+	struct token *token;
+
+	if (fd < 0)
+		die("No such file: %s", argv[1]);
+	token = tokenize(argv[1], fd);
+	line = token->line;
+	while (token) {
+		callback(token);
+		token = token->next;
+		if (token && token->line != line) {
+			line = token->line;
+			putchar('\n');
+		}
+	}
+	putchar('\n');
+	print_ident_stat();
+	return 0;
+}
diff --git a/token.h b/token.h
new file mode 100644
index 00000000..f80e9a18
--- /dev/null
+++ b/token.h
@@ -0,0 +1,108 @@
+#ifndef TOKEN_H
+#define TOKEN_H
+
+/*
+ * This describes the pure lexical elements (tokens), with
+ * no semantic meaning. In other words, an identifier doesn't
+ * have a type or meaning, it is only a specific string in
+ * the input stream.
+ *
+ * Semantic meaning is handled elsewhere.
+ */
+
+struct stream {
+	const char *name;
+};
+
+extern int input_stream_nr;
+extern struct stream *input_streams;
+
+extern int ident_hit, ident_miss;
+
+struct ident {
+	struct ident *next;	/* Hash chain of identifiers */
+	struct symbol *symbol;	/* Pointer to semantic meaning list */
+	unsigned char len;	/* Length of identifier name */
+	char name[];		/* Actual identifier */
+};
+
+enum token_type {
+	TOKEN_ERROR = 0,
+	TOKEN_IDENT,
+	TOKEN_INTEGER,
+	TOKEN_FP,
+	TOKEN_STRING,
+	TOKEN_SPECIAL,
+};
+
+/* Combination tokens */
+#define COMBINATION_STRINGS {	\
+	"+=", "++",		\
+	"-=", "--", "->",	\
+	"*=",			\
+	"/=", "/*", "//",	\
+	"%=",			\
+	"..", "...",		\
+	"<=", "<<", "<<=",	\
+	">=", ">>", ">>=",	\
+	"==",			\
+	"&&", "&=",		\
+	"||", "|=",		\
+	"^=",			\
+}
+
+enum special_token {
+	SPECIAL_BASE = 256,
+	SPECIAL_ADD_ASSIGN = 256,
+	SPECIAL_INCREMENT,
+	SPECIAL_MINUS_ASSIGN,
+	SPECIAL_DECREMENT,
+	SPECIAL_DEREFERENCE,
+	SPECIAL_TIMES_ASSIGN,
+	SPECIAL_DIV_ASSIGN,
+	SPECIAL_COMMENT,
+	SPECIAL_CPPCOMMENT,
+	SPECIAL_MOD_ASSIGN,
+	SPECIAL_DOTDOT,
+	SPECIAL_ELLIPSIS,
+	SPECIAL_LTE,
+	SPECIAL_LEFTSHIFT,
+	SPECIAL_SHL_ASSIGN,
+	SPECIAL_GTE,
+	SPECIAL_RIGHTSHIFT,
+	SPECIAL_SHR_ASSIGN,
+	SPECIAL_EQUAL,
+	SPECIAL_LOGICAL_AND,
+	SPECIAL_AND_ASSIGN,
+	SPECIAL_LOGICAL_OR,
+	SPECIAL_OR_ASSIGN,
+	SPECIAL_XOR_ASSIGN
+};
+
+struct string {
+	unsigned int length;
+	char data[];
+};
+
+struct value {
+	enum token_type type;
+	union {
+		double fpval;
+		unsigned long long intval;
+		struct ident *ident;
+		unsigned int special;
+		struct string *string;
+	};
+};
+
+struct token {
+	unsigned int line;
+	unsigned int pos:16,stream:8,len:8;
+	struct value value;
+	struct token *next;
+};
+
+extern struct token * tokenize(const char *, int);
+extern void die(const char *, ...);
+
+#endif
diff --git a/tokenize.c b/tokenize.c
new file mode 100644
index 00000000..02768b6d
--- /dev/null
+++ b/tokenize.c
@@ -0,0 +1,559 @@
+/*
+ * This is a really stupid C tokenizer, intended to run after the
+ * preprocessor.
+ *
+ * A smart preprocessor would be integrated and pass the compiler
+ * the tokenized input directly, but lacking that we just tokenize
+ * the preprocessor output.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include <stddef.h>
+#include <string.h>
+#include <ctype.h>
+#include <unistd.h>
+#include "token.h"
+
+#define EOF (-1)
+
+int input_stream_nr = 0;
+struct stream *input_streams;
+static int input_streams_allocated;
+
+static int init_stream(const char *name)
+{
+	int stream = input_stream_nr;
+
+	if (stream >= input_streams_allocated) {
+		int newalloc = stream * 4 / 3 + 10;
+		input_streams = realloc(input_streams, newalloc * sizeof(struct stream));
+		if (!input_streams)
+			die("Unable to allocate more streams space");
+		input_streams_allocated = newalloc;
+	}
+	input_streams[stream].name = name;
+	input_stream_nr = stream+1;
+	return stream;
+}
+
+#define BUFSIZE (4096)
+typedef struct {
+	int fd, line, pos, offset, size;
+	struct token **tokenlist;
+	struct token *token;
+	unsigned char buffer[BUFSIZE];
+} action_t;
+
+static int nextchar(action_t *action)
+{
+	int offset = action->offset;
+	int size = action->size;
+	int c;
+
+	if (offset >= size) {
+		size = read(action->fd, action->buffer, sizeof(action->buffer));
+		if (size <= 0)
+			return EOF;
+		action->size = size;
+		action->offset = 0;
+		offset = 0;
+	}
+	c = action->buffer[offset];
+	action->offset = offset + 1;
+	action->pos++;
+	if (c == '\n') {
+		action->line++;
+		action->pos = 0;
+	}
+	return c;
+}
+
+static void warn(action_t *action, const char *fmt, ...)
+{
+	static char buffer[512];
+	struct stream *stream;
+	struct token *token = action->token;
+
+	va_list args;
+	va_start(args, fmt);
+	vsprintf(buffer, fmt, args);
+	va_end(args);
+
+	stream = input_streams + token->stream;
+	fprintf(stderr, "warning: %s:%d: %s\n",
+		stream->name, token->line,
+		buffer);
+}
+
+static void add_token(action_t *action)
+{
+	struct token *token = action->token;
+
+	action->token = NULL;
+	token->next = NULL;
+	*action->tokenlist = token;
+	action->tokenlist = &token->next;
+}
+
+static void drop_token(action_t *action)
+{
+	struct token *token = action->token;
+
+	action->token = NULL;
+	free(token);
+}
+
+static int do_integer(unsigned long long value, int next, action_t *action)
+{
+	struct token *token = action->token;
+	
+	token->value.type = TOKEN_INTEGER;
+	token->value.intval = value;
+	add_token(action);
+	return next;
+}
+
+static int get_base_number(unsigned int base, unsigned int __val, action_t *action)
+{
+	unsigned long long value = __val;
+	int next;
+
+	for (;;) {
+		unsigned int n = 1000;
+		next = nextchar(action);
+		switch (next) {
+		case '0'...'9':
+			n = next-'0';
+			break;
+		case 'a'...'z':
+			n = next-'a'+10;
+			break;
+		case 'A'...'Z':
+			n = next-'A'+10;
+		}
+		if (n >= base)
+			break;
+		value = value * base + n;
+	}
+	return do_integer(value, next, action);
+}
+
+/* Parse error: return a token with a NULL "value" part */
+static void parse_error(action_t *action)
+{
+	add_token(action);
+}
+
+static int get_hex_number(int x, action_t *action)
+{
+	int next = nextchar(action);
+
+	switch (next) {
+	case '0'...'9':
+		return get_base_number(16, next-'0', action);
+	case 'a'...'f':		
+		return get_base_number(16, next-'a'+10, action);
+	case 'A'...'F':
+		return get_base_number(16, next-'A'+10, action);
+	}
+	parse_error(action);
+	return next;
+}
+
+static int get_one_number(int c, action_t *action)
+{
+	int next = nextchar(action);
+
+	switch (next) {
+	case '0'...'7':
+		if (c == '0')
+			return get_base_number(8, next-'0', action);
+		/* fallthrough */
+	case '8'...'9':
+		return get_base_number(10, c*10+next-'0'*11, action);
+	case 'x': case 'X':
+		if (c == '0')
+			return get_hex_number(next, action);
+	}
+	return do_integer(c-'0', next, action);
+}
+
+static int hexval(int c)
+{
+	int retval = 256;
+	switch (c) {
+	case '0'...'9':
+		retval = c - '0';
+		break;
+	case 'a'...'f':
+		retval = c - 'a' + 10;
+		break;
+	case 'A'...'F':
+		retval = c - 'A' + 10;
+		break;
+	}
+	return retval;
+}
+
+static int escapechar(int first, int type, action_t *action, int *valp)
+{
+	int next, value;
+
+	next = nextchar(action);
+	value = first;
+
+	if (first == '\n')
+		warn(action, "Newline in string or character constant");
+
+	if (first == '\\' && next != EOF) {
+		value = next;
+		next = nextchar(action);
+		if (value != type) {
+			switch (value) {
+			case 'n':
+				value = '\n';
+				break;
+			case 't':
+				value = '\t';
+				break;
+			case '\\':
+				break;
+			case '0'...'7': {
+				int nr = 2;
+				value -= '0';
+				while (next >= '0' && next <= '9') {
+					value = (value << 3) + (next-'0');
+					next = nextchar(action);
+					if (!--nr)
+						break;
+				}
+				value &= 0xff;
+				break;
+			}
+			case 'x': {
+				int hex = hexval(next);
+				if (hex < 16) {
+					value = hex;
+					next = nextchar(action);
+					while ((hex = hexval(next)) < 16) {
+						value = (value << 4) + hex;
+						next = nextchar(action);
+					}
+					value &= 0xff;
+					break;
+				}
+			}
+			/* Fallthrough */
+			default:
+				warn(action, "Unknown escape '%c'", value);
+			}
+		}
+		/* Mark it as escaped */
+		value |= 0x100;
+	}
+	*valp = value;
+	return next;
+}
+
+static int get_char_token(int next, action_t *action)
+{
+	int value;
+	struct token *token;
+
+	next = escapechar(next, '\'', action, &value);
+	if (value == '\'' || next != '\'') {
+		warn(action, "Bad character constant");
+		drop_token(action);
+		return next;
+	}
+
+	token = action->token;
+	token->value.type = TOKEN_INTEGER;
+	token->value.intval = value & 0xff;
+
+	add_token(action);
+	return nextchar(action);
+}
+
+static int get_string_token(int next, action_t *action)
+{
+	static char buffer[512];
+	struct string *string;
+	struct token *token;
+	int len = 0;
+
+	for (;;) {
+		int val;
+		next = escapechar(next, '"', action, &val);
+		if (val == '"')
+			break;
+		if (next == EOF) {
+			warn(action, "Enf of file in middle of string");
+			return next;
+		}
+		if (len < sizeof(buffer)) {
+			buffer[len] = val;
+			len++;
+		}
+			
+	}
+
+	if (len > 256)
+		warn(action, "String too long");
+
+	string = malloc(sizeof(int)+len);
+	memcpy(string->data, buffer, len);
+	string->length = len;
+
+	/* Pass it on.. */
+	token = action->token;
+	token->value.type = TOKEN_STRING;
+	token->value.string = string;
+	add_token(action);
+	
+	return next;
+}
+
+static int drop_stream_eoln(action_t *action)
+{
+	int next = nextchar(action);
+	drop_token(action);
+	for (;;) {
+		int curr = next;
+		if (curr == EOF)
+			return next;
+		next = nextchar(action);
+		if (curr == '\n')
+			return next;
+	}
+}
+
+static int drop_stream_comment(action_t *action)
+{
+	int next = nextchar(action);
+	drop_token(action);
+	for (;;) {
+		int curr = next;
+		if (curr == EOF) {
+			warn(action, "End of file in the middle of a comment");
+			return curr;
+		}
+		next = nextchar(action);
+		if (curr == '*' && next == '/')
+			break;
+	}
+	return nextchar(action);
+}
+
+unsigned char combinations[][3] = COMBINATION_STRINGS;
+
+#define NR_COMBINATIONS (sizeof(combinations)/3)
+
+static int get_one_special(int c, action_t *action)
+{
+	struct token *token;
+	unsigned char c1, c2, c3;
+	int next, value, i;
+	char *comb;
+
+	next = nextchar(action);
+
+	/*
+	 * Check for strings, character constants, and comments
+	 */
+	switch (c) {
+	case '"':
+		return get_string_token(next, action);
+	case '\'':
+		return get_char_token(next, action);
+	case '/':
+		if (next == '/')
+			return drop_stream_eoln(action);
+		if (next == '*')
+			return drop_stream_comment(action);
+	}
+
+	/*
+	 * Check for combinations
+	 */
+	value = c;
+	comb = combinations[0];
+	c1 = c; c2 = next; c3 = 0;
+	for (i = 0; i < NR_COMBINATIONS; i++) {
+		if (comb[0] == c1 && comb[1] == c2 && comb[2] == c3) {
+			value = i + SPECIAL_BASE;
+			c = next;
+			next = nextchar(action);
+			if (c3)
+				break;
+			c3 = c;
+		}
+		comb += 3;
+	}
+
+	/* Pass it on.. */
+	token = action->token;
+	token->value.type = TOKEN_SPECIAL;
+	token->value.special = value;
+	add_token(action);
+	return next;
+}
+
+#define IDENT_HASH_BITS (10)
+#define IDENT_HASH_SIZE (1<<IDENT_HASH_BITS)
+#define IDENT_HASH_MASK (IDENT_HASH_SIZE-1)
+static struct ident *hash_table[IDENT_HASH_SIZE];
+int ident_hit, ident_miss;
+
+void print_ident_stat(void)
+{
+	int i;
+	int distribution[100];
+
+	fprintf(stderr, "identifiers: %d hits, %d misses\n",
+		ident_hit, ident_miss);
+
+	for (i = 0; i < 100; i++)
+		distribution[i] = 0;
+
+	for (i = 0; i < IDENT_HASH_SIZE; i++) {
+		struct ident * ident = hash_table[i];
+		int count = 0;
+
+		while (ident) {
+			count++;
+			ident = ident->next;
+		}
+		if (count > 99)
+			count = 99;
+		distribution[count]++;
+	}
+
+	for (i = 0; i < 100; i++) {
+		if (distribution[i])
+			fprintf(stderr, "%2d: %d buckets\n", i, distribution[i]);
+	}
+}
+
+static struct ident *create_hashed_ident(const char *name, int len, unsigned long hash)
+{
+	struct ident *ident;
+
+	hash = ((hash >> IDENT_HASH_BITS) + hash) & IDENT_HASH_MASK;
+	ident = hash_table[hash];
+	while (ident) {
+		if (ident->len == len && !memcmp(ident->name, name, len)) {
+			ident_hit++;
+			return ident;
+		}
+		ident = ident->next;
+	}
+
+	ident = malloc(offsetof(struct ident,name) + len);
+	if (!ident)
+		die("Out of memory for identifiers");
+
+	ident->symbol = NULL;
+	ident->len = len;
+	memcpy(ident->name, name, len);
+	ident->next = hash_table[hash];
+	hash_table[hash] = ident;
+	ident_miss++;
+	return ident;
+}
+
+#define ident_hash_init(c)		(c)
+#define ident_hash_add(oldhash,c)	((oldhash)*11 + (c))
+#define ident_hash_end(hash)		(hash)
+
+static int get_one_identifier(int c, action_t *action)
+{
+	struct token *token;
+	struct ident *ident;
+	unsigned long hash;
+	char buf[256];
+	int len = 1;
+	int next;
+
+	hash = ident_hash_init(c);
+	buf[0] = c;
+	for (;;) {
+		next = nextchar(action);
+		switch (next) {
+		case '0'...'9':
+		case 'a'...'z':
+		case 'A'...'Z':
+		case '_':
+			if (len < sizeof(buf)) {
+				hash = ident_hash_add(hash, next);
+				buf[len] = next;
+				len++;
+			}
+			continue;
+		}
+		break;
+	};
+	hash = ident_hash_end(hash);
+
+	ident = create_hashed_ident(buf, len, hash);
+
+	/* Pass it on.. */
+	token = action->token;
+	token->value.type = TOKEN_IDENT;
+	token->value.ident = ident;
+	add_token(action);
+	return next;
+}		
+
+static int get_one_token(int c, action_t *action)
+{
+	switch (c) {
+	case '0'...'9':
+		return get_one_number(c, action);
+	case 'a'...'z':
+	case 'A'...'Z':
+	case '_':
+		return get_one_identifier(c, action);
+	default:
+		return get_one_special(c, action);
+	}	
+}
+
+struct token * tokenize(const char *name, int fd)
+{
+	struct token *retval;
+	int stream = init_stream(name);
+	action_t action;
+	int c;
+
+	retval = NULL;
+	action.tokenlist = &retval;
+	action.token = NULL;
+	action.line = 1;
+	action.pos = 0;
+	action.fd = fd;
+	action.offset = 0;
+	action.size = 0;
+
+	c = nextchar(&action);
+	while (c != EOF) {
+		if (!isspace(c)) {
+			struct token *token = malloc(sizeof(struct token));
+			if (!token)
+				die("Out of memory for token");
+
+			memset(token, 0, sizeof(struct token));
+			token->line = action.line;
+			token->pos = action.pos;
+			token->stream = stream;
+
+			action.token = token;
+
+			c = get_one_token(c, &action);
+			continue;
+		}
+		c = nextchar(&action);
+	}
+	return retval;
+}
author	Linus Torvalds <torvalds@home.transmeta.com>	2003-03-13 12:53:56 -0700
committer	Linus Torvalds <torvalds@ppc970.osdl.org>	2005-04-07 20:59:14 -0700
commit	3ece2ef7c0a3d5975f65aa09911e1944e4125c45 (patch)
tree	af5b6217bec44bbad283f7d35c8959f1c1ed930b
download	sparse-dev-3ece2ef7c0a3d5975f65aa09911e1944e4125c45.tar.gz