Add new parser for JSON

2014-11-28 16:22:33 +01:00 · 2014-11-28 16:22:33 +01:00 · 2ff1386d96
commit 2ff1386d96
parent b0cf3b4e2d
8 changed files with 446 additions and 2 deletions
--- a/tagmanager/ctags/Makefile.am
+++ b/tagmanager/ctags/Makefile.am
@ -29,6 +29,7 @@ parsers = \
 	haxe.c \
 	html.c \
 	js.c \
+	json.c \
 	latex.c \
 	lregex.c \
 	lua.c \
--- a/tagmanager/ctags/json.c
+++ b/tagmanager/ctags/json.c
@ -0,0 +1,396 @@
+/*
+ * Copyright (c) 2014, Colomban Wendling <colomban@geany.org>
+ *
+ * This source code is released for free distribution under the terms of the
+ * GNU General Public License.
+ */
+/*
+ * This module contains functions for generating tags for JSON files.
+ *
+ * http://www.ecma-international.org/publications/files/ECMA-ST/ECMA-404.pdf
+ *
+ * This implementation is forgiving and allows many constructs that are not
+ * actually valid but that don't conflict with the format.  This is intend to
+ * better support partly broken or unfinished files.
+ */
+
+#include "general.h"
+
+#include <string.h>
+#include "main.h"
+#include "entry.h"
+#include "keyword.h"
+#include "parse.h"
+#include "read.h"
+#include "vstring.h"
+
+typedef enum {
+	TOKEN_EOF,
+	TOKEN_UNDEFINED,
+	TOKEN_OPEN_SQUARE,
+	TOKEN_CLOSE_SQUARE,
+	TOKEN_OPEN_CURLY,
+	TOKEN_CLOSE_CURLY,
+	TOKEN_COLON,
+	TOKEN_COMMA,
+	TOKEN_TRUE,
+	TOKEN_FALSE,
+	TOKEN_NULL,
+	TOKEN_NUMBER,
+	TOKEN_STRING
+} tokenType;
+
+typedef enum {
+	TAG_NONE = -1,
+	TAG_OBJECT,
+	TAG_ARRAY,
+	TAG_NUMBER,
+	TAG_STRING,
+	TAG_BOOLEAN,
+	TAG_NULL,
+	TAG_COUNT
+} jsonKind;
+
+typedef struct {
+	tokenType		type;
+	jsonKind		scopeKind;
+	vString			*string;
+	vString			*scope;
+	unsigned long	lineNumber;
+	MIOPos			filePosition;
+} tokenInfo;
+
+typedef enum {
+	KEYWORD_true,
+	KEYWORD_false,
+	KEYWORD_null
+} keywordId;
+
+static langType Lang_json;
+
+static kindOption JsonKinds [] = {
+	{ TRUE,  'o', "object",		"objects"	},
+	{ TRUE,  'a', "array",		"arrays"	},
+	{ TRUE,  'n', "number",		"numbers"	},
+	{ TRUE,  's', "string",		"strings"	},
+	{ TRUE,  'b', "boolean",	"booleans"	},
+	{ TRUE,  'z', "null",		"nulls"		}
+};
+
+static tokenInfo *newToken (void)
+{
+	tokenInfo *const token = xMalloc (1, tokenInfo);
+
+	token->type			= TOKEN_UNDEFINED;
+	token->scopeKind	= TAG_NONE;
+	token->string		= vStringNew ();
+	token->scope		= vStringNew ();
+	token->lineNumber	= getSourceLineNumber ();
+	token->filePosition	= getInputFilePosition ();
+
+	return token;
+}
+
+static void deleteToken (tokenInfo *const token)
+{
+	vStringDelete (token->string);
+	vStringDelete (token->scope);
+	eFree (token);
+}
+
+static void copyToken (tokenInfo *const dest, tokenInfo *const src)
+{
+	dest->type = src->type;
+	dest->scopeKind = src->scopeKind;
+	vStringCopy (dest->string, src->string);
+	vStringCopy (dest->scope, src->scope);
+	dest->lineNumber = src->lineNumber;
+	dest->filePosition = src->filePosition;
+}
+
+static void makeJsonTag (tokenInfo *const token, const jsonKind kind)
+{
+	tagEntryInfo e;
+
+	if (! JsonKinds[kind].enabled)
+		return;
+
+	initTagEntry (&e, vStringValue (token->string));
+
+	e.lineNumber	= token->lineNumber;
+	e.filePosition	= token->filePosition;
+	e.kindName		= JsonKinds[kind].name;
+	e.kind			= JsonKinds[kind].letter;
+
+	if (vStringLength (token->scope) > 0)
+	{
+		Assert (token->scopeKind > TAG_NONE && token->scopeKind < TAG_COUNT);
+
+		e.extensionFields.scope[0] = JsonKinds[token->scopeKind].name;
+		e.extensionFields.scope[1] = vStringValue (token->scope);
+	}
+
+	makeTagEntry (&e);
+}
+
+static boolean isIdentChar (int c)
+{
+	return (isalnum (c) || c == '+' || c == '-' || c == '.');
+}
+
+static void readToken (tokenInfo *const token)
+{
+	int c;
+
+	token->type = TOKEN_UNDEFINED;
+	vStringClear (token->string);
+
+	do
+		c = fileGetc ();
+	while (c == '\t' || c == ' ' || c == '\r' || c == '\n');
+
+	token->lineNumber   = getSourceLineNumber ();
+	token->filePosition = getInputFilePosition ();
+
+	switch (c)
+	{
+		case EOF: token->type = TOKEN_EOF;			break;
+		case '[': token->type = TOKEN_OPEN_SQUARE;	break;
+		case ']': token->type = TOKEN_CLOSE_SQUARE;	break;
+		case '{': token->type = TOKEN_OPEN_CURLY;	break;
+		case '}': token->type = TOKEN_CLOSE_CURLY;	break;
+		case ':': token->type = TOKEN_COLON;		break;
+		case ',': token->type = TOKEN_COMMA;		break;
+
+		case '"':
+		{
+			boolean escaped = FALSE;
+			token->type = TOKEN_STRING;
+			while (TRUE)
+			{
+				c = fileGetc ();
+				/* we don't handle unicode escapes but they are safe */
+				if (escaped)
+					escaped = FALSE;
+				else if (c == '\\')
+					escaped = TRUE;
+				else if (c >= 0x00 && c <= 0x1F)
+					break; /* break on invalid, unescaped, control characters */
+				else if (c == '"' || c == EOF)
+					break;
+				vStringPut (token->string, c);
+			}
+			vStringTerminate (token->string);
+			break;
+		}
+
+		default:
+			if (! isIdentChar (c))
+				token->type = TOKEN_UNDEFINED;
+			else
+			{
+				do
+				{
+					vStringPut (token->string, c);
+					c = fileGetc ();
+				}
+				while (c != EOF && isIdentChar (c));
+				vStringTerminate (token->string);
+				fileUngetc (c);
+				switch (lookupKeyword (vStringValue (token->string), Lang_json))
+				{
+					case KEYWORD_true:	token->type = TOKEN_TRUE;	break;
+					case KEYWORD_false:	token->type = TOKEN_FALSE;	break;
+					case KEYWORD_null:	token->type = TOKEN_NULL;	break;
+					default:			token->type = TOKEN_NUMBER;	break;
+				}
+			}
+			break;
+	}
+}
+
+static void pushScope (tokenInfo *const token,
+					   const tokenInfo *const parent,
+					   const jsonKind parentKind)
+{
+	if (vStringLength (token->scope) > 0)
+		vStringPut (token->scope, '.');
+	vStringCat (token->scope, parent->string);
+	vStringTerminate (token->scope);
+	token->scopeKind = parentKind;
+}
+
+static void popScope (tokenInfo *const token,
+					  const tokenInfo *const parent)
+{
+	char *dot = strrchr (token->scope->buffer, '.');
+
+	if (! dot)
+		vStringClear (token->scope);
+	else
+	{
+		*dot = 0;
+		token->scope->length = dot - token->scope->buffer;
+	}
+	token->scopeKind = parent->scopeKind;
+}
+
+#define skipToOneOf2(token, type1, type2) \
+	(skipToOneOf3 (token, type1, type2, TOKEN_EOF /* dummy */))
+
+#define skipTo(token, type) \
+	(skipToOneOf3 (token, type, /* dummies */ TOKEN_EOF, TOKEN_EOF))
+
+static void skipToOneOf3 (tokenInfo *const token,
+						  const tokenType type1,
+						  const tokenType type2,
+						  const tokenType type3)
+{
+	while (token->type != TOKEN_EOF &&
+		   token->type != type1 &&
+		   token->type != type2 &&
+		   token->type != type3)
+	{
+		readToken (token);
+		if (token->type == TOKEN_OPEN_CURLY)
+		{
+			skipTo (token, TOKEN_CLOSE_CURLY);
+			readToken (token);
+		}
+		else if (token->type == TOKEN_OPEN_SQUARE)
+		{
+			skipTo (token, TOKEN_CLOSE_SQUARE);
+			readToken (token);
+		}
+	}
+}
+
+static jsonKind tokenToKind (const tokenType type)
+{
+	switch (type)
+	{
+		case TOKEN_OPEN_CURLY:	return TAG_OBJECT;
+		case TOKEN_OPEN_SQUARE:	return TAG_ARRAY;
+		case TOKEN_STRING:		return TAG_STRING;
+		case TOKEN_TRUE:
+		case TOKEN_FALSE:		return TAG_BOOLEAN;
+		case TOKEN_NUMBER:		return TAG_NUMBER;
+		default:				return TAG_NULL;
+	}
+}
+
+static void parseValue (tokenInfo *const token)
+{
+	if (token->type == TOKEN_OPEN_CURLY)
+	{
+		tokenInfo *name = newToken ();
+
+		do
+		{
+			readToken (token);
+			if (token->type == TOKEN_STRING)
+			{
+				jsonKind tagKind = TAG_NULL; /* default in case of invalid value */
+
+				copyToken (name, token);
+
+				/* skip any possible garbage before the value */
+				skipToOneOf3 (token, TOKEN_CLOSE_CURLY, TOKEN_COLON, TOKEN_COMMA);
+
+				if (token->type == TOKEN_COLON)
+				{
+					readToken (token);
+					tagKind = tokenToKind (token->type);
+
+					pushScope (token, name, tagKind);
+					parseValue (token);
+					popScope (token, name);
+				}
+
+				makeJsonTag (name, tagKind);
+			}
+			/* skip to the end of the construct */
+			skipToOneOf2 (token, TOKEN_CLOSE_CURLY, TOKEN_COMMA);
+		}
+		while (token->type != TOKEN_EOF &&
+			   token->type != TOKEN_CLOSE_CURLY);
+
+		if (token->type == TOKEN_CLOSE_CURLY)
+			readToken (token);
+
+		deleteToken (name);
+	}
+	else if (token->type == TOKEN_OPEN_SQUARE)
+	{
+		tokenInfo *name = newToken ();
+		char buf[32];
+		unsigned int nth = 0;
+
+		readToken (token);
+		while (token->type != TOKEN_EOF &&
+			   token->type != TOKEN_CLOSE_SQUARE)
+		{
+			jsonKind tagKind;
+
+			tagKind = tokenToKind (token->type);
+
+			copyToken (name, token);
+			snprintf (buf, sizeof buf, "%u", nth++);
+			vStringCopyS (name->string, buf);
+
+			makeJsonTag (name, tagKind);
+			pushScope (token, name, tagKind);
+			parseValue (token);
+			popScope (token, name);
+
+			/* skip to the end of the construct */
+			skipToOneOf2 (token, TOKEN_CLOSE_SQUARE, TOKEN_COMMA);
+			if (token->type != TOKEN_CLOSE_SQUARE)
+				readToken (token);
+		}
+
+		if (token->type == TOKEN_CLOSE_SQUARE)
+			readToken (token);
+
+		deleteToken (name);
+	}
+}
+
+static void findJsonTags (void)
+{
+	tokenInfo *const token = newToken ();
+
+	/* We allow multiple top-level elements, although it's not actually valid
+	 * JSON.  An interesting side effect of this is that we allow a leading
+	 * Unicode BOM mark -- even though ok, many JSON parsers will choke on it */
+	do
+	{
+		readToken (token);
+		parseValue (token);
+	}
+	while (token->type != TOKEN_EOF);
+
+	deleteToken (token);
+}
+
+static void initialize (const langType language)
+{
+	Lang_json = language;
+	addKeyword ("true", language, KEYWORD_true);
+	addKeyword ("false", language, KEYWORD_false);
+	addKeyword ("null", language, KEYWORD_null);
+}
+
+/* Create parser definition stucture */
+extern parserDefinition* JsonParser (void)
+{
+	static const char *const extensions [] = { "json", NULL };
+	parserDefinition *const def = parserNew ("JSON");
+	def->extensions = extensions;
+	def->kinds		= JsonKinds;
+	def->kindCount	= KIND_COUNT (JsonKinds);
+	def->parser		= findJsonTags;
+	def->initialize = initialize;
+
+	return def;
+}
--- a/tagmanager/ctags/makefile.win32
+++ b/tagmanager/ctags/makefile.win32
@ -44,7 +44,7 @@ all: $(COMPLIB)
 clean:
 	-$(RM) deps.mak *.o $(COMPLIB)

-$(COMPLIB): abaqus.o abc.o args.o c.o cobol.o fortran.o make.o conf.o pascal.o perl.o php.o diff.o vhdl.o verilog.o lua.o js.o \
+$(COMPLIB): abaqus.o abc.o args.o c.o cobol.o fortran.o make.o conf.o pascal.o perl.o php.o diff.o vhdl.o verilog.o lua.o js.o json.o \
 actionscript.o nsis.o objc.o \
 haskell.o haxe.o html.o python.o lregex.o asciidoc.o rest.o sh.o ctags.o entry.o get.o keyword.o nestlevel.o \
 options.o \
--- a/tagmanager/ctags/parsers.h
+++ b/tagmanager/ctags/parsers.h
@ -62,7 +62,8 @@
    AsciidocParser, \
    AbaqusParser, \
    RustParser, \
-    GoParser
+    GoParser, \
+    JsonParser

 #endif	/* _PARSERS_H */

--- a/tagmanager/src/tm_parser.h
+++ b/tagmanager/src/tm_parser.h
@ -69,6 +69,7 @@ typedef enum
 	TM_PARSER_ABAQUS,
 	TM_PARSER_RUST,
 	TM_PARSER_GO,
+	TM_PARSER_JSON,
 	TM_PARSER_COUNT
 } TMParserType;

--- a/tests/ctags/simple.json
+++ b/tests/ctags/simple.json
@ -0,0 +1,25 @@
+{
+  "firstName": "John",
+  "lastName": "Smith",
+  "isAlive": true,
+  "age": 25,
+  "height_cm": 167.6,
+  "address": {
+    "streetAddress": "21 2nd Street",
+    "city": "New York",
+    "state": "NY",
+    "postalCode": "10021-3100"
+  },
+  "phoneNumbers": [
+    {
+      "type": "home",
+      "number": "212 555-1234"
+    },
+    {
+      "type": "office",
+      "number": "646 555-4567"
+    }
+  ],
+  "children": [],
+  "spouse": null
+}
--- a/tests/ctags/simple.json.tags
+++ b/tests/ctags/simple.json.tags
@ -0,0 +1,19 @@
+0	input.json	/^    {$/;"	o	array:phoneNumbers
+1	input.json	/^    {$/;"	o	array:phoneNumbers
+address	input.json	/^  "address": {$/;"	o
+age	input.json	/^  "age": 25,$/;"	n
+children	input.json	/^  "children": [],$/;"	a
+city	input.json	/^    "city": "New York",$/;"	s	object:address
+firstName	input.json	/^  "firstName": "John",$/;"	s
+height_cm	input.json	/^  "height_cm": 167.6,$/;"	n
+isAlive	input.json	/^  "isAlive": true,$/;"	b
+lastName	input.json	/^  "lastName": "Smith",$/;"	s
+number	input.json	/^      "number": "212 555-1234"$/;"	s	object:phoneNumbers.0
+number	input.json	/^      "number": "646 555-4567"$/;"	s	object:phoneNumbers.1
+phoneNumbers	input.json	/^  "phoneNumbers": [$/;"	a
+postalCode	input.json	/^    "postalCode": "10021-3100"$/;"	s	object:address
+spouse	input.json	/^  "spouse": null$/;"	z
+state	input.json	/^    "state": "NY",$/;"	s	object:address
+streetAddress	input.json	/^    "streetAddress": "21 2nd Street",$/;"	s	object:address
+type	input.json	/^      "type": "home",$/;"	s	object:phoneNumbers.0
+type	input.json	/^      "type": "office",$/;"	s	object:phoneNumbers.1
--- a/1
+++ b/1
@ -87,6 +87,7 @@ ctags_sources = set([
    'tagmanager/ctags/haxe.c',
    'tagmanager/ctags/html.c',
    'tagmanager/ctags/js.c',
+    'tagmanager/ctags/json.c',
    'tagmanager/ctags/keyword.c',
    'tagmanager/ctags/latex.c',
    'tagmanager/ctags/lregex.c',