Initial implementation

author: Justin Bedo <cu@cua0.org> 2012-06-12 12:08:35 +0200
committer: Justin Bedo <cu@cua0.org> 2012-06-12 12:08:35 +0200
commit: 06a376f7f913bd57a1256898a04882818deaefa4 (patch)
tree: 33423384477ee90ccf022f29f4685bccc2a3874e
5 files changed, 685 insertions, 0 deletions
diff --git a/README b/README
new file mode 100644
index 0000000..a35851c
--- /dev/null
+++ b/README
@@ -0,0 +1,198 @@
+
+
+
+     JSON(2)                                                   JSON(2)
+
+
+
+
+
+     NAME
+          Jinit, Jtokenise, Jfind, Jnext, Jtokstr - JSON parser
+
+     SYNOPSIS
+          #include <u.h>
+          #include <libc.h>
+          #include <json.h>
+
+          typedef enum{
+               JBool,
+               JNum,
+               JNil,
+               JObj,
+               JArr,
+               JStr,
+          } Jtype;
+
+          typedef struct{
+               Jtype type;
+               char *start;
+               char *end;
+               uint nsub;
+          } Jtok;
+
+          typedef struct{
+               uint ntok, mtok;
+               Jtok *tokens;
+               uint stktop;
+               uint tokstk[JStksz];
+          } Jparser;
+
+          void Jinit(Jparser *p)
+
+          void Jterm(Jparser *p);
+
+          int Jtokenise(Jparser *p, char *s)
+
+          char *Jtokstr(Jtok *t)
+
+          int Jfind(Jparser *p, uint i, char *s)
+
+          uint Jnext(Jparser *p, uint i)
+
+     DESCRIPTION
+          These routines implement a fast, in-memory JSON parser.  The
+          parser operates on a string which is expected to contain the
+          full JSON text.  The string is not altered in any way.
+
+          A parser object p is initialised using Jinit. The parser
+          object contains the number of tokens found ntok and an array
+
+
+
+
+
+
+
+
+
+
+     JSON(2)                                                   JSON(2)
+
+
+
+          of tokens tokens. The other fields are for internal use.  As
+          memory is dynamically allocated, an initialised parser p
+          must be terminated with Jterm after use.
+
+          An initialised parser object is populated with tokens using
+          Jtokenise. Jtokenise accepts parser p and a string s and
+          returns a filled Jparser structure containing tokens found
+          in s. S is unaltered.
+
+          Each token Jtoken has a type Jtype, pointers to the start
+          and end of the token's location in the JSON string, and a
+          count nsub of the number of subtokens.  The types of a token
+          are
+
+          JBool
+               Boolean value.  Token must be either 't' or 'f'.
+
+          JNum A real number.
+
+          JNil Nill token matching 'n'.
+
+          JObj An object (dictionary) containing key:value pairs.
+
+          JArr An array.
+
+          JStr A string.
+
+          The pointers start and end point to the beginning character
+          of the token and to one character after the end of the token
+          respectively.  The exception are JStr tokens, where start
+          points to the first character after the double quote and end
+          points to the terminating double quote.  As arrays JArr and
+          objects JObj are the only types allowing subelements, nsub
+          is necessarily 0 for JBool, JNum, JNil, and JStr.
+
+          The functions Jtokstr, Jfind and Jnext are functions for
+          working with Jtokstr takes a token and extracts out the
+          string pointed to by the token.  The string is null termi-
+          nated and escaped characters are handled appropriately.  As
+          Jtokstr uses a static buffer, the string returned is
+          destroyed on the next call.
+
+          Jfind is used to find specifically named attributes within a
+          JObj. Given an index into the tokens array of p– which must
+          be a JObj – and the name of an attribute s, Jfind returns
+          the index of the token corresponding to the value matching
+          the attribute name.  If no attribute is found then Jfind
+          returns -1.
+
+          Finally, Jnext takes and index and calculates the index of
+          the next element at that depth.  It is used for skipping
+          over JObjs and JArrs which can contain a number of
+
+
+
+
+
+
+
+
+
+
+     JSON(2)                                                   JSON(2)
+
+
+
+          subelements.
+
+     BUGS
+          The parser does not implement a full grammar so some JSON
+          errors are not detected when parsing.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/json.c b/json.c
new file mode 100644
index 0000000..9f52bc7
--- /dev/null
+++ b/json.c
@@ -0,0 +1,248 @@
+#include<u.h>
+#include<libc.h>
+#include<json.h>
+
+static void*
+erealloc(void *ptr, ulong sz)
+{
+	ptr = realloc(ptr, sz);
+	if(ptr == nil)
+		sysfatal("realloc: %r");
+	return ptr;
+}
+
+void
+Jinit(Jparser *p)
+{
+	memset(p, 0, sizeof *p);
+}
+
+void
+Jterm(Jparser *p)
+{
+	if(p->mtok > 0 || p->tokens != nil)
+		free(p->tokens);
+}
+
+static Jtok*
+gettok(Jparser *p)
+{
+	uint i;
+	
+	if(p->ntok == p->mtok){
+		p->mtok = (p->ntok << 1) + 1;
+		p->tokens = erealloc(p->tokens, p->mtok * sizeof *p->tokens);
+		for(i = p->ntok; i < p->mtok; i++)
+			memset(&p->tokens[i], 0, sizeof *p->tokens);
+	}
+	
+	return &p->tokens[p->ntok++];
+}
+
+static char *
+parsestr(Jparser *p, char *s)
+{
+	Jtok *tok;
+	
+	tok = gettok(p);
+	tok->type = JStr;
+	for(tok->start = ++s; *s != '\0' && utfrune("\"", *s) == nil; s++)
+		if(utfrune("\\", *s) != nil)
+			s++;
+	tok->end = s;
+	return s;
+}
+
+static char *
+saccept(char *src, char *str)
+{
+	while(*src != '\0' && utfrune(str, *src) != nil)
+		src++;
+	return src;
+}
+
+static void
+incnsub(Jparser *p)
+{
+	if(p->stktop > 0)
+		p->tokens[p->tokstk[p->stktop - 1]].nsub++;
+}
+
+int
+Jtokenise(Jparser *p, char *s)
+{
+	Jtok *tok;
+	Jtype type;
+	
+	memset(p->tokens, 0, p->mtok * sizeof *p->tokens);
+	
+	for(; *s != '\0'; s++){
+		if(utfrune("\t\r\n ", *s) != nil)
+			continue;
+			
+		if(utfrune(":", *s) != nil)
+			if(p->ntok > 0 && p->tokens[p->ntok - 1].type == JStr)
+				continue;
+			else{
+				werrstr("unexpected :");
+				return -1;
+			}
+		
+		if(utfrune(",", *s) != nil)
+			if(p->stktop != 0)
+				continue;
+			else{
+				werrstr(", outside of object or array");
+				return -1;
+			}
+		
+		if(utfrune("{[", *s) != nil){
+			tok = gettok(p);
+			tok->start = s;
+			if(p->stktop == JStksz){
+				werrstr("stack overflow");
+				return -1;
+			}
+			incnsub(p);
+			p->tokstk[p->stktop++] = p->ntok - 1;
+			tok->type = utfrune("{", *s) != nil ? JObj:JArr;
+			continue;
+		}
+		
+		if(utfrune("}]", *s) != nil){
+			type = utfrune("}", *s) != nil ? JObj:JArr;
+			if(p->ntok == 0)
+				goto Jmatcherr;
+			for(tok = &p->tokens[p->ntok - 1]; tok >= p->tokens; tok--){
+				if(tok->start != nil && tok->end == nil){
+					if(tok->type != type){
+						werrstr("expected %c", type == JObj ? '}':']');
+						return -1;
+					}
+					if(p->stktop == 0){
+						werrstr("stack underflow");
+						return -1;
+					}
+					p->stktop--;
+					tok->end = s + 1;
+					goto Jnext;
+				}
+			}
+Jmatcherr:
+			werrstr("no matching %c", type == JObj ? '{':'[');
+			return -1;
+		}
+		
+		if(utfrune("\"", *s) != nil){
+			s = parsestr(p, s);
+			if(*s == '\0' || utfrune("\"", *s) == nil){
+				werrstr("expected \"");
+				return -1;
+			}
+			incnsub(p);
+			continue;
+		}
+		
+		if(utfrune("tf", *s) != nil){
+			tok = gettok(p);
+			tok->type = JBool;
+			tok->start = s;
+			tok->end = s + 1;
+			incnsub(p);
+			continue;
+		}
+		
+		if(utfrune("n", *s) != nil){
+			tok = gettok(p);
+			tok->type = JNil;
+			tok->start = s;
+			tok->end = s + 1;
+			incnsub(p);
+			continue;
+		}
+		
+		/* Try and parse a number */
+		tok = gettok(p);
+		tok->type = JNum;
+		tok->start = s;
+		s = saccept(s, "+-");
+		s = saccept(s, "0123456789");
+		s = saccept(s, ".");
+		s = saccept(s, "0123456789");
+		s = saccept(s, "eE");
+		s = saccept(s, "+-");
+		s = saccept(s, "0123456789");
+		tok->end = s;
+		
+		if(*s != '\0' && utfrune("\t\r\n }],", *s) == nil){
+			werrstr("number format error");
+			return -1;
+		}
+		
+		incnsub(p);
+		s--;
+Jnext:;
+	}
+	
+	if(p->stktop != 0){
+		werrstr("unexpected end of input");
+		return -1;
+	}
+	
+	return 0;
+}
+
+char *
+Jtokstr(Jtok *t)
+{
+	static char *buf;
+	static ulong sz = 0;
+	ulong len;
+	char *src, *dst;
+	
+	len = t->end - t->start;
+	if(sz <= len){
+		sz = len + 1;
+		buf = erealloc(buf, sz);
+	}
+	
+	for(src = t->start, dst = buf; len > 0; len--){
+		if(*src == '\\')
+			src++, len--;
+		*dst++ = *src++;
+	}
+	*dst = '\0';
+	return buf;
+}
+
+uint
+Jnext(Jparser *p, uint i)
+{
+	int j;
+	if(i >= p->ntok)
+		return i;
+	if(p->tokens[i].type != JObj && p->tokens[i].type != JArr)
+		return i + 1;
+	for(j = i; j < p->ntok && p->tokens[i].end > p->tokens[j].start; j++);
+	return j;
+}
+
+int
+Jfind(Jparser *p, uint r, char *name)
+{
+	uint i, j;
+	assert(r < p->ntok);
+	assert(p->tokens[r].type == JObj);
+	assert(p->tokens[r].nsub % 2 == 0);
+	
+	for(i = 0, j = r + 1; i < p->tokens[r].nsub; i += 2){
+		assert(p->tokens[j].type == JStr);
+		if(strcmp(Jtokstr(&p->tokens[j]), name) == 0)
+			return j + 1;
+		j = Jnext(p, j + 1);
+	}
+	
+	if(j >= p->ntok)
+		return -1;
+	return j;
+}
diff --git a/json.h b/json.h
new file mode 100644
index 0000000..215f716
--- /dev/null
+++ b/json.h
@@ -0,0 +1,38 @@
+#pragma src "/usr/doc/src/json"
+#pragma	lib	"json.a"
+
+enum{
+	JStksz= 32,
+};
+
+typedef enum{
+	JBool,
+	JNum,
+	JNil,
+	JObj,
+	JArr,
+	JStr,
+} Jtype;
+
+typedef struct{
+	Jtype type;
+	char *start;
+	char *end;
+	uint nsub;
+} Jtok;
+
+typedef struct{
+	uint ntok, mtok;
+	Jtok *tokens;
+	uint stktop;
+	uint tokstk[JStksz];
+} Jparser;
+
+void Jinit(Jparser *);
+void Jterm(Jparser *);
+
+int Jtokenise(Jparser *, char *);
+char *Jtokstr(Jtok *);
+
+int Jfind(Jparser *, uint, char *);
+uint Jnext(Jparser *, uint);
diff --git a/json.man b/json.man
new file mode 100644
index 0000000..d6867e9
--- /dev/null
+++ b/json.man
@@ -0,0 +1,190 @@
+.TH JSON 2
+.SH NAME
+Jinit, Jtokenise, Jfind, Jnext, Jtokstr \- JSON parser
+.SH SYNOPSIS
+.B #include <u.h>
+.br
+.B #include <libc.h>
+.br
+.B #include <json.h>
+.PP
+.nf
+.ft L
+typedef enum{
+	JBool,
+	JNum,
+	JNil,
+	JObj,
+	JArr,
+	JStr,
+} Jtype;
+.fi
+.PP
+.nf
+.ft L
+typedef struct{
+	Jtype type;
+	char *start;
+	char *end;
+	uint nsub;
+} Jtok;
+.fi
+.PP
+.nf
+.ft L
+typedef struct{
+	uint ntok, mtok;
+	Jtok *tokens;
+	uint stktop;
+	uint tokstk[JStksz];
+} Jparser;
+.fi
+.PP
+.B
+void Jinit(Jparser *p)
+.PP
+.B
+void Jterm(Jparser *p);
+.PP
+.B
+int Jtokenise(Jparser *p, char *s)
+.PP
+.B
+char *Jtokstr(Jtok *t)
+.PP
+.B
+int Jfind(Jparser *p, uint i, char *s)
+.PP
+.B
+uint Jnext(Jparser *p, uint i)
+.PP
+.SH DESCRIPTION
+These routines implement a fast, in-memory JSON parser.  The parser
+operates on a string which is expected to contain the full JSON text.
+The string is not altered in any way.
+.PP
+A parser object
+.I p
+is initialised using
+.IR Jinit .
+The parser object contains the number of tokens found
+.I ntok
+and an array of tokens
+.IR tokens .
+The other fields are for internal use.  As memory is dynamically
+allocated, an initialised parser
+.I p
+must be terminated with
+.I Jterm
+after use.
+.PP
+An initialised parser object is populated with tokens using
+.IR Jtokenise .
+.I Jtokenise
+accepts parser
+.I p
+and a string
+.I s
+and returns a filled
+.I Jparser
+structure containing tokens found in
+.IR s .
+.I S
+is unaltered.
+.PP
+Each token
+.I Jtoken
+has a type
+.IR Jtype ,
+pointers to the
+.I start
+and 
+.I end
+of the token's location in the JSON string,
+and a count
+.I nsub
+of the number of subtokens.  The types of a token are
+.TP
+.I JBool
+Boolean value.  Token must be either 't' or 'f'.
+.TP
+.I JNum
+A real number.
+.TP
+.I JNil
+Nill token matching 'n'.
+.TP
+.I JObj
+An object (dictionary) containing key:value pairs.
+.TP
+.I JArr
+An array.
+.TP
+.I JStr
+A string.
+.PP
+The pointers
+.I start
+and
+.I end
+point to the beginning character of the token and to one character
+after the end of the token respectively.  The exception are
+.I JStr
+tokens, where
+.I start
+points to the first character after the double quote and
+.I end
+points to the terminating double quote.  As arrays
+.I JArr
+and objects
+.I JObj
+are the only types allowing subelements, 
+.I nsub
+is necessarily 0 for 
+.IR JBool ,
+.IR JNum ,
+.IR JNil ,
+and
+.IR JStr .
+.PP
+The functions
+.IR Jtokstr ,
+.IR Jfind
+and
+.I Jnext are functions for working with the tokens.
+.I Jtokstr
+takes a token and extracts out the string pointed to by the token.
+The string is null terminated and escaped characters are handled
+appropriately.  As
+.I Jtokstr
+uses a static buffer, the string returned is destroyed on the next
+call.
+.PP
+.I Jfind
+is used to find specifically named attributes within a 
+.IR JObj .
+Given an index into the
+.I tokens
+array of
+.IR p –
+which must be a
+.I JObj
+– and the name of an attribute
+.IR s ,
+.I Jfind
+returns the index of the token corresponding to the value matching the
+attribute name.  If no attribute is found then
+.I Jfind
+returns -1.
+.PP
+Finally,
+.I Jnext
+takes and index and calculates the index of the next element at that
+depth.  It is used for skipping over
+.IR JObj s
+and
+.IR JArr s
+which can contain a number of subelements.
+.SH BUGS
+The parser does not implement a full grammar so some JSON errors are
+not detected when parsing.
diff --git a/mkfile b/mkfile
new file mode 100644
index 0000000..f7f40f8
--- /dev/null
+++ b/mkfile
@@ -0,0 +1,11 @@
+</$objtype/mkfile
+
+LIB=/$objtype/lib/json.a
+
+HFILES=\
+	json.h\
+
+OFILES=\
+	json.$O\
+
+</sys/src/cmd/mklib
author	Justin Bedo <cu@cua0.org>	2012-06-12 12:08:35 +0200
committer	Justin Bedo <cu@cua0.org>	2012-06-12 12:08:35 +0200
commit	06a376f7f913bd57a1256898a04882818deaefa4 (patch)
tree	33423384477ee90ccf022f29f4685bccc2a3874e