From 06a376f7f913bd57a1256898a04882818deaefa4 Mon Sep 17 00:00:00 2001 From: Justin Bedo Date: Tue, 12 Jun 2012 12:08:35 +0200 Subject: Initial implementation --- README | 198 ++++++++++++++++++++++++++++++++++++++++++++++++++ json.c | 248 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ json.h | 38 ++++++++++ json.man | 190 ++++++++++++++++++++++++++++++++++++++++++++++++ mkfile | 11 +++ 5 files changed, 685 insertions(+) create mode 100644 README create mode 100644 json.c create mode 100644 json.h create mode 100644 json.man create mode 100644 mkfile diff --git a/README b/README new file mode 100644 index 0000000..a35851c --- /dev/null +++ b/README @@ -0,0 +1,198 @@ + + + + JSON(2) JSON(2) + + + + + + NAME + Jinit, Jtokenise, Jfind, Jnext, Jtokstr - JSON parser + + SYNOPSIS + #include + #include + #include + + typedef enum{ + JBool, + JNum, + JNil, + JObj, + JArr, + JStr, + } Jtype; + + typedef struct{ + Jtype type; + char *start; + char *end; + uint nsub; + } Jtok; + + typedef struct{ + uint ntok, mtok; + Jtok *tokens; + uint stktop; + uint tokstk[JStksz]; + } Jparser; + + void Jinit(Jparser *p) + + void Jterm(Jparser *p); + + int Jtokenise(Jparser *p, char *s) + + char *Jtokstr(Jtok *t) + + int Jfind(Jparser *p, uint i, char *s) + + uint Jnext(Jparser *p, uint i) + + DESCRIPTION + These routines implement a fast, in-memory JSON parser. The + parser operates on a string which is expected to contain the + full JSON text. The string is not altered in any way. + + A parser object p is initialised using Jinit. The parser + object contains the number of tokens found ntok and an array + + + + + + + + + + + JSON(2) JSON(2) + + + + of tokens tokens. The other fields are for internal use. As + memory is dynamically allocated, an initialised parser p + must be terminated with Jterm after use. + + An initialised parser object is populated with tokens using + Jtokenise. Jtokenise accepts parser p and a string s and + returns a filled Jparser structure containing tokens found + in s. S is unaltered. + + Each token Jtoken has a type Jtype, pointers to the start + and end of the token's location in the JSON string, and a + count nsub of the number of subtokens. The types of a token + are + + JBool + Boolean value. Token must be either 't' or 'f'. + + JNum A real number. + + JNil Nill token matching 'n'. + + JObj An object (dictionary) containing key:value pairs. + + JArr An array. + + JStr A string. + + The pointers start and end point to the beginning character + of the token and to one character after the end of the token + respectively. The exception are JStr tokens, where start + points to the first character after the double quote and end + points to the terminating double quote. As arrays JArr and + objects JObj are the only types allowing subelements, nsub + is necessarily 0 for JBool, JNum, JNil, and JStr. + + The functions Jtokstr, Jfind and Jnext are functions for + working with Jtokstr takes a token and extracts out the + string pointed to by the token. The string is null termi- + nated and escaped characters are handled appropriately. As + Jtokstr uses a static buffer, the string returned is + destroyed on the next call. + + Jfind is used to find specifically named attributes within a + JObj. Given an index into the tokens array of pā€“ which must + be a JObj ā€“ and the name of an attribute s, Jfind returns + the index of the token corresponding to the value matching + the attribute name. If no attribute is found then Jfind + returns -1. + + Finally, Jnext takes and index and calculates the index of + the next element at that depth. It is used for skipping + over JObjs and JArrs which can contain a number of + + + + + + + + + + + JSON(2) JSON(2) + + + + subelements. + + BUGS + The parser does not implement a full grammar so some JSON + errors are not detected when parsing. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/json.c b/json.c new file mode 100644 index 0000000..9f52bc7 --- /dev/null +++ b/json.c @@ -0,0 +1,248 @@ +#include +#include +#include + +static void* +erealloc(void *ptr, ulong sz) +{ + ptr = realloc(ptr, sz); + if(ptr == nil) + sysfatal("realloc: %r"); + return ptr; +} + +void +Jinit(Jparser *p) +{ + memset(p, 0, sizeof *p); +} + +void +Jterm(Jparser *p) +{ + if(p->mtok > 0 || p->tokens != nil) + free(p->tokens); +} + +static Jtok* +gettok(Jparser *p) +{ + uint i; + + if(p->ntok == p->mtok){ + p->mtok = (p->ntok << 1) + 1; + p->tokens = erealloc(p->tokens, p->mtok * sizeof *p->tokens); + for(i = p->ntok; i < p->mtok; i++) + memset(&p->tokens[i], 0, sizeof *p->tokens); + } + + return &p->tokens[p->ntok++]; +} + +static char * +parsestr(Jparser *p, char *s) +{ + Jtok *tok; + + tok = gettok(p); + tok->type = JStr; + for(tok->start = ++s; *s != '\0' && utfrune("\"", *s) == nil; s++) + if(utfrune("\\", *s) != nil) + s++; + tok->end = s; + return s; +} + +static char * +saccept(char *src, char *str) +{ + while(*src != '\0' && utfrune(str, *src) != nil) + src++; + return src; +} + +static void +incnsub(Jparser *p) +{ + if(p->stktop > 0) + p->tokens[p->tokstk[p->stktop - 1]].nsub++; +} + +int +Jtokenise(Jparser *p, char *s) +{ + Jtok *tok; + Jtype type; + + memset(p->tokens, 0, p->mtok * sizeof *p->tokens); + + for(; *s != '\0'; s++){ + if(utfrune("\t\r\n ", *s) != nil) + continue; + + if(utfrune(":", *s) != nil) + if(p->ntok > 0 && p->tokens[p->ntok - 1].type == JStr) + continue; + else{ + werrstr("unexpected :"); + return -1; + } + + if(utfrune(",", *s) != nil) + if(p->stktop != 0) + continue; + else{ + werrstr(", outside of object or array"); + return -1; + } + + if(utfrune("{[", *s) != nil){ + tok = gettok(p); + tok->start = s; + if(p->stktop == JStksz){ + werrstr("stack overflow"); + return -1; + } + incnsub(p); + p->tokstk[p->stktop++] = p->ntok - 1; + tok->type = utfrune("{", *s) != nil ? JObj:JArr; + continue; + } + + if(utfrune("}]", *s) != nil){ + type = utfrune("}", *s) != nil ? JObj:JArr; + if(p->ntok == 0) + goto Jmatcherr; + for(tok = &p->tokens[p->ntok - 1]; tok >= p->tokens; tok--){ + if(tok->start != nil && tok->end == nil){ + if(tok->type != type){ + werrstr("expected %c", type == JObj ? '}':']'); + return -1; + } + if(p->stktop == 0){ + werrstr("stack underflow"); + return -1; + } + p->stktop--; + tok->end = s + 1; + goto Jnext; + } + } +Jmatcherr: + werrstr("no matching %c", type == JObj ? '{':'['); + return -1; + } + + if(utfrune("\"", *s) != nil){ + s = parsestr(p, s); + if(*s == '\0' || utfrune("\"", *s) == nil){ + werrstr("expected \""); + return -1; + } + incnsub(p); + continue; + } + + if(utfrune("tf", *s) != nil){ + tok = gettok(p); + tok->type = JBool; + tok->start = s; + tok->end = s + 1; + incnsub(p); + continue; + } + + if(utfrune("n", *s) != nil){ + tok = gettok(p); + tok->type = JNil; + tok->start = s; + tok->end = s + 1; + incnsub(p); + continue; + } + + /* Try and parse a number */ + tok = gettok(p); + tok->type = JNum; + tok->start = s; + s = saccept(s, "+-"); + s = saccept(s, "0123456789"); + s = saccept(s, "."); + s = saccept(s, "0123456789"); + s = saccept(s, "eE"); + s = saccept(s, "+-"); + s = saccept(s, "0123456789"); + tok->end = s; + + if(*s != '\0' && utfrune("\t\r\n }],", *s) == nil){ + werrstr("number format error"); + return -1; + } + + incnsub(p); + s--; +Jnext:; + } + + if(p->stktop != 0){ + werrstr("unexpected end of input"); + return -1; + } + + return 0; +} + +char * +Jtokstr(Jtok *t) +{ + static char *buf; + static ulong sz = 0; + ulong len; + char *src, *dst; + + len = t->end - t->start; + if(sz <= len){ + sz = len + 1; + buf = erealloc(buf, sz); + } + + for(src = t->start, dst = buf; len > 0; len--){ + if(*src == '\\') + src++, len--; + *dst++ = *src++; + } + *dst = '\0'; + return buf; +} + +uint +Jnext(Jparser *p, uint i) +{ + int j; + if(i >= p->ntok) + return i; + if(p->tokens[i].type != JObj && p->tokens[i].type != JArr) + return i + 1; + for(j = i; j < p->ntok && p->tokens[i].end > p->tokens[j].start; j++); + return j; +} + +int +Jfind(Jparser *p, uint r, char *name) +{ + uint i, j; + assert(r < p->ntok); + assert(p->tokens[r].type == JObj); + assert(p->tokens[r].nsub % 2 == 0); + + for(i = 0, j = r + 1; i < p->tokens[r].nsub; i += 2){ + assert(p->tokens[j].type == JStr); + if(strcmp(Jtokstr(&p->tokens[j]), name) == 0) + return j + 1; + j = Jnext(p, j + 1); + } + + if(j >= p->ntok) + return -1; + return j; +} diff --git a/json.h b/json.h new file mode 100644 index 0000000..215f716 --- /dev/null +++ b/json.h @@ -0,0 +1,38 @@ +#pragma src "/usr/doc/src/json" +#pragma lib "json.a" + +enum{ + JStksz= 32, +}; + +typedef enum{ + JBool, + JNum, + JNil, + JObj, + JArr, + JStr, +} Jtype; + +typedef struct{ + Jtype type; + char *start; + char *end; + uint nsub; +} Jtok; + +typedef struct{ + uint ntok, mtok; + Jtok *tokens; + uint stktop; + uint tokstk[JStksz]; +} Jparser; + +void Jinit(Jparser *); +void Jterm(Jparser *); + +int Jtokenise(Jparser *, char *); +char *Jtokstr(Jtok *); + +int Jfind(Jparser *, uint, char *); +uint Jnext(Jparser *, uint); diff --git a/json.man b/json.man new file mode 100644 index 0000000..d6867e9 --- /dev/null +++ b/json.man @@ -0,0 +1,190 @@ +.TH JSON 2 +.SH NAME +Jinit, Jtokenise, Jfind, Jnext, Jtokstr \- JSON parser +.SH SYNOPSIS +.B #include +.br +.B #include +.br +.B #include +.PP +.nf +.ft L +typedef enum{ + JBool, + JNum, + JNil, + JObj, + JArr, + JStr, +} Jtype; +.fi +.PP +.nf +.ft L +typedef struct{ + Jtype type; + char *start; + char *end; + uint nsub; +} Jtok; +.fi +.PP +.nf +.ft L +typedef struct{ + uint ntok, mtok; + Jtok *tokens; + uint stktop; + uint tokstk[JStksz]; +} Jparser; +.fi +.PP +.B +void Jinit(Jparser *p) +.PP +.B +void Jterm(Jparser *p); +.PP +.B +int Jtokenise(Jparser *p, char *s) +.PP +.B +char *Jtokstr(Jtok *t) +.PP +.B +int Jfind(Jparser *p, uint i, char *s) +.PP +.B +uint Jnext(Jparser *p, uint i) +.PP +.SH DESCRIPTION +These routines implement a fast, in-memory JSON parser. The parser +operates on a string which is expected to contain the full JSON text. +The string is not altered in any way. +.PP +A parser object +.I p +is initialised using +.IR Jinit . +The parser object contains the number of tokens found +.I ntok +and an array of tokens +.IR tokens . +The other fields are for internal use. As memory is dynamically +allocated, an initialised parser +.I p +must be terminated with +.I Jterm +after use. +.PP +An initialised parser object is populated with tokens using +.IR Jtokenise . +.I Jtokenise +accepts parser +.I p +and a string +.I s +and returns a filled +.I Jparser +structure containing tokens found in +.IR s . +.I S +is unaltered. +.PP +Each token +.I Jtoken +has a type +.IR Jtype , +pointers to the +.I start +and +.I end +of the token's location in the JSON string, +and a count +.I nsub +of the number of subtokens. The types of a token are +.TP +.I JBool +Boolean value. Token must be either 't' or 'f'. +.TP +.I JNum +A real number. +.TP +.I JNil +Nill token matching 'n'. +.TP +.I JObj +An object (dictionary) containing key:value pairs. +.TP +.I JArr +An array. +.TP +.I JStr +A string. +.PP +The pointers +.I start +and +.I end +point to the beginning character of the token and to one character +after the end of the token respectively. The exception are +.I JStr +tokens, where +.I start +points to the first character after the double quote and +.I end +points to the terminating double quote. As arrays +.I JArr +and objects +.I JObj +are the only types allowing subelements, +.I nsub +is necessarily 0 for +.IR JBool , +.IR JNum , +.IR JNil , +and +.IR JStr . +.PP +The functions +.IR Jtokstr , +.IR Jfind +and +.I Jnext are functions for working with the tokens. +.I Jtokstr +takes a token and extracts out the string pointed to by the token. +The string is null terminated and escaped characters are handled +appropriately. As +.I Jtokstr +uses a static buffer, the string returned is destroyed on the next +call. +.PP +.I Jfind +is used to find specifically named attributes within a +.IR JObj . +Given an index into the +.I tokens +array of +.IR p ā€“ +which must be a +.I JObj +ā€“ and the name of an attribute +.IR s , +.I Jfind +returns the index of the token corresponding to the value matching the +attribute name. If no attribute is found then +.I Jfind +returns -1. +.PP +Finally, +.I Jnext +takes and index and calculates the index of the next element at that +depth. It is used for skipping over +.IR JObj s +and +.IR JArr s +which can contain a number of subelements. +.SH BUGS +The parser does not implement a full grammar so some JSON errors are +not detected when parsing. diff --git a/mkfile b/mkfile new file mode 100644 index 0000000..f7f40f8 --- /dev/null +++ b/mkfile @@ -0,0 +1,11 @@ +