|
中华网络安全联盟 作者:佚名 来源:网络转载 时间:2006-3-20
编程手记之ANSI C篇-(四)XML解析自动机 在实现了通用连接件、哈希表和二叉分析树后,我可以用这些功能做一个XML分析工具了。XML以其标准的结构,严谨的文法,强大的描述能力,以及独立和开放性,在描述数据资源方面得到了广泛的应用。在各种语言中,XML文档(DOM)的分析工具已层出不穷,在此,从文法的角度,来完成一个XML分析自动机的C实现。 1、XML基本文法: /*XML脚本有XML的声明和一个根结点构成*/ XMLScripts --> XMLNote + XMLEntity /*XML声明由XML标识首部、零个或多个XML属性节组成,之间空格符隔开,最后是XML标识尾部*/ XMLNote --> '<?xml' + { BLANK + XMLAttr} + '?>' /*XML节点由节点首部、节点文本、零个或多个子节点和节点尾部组成,没有文本、自节点的节点称为空节点*/ XMLEntity --> TagHead + [ TagText + {XMLEntity} + TagTail ] /*节点首部由节点开始符、节点名称、零个或多个属性节组成,没有文本和子节点的节点可由终止符后缀*/ TagHead --> '<' + TagName + {BLANK + XMLAttr} + [ '/>' | '' ] /*节点文本由字符串组成*/ TagText --> {'a | b c| ...0 | 1 | ..'} /*节点尾部由节点终止符和节点名称组成*/ TagTail --> '</' + TagName + '>' /*节点名称由字符串组成*/ TagName --> {'a | b c| ...0 | 1 | ..'} /*属性节属性名称、赋值符、首尾扩号和属性值组成*/ XMLAttr --> AttrName + '=' + '"' + AttrValue + '"' /*属性名称由字符串组成*/ AttrName --> {'a | b c| ...0 | 1 | ..'} /*属性值由字符串组成*/ AttrValue --> {'a | b c| ...0 | 1 | ..'} /*空格符由下列符号组成*/ BLANK --> {' ' | '\t' | '\r' | '\n'} 2、XML解析自动机的定义: /*定义自动机返回状态码*/ #define XP_SUCCESS 0 #define XP_CONTINUE 1 #define XP_ERROR -1 /*定义自动机当前的操作码*/ typedef enum{paChild = 0,paSibling = 1,paAttr = 2}XMLParseAction; /*自动机器的数据结构*/ typedef struct tagXMLMac{ LINKPTR tree; /*二叉树用以维系节点关系和存储节点属性*/ LINKPTR parent; /*指向当前分析的父节点的,用于回溯*/ XMLParseAction act; /*当前自动机操作码*/ int retcode; /*自动机状态码*/ TCHAR* token; /*当前分析位置的字符串指针*/ }XMLMac; /*定义一些节点的固有属性*/ #define NODENAME _T("NodeName") #define NODETEXT _T("NodeText") #define NODETYPE _T("NodeType") /*定义一些常用的固定符号*/ #define XMLNS _T("xmlns:") #define NSS _T(':') #define ASIGN _T('=') #define QUATE _T('"') /*定义终止符号集合的停止符*/ #define NILL _T('\x02') /*定义空格同意符*/ static TCHAR BlankSign[] = {_T(' '),_T('?'),_T('\t'),'\r',_T('\n'),NILL}; /*定义节点文本终止符*/ static TCHAR TextTerm[] = {_T('<'),_T('\0'),NILL}; /*定义属性终止符*/ static TCHAR AttrTerm[] = {_T('"'),/*_T('>'),*/_T('\0'),NILL}; /*定义节点首部终止符*/ static TCHAR TagHeadTerm[] = {_T(' '),_T('/'),_T('>'),_T('\t'),_T('\r'),_T('\n'),_T('\0'),NILL}; /*定义节点尾部终止符*/ static TCHAR TagTailTerm[] = {_T('>'),_T('\0'),NILL}; /*定义一些节点类型*/ typedef enum{ttENT = 0,ttXML = 1,ttCMT = 2, ttELE = 3, ttCDA = 4, ttDOC = 5, ttEXT = 6, ttNOT = 7}TagType; #define TT_ENT _T("<") /*define normal entity tag*/ #define TT_XML _T("<?xml") /*define xml root tag*/ #define TT_CMT _T("<!--") /*define comments tag*/ #define TT_ELE _T("<!ELEMENT") /*define data definition element tag*/ #define TT_CDA _T("<![CDATA[") /*define fregment data envelope*/ #define TT_DOC _T("<!DOCTYPE") /*define xml dtd source*/ #define TT_EXT _T("<!ENTITY") /*define outside entity*/ #define TT_NOT _T("<!NOTATION") /*define notation tag*/ 3、定义解析过程实现: /*测试字符是否是空格符*/ int _IsBlankSign(TCHAR ch) { int i = 0; while(BlankSign[i] != NILL) { if(ch == BlankSign[i]) return 1; i++; } return 0; } /*测试字符是否是节点首部终止符*/ int _IsTagHeadTerm(TCHAR ch) { int i = 0; while(TagHeadTerm[i] != NILL) { if(ch == TagHeadTerm[i]) return 1; i++; } return 0; } /*测试字符是否是节点尾部终止符*/ int _IsTagTailTerm(TCHAR ch) { int i = 0; while(TagTailTerm[i] != NILL) { if(ch == TagTailTerm[i]) return 1; i++; } return 0; } /*测试字符是否是节点文本终止符*/ int _IsTextTerm(TCHAR ch) { int i = 0; while(TextTerm[i] != NILL) { if(ch == TextTerm[i]) return 1; i++; } return 0; } /*测试字符是否是属性值终止符*/ int _IsAttrTerm(TCHAR ch) { int i = 0; while(AttrTerm[i] != NILL) { if(ch == AttrTerm[i]) return 1; i++; } return 0; } /*测试字符串首是否包括XML名域*/ int _IsNameSpace(TCHAR* key) { TCHAR* token = key; int len; len = _tcslen(XMLNS); if(_tcsncpy(token,XMLNS,len) == 0) return 1; else return 0; } /*测试节点类型*/ int _TagType(TCHAR* sz) { if(!_tcsncmp(sz,TT_XML,_tcslen(TT_XML))) return ttXML; else if(!_tcsncmp(sz,TT_CMT,_tcslen(TT_CMT))) return ttCMT; else if(!_tcsncmp(sz,TT_ELE,_tcslen(TT_ELE))) return ttELE; else if(!_tcsncmp(sz,TT_CDA,_tcslen(TT_CDA))) return ttCDA; else if(!_tcsncmp(sz,TT_DOC,_tcslen(TT_DOC))) return ttDOC; else if(!_tcsncmp(sz,TT_EXT,_tcslen(TT_EXT))) return ttEXT; else if(!_tcsncmp(sz,TT_NOT,_tcslen(TT_NOT))) return ttNOT; else if(!_tcsncmp(sz,TT_ENT,_tcslen(TT_ENT))) return ttENT; else return -1; } /*越过空格符*/ TCHAR* _XMLSkipBlank(TCHAR* szXML) { TCHAR* token = szXML; while(_IsBlankSign(*token)) token ++; if(*token == _T('\0')) return NULL; else return token; } /*越过XML声明节,如 <?xml ...>*/ TCHAR* _XMLSkipXML(TCHAR* szXML) { TCHAR* token = szXML + _tcslen(TT_XML); while(*token != _T('>') && *token != _T('\0')) token ++; if(*token == _T('>')) return token + 1; /*skip '<'*/ else return token; } /*越过注释节,如 <!-- ... -->*/ TCHAR* _XMLSkipCMT(TCHAR* szXML) { TCHAR* token = szXML + _tcslen(TT_CMT); while(*token != _T('>') && *token != _T('\0')) token ++; if(*token == _T('>')) return token + 1; /*skip '<'*/ else return token; } /*越过实体声明节,如 <!ELEMENT ...>*/ TCHAR* _XMLSkipELE(TCHAR* szXML) { TCHAR* token = szXML + _tcslen(TT_ELE); while(*token != _T('>') && *token != _T('\0')) token ++; if(*token == _T('>')) return token + 1; /*skip '<'*/ else return token; } /*越过CDATA节,如 <!CDATA[[...]]>*/ TCHAR* _XMLSkipCDA(TCHAR* szXML) { TCHAR* token = szXML + _tcslen(TT_CDA); while(*token != _T(']') && *(token + 1) != _T(']') && *token != _T('\0')) token ++; if(*token == _T(']')) return token + 2; /*skip ']]'*/ else return token; } /*越过文档声明节,如 <!DOCTYPE ...>*/ TCHAR* _XMLSkipDOC(TCHAR* szXML) { TCHAR* token = szXML + _tcslen(TT_DOC); while(*token != _T('>') && *token != _T('\0')) token ++; if(*token == _T('>')) return token + 1; /*skip '<'*/ else return token; } /*越过外部实体声明节,如 <!ENTITY ...>*/ TCHAR* _XMLSkipEXT(TCHAR* szXML) { TCHAR* token = szXML + _tcslen(TT_EXT); while(*token != _T('>') && *token != _T('\0')) token ++; if(*token == _T('>')) return token + 1; /*skip '<'*/ else return token; } /*越过其他外部声明节,如 <!NOTATION ...>*/ TCHAR* _XMLSkipNOT(TCHAR* szXML) { TCHAR* token = szXML + _tcslen(TT_NOT); while(*token != _T('>') && *token != _T('\0')) token ++; if(*token == _T('>')) return token + 1; /*skip '<'*/ else return token; } /*越过赋值符,如 ... = "..."*/ TCHAR* _XMLSkipAsign(TCHAR* szXML) { TCHAR* token = szXML; token = _XMLSkipBlank(token); if(*token != ASIGN) return NULL; token ++; return _XMLSkipBlank(token); } /*越过节点首部,如 <sometag> */ TCHAR* _XMLSkipTagHeader(TCHAR* szXML) { TCHAR* token = szXML + 1; assert(*szXML == _T('<')); token = _XMLSkipBlank(token); if(token == NULL || *token == _T('>')) return NULL; while(!_IsTagHeadTerm(*token)) token ++; if(*token == _T('\0')) return NULL; return token; } /*越过节点尾部并回溯嵌套节点,如 </sometag1></sometag2>...*/ void _XMLSkipTagTail(XMLMac* pm) { TCHAR* token = pm->token; token = _XMLSkipBlank(token); if(token == NULL) { pm->retcode = XP_SUCCESS; return; } if(*token == _T('/') || (*token == _T('<') && *(token + 1) == _T('/'))) { while(!_IsTagTailTerm(*token)) token ++; if(*token == _T('\0')) { pm->retcode = XP_ERROR; return; } token ++ ;//skip _T('>') /*回溯到父节点*/ pm->parent = GetTreeDataParentItem(pm->parent); pm->token = token; pm->act = paSibling; _XMLSkipTagTail(pm); }else { pm->token = token; } } /*分离名域和节点名称,如 'xsl:entname'*/ void _SplitNameSpace(TCHAR* sz,TCHAR** ns,int* nslen,TCHAR** ent,int* entlen) { TCHAR* token = sz; *ns = *ent = NULL; *nslen = *entlen = 0; while(!_IsTagHeadTerm(*token) && *token != NSS) token ++; if(*token == NSS) { *ns = sz; *nslen = token - sz; token ++; /*skip NSS ':'*/ *ent = token; while(!_IsTagHeadTerm(*token)) { token ++; *entlen = *entlen + 1; } }else { *ent = sz; *entlen = token - sz; } } /*解析节点的属性集合*/ void _XMLParseAttr(XMLMac* pm) { TCHAR* token = pm->token; int keylen,vallen; TCHAR* key; TCHAR* val; key = pm->token; while(!_IsBlankSign(*token) && *token != ASIGN) token ++; keylen = token - pm->token ; token = _XMLSkipAsign(token); if(token == NULL) { pm->retcode = XP_ERROR; return; } if(*token == QUATE) /*skip left QUATE '"'*/ token ++; val = token; while(!_IsAttrTerm(*token)) token ++; vallen = token - val ; WriteTreeDataItemProper(pm->parent,key,keylen,val,vallen); if(*token == QUATE) /*skip right QUATE '"'*/ token ++; token = _XMLSkipBlank(token); if(token == NULL) { pm->retcode = XP_ERROR; return; } /*test entity property set is terminated*/ if(*token == _T('>') || *token == _T('/')) { pm->retcode = XP_SUCCESS; pm->token = token ; return; } pm->token = token; pm->retcode = XP_CONTINUE; _XMLParseAttr(pm); } /*解析节点文本,如 <sometag>sometext</sometag>*/ void _XMLParseTagText(XMLMac* pm) { TCHAR* token = pm->token; TCHAR* val; int vallen; token = _XMLSkipBlank(token); if(token == NULL) { pm->retcode = XP_ERROR; return; } val = token; while(!_IsTextTerm(*token)) token ++; if(*token == _T('\0')) { pm->retcode = XP_ERROR; return; } vallen = token - val; WriteTreeDataItemProper(pm->parent,NODETEXT,-1,val,vallen); pm->token = token; pm->retcode = XP_SUCCESS; } /*解析节点和他的子节点*/ void _XMLParseTagEntity(XMLMac* pm) { TCHAR* token = pm->token; TCHAR* tag; TCHAR* ns; TCHAR* ent; int len,nslen,entlen; int tt; LINKPTR item; token = _XMLSkipBlank(token); if(token == NULL) { if(pm->parent == NULL) pm->retcode = XP_SUCCESS; //no more to parse else pm->retcode = XP_ERROR ; //lost some tag return; } tt = _TagType(token); if(tt < 0) { pm->retcode = XP_ERROR; /*invalid entity header*/ return; } /*部分节点类型在此不作分析*/ if(tt == ttXML) { token = _XMLSkipXML(token); /*do nothing*/ pm->token = token; _XMLParseTagEntity(pm); return; }else if(tt == ttCMT) { token = _XMLSkipCMT(token); /*do nothing*/ pm->token = token; _XMLParseTagEntity(pm); return; }else if(tt == ttELE) { token = _XMLSkipELE(token); /*do nothing*/ pm->token = token; _XMLParseTagEntity(pm); return; }else if(tt == ttCDA) { token = _XMLSkipCDA(token); /*do nothing*/ pm->token = token; _XMLParseTagEntity(pm); return; }else if(tt == ttDOC) { token = _XMLSkipDOC(token); /*do nothing*/ pm->token = token; _XMLParseTagEntity(pm); return; }else if(tt == ttEXT) { token = _XMLSkipEXT(token); /*do nothing*/ pm->token = token; _XMLParseTagEntity(pm); return; }else if(tt == ttNOT) { token = _XMLSkipNOT(token); /*do nothing*/ pm->token = token; _XMLParseTagEntity(pm); return; } /*开始分析节点*/ tag = token + 1; /*skip '<'*/ token = _XMLSkipTagHeader(token); if(token == NULL) { pm->retcode = XP_ERROR; /*invalid entry body*/ return; } len = token - tag; item = InsertTreeDataItem(pm->tree,pm->parent,LINK_LAST); /*分析节点名域和名称*/ _SplitNameSpace(tag,&ns,&nslen,&ent,&entlen); if(nslen == 0) WriteTreeDataItemProper(item,NODENAME,-1,ent,entlen); else { WriteTreeDataItemProper(item,NODENAME,-1,ent,entlen); WriteTreeDataItemProper(item,XMLNS,-1,ns,nslen); } /*新节点并作为当前父节点,接着去分析下一个节点*/ pm->parent = item; token = _XMLSkipBlank(token); if(token == NULL) { pm->retcode = XP_ERROR; return; } if(*token != _T('>')) /*节点首部有属性集合*/ { if(*token != _T('/')) /*非空节点,以下作节点属性集合分析*/ { pm->act = paAttr; pm->token = token; _XMLParseAttr(pm); if(pm->retcode == XP_ERROR) return; }else /*空节点*/ pm->token = token; token = pm->token; if(*token == _T('/')) /*空节点,则完成该节点分析*/ { pm->token = token; _XMLSkipTagTail(pm); /*完成空节点分析并回溯*/ if(pm->retcode == XP_ERROR) { return; } _XMLParseTagEntity(pm); /*分析下一个兄弟节点*/ return; }else token ++; /*skip _T('>')*/ }else { token ++; //skip _T('>') pm->token = token; } /*分析节点文本*/ pm->token = token; pm->retcode = XP_CONTINUE; _XMLParseTagText(pm); if(pm->retcode == XP_ERROR) return; token = pm->token; assert(*token == _T('<')); if(*(token + 1) != _T('/')) /*该节点有子节点*/ { pm->token = token; pm->act = paChild; pm->retcode = XP_CONTINUE; _XMLParseTagEntity(pm); /*分析子节点*/ }else /*没有子节点*/ { pm->token = token; _XMLSkipTagTail(pm); /*完成该节点分析并回溯*/ if(pm->retcode == XP_ERROR) { return; } _XMLParseTagEntity(pm); /*去分析下一个兄弟节点*/ } } 4、以上的实现只是沧海一粟,更多的功能有待进一步去实现,让我们共同努力吧!我的MAIL:jdhot2003@hotmail.com |