XPath: Refactored tokenization/parsing to be more compliant

git-svn-id: http://pugixml.googlecode.com/svn/trunk@210 99668b35-9821-0410-8761-19e4c4f06640
2009-11-08 09:13:58 +00:00 · 2009-11-08 09:13:58 +00:00 · 19293d2558
commit 19293d2558
parent d132a265db
1 changed files with 274 additions and 261 deletions
--- a/src/pugixpath.cpp
+++ b/src/pugixpath.cpp
@ -40,9 +40,9 @@ namespace
 	enum chartype
 	{
 		ct_space = 1,			// \r, \n, space, tab
-		ct_start_symbol = 2,	// Any symbol > 127, a-z, A-Z, _, :
+		ct_start_symbol = 2,	// Any symbol > 127, a-z, A-Z, _
 		ct_digit = 4,			// 0-9
-		ct_symbol = 8			// Any symbol > 127, a-z, A-Z, 0-9, _, :, -, .
+		ct_symbol = 8			// Any symbol > 127, a-z, A-Z, 0-9, _, -, .
 	};
 	
 	const unsigned char chartype_table[256] =
@ -50,7 +50,7 @@ namespace
 		0,  0,  0,  0,  0,  0,  0,  0,     0,  1,  1,  0,  0,  1,  0,  0,     // 0-15
 		0,  0,  0,  0,  0,  0,  0,  0,     0,  0,  0,  0,  0,  0,  0,  0,     // 16-31
 		1,  0,  0,  0,  0,  0,  0,  0,     0,  0,  0,  0,  0,  8,  8,  0,     // 32-47
-		12, 12, 12, 12, 12, 12, 12, 12,    12, 12, 10, 0,  0,  0,  0,  0,     // 48-63
+		12, 12, 12, 12, 12, 12, 12, 12,    12, 12, 0,  0,  0,  0,  0,  0,     // 48-63
 		0,  10, 10, 10, 10, 10, 10, 10,    10, 10, 10, 10, 10, 10, 10, 10,    // 64-79
 		10, 10, 10, 10, 10, 10, 10, 10,    10, 10, 10, 0,  0,  0,  0,  10,    // 80-95
 		0,  10, 10, 10, 10, 10, 10, 10,    10, 10, 10, 10, 10, 10, 10, 10,    // 96-111
@ -819,7 +819,8 @@ namespace pugi
 		lex_comma,
 		lex_axis_attribute,
 		lex_dot,
-		lex_double_dot
+		lex_double_dot,
+		lex_double_colon
 	};

 	class xpath_lexer
@ -1063,6 +1064,18 @@ namespace pugi
 				break;
 			}

+			case ':':
+				if (*(m_cur+1) == ':')
+				{
+					m_cur += 2;
+					m_cur_lexeme = lex_double_colon;
+				}
+				else
+				{
+					m_cur_lexeme = lex_none;
+				}
+				break;
+
 			default:
 				if (is_chartype(*m_cur, ct_digit))
 				{
@ -1083,6 +1096,22 @@ namespace pugi
 				{
 					while (is_chartype(*m_cur, ct_symbol))
 						contents_push(*m_cur++);
+
+					if (m_cur[0] == ':')
+					{
+						if (m_cur[1] == '*') // namespace test ncname:*
+						{
+							contents_push(*m_cur++); // :
+							contents_push(*m_cur++); // *
+						}
+						else if (is_chartype(m_cur[1], ct_symbol)) // namespace test qname
+						{
+							contents_push(*m_cur++); // :
+
+							while (is_chartype(*m_cur, ct_symbol))
+								contents_push(*m_cur++);
+						}
+					}
 				
 					while (is_chartype(*m_cur, ct_space)) ++m_cur;

@ -2803,6 +2832,173 @@ namespace pugi

 		xpath_parser(const xpath_parser&);
 		xpath_parser& operator=(const xpath_parser&);
+
+		ast_type_t parse_function_name(const std::string& name, size_t argc)
+		{
+			switch (name[0])
+			{
+			case 'b':
+				if (name == "boolean" && argc == 1)
+					return ast_func_boolean;
+					
+				break;
+			
+			case 'c':
+				if (name == "count" && argc == 1)
+					return ast_func_count;
+				else if (name == "contains" && argc == 2)
+					return ast_func_contains;
+				else if (name == "concat" && argc == 2)
+					return ast_func_concat;
+				else if (name == "ceiling" && argc == 1)
+					return ast_func_ceiling;
+					
+				break;
+			
+			case 'f':
+				if (name == "false" && argc == 0)
+					return ast_func_false;
+				else if (name == "floor" && argc == 1)
+					return ast_func_floor;
+					
+				break;
+			
+			case 'i':
+				if (name == "id" && argc == 1)
+					return ast_func_id;
+					
+				break;
+			
+			case 'l':
+				if (name == "last" && argc == 0)
+					return ast_func_last;
+				else if (name == "lang" && argc == 1)
+					return ast_func_lang;
+				else if (name == "local-name" && argc <= 1)
+					return argc == 0 ? ast_func_local_name_0 : ast_func_local_name_1;
+			
+				break;
+			
+			case 'n':
+				if (name == "name" && argc <= 1)
+					return argc == 0 ? ast_func_name_0 : ast_func_name_1;
+				else if (name == "namespace-uri" && argc <= 1)
+					return argc == 0 ? ast_func_namespace_uri_0 : ast_func_namespace_uri_1;
+				else if (name == "normalize-space" && argc <= 1)
+					return argc == 0 ? ast_func_normalize_space_0 : ast_func_normalize_space_1;
+				else if (name == "not" && argc == 1)
+					return ast_func_not;
+				else if (name == "number" && argc <= 1)
+					return argc == 0 ? ast_func_number_0 : ast_func_number_1;
+			
+				break;
+			
+			case 'p':
+				if (name == "position" && argc == 0)
+					return ast_func_position;
+				
+				break;
+			
+			case 'r':
+				if (name == "round" && argc == 1)
+					return ast_func_round;
+
+				break;
+			
+			case 's':
+				if (name == "string" && argc <= 1)
+					return argc == 0 ? ast_func_string_0 : ast_func_string_1;
+				else if (name == "string-length" && argc <= 1)
+					return argc == 0 ? ast_func_string_length_0 : ast_func_string_length_1;
+				else if (name == "starts-with" && argc == 2)
+					return ast_func_starts_with;
+				else if (name == "substring-before" && argc == 2)
+					return ast_func_substring_before;
+				else if (name == "substring-after" && argc == 2)
+					return ast_func_substring_after;
+				else if (name == "substring" && (argc == 2 || argc == 3))
+					return argc == 2 ? ast_func_substring_2 : ast_func_substring_3;
+				else if (name == "sum" && argc == 1)
+					return ast_func_sum;
+
+				break;
+			
+			case 't':
+				if (name == "translate" && argc == 3)
+					return ast_func_translate;
+				else if (name == "true" && argc == 0)
+					return ast_func_true;
+					
+				break;
+			}
+
+			return ast_none;
+		}
+
+		axis_t parse_axis_name(const std::string& name, bool& specified)
+		{
+			specified = true;
+
+			switch (name[0])
+			{
+			case 'a':
+				if (name == "ancestor")
+					return axis_ancestor;
+				else if (name == "ancestor-or-self")
+					return axis_ancestor_or_self;
+				else if (name == "attribute")
+					return axis_attribute;
+				
+				break;
+			
+			case 'c':
+				if (name == "child")
+					return axis_child;
+				
+				break;
+			
+			case 'd':
+				if (name == "descendant")
+					return axis_descendant;
+				else if (name == "descendant-or-self")
+					return axis_descendant_or_self;
+				
+				break;
+			
+			case 'f':
+				if (name == "following")
+					return axis_following;
+				else if (name == "following-sibling")
+					return axis_following_sibling;
+				
+				break;
+			
+			case 'n':
+				if (name == "namespace")
+					return axis_namespace;
+				
+				break;
+			
+			case 'p':
+				if (name == "parent")
+					return axis_parent;
+				else if (name == "preceding")
+					return axis_preceding;
+				else if (name == "preceding-sibling")
+					return axis_preceding_sibling;
+				
+				break;
+			
+			case 's':
+				if (name == "self")
+					return axis_self;
+				
+				break;
+			}
+
+			specified = false;
+			return axis_child;
+		}
 	    
 	    // PrimaryExpr ::= VariableReference | '(' Expr ')' | Literal | Number | FunctionCall
 	    xpath_ast_node* parse_primary_expression()
@ -2895,129 +3091,8 @@ namespace pugi
 				
 				m_lexer.next();
 				
-				ast_type_t type = ast_none;
-				
-				switch (function[0])
-				{
-				case 'b':
-				{
-					if (function == "boolean" && argc == 1)
-						type = ast_func_boolean;
-						
-					break;
-				}
-				
-				case 'c':
-				{
-					if (function == "count" && argc == 1)
-						type = ast_func_count;
-					else if (function == "contains" && argc == 2)
-						type = ast_func_contains;
-					else if (function == "concat" && argc == 2)
-					{
-						// set_next was done earlier
-						return new (m_alloc.node()) xpath_ast_node(ast_func_concat, args[0], args[1]);
-					}
-					else if (function == "ceiling" && argc == 1)
-						type = ast_func_ceiling;
-						
-					break;
-				}
-				
-				case 'f':
-				{
-					if (function == "false" && argc == 0)
-						type = ast_func_false;
-					else if (function == "floor" && argc == 1)
-						type = ast_func_floor;
-						
-					break;
-				}
-				
-				case 'i':
-				{
-					if (function == "id" && argc == 1)
-						type = ast_func_id;
-						
-					break;
-				}
-				
-				case 'l':
-				{
-					if (function == "last" && argc == 0)
-						type = ast_func_last;
-					else if (function == "lang" && argc == 1)
-						type = ast_func_lang;
-					else if (function == "local-name" && argc <= 1)
-						type = argc == 0 ? ast_func_local_name_0 : ast_func_local_name_1;
-				
-					break;
-				}
-				
-				case 'n':
-				{
-					if (function == "name" && argc <= 1)
-						type = argc == 0 ? ast_func_name_0 : ast_func_name_1;
-					else if (function == "namespace-uri" && argc <= 1)
-						type = argc == 0 ? ast_func_namespace_uri_0 : ast_func_namespace_uri_1;
-					else if (function == "normalize-space" && argc <= 1)
-						type = argc == 0 ? ast_func_normalize_space_0 : ast_func_normalize_space_1;
-					else if (function == "not" && argc == 1)
-						type = ast_func_not;
-					else if (function == "number" && argc <= 1)
-						type = argc == 0 ? ast_func_number_0 : ast_func_number_1;
-				
-					break;
-				}
-				
-				case 'p':
-				{
-					if (function == "position" && argc == 0)
-						type = ast_func_position;
-					
-					break;
-				}
-				
-				case 'r':
-				{
-					if (function == "round" && argc == 1)
-						type = ast_func_round;
+				ast_type_t type = parse_function_name(function, argc);

-					break;
-				}
-				
-				case 's':
-				{
-					if (function == "string" && argc <= 1)
-						type = argc == 0 ? ast_func_string_0 : ast_func_string_1;
-					else if (function == "string-length" && argc <= 1)
-						type = argc == 0 ? ast_func_string_length_0 : ast_func_string_length_1;
-					else if (function == "starts-with" && argc == 2)
-						type = ast_func_starts_with;
-					else if (function == "substring-before" && argc == 2)
-						type = ast_func_substring_before;
-					else if (function == "substring-after" && argc == 2)
-						type = ast_func_substring_after;
-					else if (function == "substring" && (argc == 2 || argc == 3))
-						type = argc == 2 ? ast_func_substring_2 : ast_func_substring_3;
-					else if (function == "sum" && argc == 1)
-						type = ast_func_sum;
-
-					break;
-				}
-				
-				case 't':
-				{
-					if (function == "translate" && argc == 3)
-						type = ast_func_translate;
-					else if (function == "true" && argc == 0)
-						type = ast_func_true;
-						
-					break;
-				}
-				
-				}
-				
 				if (type != ast_none)
 				{
 					switch (argc)
@ -3069,11 +3144,13 @@ namespace pugi
 	    // AbbreviatedStep ::= '.' | '..'
 	    xpath_ast_node* parse_step(xpath_ast_node* set)
 	    {
-			axis_t axis;
-			
+			bool axis_specified = false;
+			axis_t axis = axis_child; // implied child axis
+
 			if (m_lexer.current() == lex_axis_attribute)
 			{
 				axis = axis_attribute;
+				axis_specified = true;
 				
 				m_lexer.next();
 			}
@ -3089,165 +3166,101 @@ namespace pugi
 				
 				return new (m_alloc.node()) xpath_ast_node(ast_step, set, axis_parent, nodetest_type_node, 0, m_alloc);
 			}
-			else // implied child axis
-				axis = axis_child;
 	    
 			nodetest_t nt_type;
 			std::string nt_name;
 			
 			if (m_lexer.current() == lex_string)
 			{
+				bool nodetest_specified = false;
+
 				// node name test
 				nt_name = m_lexer.contents();
 				m_lexer.next();
-				
-				// possible axis name here - check.
-				if (nt_name.find("::") == std::string::npos && m_lexer.current() == lex_string && m_lexer.contents()[0] == ':' && m_lexer.contents()[1] == ':')
-				{
-					nt_name += m_lexer.contents();
-					m_lexer.next();
-				}
-				
-				// possible namespace test
-				if (m_lexer.current() == lex_string && m_lexer.contents()[0] == ':')
-				{
-					std::string::size_type colon_pos = nt_name.find(':');
-					
-					// either there is no : in current string or there is, but it's :: and there's nothing more
-					if (colon_pos == std::string::npos ||
-						(colon_pos + 1 < nt_name.size() && nt_name[colon_pos + 1] == ':' &&
-						 nt_name.find(':', colon_pos + 2) == std::string::npos))
-					{
-						nt_name += m_lexer.contents();
-						m_lexer.next();
-					}
-				}
-				
-				bool axis_specified = true;
-				
-				switch (nt_name[0])
-				{
-				case 'a':
-					if (starts_with(nt_name, "ancestor::")) axis = axis_ancestor;
-					else if (starts_with(nt_name, "ancestor-or-self::")) axis = axis_ancestor_or_self;
-					else if (starts_with(nt_name, "attribute::")) axis = axis_attribute;
-					else axis_specified = false;
-					
-					break;
-				
-				case 'c':
-					if (starts_with(nt_name, "child::")) axis = axis_child;
-					else axis_specified = false;
-					
-					break;
-				
-				case 'd':
-					if (starts_with(nt_name, "descendant::")) axis = axis_descendant;
-					else if (starts_with(nt_name, "descendant-or-self::")) axis = axis_descendant_or_self;
-					else axis_specified = false;
-					
-					break;
-				
-				case 'f':
-					if (starts_with(nt_name, "following::")) axis = axis_following;
-					else if (starts_with(nt_name, "following-sibling::")) axis = axis_following_sibling;
-					else axis_specified = false;
-					
-					break;
-				
-				case 'n':
-					if (starts_with(nt_name, "namespace::")) axis = axis_namespace;
-					else axis_specified = false;
-					
-					break;
-				
-				case 'p':
-					if (starts_with(nt_name, "parent::")) axis = axis_parent;
-					else if (starts_with(nt_name, "preceding::")) axis = axis_preceding;
-					else if (starts_with(nt_name, "preceding-sibling::")) axis = axis_preceding_sibling;
-					else axis_specified = false;
-					
-					break;
-				
-				case 's':
-					if (starts_with(nt_name, "self::")) axis = axis_self;
-					else axis_specified = false;
-					
-					break;

-				default:
-					axis_specified = false;
-				}
-				
-				if (axis_specified)
+				// was it an axis name?
+				if (m_lexer.current() == lex_double_colon)
 				{
-					nt_name.erase(0, nt_name.find("::") + 2);
-				}
-				
-				if (nt_name.empty() && m_lexer.current() == lex_string)
-				{
-					nt_name += m_lexer.contents();
-					m_lexer.next();
-				}
+					// parse axis name
+					if (axis_specified) throw xpath_exception("Two axis specifiers in one step");

-				// node type test or processing-instruction
-				if (m_lexer.current() == lex_open_brace)
-				{
+					axis = parse_axis_name(nt_name, axis_specified);
+
+					if (!axis_specified) throw xpath_exception("Unknown axis");
+
+					// read actual node test
 					m_lexer.next();
-					
-					if (m_lexer.current() == lex_close_brace)
+
+					if (m_lexer.current() == lex_multiply)
 					{
+						nt_type = nodetest_all;
 						m_lexer.next();
-						
-						if (nt_name == "node")
-							nt_type = nodetest_type_node;
-						else if (nt_name == "text")
-							nt_type = nodetest_type_text;
-						else if (nt_name == "comment")
-							nt_type = nodetest_type_comment;
-						else if (nt_name == "processing-instruction")
-							nt_type = nodetest_type_pi;
-						else
-							throw xpath_exception("Unrecognized node type");
-						
-						nt_name.erase(nt_name.begin(), nt_name.end());
+
+						nodetest_specified = true;
 					}
-					else if (nt_name == "processing-instruction")
+					else if (m_lexer.current() == lex_string)
 					{
-						if (m_lexer.current() != lex_quoted_string)
-							throw xpath_exception("Only literals are allowed as arguments to processing-instruction()");
-					
-						nt_type = nodetest_pi;
 						nt_name = m_lexer.contents();
 						m_lexer.next();
-						
-						if (m_lexer.current() != lex_close_brace)
-							throw xpath_exception("Unmatched brace near processing-instruction()");
-						m_lexer.next();
 					}
-					else
-						throw xpath_exception("Unmatched brace near node type test");
-
+					else throw xpath_exception("Unrecognized node test");
 				}
-				// namespace *
-				else if (m_lexer.current() == lex_multiply)
+				
+				if (!nodetest_specified)
 				{
-					// Only strings of form 'namespace:*' are permitted
-					if (nt_name.empty())
-						nt_type = nodetest_all;
+					// node type test or processing-instruction
+					if (m_lexer.current() == lex_open_brace)
+					{
+						m_lexer.next();
+						
+						if (m_lexer.current() == lex_close_brace)
+						{
+							m_lexer.next();
+							
+							if (nt_name == "node")
+								nt_type = nodetest_type_node;
+							else if (nt_name == "text")
+								nt_type = nodetest_type_text;
+							else if (nt_name == "comment")
+								nt_type = nodetest_type_comment;
+							else if (nt_name == "processing-instruction")
+								nt_type = nodetest_type_pi;
+							else
+								throw xpath_exception("Unrecognized node type");
+							
+							nt_name.erase(nt_name.begin(), nt_name.end());
+						}
+						else if (nt_name == "processing-instruction")
+						{
+							if (m_lexer.current() != lex_quoted_string)
+								throw xpath_exception("Only literals are allowed as arguments to processing-instruction()");
+						
+							nt_type = nodetest_pi;
+							nt_name = m_lexer.contents();
+							m_lexer.next();
+							
+							if (m_lexer.current() != lex_close_brace)
+								throw xpath_exception("Unmatched brace near processing-instruction()");
+							m_lexer.next();
+						}
+						else
+							throw xpath_exception("Unmatched brace near node type test");
+
+					}
+					// QName or NCName:*
 					else
 					{
-						if (nt_name.find(':') != nt_name.size() - 1)
-							throw xpath_exception("Wrong namespace-like node test");
-						
-						nt_name.erase(nt_name.size() - 1);
-						
-						nt_type = nodetest_all_in_namespace;
+						std::string::size_type colon_pos = nt_name.find(':');
+
+						if (nt_name.size() > 2 && colon_pos == nt_name.size() - 2 && nt_name[nt_name.size() - 1] == '*') // NCName:*
+						{
+							nt_name.erase(nt_name.size() - 1);
+							
+							nt_type = nodetest_all_in_namespace;
+						}
+						else nt_type = nodetest_name;
 					}
-					
-					m_lexer.next();
 				}
-				else nt_type = nodetest_name;
 			}
 			else if (m_lexer.current() == lex_multiply)
 			{