| Viewing file:  html5parser.py (114.42 KB)      -rw-r--r-- Select action/file-type:
 
  (+) |  (+) |  (+) | Code (+) | Session (+) |  (+) | SDB (+) |  (+) |  (+) |  (+) |  (+) |  (+) | 
 
from __future__ import absolute_import, division, unicode_literalsfrom pip._vendor.six import with_metaclass, viewkeys, PY3
 
 import types
 
 try:
 from collections import OrderedDict
 except ImportError:
 from pip._vendor.ordereddict import OrderedDict
 
 from . import _inputstream
 from . import _tokenizer
 
 from . import treebuilders
 from .treebuilders.base import Marker
 
 from . import _utils
 from .constants import (
 spaceCharacters, asciiUpper2Lower,
 specialElements, headingElements, cdataElements, rcdataElements,
 tokenTypes, tagTokenTypes,
 namespaces,
 htmlIntegrationPointElements, mathmlTextIntegrationPointElements,
 adjustForeignAttributes as adjustForeignAttributesMap,
 adjustMathMLAttributes, adjustSVGAttributes,
 E,
 ReparseException
 )
 
 
 def parse(doc, treebuilder="etree", namespaceHTMLElements=True, **kwargs):
 """Parse a string or file-like object into a tree"""
 tb = treebuilders.getTreeBuilder(treebuilder)
 p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements)
 return p.parse(doc, **kwargs)
 
 
 def parseFragment(doc, container="div", treebuilder="etree", namespaceHTMLElements=True, **kwargs):
 tb = treebuilders.getTreeBuilder(treebuilder)
 p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements)
 return p.parseFragment(doc, container=container, **kwargs)
 
 
 def method_decorator_metaclass(function):
 class Decorated(type):
 def __new__(meta, classname, bases, classDict):
 for attributeName, attribute in classDict.items():
 if isinstance(attribute, types.FunctionType):
 attribute = function(attribute)
 
 classDict[attributeName] = attribute
 return type.__new__(meta, classname, bases, classDict)
 return Decorated
 
 
 class HTMLParser(object):
 """HTML parser. Generates a tree structure from a stream of (possibly
 malformed) HTML"""
 
 def __init__(self, tree=None, strict=False, namespaceHTMLElements=True, debug=False):
 """
 strict - raise an exception when a parse error is encountered
 
 tree - a treebuilder class controlling the type of tree that will be
 returned. Built in treebuilders can be accessed through
 html5lib.treebuilders.getTreeBuilder(treeType)
 """
 
 # Raise an exception on the first error encountered
 self.strict = strict
 
 if tree is None:
 tree = treebuilders.getTreeBuilder("etree")
 self.tree = tree(namespaceHTMLElements)
 self.errors = []
 
 self.phases = dict([(name, cls(self, self.tree)) for name, cls in
 getPhases(debug).items()])
 
 def _parse(self, stream, innerHTML=False, container="div", scripting=False, **kwargs):
 
 self.innerHTMLMode = innerHTML
 self.container = container
 self.scripting = scripting
 self.tokenizer = _tokenizer.HTMLTokenizer(stream, parser=self, **kwargs)
 self.reset()
 
 try:
 self.mainLoop()
 except ReparseException:
 self.reset()
 self.mainLoop()
 
 def reset(self):
 self.tree.reset()
 self.firstStartTag = False
 self.errors = []
 self.log = []  # only used with debug mode
 # "quirks" / "limited quirks" / "no quirks"
 self.compatMode = "no quirks"
 
 if self.innerHTMLMode:
 self.innerHTML = self.container.lower()
 
 if self.innerHTML in cdataElements:
 self.tokenizer.state = self.tokenizer.rcdataState
 elif self.innerHTML in rcdataElements:
 self.tokenizer.state = self.tokenizer.rawtextState
 elif self.innerHTML == 'plaintext':
 self.tokenizer.state = self.tokenizer.plaintextState
 else:
 # state already is data state
 # self.tokenizer.state = self.tokenizer.dataState
 pass
 self.phase = self.phases["beforeHtml"]
 self.phase.insertHtmlElement()
 self.resetInsertionMode()
 else:
 self.innerHTML = False  # pylint:disable=redefined-variable-type
 self.phase = self.phases["initial"]
 
 self.lastPhase = None
 
 self.beforeRCDataPhase = None
 
 self.framesetOK = True
 
 @property
 def documentEncoding(self):
 """The name of the character encoding
 that was used to decode the input stream,
 or :obj:`None` if that is not determined yet.
 
 """
 if not hasattr(self, 'tokenizer'):
 return None
 return self.tokenizer.stream.charEncoding[0].name
 
 def isHTMLIntegrationPoint(self, element):
 if (element.name == "annotation-xml" and
 element.namespace == namespaces["mathml"]):
 return ("encoding" in element.attributes and
 element.attributes["encoding"].translate(
 asciiUpper2Lower) in
 ("text/html", "application/xhtml+xml"))
 else:
 return (element.namespace, element.name) in htmlIntegrationPointElements
 
 def isMathMLTextIntegrationPoint(self, element):
 return (element.namespace, element.name) in mathmlTextIntegrationPointElements
 
 def mainLoop(self):
 CharactersToken = tokenTypes["Characters"]
 SpaceCharactersToken = tokenTypes["SpaceCharacters"]
 StartTagToken = tokenTypes["StartTag"]
 EndTagToken = tokenTypes["EndTag"]
 CommentToken = tokenTypes["Comment"]
 DoctypeToken = tokenTypes["Doctype"]
 ParseErrorToken = tokenTypes["ParseError"]
 
 for token in self.normalizedTokens():
 prev_token = None
 new_token = token
 while new_token is not None:
 prev_token = new_token
 currentNode = self.tree.openElements[-1] if self.tree.openElements else None
 currentNodeNamespace = currentNode.namespace if currentNode else None
 currentNodeName = currentNode.name if currentNode else None
 
 type = new_token["type"]
 
 if type == ParseErrorToken:
 self.parseError(new_token["data"], new_token.get("datavars", {}))
 new_token = None
 else:
 if (len(self.tree.openElements) == 0 or
 currentNodeNamespace == self.tree.defaultNamespace or
 (self.isMathMLTextIntegrationPoint(currentNode) and
 ((type == StartTagToken and
 token["name"] not in frozenset(["mglyph", "malignmark"])) or
 type in (CharactersToken, SpaceCharactersToken))) or
 (currentNodeNamespace == namespaces["mathml"] and
 currentNodeName == "annotation-xml" and
 type == StartTagToken and
 token["name"] == "svg") or
 (self.isHTMLIntegrationPoint(currentNode) and
 type in (StartTagToken, CharactersToken, SpaceCharactersToken))):
 phase = self.phase
 else:
 phase = self.phases["inForeignContent"]
 
 if type == CharactersToken:
 new_token = phase.processCharacters(new_token)
 elif type == SpaceCharactersToken:
 new_token = phase.processSpaceCharacters(new_token)
 elif type == StartTagToken:
 new_token = phase.processStartTag(new_token)
 elif type == EndTagToken:
 new_token = phase.processEndTag(new_token)
 elif type == CommentToken:
 new_token = phase.processComment(new_token)
 elif type == DoctypeToken:
 new_token = phase.processDoctype(new_token)
 
 if (type == StartTagToken and prev_token["selfClosing"] and
 not prev_token["selfClosingAcknowledged"]):
 self.parseError("non-void-element-with-trailing-solidus",
 {"name": prev_token["name"]})
 
 # When the loop finishes it's EOF
 reprocess = True
 phases = []
 while reprocess:
 phases.append(self.phase)
 reprocess = self.phase.processEOF()
 if reprocess:
 assert self.phase not in phases
 
 def normalizedTokens(self):
 for token in self.tokenizer:
 yield self.normalizeToken(token)
 
 def parse(self, stream, *args, **kwargs):
 """Parse a HTML document into a well-formed tree
 
 stream - a filelike object or string containing the HTML to be parsed
 
 The optional encoding parameter must be a string that indicates
 the encoding.  If specified, that encoding will be used,
 regardless of any BOM or later declaration (such as in a meta
 element)
 
 scripting - treat noscript elements as if javascript was turned on
 """
 self._parse(stream, False, None, *args, **kwargs)
 return self.tree.getDocument()
 
 def parseFragment(self, stream, *args, **kwargs):
 """Parse a HTML fragment into a well-formed tree fragment
 
 container - name of the element we're setting the innerHTML property
 if set to None, default to 'div'
 
 stream - a filelike object or string containing the HTML to be parsed
 
 The optional encoding parameter must be a string that indicates
 the encoding.  If specified, that encoding will be used,
 regardless of any BOM or later declaration (such as in a meta
 element)
 
 scripting - treat noscript elements as if javascript was turned on
 """
 self._parse(stream, True, *args, **kwargs)
 return self.tree.getFragment()
 
 def parseError(self, errorcode="XXX-undefined-error", datavars=None):
 # XXX The idea is to make errorcode mandatory.
 if datavars is None:
 datavars = {}
 self.errors.append((self.tokenizer.stream.position(), errorcode, datavars))
 if self.strict:
 raise ParseError(E[errorcode] % datavars)
 
 def normalizeToken(self, token):
 """ HTML5 specific normalizations to the token stream """
 
 if token["type"] == tokenTypes["StartTag"]:
 raw = token["data"]
 token["data"] = OrderedDict(raw)
 if len(raw) > len(token["data"]):
 # we had some duplicated attribute, fix so first wins
 token["data"].update(raw[::-1])
 
 return token
 
 def adjustMathMLAttributes(self, token):
 adjust_attributes(token, adjustMathMLAttributes)
 
 def adjustSVGAttributes(self, token):
 adjust_attributes(token, adjustSVGAttributes)
 
 def adjustForeignAttributes(self, token):
 adjust_attributes(token, adjustForeignAttributesMap)
 
 def reparseTokenNormal(self, token):
 # pylint:disable=unused-argument
 self.parser.phase()
 
 def resetInsertionMode(self):
 # The name of this method is mostly historical. (It's also used in the
 # specification.)
 last = False
 newModes = {
 "select": "inSelect",
 "td": "inCell",
 "th": "inCell",
 "tr": "inRow",
 "tbody": "inTableBody",
 "thead": "inTableBody",
 "tfoot": "inTableBody",
 "caption": "inCaption",
 "colgroup": "inColumnGroup",
 "table": "inTable",
 "head": "inBody",
 "body": "inBody",
 "frameset": "inFrameset",
 "html": "beforeHead"
 }
 for node in self.tree.openElements[::-1]:
 nodeName = node.name
 new_phase = None
 if node == self.tree.openElements[0]:
 assert self.innerHTML
 last = True
 nodeName = self.innerHTML
 # Check for conditions that should only happen in the innerHTML
 # case
 if nodeName in ("select", "colgroup", "head", "html"):
 assert self.innerHTML
 
 if not last and node.namespace != self.tree.defaultNamespace:
 continue
 
 if nodeName in newModes:
 new_phase = self.phases[newModes[nodeName]]
 break
 elif last:
 new_phase = self.phases["inBody"]
 break
 
 self.phase = new_phase
 
 def parseRCDataRawtext(self, token, contentType):
 """Generic RCDATA/RAWTEXT Parsing algorithm
 contentType - RCDATA or RAWTEXT
 """
 assert contentType in ("RAWTEXT", "RCDATA")
 
 self.tree.insertElement(token)
 
 if contentType == "RAWTEXT":
 self.tokenizer.state = self.tokenizer.rawtextState
 else:
 self.tokenizer.state = self.tokenizer.rcdataState
 
 self.originalPhase = self.phase
 
 self.phase = self.phases["text"]
 
 
 @_utils.memoize
 def getPhases(debug):
 def log(function):
 """Logger that records which phase processes each token"""
 type_names = dict((value, key) for key, value in
 tokenTypes.items())
 
 def wrapped(self, *args, **kwargs):
 if function.__name__.startswith("process") and len(args) > 0:
 token = args[0]
 try:
 info = {"type": type_names[token['type']]}
 except:
 raise
 if token['type'] in tagTokenTypes:
 info["name"] = token['name']
 
 self.parser.log.append((self.parser.tokenizer.state.__name__,
 self.parser.phase.__class__.__name__,
 self.__class__.__name__,
 function.__name__,
 info))
 return function(self, *args, **kwargs)
 else:
 return function(self, *args, **kwargs)
 return wrapped
 
 def getMetaclass(use_metaclass, metaclass_func):
 if use_metaclass:
 return method_decorator_metaclass(metaclass_func)
 else:
 return type
 
 # pylint:disable=unused-argument
 class Phase(with_metaclass(getMetaclass(debug, log))):
 """Base class for helper object that implements each phase of processing
 """
 
 def __init__(self, parser, tree):
 self.parser = parser
 self.tree = tree
 
 def processEOF(self):
 raise NotImplementedError
 
 def processComment(self, token):
 # For most phases the following is correct. Where it's not it will be
 # overridden.
 self.tree.insertComment(token, self.tree.openElements[-1])
 
 def processDoctype(self, token):
 self.parser.parseError("unexpected-doctype")
 
 def processCharacters(self, token):
 self.tree.insertText(token["data"])
 
 def processSpaceCharacters(self, token):
 self.tree.insertText(token["data"])
 
 def processStartTag(self, token):
 return self.startTagHandler[token["name"]](token)
 
 def startTagHtml(self, token):
 if not self.parser.firstStartTag and token["name"] == "html":
 self.parser.parseError("non-html-root")
 # XXX Need a check here to see if the first start tag token emitted is
 # this token... If it's not, invoke self.parser.parseError().
 for attr, value in token["data"].items():
 if attr not in self.tree.openElements[0].attributes:
 self.tree.openElements[0].attributes[attr] = value
 self.parser.firstStartTag = False
 
 def processEndTag(self, token):
 return self.endTagHandler[token["name"]](token)
 
 class InitialPhase(Phase):
 def processSpaceCharacters(self, token):
 pass
 
 def processComment(self, token):
 self.tree.insertComment(token, self.tree.document)
 
 def processDoctype(self, token):
 name = token["name"]
 publicId = token["publicId"]
 systemId = token["systemId"]
 correct = token["correct"]
 
 if (name != "html" or publicId is not None or
 systemId is not None and systemId != "about:legacy-compat"):
 self.parser.parseError("unknown-doctype")
 
 if publicId is None:
 publicId = ""
 
 self.tree.insertDoctype(token)
 
 if publicId != "":
 publicId = publicId.translate(asciiUpper2Lower)
 
 if (not correct or token["name"] != "html" or
 publicId.startswith(
 ("+//silmaril//dtd html pro v0r11 19970101//",
 "-//advasoft ltd//dtd html 3.0 aswedit + extensions//",
 "-//as//dtd html 3.0 aswedit + extensions//",
 "-//ietf//dtd html 2.0 level 1//",
 "-//ietf//dtd html 2.0 level 2//",
 "-//ietf//dtd html 2.0 strict level 1//",
 "-//ietf//dtd html 2.0 strict level 2//",
 "-//ietf//dtd html 2.0 strict//",
 "-//ietf//dtd html 2.0//",
 "-//ietf//dtd html 2.1e//",
 "-//ietf//dtd html 3.0//",
 "-//ietf//dtd html 3.2 final//",
 "-//ietf//dtd html 3.2//",
 "-//ietf//dtd html 3//",
 "-//ietf//dtd html level 0//",
 "-//ietf//dtd html level 1//",
 "-//ietf//dtd html level 2//",
 "-//ietf//dtd html level 3//",
 "-//ietf//dtd html strict level 0//",
 "-//ietf//dtd html strict level 1//",
 "-//ietf//dtd html strict level 2//",
 "-//ietf//dtd html strict level 3//",
 "-//ietf//dtd html strict//",
 "-//ietf//dtd html//",
 "-//metrius//dtd metrius presentational//",
 "-//microsoft//dtd internet explorer 2.0 html strict//",
 "-//microsoft//dtd internet explorer 2.0 html//",
 "-//microsoft//dtd internet explorer 2.0 tables//",
 "-//microsoft//dtd internet explorer 3.0 html strict//",
 "-//microsoft//dtd internet explorer 3.0 html//",
 "-//microsoft//dtd internet explorer 3.0 tables//",
 "-//netscape comm. corp.//dtd html//",
 "-//netscape comm. corp.//dtd strict html//",
 "-//o'reilly and associates//dtd html 2.0//",
 "-//o'reilly and associates//dtd html extended 1.0//",
 "-//o'reilly and associates//dtd html extended relaxed 1.0//",
 "-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//",
 "-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//",
 "-//spyglass//dtd html 2.0 extended//",
 "-//sq//dtd html 2.0 hotmetal + extensions//",
 "-//sun microsystems corp.//dtd hotjava html//",
 "-//sun microsystems corp.//dtd hotjava strict html//",
 "-//w3c//dtd html 3 1995-03-24//",
 "-//w3c//dtd html 3.2 draft//",
 "-//w3c//dtd html 3.2 final//",
 "-//w3c//dtd html 3.2//",
 "-//w3c//dtd html 3.2s draft//",
 "-//w3c//dtd html 4.0 frameset//",
 "-//w3c//dtd html 4.0 transitional//",
 "-//w3c//dtd html experimental 19960712//",
 "-//w3c//dtd html experimental 970421//",
 "-//w3c//dtd w3 html//",
 "-//w3o//dtd w3 html 3.0//",
 "-//webtechs//dtd mozilla html 2.0//",
 "-//webtechs//dtd mozilla html//")) or
 publicId in ("-//w3o//dtd w3 html strict 3.0//en//",
 "-/w3c/dtd html 4.0 transitional/en",
 "html") or
 publicId.startswith(
 ("-//w3c//dtd html 4.01 frameset//",
 "-//w3c//dtd html 4.01 transitional//")) and
 systemId is None or
 systemId and systemId.lower() == "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"):
 self.parser.compatMode = "quirks"
 elif (publicId.startswith(
 ("-//w3c//dtd xhtml 1.0 frameset//",
 "-//w3c//dtd xhtml 1.0 transitional//")) or
 publicId.startswith(
 ("-//w3c//dtd html 4.01 frameset//",
 "-//w3c//dtd html 4.01 transitional//")) and
 systemId is not None):
 self.parser.compatMode = "limited quirks"
 
 self.parser.phase = self.parser.phases["beforeHtml"]
 
 def anythingElse(self):
 self.parser.compatMode = "quirks"
 self.parser.phase = self.parser.phases["beforeHtml"]
 
 def processCharacters(self, token):
 self.parser.parseError("expected-doctype-but-got-chars")
 self.anythingElse()
 return token
 
 def processStartTag(self, token):
 self.parser.parseError("expected-doctype-but-got-start-tag",
 {"name": token["name"]})
 self.anythingElse()
 return token
 
 def processEndTag(self, token):
 self.parser.parseError("expected-doctype-but-got-end-tag",
 {"name": token["name"]})
 self.anythingElse()
 return token
 
 def processEOF(self):
 self.parser.parseError("expected-doctype-but-got-eof")
 self.anythingElse()
 return True
 
 class BeforeHtmlPhase(Phase):
 # helper methods
 def insertHtmlElement(self):
 self.tree.insertRoot(impliedTagToken("html", "StartTag"))
 self.parser.phase = self.parser.phases["beforeHead"]
 
 # other
 def processEOF(self):
 self.insertHtmlElement()
 return True
 
 def processComment(self, token):
 self.tree.insertComment(token, self.tree.document)
 
 def processSpaceCharacters(self, token):
 pass
 
 def processCharacters(self, token):
 self.insertHtmlElement()
 return token
 
 def processStartTag(self, token):
 if token["name"] == "html":
 self.parser.firstStartTag = True
 self.insertHtmlElement()
 return token
 
 def processEndTag(self, token):
 if token["name"] not in ("head", "body", "html", "br"):
 self.parser.parseError("unexpected-end-tag-before-html",
 {"name": token["name"]})
 else:
 self.insertHtmlElement()
 return token
 
 class BeforeHeadPhase(Phase):
 def __init__(self, parser, tree):
 Phase.__init__(self, parser, tree)
 
 self.startTagHandler = _utils.MethodDispatcher([
 ("html", self.startTagHtml),
 ("head", self.startTagHead)
 ])
 self.startTagHandler.default = self.startTagOther
 
 self.endTagHandler = _utils.MethodDispatcher([
 (("head", "body", "html", "br"), self.endTagImplyHead)
 ])
 self.endTagHandler.default = self.endTagOther
 
 def processEOF(self):
 self.startTagHead(impliedTagToken("head", "StartTag"))
 return True
 
 def processSpaceCharacters(self, token):
 pass
 
 def processCharacters(self, token):
 self.startTagHead(impliedTagToken("head", "StartTag"))
 return token
 
 def startTagHtml(self, token):
 return self.parser.phases["inBody"].processStartTag(token)
 
 def startTagHead(self, token):
 self.tree.insertElement(token)
 self.tree.headPointer = self.tree.openElements[-1]
 self.parser.phase = self.parser.phases["inHead"]
 
 def startTagOther(self, token):
 self.startTagHead(impliedTagToken("head", "StartTag"))
 return token
 
 def endTagImplyHead(self, token):
 self.startTagHead(impliedTagToken("head", "StartTag"))
 return token
 
 def endTagOther(self, token):
 self.parser.parseError("end-tag-after-implied-root",
 {"name": token["name"]})
 
 class InHeadPhase(Phase):
 def __init__(self, parser, tree):
 Phase.__init__(self, parser, tree)
 
 self.startTagHandler = _utils.MethodDispatcher([
 ("html", self.startTagHtml),
 ("title", self.startTagTitle),
 (("noframes", "style"), self.startTagNoFramesStyle),
 ("noscript", self.startTagNoscript),
 ("script", self.startTagScript),
 (("base", "basefont", "bgsound", "command", "link"),
 self.startTagBaseLinkCommand),
 ("meta", self.startTagMeta),
 ("head", self.startTagHead)
 ])
 self.startTagHandler.default = self.startTagOther
 
 self.endTagHandler = _utils.MethodDispatcher([
 ("head", self.endTagHead),
 (("br", "html", "body"), self.endTagHtmlBodyBr)
 ])
 self.endTagHandler.default = self.endTagOther
 
 # the real thing
 def processEOF(self):
 self.anythingElse()
 return True
 
 def processCharacters(self, token):
 self.anythingElse()
 return token
 
 def startTagHtml(self, token):
 return self.parser.phases["inBody"].processStartTag(token)
 
 def startTagHead(self, token):
 self.parser.parseError("two-heads-are-not-better-than-one")
 
 def startTagBaseLinkCommand(self, token):
 self.tree.insertElement(token)
 self.tree.openElements.pop()
 token["selfClosingAcknowledged"] = True
 
 def startTagMeta(self, token):
 self.tree.insertElement(token)
 self.tree.openElements.pop()
 token["selfClosingAcknowledged"] = True
 
 attributes = token["data"]
 if self.parser.tokenizer.stream.charEncoding[1] == "tentative":
 if "charset" in attributes:
 self.parser.tokenizer.stream.changeEncoding(attributes["charset"])
 elif ("content" in attributes and
 "http-equiv" in attributes and
 attributes["http-equiv"].lower() == "content-type"):
 # Encoding it as UTF-8 here is a hack, as really we should pass
 # the abstract Unicode string, and just use the
 # ContentAttrParser on that, but using UTF-8 allows all chars
 # to be encoded and as a ASCII-superset works.
 data = _inputstream.EncodingBytes(attributes["content"].encode("utf-8"))
 parser = _inputstream.ContentAttrParser(data)
 codec = parser.parse()
 self.parser.tokenizer.stream.changeEncoding(codec)
 
 def startTagTitle(self, token):
 self.parser.parseRCDataRawtext(token, "RCDATA")
 
 def startTagNoFramesStyle(self, token):
 # Need to decide whether to implement the scripting-disabled case
 self.parser.parseRCDataRawtext(token, "RAWTEXT")
 
 def startTagNoscript(self, token):
 if self.parser.scripting:
 self.parser.parseRCDataRawtext(token, "RAWTEXT")
 else:
 self.tree.insertElement(token)
 self.parser.phase = self.parser.phases["inHeadNoscript"]
 
 def startTagScript(self, token):
 self.tree.insertElement(token)
 self.parser.tokenizer.state = self.parser.tokenizer.scriptDataState
 self.parser.originalPhase = self.parser.phase
 self.parser.phase = self.parser.phases["text"]
 
 def startTagOther(self, token):
 self.anythingElse()
 return token
 
 def endTagHead(self, token):
 node = self.parser.tree.openElements.pop()
 assert node.name == "head", "Expected head got %s" % node.name
 self.parser.phase = self.parser.phases["afterHead"]
 
 def endTagHtmlBodyBr(self, token):
 self.anythingElse()
 return token
 
 def endTagOther(self, token):
 self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
 
 def anythingElse(self):
 self.endTagHead(impliedTagToken("head"))
 
 class InHeadNoscriptPhase(Phase):
 def __init__(self, parser, tree):
 Phase.__init__(self, parser, tree)
 
 self.startTagHandler = _utils.MethodDispatcher([
 ("html", self.startTagHtml),
 (("basefont", "bgsound", "link", "meta", "noframes", "style"), self.startTagBaseLinkCommand),
 (("head", "noscript"), self.startTagHeadNoscript),
 ])
 self.startTagHandler.default = self.startTagOther
 
 self.endTagHandler = _utils.MethodDispatcher([
 ("noscript", self.endTagNoscript),
 ("br", self.endTagBr),
 ])
 self.endTagHandler.default = self.endTagOther
 
 def processEOF(self):
 self.parser.parseError("eof-in-head-noscript")
 self.anythingElse()
 return True
 
 def processComment(self, token):
 return self.parser.phases["inHead"].processComment(token)
 
 def processCharacters(self, token):
 self.parser.parseError("char-in-head-noscript")
 self.anythingElse()
 return token
 
 def processSpaceCharacters(self, token):
 return self.parser.phases["inHead"].processSpaceCharacters(token)
 
 def startTagHtml(self, token):
 return self.parser.phases["inBody"].processStartTag(token)
 
 def startTagBaseLinkCommand(self, token):
 return self.parser.phases["inHead"].processStartTag(token)
 
 def startTagHeadNoscript(self, token):
 self.parser.parseError("unexpected-start-tag", {"name": token["name"]})
 
 def startTagOther(self, token):
 self.parser.parseError("unexpected-inhead-noscript-tag", {"name": token["name"]})
 self.anythingElse()
 return token
 
 def endTagNoscript(self, token):
 node = self.parser.tree.openElements.pop()
 assert node.name == "noscript", "Expected noscript got %s" % node.name
 self.parser.phase = self.parser.phases["inHead"]
 
 def endTagBr(self, token):
 self.parser.parseError("unexpected-inhead-noscript-tag", {"name": token["name"]})
 self.anythingElse()
 return token
 
 def endTagOther(self, token):
 self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
 
 def anythingElse(self):
 # Caller must raise parse error first!
 self.endTagNoscript(impliedTagToken("noscript"))
 
 class AfterHeadPhase(Phase):
 def __init__(self, parser, tree):
 Phase.__init__(self, parser, tree)
 
 self.startTagHandler = _utils.MethodDispatcher([
 ("html", self.startTagHtml),
 ("body", self.startTagBody),
 ("frameset", self.startTagFrameset),
 (("base", "basefont", "bgsound", "link", "meta", "noframes", "script",
 "style", "title"),
 self.startTagFromHead),
 ("head", self.startTagHead)
 ])
 self.startTagHandler.default = self.startTagOther
 self.endTagHandler = _utils.MethodDispatcher([(("body", "html", "br"),
 self.endTagHtmlBodyBr)])
 self.endTagHandler.default = self.endTagOther
 
 def processEOF(self):
 self.anythingElse()
 return True
 
 def processCharacters(self, token):
 self.anythingElse()
 return token
 
 def startTagHtml(self, token):
 return self.parser.phases["inBody"].processStartTag(token)
 
 def startTagBody(self, token):
 self.parser.framesetOK = False
 self.tree.insertElement(token)
 self.parser.phase = self.parser.phases["inBody"]
 
 def startTagFrameset(self, token):
 self.tree.insertElement(token)
 self.parser.phase = self.parser.phases["inFrameset"]
 
 def startTagFromHead(self, token):
 self.parser.parseError("unexpected-start-tag-out-of-my-head",
 {"name": token["name"]})
 self.tree.openElements.append(self.tree.headPointer)
 self.parser.phases["inHead"].processStartTag(token)
 for node in self.tree.openElements[::-1]:
 if node.name == "head":
 self.tree.openElements.remove(node)
 break
 
 def startTagHead(self, token):
 self.parser.parseError("unexpected-start-tag", {"name": token["name"]})
 
 def startTagOther(self, token):
 self.anythingElse()
 return token
 
 def endTagHtmlBodyBr(self, token):
 self.anythingElse()
 return token
 
 def endTagOther(self, token):
 self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
 
 def anythingElse(self):
 self.tree.insertElement(impliedTagToken("body", "StartTag"))
 self.parser.phase = self.parser.phases["inBody"]
 self.parser.framesetOK = True
 
 class InBodyPhase(Phase):
 # http://www.whatwg.org/specs/web-apps/current-work/#parsing-main-inbody
 # the really-really-really-very crazy mode
 def __init__(self, parser, tree):
 Phase.__init__(self, parser, tree)
 
 # Set this to the default handler
 self.processSpaceCharacters = self.processSpaceCharactersNonPre
 
 self.startTagHandler = _utils.MethodDispatcher([
 ("html", self.startTagHtml),
 (("base", "basefont", "bgsound", "command", "link", "meta",
 "script", "style", "title"),
 self.startTagProcessInHead),
 ("body", self.startTagBody),
 ("frameset", self.startTagFrameset),
 (("address", "article", "aside", "blockquote", "center", "details",
 "dir", "div", "dl", "fieldset", "figcaption", "figure",
 "footer", "header", "hgroup", "main", "menu", "nav", "ol", "p",
 "section", "summary", "ul"),
 self.startTagCloseP),
 (headingElements, self.startTagHeading),
 (("pre", "listing"), self.startTagPreListing),
 ("form", self.startTagForm),
 (("li", "dd", "dt"), self.startTagListItem),
 ("plaintext", self.startTagPlaintext),
 ("a", self.startTagA),
 (("b", "big", "code", "em", "font", "i", "s", "small", "strike",
 "strong", "tt", "u"), self.startTagFormatting),
 ("nobr", self.startTagNobr),
 ("button", self.startTagButton),
 (("applet", "marquee", "object"), self.startTagAppletMarqueeObject),
 ("xmp", self.startTagXmp),
 ("table", self.startTagTable),
 (("area", "br", "embed", "img", "keygen", "wbr"),
 self.startTagVoidFormatting),
 (("param", "source", "track"), self.startTagParamSource),
 ("input", self.startTagInput),
 ("hr", self.startTagHr),
 ("image", self.startTagImage),
 ("isindex", self.startTagIsIndex),
 ("textarea", self.startTagTextarea),
 ("iframe", self.startTagIFrame),
 ("noscript", self.startTagNoscript),
 (("noembed", "noframes"), self.startTagRawtext),
 ("select", self.startTagSelect),
 (("rp", "rt"), self.startTagRpRt),
 (("option", "optgroup"), self.startTagOpt),
 (("math"), self.startTagMath),
 (("svg"), self.startTagSvg),
 (("caption", "col", "colgroup", "frame", "head",
 "tbody", "td", "tfoot", "th", "thead",
 "tr"), self.startTagMisplaced)
 ])
 self.startTagHandler.default = self.startTagOther
 
 self.endTagHandler = _utils.MethodDispatcher([
 ("body", self.endTagBody),
 ("html", self.endTagHtml),
 (("address", "article", "aside", "blockquote", "button", "center",
 "details", "dialog", "dir", "div", "dl", "fieldset", "figcaption", "figure",
 "footer", "header", "hgroup", "listing", "main", "menu", "nav", "ol", "pre",
 "section", "summary", "ul"), self.endTagBlock),
 ("form", self.endTagForm),
 ("p", self.endTagP),
 (("dd", "dt", "li"), self.endTagListItem),
 (headingElements, self.endTagHeading),
 (("a", "b", "big", "code", "em", "font", "i", "nobr", "s", "small",
 "strike", "strong", "tt", "u"), self.endTagFormatting),
 (("applet", "marquee", "object"), self.endTagAppletMarqueeObject),
 ("br", self.endTagBr),
 ])
 self.endTagHandler.default = self.endTagOther
 
 def isMatchingFormattingElement(self, node1, node2):
 return (node1.name == node2.name and
 node1.namespace == node2.namespace and
 node1.attributes == node2.attributes)
 
 # helper
 def addFormattingElement(self, token):
 self.tree.insertElement(token)
 element = self.tree.openElements[-1]
 
 matchingElements = []
 for node in self.tree.activeFormattingElements[::-1]:
 if node is Marker:
 break
 elif self.isMatchingFormattingElement(node, element):
 matchingElements.append(node)
 
 assert len(matchingElements) <= 3
 if len(matchingElements) == 3:
 self.tree.activeFormattingElements.remove(matchingElements[-1])
 self.tree.activeFormattingElements.append(element)
 
 # the real deal
 def processEOF(self):
 allowed_elements = frozenset(("dd", "dt", "li", "p", "tbody", "td",
 "tfoot", "th", "thead", "tr", "body",
 "html"))
 for node in self.tree.openElements[::-1]:
 if node.name not in allowed_elements:
 self.parser.parseError("expected-closing-tag-but-got-eof")
 break
 # Stop parsing
 
 def processSpaceCharactersDropNewline(self, token):
 # Sometimes (start of <pre>, <listing>, and <textarea> blocks) we
 # want to drop leading newlines
 data = token["data"]
 self.processSpaceCharacters = self.processSpaceCharactersNonPre
 if (data.startswith("\n") and
 self.tree.openElements[-1].name in ("pre", "listing", "textarea") and
 not self.tree.openElements[-1].hasContent()):
 data = data[1:]
 if data:
 self.tree.reconstructActiveFormattingElements()
 self.tree.insertText(data)
 
 def processCharacters(self, token):
 if token["data"] == "\u0000":
 # The tokenizer should always emit null on its own
 return
 self.tree.reconstructActiveFormattingElements()
 self.tree.insertText(token["data"])
 # This must be bad for performance
 if (self.parser.framesetOK and
 any([char not in spaceCharacters
 for char in token["data"]])):
 self.parser.framesetOK = False
 
 def processSpaceCharactersNonPre(self, token):
 self.tree.reconstructActiveFormattingElements()
 self.tree.insertText(token["data"])
 
 def startTagProcessInHead(self, token):
 return self.parser.phases["inHead"].processStartTag(token)
 
 def startTagBody(self, token):
 self.parser.parseError("unexpected-start-tag", {"name": "body"})
 if (len(self.tree.openElements) == 1 or
 self.tree.openElements[1].name != "body"):
 assert self.parser.innerHTML
 else:
 self.parser.framesetOK = False
 for attr, value in token["data"].items():
 if attr not in self.tree.openElements[1].attributes:
 self.tree.openElements[1].attributes[attr] = value
 
 def startTagFrameset(self, token):
 self.parser.parseError("unexpected-start-tag", {"name": "frameset"})
 if (len(self.tree.openElements) == 1 or self.tree.openElements[1].name != "body"):
 assert self.parser.innerHTML
 elif not self.parser.framesetOK:
 pass
 else:
 if self.tree.openElements[1].parent:
 self.tree.openElements[1].parent.removeChild(self.tree.openElements[1])
 while self.tree.openElements[-1].name != "html":
 self.tree.openElements.pop()
 self.tree.insertElement(token)
 self.parser.phase = self.parser.phases["inFrameset"]
 
 def startTagCloseP(self, token):
 if self.tree.elementInScope("p", variant="button"):
 self.endTagP(impliedTagToken("p"))
 self.tree.insertElement(token)
 
 def startTagPreListing(self, token):
 if self.tree.elementInScope("p", variant="button"):
 self.endTagP(impliedTagToken("p"))
 self.tree.insertElement(token)
 self.parser.framesetOK = False
 self.processSpaceCharacters = self.processSpaceCharactersDropNewline
 
 def startTagForm(self, token):
 if self.tree.formPointer:
 self.parser.parseError("unexpected-start-tag", {"name": "form"})
 else:
 if self.tree.elementInScope("p", variant="button"):
 self.endTagP(impliedTagToken("p"))
 self.tree.insertElement(token)
 self.tree.formPointer = self.tree.openElements[-1]
 
 def startTagListItem(self, token):
 self.parser.framesetOK = False
 
 stopNamesMap = {"li": ["li"],
 "dt": ["dt", "dd"],
 "dd": ["dt", "dd"]}
 stopNames = stopNamesMap[token["name"]]
 for node in reversed(self.tree.openElements):
 if node.name in stopNames:
 self.parser.phase.processEndTag(
 impliedTagToken(node.name, "EndTag"))
 break
 if (node.nameTuple in specialElements and
 node.name not in ("address", "div", "p")):
 break
 
 if self.tree.elementInScope("p", variant="button"):
 self.parser.phase.processEndTag(
 impliedTagToken("p", "EndTag"))
 
 self.tree.insertElement(token)
 
 def startTagPlaintext(self, token):
 if self.tree.elementInScope("p", variant="button"):
 self.endTagP(impliedTagToken("p"))
 self.tree.insertElement(token)
 self.parser.tokenizer.state = self.parser.tokenizer.plaintextState
 
 def startTagHeading(self, token):
 if self.tree.elementInScope("p", variant="button"):
 self.endTagP(impliedTagToken("p"))
 if self.tree.openElements[-1].name in headingElements:
 self.parser.parseError("unexpected-start-tag", {"name": token["name"]})
 self.tree.openElements.pop()
 self.tree.insertElement(token)
 
 def startTagA(self, token):
 afeAElement = self.tree.elementInActiveFormattingElements("a")
 if afeAElement:
 self.parser.parseError("unexpected-start-tag-implies-end-tag",
 {"startName": "a", "endName": "a"})
 self.endTagFormatting(impliedTagToken("a"))
 if afeAElement in self.tree.openElements:
 self.tree.openElements.remove(afeAElement)
 if afeAElement in self.tree.activeFormattingElements:
 self.tree.activeFormattingElements.remove(afeAElement)
 self.tree.reconstructActiveFormattingElements()
 self.addFormattingElement(token)
 
 def startTagFormatting(self, token):
 self.tree.reconstructActiveFormattingElements()
 self.addFormattingElement(token)
 
 def startTagNobr(self, token):
 self.tree.reconstructActiveFormattingElements()
 if self.tree.elementInScope("nobr"):
 self.parser.parseError("unexpected-start-tag-implies-end-tag",
 {"startName": "nobr", "endName": "nobr"})
 self.processEndTag(impliedTagToken("nobr"))
 # XXX Need tests that trigger the following
 self.tree.reconstructActiveFormattingElements()
 self.addFormattingElement(token)
 
 def startTagButton(self, token):
 if self.tree.elementInScope("button"):
 self.parser.parseError("unexpected-start-tag-implies-end-tag",
 {"startName": "button", "endName": "button"})
 self.processEndTag(impliedTagToken("button"))
 return token
 else:
 self.tree.reconstructActiveFormattingElements()
 self.tree.insertElement(token)
 self.parser.framesetOK = False
 
 def startTagAppletMarqueeObject(self, token):
 self.tree.reconstructActiveFormattingElements()
 self.tree.insertElement(token)
 self.tree.activeFormattingElements.append(Marker)
 self.parser.framesetOK = False
 
 def startTagXmp(self, token):
 if self.tree.elementInScope("p", variant="button"):
 self.endTagP(impliedTagToken("p"))
 self.tree.reconstructActiveFormattingElements()
 self.parser.framesetOK = False
 self.parser.parseRCDataRawtext(token, "RAWTEXT")
 
 def startTagTable(self, token):
 if self.parser.compatMode != "quirks":
 if self.tree.elementInScope("p", variant="button"):
 self.processEndTag(impliedTagToken("p"))
 self.tree.insertElement(token)
 self.parser.framesetOK = False
 self.parser.phase = self.parser.phases["inTable"]
 
 def startTagVoidFormatting(self, token):
 self.tree.reconstructActiveFormattingElements()
 self.tree.insertElement(token)
 self.tree.openElements.pop()
 token["selfClosingAcknowledged"] = True
 self.parser.framesetOK = False
 
 def startTagInput(self, token):
 framesetOK = self.parser.framesetOK
 self.startTagVoidFormatting(token)
 if ("type" in token["data"] and
 token["data"]["type"].translate(asciiUpper2Lower) == "hidden"):
 # input type=hidden doesn't change framesetOK
 self.parser.framesetOK = framesetOK
 
 def startTagParamSource(self, token):
 self.tree.insertElement(token)
 self.tree.openElements.pop()
 token["selfClosingAcknowledged"] = True
 
 def startTagHr(self, token):
 if self.tree.elementInScope("p", variant="button"):
 self.endTagP(impliedTagToken("p"))
 self.tree.insertElement(token)
 self.tree.openElements.pop()
 token["selfClosingAcknowledged"] = True
 self.parser.framesetOK = False
 
 def startTagImage(self, token):
 # No really...
 self.parser.parseError("unexpected-start-tag-treated-as",
 {"originalName": "image", "newName": "img"})
 self.processStartTag(impliedTagToken("img", "StartTag",
 attributes=token["data"],
 selfClosing=token["selfClosing"]))
 
 def startTagIsIndex(self, token):
 self.parser.parseError("deprecated-tag", {"name": "isindex"})
 if self.tree.formPointer:
 return
 form_attrs = {}
 if "action" in token["data"]:
 form_attrs["action"] = token["data"]["action"]
 self.processStartTag(impliedTagToken("form", "StartTag",
 attributes=form_attrs))
 self.processStartTag(impliedTagToken("hr", "StartTag"))
 self.processStartTag(impliedTagToken("label", "StartTag"))
 # XXX Localization ...
 if "prompt" in token["data"]:
 prompt = token["data"]["prompt"]
 else:
 prompt = "This is a searchable index. Enter search keywords: "
 self.processCharacters(
 {"type": tokenTypes["Characters"], "data": prompt})
 attributes = token["data"].copy()
 if "action" in attributes:
 del attributes["action"]
 if "prompt" in attributes:
 del attributes["prompt"]
 attributes["name"] = "isindex"
 self.processStartTag(impliedTagToken("input", "StartTag",
 attributes=attributes,
 selfClosing=token["selfClosing"]))
 self.processEndTag(impliedTagToken("label"))
 self.processStartTag(impliedTagToken("hr", "StartTag"))
 self.processEndTag(impliedTagToken("form"))
 
 def startTagTextarea(self, token):
 self.tree.insertElement(token)
 self.parser.tokenizer.state = self.parser.tokenizer.rcdataState
 self.processSpaceCharacters = self.processSpaceCharactersDropNewline
 self.parser.framesetOK = False
 
 def startTagIFrame(self, token):
 self.parser.framesetOK = False
 self.startTagRawtext(token)
 
 def startTagNoscript(self, token):
 if self.parser.scripting:
 self.startTagRawtext(token)
 else:
 self.startTagOther(token)
 
 def startTagRawtext(self, token):
 """iframe, noembed noframes, noscript(if scripting enabled)"""
 self.parser.parseRCDataRawtext(token, "RAWTEXT")
 
 def startTagOpt(self, token):
 if self.tree.openElements[-1].name == "option":
 self.parser.phase.processEndTag(impliedTagToken("option"))
 self.tree.reconstructActiveFormattingElements()
 self.parser.tree.insertElement(token)
 
 def startTagSelect(self, token):
 self.tree.reconstructActiveFormattingElements()
 self.tree.insertElement(token)
 self.parser.framesetOK = False
 if self.parser.phase in (self.parser.phases["inTable"],
 self.parser.phases["inCaption"],
 self.parser.phases["inColumnGroup"],
 self.parser.phases["inTableBody"],
 self.parser.phases["inRow"],
 self.parser.phases["inCell"]):
 self.parser.phase = self.parser.phases["inSelectInTable"]
 else:
 self.parser.phase = self.parser.phases["inSelect"]
 
 def startTagRpRt(self, token):
 if self.tree.elementInScope("ruby"):
 self.tree.generateImpliedEndTags()
 if self.tree.openElements[-1].name != "ruby":
 self.parser.parseError()
 self.tree.insertElement(token)
 
 def startTagMath(self, token):
 self.tree.reconstructActiveFormattingElements()
 self.parser.adjustMathMLAttributes(token)
 self.parser.adjustForeignAttributes(token)
 token["namespace"] = namespaces["mathml"]
 self.tree.insertElement(token)
 # Need to get the parse error right for the case where the token
 # has a namespace not equal to the xmlns attribute
 if token["selfClosing"]:
 self.tree.openElements.pop()
 token["selfClosingAcknowledged"] = True
 
 def startTagSvg(self, token):
 self.tree.reconstructActiveFormattingElements()
 self.parser.adjustSVGAttributes(token)
 self.parser.adjustForeignAttributes(token)
 token["namespace"] = namespaces["svg"]
 self.tree.insertElement(token)
 # Need to get the parse error right for the case where the token
 # has a namespace not equal to the xmlns attribute
 if token["selfClosing"]:
 self.tree.openElements.pop()
 token["selfClosingAcknowledged"] = True
 
 def startTagMisplaced(self, token):
 """ Elements that should be children of other elements that have a
 different insertion mode; here they are ignored
 "caption", "col", "colgroup", "frame", "frameset", "head",
 "option", "optgroup", "tbody", "td", "tfoot", "th", "thead",
 "tr", "noscript"
 """
 self.parser.parseError("unexpected-start-tag-ignored", {"name": token["name"]})
 
 def startTagOther(self, token):
 self.tree.reconstructActiveFormattingElements()
 self.tree.insertElement(token)
 
 def endTagP(self, token):
 if not self.tree.elementInScope("p", variant="button"):
 self.startTagCloseP(impliedTagToken("p", "StartTag"))
 self.parser.parseError("unexpected-end-tag", {"name": "p"})
 self.endTagP(impliedTagToken("p", "EndTag"))
 else:
 self.tree.generateImpliedEndTags("p")
 if self.tree.openElements[-1].name != "p":
 self.parser.parseError("unexpected-end-tag", {"name": "p"})
 node = self.tree.openElements.pop()
 while node.name != "p":
 node = self.tree.openElements.pop()
 
 def endTagBody(self, token):
 if not self.tree.elementInScope("body"):
 self.parser.parseError()
 return
 elif self.tree.openElements[-1].name != "body":
 for node in self.tree.openElements[2:]:
 if node.name not in frozenset(("dd", "dt", "li", "optgroup",
 "option", "p", "rp", "rt",
 "tbody", "td", "tfoot",
 "th", "thead", "tr", "body",
 "html")):
 # Not sure this is the correct name for the parse error
 self.parser.parseError(
 "expected-one-end-tag-but-got-another",
 {"gotName": "body", "expectedName": node.name})
 break
 self.parser.phase = self.parser.phases["afterBody"]
 
 def endTagHtml(self, token):
 # We repeat the test for the body end tag token being ignored here
 if self.tree.elementInScope("body"):
 self.endTagBody(impliedTagToken("body"))
 return token
 
 def endTagBlock(self, token):
 # Put us back in the right whitespace handling mode
 if token["name"] == "pre":
 self.processSpaceCharacters = self.processSpaceCharactersNonPre
 inScope = self.tree.elementInScope(token["name"])
 if inScope:
 self.tree.generateImpliedEndTags()
 if self.tree.openElements[-1].name != token["name"]:
 self.parser.parseError("end-tag-too-early", {"name": token["name"]})
 if inScope:
 node = self.tree.openElements.pop()
 while node.name != token["name"]:
 node = self.tree.openElements.pop()
 
 def endTagForm(self, token):
 node = self.tree.formPointer
 self.tree.formPointer = None
 if node is None or not self.tree.elementInScope(node):
 self.parser.parseError("unexpected-end-tag",
 {"name": "form"})
 else:
 self.tree.generateImpliedEndTags()
 if self.tree.openElements[-1] != node:
 self.parser.parseError("end-tag-too-early-ignored",
 {"name": "form"})
 self.tree.openElements.remove(node)
 
 def endTagListItem(self, token):
 if token["name"] == "li":
 variant = "list"
 else:
 variant = None
 if not self.tree.elementInScope(token["name"], variant=variant):
 self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
 else:
 self.tree.generateImpliedEndTags(exclude=token["name"])
 if self.tree.openElements[-1].name != token["name"]:
 self.parser.parseError(
 "end-tag-too-early",
 {"name": token["name"]})
 node = self.tree.openElements.pop()
 while node.name != token["name"]:
 node = self.tree.openElements.pop()
 
 def endTagHeading(self, token):
 for item in headingElements:
 if self.tree.elementInScope(item):
 self.tree.generateImpliedEndTags()
 break
 if self.tree.openElements[-1].name != token["name"]:
 self.parser.parseError("end-tag-too-early", {"name": token["name"]})
 
 for item in headingElements:
 if self.tree.elementInScope(item):
 item = self.tree.openElements.pop()
 while item.name not in headingElements:
 item = self.tree.openElements.pop()
 break
 
 def endTagFormatting(self, token):
 """The much-feared adoption agency algorithm"""
 # http://svn.whatwg.org/webapps/complete.html#adoptionAgency revision 7867
 # XXX Better parseError messages appreciated.
 
 # Step 1
 outerLoopCounter = 0
 
 # Step 2
 while outerLoopCounter < 8:
 
 # Step 3
 outerLoopCounter += 1
 
 # Step 4:
 
 # Let the formatting element be the last element in
 # the list of active formatting elements that:
 # - is between the end of the list and the last scope
 # marker in the list, if any, or the start of the list
 # otherwise, and
 # - has the same tag name as the token.
 formattingElement = self.tree.elementInActiveFormattingElements(
 token["name"])
 if (not formattingElement or
 (formattingElement in self.tree.openElements and
 not self.tree.elementInScope(formattingElement.name))):
 # If there is no such node, then abort these steps
 # and instead act as described in the "any other
 # end tag" entry below.
 self.endTagOther(token)
 return
 
 # Otherwise, if there is such a node, but that node is
 # not in the stack of open elements, then this is a
 # parse error; remove the element from the list, and
 # abort these steps.
 elif formattingElement not in self.tree.openElements:
 self.parser.parseError("adoption-agency-1.2", {"name": token["name"]})
 self.tree.activeFormattingElements.remove(formattingElement)
 return
 
 # Otherwise, if there is such a node, and that node is
 # also in the stack of open elements, but the element
 # is not in scope, then this is a parse error; ignore
 # the token, and abort these steps.
 elif not self.tree.elementInScope(formattingElement.name):
 self.parser.parseError("adoption-agency-4.4", {"name": token["name"]})
 return
 
 # Otherwise, there is a formatting element and that
 # element is in the stack and is in scope. If the
 # element is not the current node, this is a parse
 # error. In any case, proceed with the algorithm as
 # written in the following steps.
 else:
 if formattingElement != self.tree.openElements[-1]:
 self.parser.parseError("adoption-agency-1.3", {"name": token["name"]})
 
 # Step 5:
 
 # Let the furthest block be the topmost node in the
 # stack of open elements that is lower in the stack
 # than the formatting element, and is an element in
 # the special category. There might not be one.
 afeIndex = self.tree.openElements.index(formattingElement)
 furthestBlock = None
 for element in self.tree.openElements[afeIndex:]:
 if element.nameTuple in specialElements:
 furthestBlock = element
 break
 
 # Step 6:
 
 # If there is no furthest block, then the UA must
 # first pop all the nodes from the bottom of the stack
 # of open elements, from the current node up to and
 # including the formatting element, then remove the
 # formatting element from the list of active
 # formatting elements, and finally abort these steps.
 if furthestBlock is None:
 element = self.tree.openElements.pop()
 while element != formattingElement:
 element = self.tree.openElements.pop()
 self.tree.activeFormattingElements.remove(element)
 return
 
 # Step 7
 commonAncestor = self.tree.openElements[afeIndex - 1]
 
 # Step 8:
 # The bookmark is supposed to help us identify where to reinsert
 # nodes in step 15. We have to ensure that we reinsert nodes after
 # the node before the active formatting element. Note the bookmark
 # can move in step 9.7
 bookmark = self.tree.activeFormattingElements.index(formattingElement)
 
 # Step 9
 lastNode = node = furthestBlock
 innerLoopCounter = 0
 
 index = self.tree.openElements.index(node)
 while innerLoopCounter < 3:
 innerLoopCounter += 1
 # Node is element before node in open elements
 index -= 1
 node = self.tree.openElements[index]
 if node not in self.tree.activeFormattingElements:
 self.tree.openElements.remove(node)
 continue
 # Step 9.6
 if node == formattingElement:
 break
 # Step 9.7
 if lastNode == furthestBlock:
 bookmark = self.tree.activeFormattingElements.index(node) + 1
 # Step 9.8
 clone = node.cloneNode()
 # Replace node with clone
 self.tree.activeFormattingElements[
 self.tree.activeFormattingElements.index(node)] = clone
 self.tree.openElements[
 self.tree.openElements.index(node)] = clone
 node = clone
 # Step 9.9
 # Remove lastNode from its parents, if any
 if lastNode.parent:
 lastNode.parent.removeChild(lastNode)
 node.appendChild(lastNode)
 # Step 9.10
 lastNode = node
 
 # Step 10
 # Foster parent lastNode if commonAncestor is a
 # table, tbody, tfoot, thead, or tr we need to foster
 # parent the lastNode
 if lastNode.parent:
 lastNode.parent.removeChild(lastNode)
 
 if commonAncestor.name in frozenset(("table", "tbody", "tfoot", "thead", "tr")):
 parent, insertBefore = self.tree.getTableMisnestedNodePosition()
 parent.insertBefore(lastNode, insertBefore)
 else:
 commonAncestor.appendChild(lastNode)
 
 # Step 11
 clone = formattingElement.cloneNode()
 
 # Step 12
 furthestBlock.reparentChildren(clone)
 
 # Step 13
 furthestBlock.appendChild(clone)
 
 # Step 14
 self.tree.activeFormattingElements.remove(formattingElement)
 self.tree.activeFormattingElements.insert(bookmark, clone)
 
 # Step 15
 self.tree.openElements.remove(formattingElement)
 self.tree.openElements.insert(
 self.tree.openElements.index(furthestBlock) + 1, clone)
 
 def endTagAppletMarqueeObject(self, token):
 if self.tree.elementInScope(token["name"]):
 self.tree.generateImpliedEndTags()
 if self.tree.openElements[-1].name != token["name"]:
 self.parser.parseError("end-tag-too-early", {"name": token["name"]})
 
 if self.tree.elementInScope(token["name"]):
 element = self.tree.openElements.pop()
 while element.name != token["name"]:
 element = self.tree.openElements.pop()
 self.tree.clearActiveFormattingElements()
 
 def endTagBr(self, token):
 self.parser.parseError("unexpected-end-tag-treated-as",
 {"originalName": "br", "newName": "br element"})
 self.tree.reconstructActiveFormattingElements()
 self.tree.insertElement(impliedTagToken("br", "StartTag"))
 self.tree.openElements.pop()
 
 def endTagOther(self, token):
 for node in self.tree.openElements[::-1]:
 if node.name == token["name"]:
 self.tree.generateImpliedEndTags(exclude=token["name"])
 if self.tree.openElements[-1].name != token["name"]:
 self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
 while self.tree.openElements.pop() != node:
 pass
 break
 else:
 if node.nameTuple in specialElements:
 self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
 break
 
 class TextPhase(Phase):
 def __init__(self, parser, tree):
 Phase.__init__(self, parser, tree)
 self.startTagHandler = _utils.MethodDispatcher([])
 self.startTagHandler.default = self.startTagOther
 self.endTagHandler = _utils.MethodDispatcher([
 ("script", self.endTagScript)])
 self.endTagHandler.default = self.endTagOther
 
 def processCharacters(self, token):
 self.tree.insertText(token["data"])
 
 def processEOF(self):
 self.parser.parseError("expected-named-closing-tag-but-got-eof",
 {"name": self.tree.openElements[-1].name})
 self.tree.openElements.pop()
 self.parser.phase = self.parser.originalPhase
 return True
 
 def startTagOther(self, token):
 assert False, "Tried to process start tag %s in RCDATA/RAWTEXT mode" % token['name']
 
 def endTagScript(self, token):
 node = self.tree.openElements.pop()
 assert node.name == "script"
 self.parser.phase = self.parser.originalPhase
 # The rest of this method is all stuff that only happens if
 # document.write works
 
 def endTagOther(self, token):
 self.tree.openElements.pop()
 self.parser.phase = self.parser.originalPhase
 
 class InTablePhase(Phase):
 # http://www.whatwg.org/specs/web-apps/current-work/#in-table
 def __init__(self, parser, tree):
 Phase.__init__(self, parser, tree)
 self.startTagHandler = _utils.MethodDispatcher([
 ("html", self.startTagHtml),
 ("caption", self.startTagCaption),
 ("colgroup", self.startTagColgroup),
 ("col", self.startTagCol),
 (("tbody", "tfoot", "thead"), self.startTagRowGroup),
 (("td", "th", "tr"), self.startTagImplyTbody),
 ("table", self.startTagTable),
 (("style", "script"), self.startTagStyleScript),
 ("input", self.startTagInput),
 ("form", self.startTagForm)
 ])
 self.startTagHandler.default = self.startTagOther
 
 self.endTagHandler = _utils.MethodDispatcher([
 ("table", self.endTagTable),
 (("body", "caption", "col", "colgroup", "html", "tbody", "td",
 "tfoot", "th", "thead", "tr"), self.endTagIgnore)
 ])
 self.endTagHandler.default = self.endTagOther
 
 # helper methods
 def clearStackToTableContext(self):
 # "clear the stack back to a table context"
 while self.tree.openElements[-1].name not in ("table", "html"):
 # self.parser.parseError("unexpected-implied-end-tag-in-table",
 #  {"name":  self.tree.openElements[-1].name})
 self.tree.openElements.pop()
 # When the current node is <html> it's an innerHTML case
 
 # processing methods
 def processEOF(self):
 if self.tree.openElements[-1].name != "html":
 self.parser.parseError("eof-in-table")
 else:
 assert self.parser.innerHTML
 # Stop parsing
 
 def processSpaceCharacters(self, token):
 originalPhase = self.parser.phase
 self.parser.phase = self.parser.phases["inTableText"]
 self.parser.phase.originalPhase = originalPhase
 self.parser.phase.processSpaceCharacters(token)
 
 def processCharacters(self, token):
 originalPhase = self.parser.phase
 self.parser.phase = self.parser.phases["inTableText"]
 self.parser.phase.originalPhase = originalPhase
 self.parser.phase.processCharacters(token)
 
 def insertText(self, token):
 # If we get here there must be at least one non-whitespace character
 # Do the table magic!
 self.tree.insertFromTable = True
 self.parser.phases["inBody"].processCharacters(token)
 self.tree.insertFromTable = False
 
 def startTagCaption(self, token):
 self.clearStackToTableContext()
 self.tree.activeFormattingElements.append(Marker)
 self.tree.insertElement(token)
 self.parser.phase = self.parser.phases["inCaption"]
 
 def startTagColgroup(self, token):
 self.clearStackToTableContext()
 self.tree.insertElement(token)
 self.parser.phase = self.parser.phases["inColumnGroup"]
 
 def startTagCol(self, token):
 self.startTagColgroup(impliedTagToken("colgroup", "StartTag"))
 return token
 
 def startTagRowGroup(self, token):
 self.clearStackToTableContext()
 self.tree.insertElement(token)
 self.parser.phase = self.parser.phases["inTableBody"]
 
 def startTagImplyTbody(self, token):
 self.startTagRowGroup(impliedTagToken("tbody", "StartTag"))
 return token
 
 def startTagTable(self, token):
 self.parser.parseError("unexpected-start-tag-implies-end-tag",
 {"startName": "table", "endName": "table"})
 self.parser.phase.processEndTag(impliedTagToken("table"))
 if not self.parser.innerHTML:
 return token
 
 def startTagStyleScript(self, token):
 return self.parser.phases["inHead"].processStartTag(token)
 
 def startTagInput(self, token):
 if ("type" in token["data"] and
 token["data"]["type"].translate(asciiUpper2Lower) == "hidden"):
 self.parser.parseError("unexpected-hidden-input-in-table")
 self.tree.insertElement(token)
 # XXX associate with form
 self.tree.openElements.pop()
 else:
 self.startTagOther(token)
 
 def startTagForm(self, token):
 self.parser.parseError("unexpected-form-in-table")
 if self.tree.formPointer is None:
 self.tree.insertElement(token)
 self.tree.formPointer = self.tree.openElements[-1]
 self.tree.openElements.pop()
 
 def startTagOther(self, token):
 self.parser.parseError("unexpected-start-tag-implies-table-voodoo", {"name": token["name"]})
 # Do the table magic!
 self.tree.insertFromTable = True
 self.parser.phases["inBody"].processStartTag(token)
 self.tree.insertFromTable = False
 
 def endTagTable(self, token):
 if self.tree.elementInScope("table", variant="table"):
 self.tree.generateImpliedEndTags()
 if self.tree.openElements[-1].name != "table":
 self.parser.parseError("end-tag-too-early-named",
 {"gotName": "table",
 "expectedName": self.tree.openElements[-1].name})
 while self.tree.openElements[-1].name != "table":
 self.tree.openElements.pop()
 self.tree.openElements.pop()
 self.parser.resetInsertionMode()
 else:
 # innerHTML case
 assert self.parser.innerHTML
 self.parser.parseError()
 
 def endTagIgnore(self, token):
 self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
 
 def endTagOther(self, token):
 self.parser.parseError("unexpected-end-tag-implies-table-voodoo", {"name": token["name"]})
 # Do the table magic!
 self.tree.insertFromTable = True
 self.parser.phases["inBody"].processEndTag(token)
 self.tree.insertFromTable = False
 
 class InTableTextPhase(Phase):
 def __init__(self, parser, tree):
 Phase.__init__(self, parser, tree)
 self.originalPhase = None
 self.characterTokens = []
 
 def flushCharacters(self):
 data = "".join([item["data"] for item in self.characterTokens])
 if any([item not in spaceCharacters for item in data]):
 token = {"type": tokenTypes["Characters"], "data": data}
 self.parser.phases["inTable"].insertText(token)
 elif data:
 self.tree.insertText(data)
 self.characterTokens = []
 
 def processComment(self, token):
 self.flushCharacters()
 self.parser.phase = self.originalPhase
 return token
 
 def processEOF(self):
 self.flushCharacters()
 self.parser.phase = self.originalPhase
 return True
 
 def processCharacters(self, token):
 if token["data"] == "\u0000":
 return
 self.characterTokens.append(token)
 
 def processSpaceCharacters(self, token):
 # pretty sure we should never reach here
 self.characterTokens.append(token)
 #        assert False
 
 def processStartTag(self, token):
 self.flushCharacters()
 self.parser.phase = self.originalPhase
 return token
 
 def processEndTag(self, token):
 self.flushCharacters()
 self.parser.phase = self.originalPhase
 return token
 
 class InCaptionPhase(Phase):
 # http://www.whatwg.org/specs/web-apps/current-work/#in-caption
 def __init__(self, parser, tree):
 Phase.__init__(self, parser, tree)
 
 self.startTagHandler = _utils.MethodDispatcher([
 ("html", self.startTagHtml),
 (("caption", "col", "colgroup", "tbody", "td", "tfoot", "th",
 "thead", "tr"), self.startTagTableElement)
 ])
 self.startTagHandler.default = self.startTagOther
 
 self.endTagHandler = _utils.MethodDispatcher([
 ("caption", self.endTagCaption),
 ("table", self.endTagTable),
 (("body", "col", "colgroup", "html", "tbody", "td", "tfoot", "th",
 "thead", "tr"), self.endTagIgnore)
 ])
 self.endTagHandler.default = self.endTagOther
 
 def ignoreEndTagCaption(self):
 return not self.tree.elementInScope("caption", variant="table")
 
 def processEOF(self):
 self.parser.phases["inBody"].processEOF()
 
 def processCharacters(self, token):
 return self.parser.phases["inBody"].processCharacters(token)
 
 def startTagTableElement(self, token):
 self.parser.parseError()
 # XXX Have to duplicate logic here to find out if the tag is ignored
 ignoreEndTag = self.ignoreEndTagCaption()
 self.parser.phase.processEndTag(impliedTagToken("caption"))
 if not ignoreEndTag:
 return token
 
 def startTagOther(self, token):
 return self.parser.phases["inBody"].processStartTag(token)
 
 def endTagCaption(self, token):
 if not self.ignoreEndTagCaption():
 # AT this code is quite similar to endTagTable in "InTable"
 self.tree.generateImpliedEndTags()
 if self.tree.openElements[-1].name != "caption":
 self.parser.parseError("expected-one-end-tag-but-got-another",
 {"gotName": "caption",
 "expectedName": self.tree.openElements[-1].name})
 while self.tree.openElements[-1].name != "caption":
 self.tree.openElements.pop()
 self.tree.openElements.pop()
 self.tree.clearActiveFormattingElements()
 self.parser.phase = self.parser.phases["inTable"]
 else:
 # innerHTML case
 assert self.parser.innerHTML
 self.parser.parseError()
 
 def endTagTable(self, token):
 self.parser.parseError()
 ignoreEndTag = self.ignoreEndTagCaption()
 self.parser.phase.processEndTag(impliedTagToken("caption"))
 if not ignoreEndTag:
 return token
 
 def endTagIgnore(self, token):
 self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
 
 def endTagOther(self, token):
 return self.parser.phases["inBody"].processEndTag(token)
 
 class InColumnGroupPhase(Phase):
 # http://www.whatwg.org/specs/web-apps/current-work/#in-column
 
 def __init__(self, parser, tree):
 Phase.__init__(self, parser, tree)
 
 self.startTagHandler = _utils.MethodDispatcher([
 ("html", self.startTagHtml),
 ("col", self.startTagCol)
 ])
 self.startTagHandler.default = self.startTagOther
 
 self.endTagHandler = _utils.MethodDispatcher([
 ("colgroup", self.endTagColgroup),
 ("col", self.endTagCol)
 ])
 self.endTagHandler.default = self.endTagOther
 
 def ignoreEndTagColgroup(self):
 return self.tree.openElements[-1].name == "html"
 
 def processEOF(self):
 if self.tree.openElements[-1].name == "html":
 assert self.parser.innerHTML
 return
 else:
 ignoreEndTag = self.ignoreEndTagColgroup()
 self.endTagColgroup(impliedTagToken("colgroup"))
 if not ignoreEndTag:
 return True
 
 def processCharacters(self, token):
 ignoreEndTag = self.ignoreEndTagColgroup()
 self.endTagColgroup(impliedTagToken("colgroup"))
 if not ignoreEndTag:
 return token
 
 def startTagCol(self, token):
 self.tree.insertElement(token)
 self.tree.openElements.pop()
 token["selfClosingAcknowledged"] = True
 
 def startTagOther(self, token):
 ignoreEndTag = self.ignoreEndTagColgroup()
 self.endTagColgroup(impliedTagToken("colgroup"))
 if not ignoreEndTag:
 return token
 
 def endTagColgroup(self, token):
 if self.ignoreEndTagColgroup():
 # innerHTML case
 assert self.parser.innerHTML
 self.parser.parseError()
 else:
 self.tree.openElements.pop()
 self.parser.phase = self.parser.phases["inTable"]
 
 def endTagCol(self, token):
 self.parser.parseError("no-end-tag", {"name": "col"})
 
 def endTagOther(self, token):
 ignoreEndTag = self.ignoreEndTagColgroup()
 self.endTagColgroup(impliedTagToken("colgroup"))
 if not ignoreEndTag:
 return token
 
 class InTableBodyPhase(Phase):
 # http://www.whatwg.org/specs/web-apps/current-work/#in-table0
 def __init__(self, parser, tree):
 Phase.__init__(self, parser, tree)
 self.startTagHandler = _utils.MethodDispatcher([
 ("html", self.startTagHtml),
 ("tr", self.startTagTr),
 (("td", "th"), self.startTagTableCell),
 (("caption", "col", "colgroup", "tbody", "tfoot", "thead"),
 self.startTagTableOther)
 ])
 self.startTagHandler.default = self.startTagOther
 
 self.endTagHandler = _utils.MethodDispatcher([
 (("tbody", "tfoot", "thead"), self.endTagTableRowGroup),
 ("table", self.endTagTable),
 (("body", "caption", "col", "colgroup", "html", "td", "th",
 "tr"), self.endTagIgnore)
 ])
 self.endTagHandler.default = self.endTagOther
 
 # helper methods
 def clearStackToTableBodyContext(self):
 while self.tree.openElements[-1].name not in ("tbody", "tfoot",
 "thead", "html"):
 # self.parser.parseError("unexpected-implied-end-tag-in-table",
 #  {"name": self.tree.openElements[-1].name})
 self.tree.openElements.pop()
 if self.tree.openElements[-1].name == "html":
 assert self.parser.innerHTML
 
 # the rest
 def processEOF(self):
 self.parser.phases["inTable"].processEOF()
 
 def processSpaceCharacters(self, token):
 return self.parser.phases["inTable"].processSpaceCharacters(token)
 
 def processCharacters(self, token):
 return self.parser.phases["inTable"].processCharacters(token)
 
 def startTagTr(self, token):
 self.clearStackToTableBodyContext()
 self.tree.insertElement(token)
 self.parser.phase = self.parser.phases["inRow"]
 
 def startTagTableCell(self, token):
 self.parser.parseError("unexpected-cell-in-table-body",
 {"name": token["name"]})
 self.startTagTr(impliedTagToken("tr", "StartTag"))
 return token
 
 def startTagTableOther(self, token):
 # XXX AT Any ideas on how to share this with endTagTable?
 if (self.tree.elementInScope("tbody", variant="table") or
 self.tree.elementInScope("thead", variant="table") or
 self.tree.elementInScope("tfoot", variant="table")):
 self.clearStackToTableBodyContext()
 self.endTagTableRowGroup(
 impliedTagToken(self.tree.openElements[-1].name))
 return token
 else:
 # innerHTML case
 assert self.parser.innerHTML
 self.parser.parseError()
 
 def startTagOther(self, token):
 return self.parser.phases["inTable"].processStartTag(token)
 
 def endTagTableRowGroup(self, token):
 if self.tree.elementInScope(token["name"], variant="table"):
 self.clearStackToTableBodyContext()
 self.tree.openElements.pop()
 self.parser.phase = self.parser.phases["inTable"]
 else:
 self.parser.parseError("unexpected-end-tag-in-table-body",
 {"name": token["name"]})
 
 def endTagTable(self, token):
 if (self.tree.elementInScope("tbody", variant="table") or
 self.tree.elementInScope("thead", variant="table") or
 self.tree.elementInScope("tfoot", variant="table")):
 self.clearStackToTableBodyContext()
 self.endTagTableRowGroup(
 impliedTagToken(self.tree.openElements[-1].name))
 return token
 else:
 # innerHTML case
 assert self.parser.innerHTML
 self.parser.parseError()
 
 def endTagIgnore(self, token):
 self.parser.parseError("unexpected-end-tag-in-table-body",
 {"name": token["name"]})
 
 def endTagOther(self, token):
 return self.parser.phases["inTable"].processEndTag(token)
 
 class InRowPhase(Phase):
 # http://www.whatwg.org/specs/web-apps/current-work/#in-row
 def __init__(self, parser, tree):
 Phase.__init__(self, parser, tree)
 self.startTagHandler = _utils.MethodDispatcher([
 ("html", self.startTagHtml),
 (("td", "th"), self.startTagTableCell),
 (("caption", "col", "colgroup", "tbody", "tfoot", "thead",
 "tr"), self.startTagTableOther)
 ])
 self.startTagHandler.default = self.startTagOther
 
 self.endTagHandler = _utils.MethodDispatcher([
 ("tr", self.endTagTr),
 ("table", self.endTagTable),
 (("tbody", "tfoot", "thead"), self.endTagTableRowGroup),
 (("body", "caption", "col", "colgroup", "html", "td", "th"),
 self.endTagIgnore)
 ])
 self.endTagHandler.default = self.endTagOther
 
 # helper methods (XXX unify this with other table helper methods)
 def clearStackToTableRowContext(self):
 while self.tree.openElements[-1].name not in ("tr", "html"):
 self.parser.parseError("unexpected-implied-end-tag-in-table-row",
 {"name": self.tree.openElements[-1].name})
 self.tree.openElements.pop()
 
 def ignoreEndTagTr(self):
 return not self.tree.elementInScope("tr", variant="table")
 
 # the rest
 def processEOF(self):
 self.parser.phases["inTable"].processEOF()
 
 def processSpaceCharacters(self, token):
 return self.parser.phases["inTable"].processSpaceCharacters(token)
 
 def processCharacters(self, token):
 return self.parser.phases["inTable"].processCharacters(token)
 
 def startTagTableCell(self, token):
 self.clearStackToTableRowContext()
 self.tree.insertElement(token)
 self.parser.phase = self.parser.phases["inCell"]
 self.tree.activeFormattingElements.append(Marker)
 
 def startTagTableOther(self, token):
 ignoreEndTag = self.ignoreEndTagTr()
 self.endTagTr(impliedTagToken("tr"))
 # XXX how are we sure it's always ignored in the innerHTML case?
 if not ignoreEndTag:
 return token
 
 def startTagOther(self, token):
 return self.parser.phases["inTable"].processStartTag(token)
 
 def endTagTr(self, token):
 if not self.ignoreEndTagTr():
 self.clearStackToTableRowContext()
 self.tree.openElements.pop()
 self.parser.phase = self.parser.phases["inTableBody"]
 else:
 # innerHTML case
 assert self.parser.innerHTML
 self.parser.parseError()
 
 def endTagTable(self, token):
 ignoreEndTag = self.ignoreEndTagTr()
 self.endTagTr(impliedTagToken("tr"))
 # Reprocess the current tag if the tr end tag was not ignored
 # XXX how are we sure it's always ignored in the innerHTML case?
 if not ignoreEndTag:
 return token
 
 def endTagTableRowGroup(self, token):
 if self.tree.elementInScope(token["name"], variant="table"):
 self.endTagTr(impliedTagToken("tr"))
 return token
 else:
 self.parser.parseError()
 
 def endTagIgnore(self, token):
 self.parser.parseError("unexpected-end-tag-in-table-row",
 {"name": token["name"]})
 
 def endTagOther(self, token):
 return self.parser.phases["inTable"].processEndTag(token)
 
 class InCellPhase(Phase):
 # http://www.whatwg.org/specs/web-apps/current-work/#in-cell
 def __init__(self, parser, tree):
 Phase.__init__(self, parser, tree)
 self.startTagHandler = _utils.MethodDispatcher([
 ("html", self.startTagHtml),
 (("caption", "col", "colgroup", "tbody", "td", "tfoot", "th",
 "thead", "tr"), self.startTagTableOther)
 ])
 self.startTagHandler.default = self.startTagOther
 
 self.endTagHandler = _utils.MethodDispatcher([
 (("td", "th"), self.endTagTableCell),
 (("body", "caption", "col", "colgroup", "html"), self.endTagIgnore),
 (("table", "tbody", "tfoot", "thead", "tr"), self.endTagImply)
 ])
 self.endTagHandler.default = self.endTagOther
 
 # helper
 def closeCell(self):
 if self.tree.elementInScope("td", variant="table"):
 self.endTagTableCell(impliedTagToken("td"))
 elif self.tree.elementInScope("th", variant="table"):
 self.endTagTableCell(impliedTagToken("th"))
 
 # the rest
 def processEOF(self):
 self.parser.phases["inBody"].processEOF()
 
 def processCharacters(self, token):
 return self.parser.phases["inBody"].processCharacters(token)
 
 def startTagTableOther(self, token):
 if (self.tree.elementInScope("td", variant="table") or
 self.tree.elementInScope("th", variant="table")):
 self.closeCell()
 return token
 else:
 # innerHTML case
 assert self.parser.innerHTML
 self.parser.parseError()
 
 def startTagOther(self, token):
 return self.parser.phases["inBody"].processStartTag(token)
 
 def endTagTableCell(self, token):
 if self.tree.elementInScope(token["name"], variant="table"):
 self.tree.generateImpliedEndTags(token["name"])
 if self.tree.openElements[-1].name != token["name"]:
 self.parser.parseError("unexpected-cell-end-tag",
 {"name": token["name"]})
 while True:
 node = self.tree.openElements.pop()
 if node.name == token["name"]:
 break
 else:
 self.tree.openElements.pop()
 self.tree.clearActiveFormattingElements()
 self.parser.phase = self.parser.phases["inRow"]
 else:
 self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
 
 def endTagIgnore(self, token):
 self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
 
 def endTagImply(self, token):
 if self.tree.elementInScope(token["name"], variant="table"):
 self.closeCell()
 return token
 else:
 # sometimes innerHTML case
 self.parser.parseError()
 
 def endTagOther(self, token):
 return self.parser.phases["inBody"].processEndTag(token)
 
 class InSelectPhase(Phase):
 def __init__(self, parser, tree):
 Phase.__init__(self, parser, tree)
 
 self.startTagHandler = _utils.MethodDispatcher([
 ("html", self.startTagHtml),
 ("option", self.startTagOption),
 ("optgroup", self.startTagOptgroup),
 ("select", self.startTagSelect),
 (("input", "keygen", "textarea"), self.startTagInput),
 ("script", self.startTagScript)
 ])
 self.startTagHandler.default = self.startTagOther
 
 self.endTagHandler = _utils.MethodDispatcher([
 ("option", self.endTagOption),
 ("optgroup", self.endTagOptgroup),
 ("select", self.endTagSelect)
 ])
 self.endTagHandler.default = self.endTagOther
 
 # http://www.whatwg.org/specs/web-apps/current-work/#in-select
 def processEOF(self):
 if self.tree.openElements[-1].name != "html":
 self.parser.parseError("eof-in-select")
 else:
 assert self.parser.innerHTML
 
 def processCharacters(self, token):
 if token["data"] == "\u0000":
 return
 self.tree.insertText(token["data"])
 
 def startTagOption(self, token):
 # We need to imply </option> if <option> is the current node.
 if self.tree.openElements[-1].name == "option":
 self.tree.openElements.pop()
 self.tree.insertElement(token)
 
 def startTagOptgroup(self, token):
 if self.tree.openElements[-1].name == "option":
 self.tree.openElements.pop()
 if self.tree.openElements[-1].name == "optgroup":
 self.tree.openElements.pop()
 self.tree.insertElement(token)
 
 def startTagSelect(self, token):
 self.parser.parseError("unexpected-select-in-select")
 self.endTagSelect(impliedTagToken("select"))
 
 def startTagInput(self, token):
 self.parser.parseError("unexpected-input-in-select")
 if self.tree.elementInScope("select", variant="select"):
 self.endTagSelect(impliedTagToken("select"))
 return token
 else:
 assert self.parser.innerHTML
 
 def startTagScript(self, token):
 return self.parser.phases["inHead"].processStartTag(token)
 
 def startTagOther(self, token):
 self.parser.parseError("unexpected-start-tag-in-select",
 {"name": token["name"]})
 
 def endTagOption(self, token):
 if self.tree.openElements[-1].name == "option":
 self.tree.openElements.pop()
 else:
 self.parser.parseError("unexpected-end-tag-in-select",
 {"name": "option"})
 
 def endTagOptgroup(self, token):
 # </optgroup> implicitly closes <option>
 if (self.tree.openElements[-1].name == "option" and
 self.tree.openElements[-2].name == "optgroup"):
 self.tree.openElements.pop()
 # It also closes </optgroup>
 if self.tree.openElements[-1].name == "optgroup":
 self.tree.openElements.pop()
 # But nothing else
 else:
 self.parser.parseError("unexpected-end-tag-in-select",
 {"name": "optgroup"})
 
 def endTagSelect(self, token):
 if self.tree.elementInScope("select", variant="select"):
 node = self.tree.openElements.pop()
 while node.name != "select":
 node = self.tree.openElements.pop()
 self.parser.resetInsertionMode()
 else:
 # innerHTML case
 assert self.parser.innerHTML
 self.parser.parseError()
 
 def endTagOther(self, token):
 self.parser.parseError("unexpected-end-tag-in-select",
 {"name": token["name"]})
 
 class InSelectInTablePhase(Phase):
 def __init__(self, parser, tree):
 Phase.__init__(self, parser, tree)
 
 self.startTagHandler = _utils.MethodDispatcher([
 (("caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th"),
 self.startTagTable)
 ])
 self.startTagHandler.default = self.startTagOther
 
 self.endTagHandler = _utils.MethodDispatcher([
 (("caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th"),
 self.endTagTable)
 ])
 self.endTagHandler.default = self.endTagOther
 
 def processEOF(self):
 self.parser.phases["inSelect"].processEOF()
 
 def processCharacters(self, token):
 return self.parser.phases["inSelect"].processCharacters(token)
 
 def startTagTable(self, token):
 self.parser.parseError("unexpected-table-element-start-tag-in-select-in-table", {"name": token["name"]})
 self.endTagOther(impliedTagToken("select"))
 return token
 
 def startTagOther(self, token):
 return self.parser.phases["inSelect"].processStartTag(token)
 
 def endTagTable(self, token):
 self.parser.parseError("unexpected-table-element-end-tag-in-select-in-table", {"name": token["name"]})
 if self.tree.elementInScope(token["name"], variant="table"):
 self.endTagOther(impliedTagToken("select"))
 return token
 
 def endTagOther(self, token):
 return self.parser.phases["inSelect"].processEndTag(token)
 
 class InForeignContentPhase(Phase):
 breakoutElements = frozenset(["b", "big", "blockquote", "body", "br",
 "center", "code", "dd", "div", "dl", "dt",
 "em", "embed", "h1", "h2", "h3",
 "h4", "h5", "h6", "head", "hr", "i", "img",
 "li", "listing", "menu", "meta", "nobr",
 "ol", "p", "pre", "ruby", "s", "small",
 "span", "strong", "strike", "sub", "sup",
 "table", "tt", "u", "ul", "var"])
 
 def __init__(self, parser, tree):
 Phase.__init__(self, parser, tree)
 
 def adjustSVGTagNames(self, token):
 replacements = {"altglyph": "altGlyph",
 "altglyphdef": "altGlyphDef",
 "altglyphitem": "altGlyphItem",
 "animatecolor": "animateColor",
 "animatemotion": "animateMotion",
 "animatetransform": "animateTransform",
 "clippath": "clipPath",
 "feblend": "feBlend",
 "fecolormatrix": "feColorMatrix",
 "fecomponenttransfer": "feComponentTransfer",
 "fecomposite": "feComposite",
 "feconvolvematrix": "feConvolveMatrix",
 "fediffuselighting": "feDiffuseLighting",
 "fedisplacementmap": "feDisplacementMap",
 "fedistantlight": "feDistantLight",
 "feflood": "feFlood",
 "fefunca": "feFuncA",
 "fefuncb": "feFuncB",
 "fefuncg": "feFuncG",
 "fefuncr": "feFuncR",
 "fegaussianblur": "feGaussianBlur",
 "feimage": "feImage",
 "femerge": "feMerge",
 "femergenode": "feMergeNode",
 "femorphology": "feMorphology",
 "feoffset": "feOffset",
 "fepointlight": "fePointLight",
 "fespecularlighting": "feSpecularLighting",
 "fespotlight": "feSpotLight",
 "fetile": "feTile",
 "feturbulence": "feTurbulence",
 "foreignobject": "foreignObject",
 "glyphref": "glyphRef",
 "lineargradient": "linearGradient",
 "radialgradient": "radialGradient",
 "textpath": "textPath"}
 
 if token["name"] in replacements:
 token["name"] = replacements[token["name"]]
 
 def processCharacters(self, token):
 if token["data"] == "\u0000":
 token["data"] = "\uFFFD"
 elif (self.parser.framesetOK and
 any(char not in spaceCharacters for char in token["data"])):
 self.parser.framesetOK = False
 Phase.processCharacters(self, token)
 
 def processStartTag(self, token):
 currentNode = self.tree.openElements[-1]
 if (token["name"] in self.breakoutElements or
 (token["name"] == "font" and
 set(token["data"].keys()) & set(["color", "face", "size"]))):
 self.parser.parseError("unexpected-html-element-in-foreign-content",
 {"name": token["name"]})
 while (self.tree.openElements[-1].namespace !=
 self.tree.defaultNamespace and
 not self.parser.isHTMLIntegrationPoint(self.tree.openElements[-1]) and
 not self.parser.isMathMLTextIntegrationPoint(self.tree.openElements[-1])):
 self.tree.openElements.pop()
 return token
 
 else:
 if currentNode.namespace == namespaces["mathml"]:
 self.parser.adjustMathMLAttributes(token)
 elif currentNode.namespace == namespaces["svg"]:
 self.adjustSVGTagNames(token)
 self.parser.adjustSVGAttributes(token)
 self.parser.adjustForeignAttributes(token)
 token["namespace"] = currentNode.namespace
 self.tree.insertElement(token)
 if token["selfClosing"]:
 self.tree.openElements.pop()
 token["selfClosingAcknowledged"] = True
 
 def processEndTag(self, token):
 nodeIndex = len(self.tree.openElements) - 1
 node = self.tree.openElements[-1]
 if node.name.translate(asciiUpper2Lower) != token["name"]:
 self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
 
 while True:
 if node.name.translate(asciiUpper2Lower) == token["name"]:
 # XXX this isn't in the spec but it seems necessary
 if self.parser.phase == self.parser.phases["inTableText"]:
 self.parser.phase.flushCharacters()
 self.parser.phase = self.parser.phase.originalPhase
 while self.tree.openElements.pop() != node:
 assert self.tree.openElements
 new_token = None
 break
 nodeIndex -= 1
 
 node = self.tree.openElements[nodeIndex]
 if node.namespace != self.tree.defaultNamespace:
 continue
 else:
 new_token = self.parser.phase.processEndTag(token)
 break
 return new_token
 
 class AfterBodyPhase(Phase):
 def __init__(self, parser, tree):
 Phase.__init__(self, parser, tree)
 
 self.startTagHandler = _utils.MethodDispatcher([
 ("html", self.startTagHtml)
 ])
 self.startTagHandler.default = self.startTagOther
 
 self.endTagHandler = _utils.MethodDispatcher([("html", self.endTagHtml)])
 self.endTagHandler.default = self.endTagOther
 
 def processEOF(self):
 # Stop parsing
 pass
 
 def processComment(self, token):
 # This is needed because data is to be appended to the <html> element
 # here and not to whatever is currently open.
 self.tree.insertComment(token, self.tree.openElements[0])
 
 def processCharacters(self, token):
 self.parser.parseError("unexpected-char-after-body")
 self.parser.phase = self.parser.phases["inBody"]
 return token
 
 def startTagHtml(self, token):
 return self.parser.phases["inBody"].processStartTag(token)
 
 def startTagOther(self, token):
 self.parser.parseError("unexpected-start-tag-after-body",
 {"name": token["name"]})
 self.parser.phase = self.parser.phases["inBody"]
 return token
 
 def endTagHtml(self, name):
 if self.parser.innerHTML:
 self.parser.parseError("unexpected-end-tag-after-body-innerhtml")
 else:
 self.parser.phase = self.parser.phases["afterAfterBody"]
 
 def endTagOther(self, token):
 self.parser.parseError("unexpected-end-tag-after-body",
 {"name": token["name"]})
 self.parser.phase = self.parser.phases["inBody"]
 return token
 
 class InFramesetPhase(Phase):
 # http://www.whatwg.org/specs/web-apps/current-work/#in-frameset
 def __init__(self, parser, tree):
 Phase.__init__(self, parser, tree)
 
 self.startTagHandler = _utils.MethodDispatcher([
 ("html", self.startTagHtml),
 ("frameset", self.startTagFrameset),
 ("frame", self.startTagFrame),
 ("noframes", self.startTagNoframes)
 ])
 self.startTagHandler.default = self.startTagOther
 
 self.endTagHandler = _utils.MethodDispatcher([
 ("frameset", self.endTagFrameset)
 ])
 self.endTagHandler.default = self.endTagOther
 
 def processEOF(self):
 if self.tree.openElements[-1].name != "html":
 self.parser.parseError("eof-in-frameset")
 else:
 assert self.parser.innerHTML
 
 def processCharacters(self, token):
 self.parser.parseError("unexpected-char-in-frameset")
 
 def startTagFrameset(self, token):
 self.tree.insertElement(token)
 
 def startTagFrame(self, token):
 self.tree.insertElement(token)
 self.tree.openElements.pop()
 
 def startTagNoframes(self, token):
 return self.parser.phases["inBody"].processStartTag(token)
 
 def startTagOther(self, token):
 self.parser.parseError("unexpected-start-tag-in-frameset",
 {"name": token["name"]})
 
 def endTagFrameset(self, token):
 if self.tree.openElements[-1].name == "html":
 # innerHTML case
 self.parser.parseError("unexpected-frameset-in-frameset-innerhtml")
 else:
 self.tree.openElements.pop()
 if (not self.parser.innerHTML and
 self.tree.openElements[-1].name != "frameset"):
 # If we're not in innerHTML mode and the current node is not a
 # "frameset" element (anymore) then switch.
 self.parser.phase = self.parser.phases["afterFrameset"]
 
 def endTagOther(self, token):
 self.parser.parseError("unexpected-end-tag-in-frameset",
 {"name": token["name"]})
 
 class AfterFramesetPhase(Phase):
 # http://www.whatwg.org/specs/web-apps/current-work/#after3
 def __init__(self, parser, tree):
 Phase.__init__(self, parser, tree)
 
 self.startTagHandler = _utils.MethodDispatcher([
 ("html", self.startTagHtml),
 ("noframes", self.startTagNoframes)
 ])
 self.startTagHandler.default = self.startTagOther
 
 self.endTagHandler = _utils.MethodDispatcher([
 ("html", self.endTagHtml)
 ])
 self.endTagHandler.default = self.endTagOther
 
 def processEOF(self):
 # Stop parsing
 pass
 
 def processCharacters(self, token):
 self.parser.parseError("unexpected-char-after-frameset")
 
 def startTagNoframes(self, token):
 return self.parser.phases["inHead"].processStartTag(token)
 
 def startTagOther(self, token):
 self.parser.parseError("unexpected-start-tag-after-frameset",
 {"name": token["name"]})
 
 def endTagHtml(self, token):
 self.parser.phase = self.parser.phases["afterAfterFrameset"]
 
 def endTagOther(self, token):
 self.parser.parseError("unexpected-end-tag-after-frameset",
 {"name": token["name"]})
 
 class AfterAfterBodyPhase(Phase):
 def __init__(self, parser, tree):
 Phase.__init__(self, parser, tree)
 
 self.startTagHandler = _utils.MethodDispatcher([
 ("html", self.startTagHtml)
 ])
 self.startTagHandler.default = self.startTagOther
 
 def processEOF(self):
 pass
 
 def processComment(self, token):
 self.tree.insertComment(token, self.tree.document)
 
 def processSpaceCharacters(self, token):
 return self.parser.phases["inBody"].processSpaceCharacters(token)
 
 def processCharacters(self, token):
 self.parser.parseError("expected-eof-but-got-char")
 self.parser.phase = self.parser.phases["inBody"]
 return token
 
 def startTagHtml(self, token):
 return self.parser.phases["inBody"].processStartTag(token)
 
 def startTagOther(self, token):
 self.parser.parseError("expected-eof-but-got-start-tag",
 {"name": token["name"]})
 self.parser.phase = self.parser.phases["inBody"]
 return token
 
 def processEndTag(self, token):
 self.parser.parseError("expected-eof-but-got-end-tag",
 {"name": token["name"]})
 self.parser.phase = self.parser.phases["inBody"]
 return token
 
 class AfterAfterFramesetPhase(Phase):
 def __init__(self, parser, tree):
 Phase.__init__(self, parser, tree)
 
 self.startTagHandler = _utils.MethodDispatcher([
 ("html", self.startTagHtml),
 ("noframes", self.startTagNoFrames)
 ])
 self.startTagHandler.default = self.startTagOther
 
 def processEOF(self):
 pass
 
 def processComment(self, token):
 self.tree.insertComment(token, self.tree.document)
 
 def processSpaceCharacters(self, token):
 return self.parser.phases["inBody"].processSpaceCharacters(token)
 
 def processCharacters(self, token):
 self.parser.parseError("expected-eof-but-got-char")
 
 def startTagHtml(self, token):
 return self.parser.phases["inBody"].processStartTag(token)
 
 def startTagNoFrames(self, token):
 return self.parser.phases["inHead"].processStartTag(token)
 
 def startTagOther(self, token):
 self.parser.parseError("expected-eof-but-got-start-tag",
 {"name": token["name"]})
 
 def processEndTag(self, token):
 self.parser.parseError("expected-eof-but-got-end-tag",
 {"name": token["name"]})
 # pylint:enable=unused-argument
 
 return {
 "initial": InitialPhase,
 "beforeHtml": BeforeHtmlPhase,
 "beforeHead": BeforeHeadPhase,
 "inHead": InHeadPhase,
 "inHeadNoscript": InHeadNoscriptPhase,
 "afterHead": AfterHeadPhase,
 "inBody": InBodyPhase,
 "text": TextPhase,
 "inTable": InTablePhase,
 "inTableText": InTableTextPhase,
 "inCaption": InCaptionPhase,
 "inColumnGroup": InColumnGroupPhase,
 "inTableBody": InTableBodyPhase,
 "inRow": InRowPhase,
 "inCell": InCellPhase,
 "inSelect": InSelectPhase,
 "inSelectInTable": InSelectInTablePhase,
 "inForeignContent": InForeignContentPhase,
 "afterBody": AfterBodyPhase,
 "inFrameset": InFramesetPhase,
 "afterFrameset": AfterFramesetPhase,
 "afterAfterBody": AfterAfterBodyPhase,
 "afterAfterFrameset": AfterAfterFramesetPhase,
 # XXX after after frameset
 }
 
 
 def adjust_attributes(token, replacements):
 if PY3 or _utils.PY27:
 needs_adjustment = viewkeys(token['data']) & viewkeys(replacements)
 else:
 needs_adjustment = frozenset(token['data']) & frozenset(replacements)
 if needs_adjustment:
 token['data'] = OrderedDict((replacements.get(k, k), v)
 for k, v in token['data'].items())
 
 
 def impliedTagToken(name, type="EndTag", attributes=None,
 selfClosing=False):
 if attributes is None:
 attributes = {}
 return {"type": tokenTypes[type], "name": name, "data": attributes,
 "selfClosing": selfClosing}
 
 
 class ParseError(Exception):
 """Error in parsed document"""
 pass
 
 |