| Viewing file:  _tokenizer.py (74.79 KB)      -rw-r--r-- Select action/file-type:
 
  (+) |  (+) |  (+) | Code (+) | Session (+) |  (+) | SDB (+) |  (+) |  (+) |  (+) |  (+) |  (+) | 
 
from __future__ import absolute_import, division, unicode_literals
 from pip._vendor.six import unichr as chr
 
 from collections import deque
 
 from .constants import spaceCharacters
 from .constants import entities
 from .constants import asciiLetters, asciiUpper2Lower
 from .constants import digits, hexDigits, EOF
 from .constants import tokenTypes, tagTokenTypes
 from .constants import replacementCharacters
 
 from ._inputstream import HTMLInputStream
 
 from ._trie import Trie
 
 entitiesTrie = Trie(entities)
 
 
 class HTMLTokenizer(object):
 """ This class takes care of tokenizing HTML.
 
 * self.currentToken
 Holds the token that is currently being processed.
 
 * self.state
 Holds a reference to the method to be invoked... XXX
 
 * self.stream
 Points to HTMLInputStream object.
 """
 
 def __init__(self, stream, parser=None, **kwargs):
 
 self.stream = HTMLInputStream(stream, **kwargs)
 self.parser = parser
 
 # Setup the initial tokenizer state
 self.escapeFlag = False
 self.lastFourChars = []
 self.state = self.dataState
 self.escape = False
 
 # The current token being created
 self.currentToken = None
 super(HTMLTokenizer, self).__init__()
 
 def __iter__(self):
 """ This is where the magic happens.
 
 We do our usually processing through the states and when we have a token
 to return we yield the token which pauses processing until the next token
 is requested.
 """
 self.tokenQueue = deque([])
 # Start processing. When EOF is reached self.state will return False
 # instead of True and the loop will terminate.
 while self.state():
 while self.stream.errors:
 yield {"type": tokenTypes["ParseError"], "data": self.stream.errors.pop(0)}
 while self.tokenQueue:
 yield self.tokenQueue.popleft()
 
 def consumeNumberEntity(self, isHex):
 """This function returns either U+FFFD or the character based on the
 decimal or hexadecimal representation. It also discards ";" if present.
 If not present self.tokenQueue.append({"type": tokenTypes["ParseError"]}) is invoked.
 """
 
 allowed = digits
 radix = 10
 if isHex:
 allowed = hexDigits
 radix = 16
 
 charStack = []
 
 # Consume all the characters that are in range while making sure we
 # don't hit an EOF.
 c = self.stream.char()
 while c in allowed and c is not EOF:
 charStack.append(c)
 c = self.stream.char()
 
 # Convert the set of characters consumed to an int.
 charAsInt = int("".join(charStack), radix)
 
 # Certain characters get replaced with others
 if charAsInt in replacementCharacters:
 char = replacementCharacters[charAsInt]
 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
 "illegal-codepoint-for-numeric-entity",
 "datavars": {"charAsInt": charAsInt}})
 elif ((0xD800 <= charAsInt <= 0xDFFF) or
 (charAsInt > 0x10FFFF)):
 char = "\uFFFD"
 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
 "illegal-codepoint-for-numeric-entity",
 "datavars": {"charAsInt": charAsInt}})
 else:
 # Should speed up this check somehow (e.g. move the set to a constant)
 if ((0x0001 <= charAsInt <= 0x0008) or
 (0x000E <= charAsInt <= 0x001F) or
 (0x007F <= charAsInt <= 0x009F) or
 (0xFDD0 <= charAsInt <= 0xFDEF) or
 charAsInt in frozenset([0x000B, 0xFFFE, 0xFFFF, 0x1FFFE,
 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,
 0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE,
 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
 0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE,
 0x9FFFF, 0xAFFFE, 0xAFFFF, 0xBFFFE,
 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE,
 0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE,
 0xFFFFF, 0x10FFFE, 0x10FFFF])):
 self.tokenQueue.append({"type": tokenTypes["ParseError"],
 "data":
 "illegal-codepoint-for-numeric-entity",
 "datavars": {"charAsInt": charAsInt}})
 try:
 # Try/except needed as UCS-2 Python builds' unichar only works
 # within the BMP.
 char = chr(charAsInt)
 except ValueError:
 v = charAsInt - 0x10000
 char = chr(0xD800 | (v >> 10)) + chr(0xDC00 | (v & 0x3FF))
 
 # Discard the ; if present. Otherwise, put it back on the queue and
 # invoke parseError on parser.
 if c != ";":
 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
 "numeric-entity-without-semicolon"})
 self.stream.unget(c)
 
 return char
 
 def consumeEntity(self, allowedChar=None, fromAttribute=False):
 # Initialise to the default output for when no entity is matched
 output = "&"
 
 charStack = [self.stream.char()]
 if (charStack[0] in spaceCharacters or charStack[0] in (EOF, "<", "&") or
 (allowedChar is not None and allowedChar == charStack[0])):
 self.stream.unget(charStack[0])
 
 elif charStack[0] == "#":
 # Read the next character to see if it's hex or decimal
 hex = False
 charStack.append(self.stream.char())
 if charStack[-1] in ("x", "X"):
 hex = True
 charStack.append(self.stream.char())
 
 # charStack[-1] should be the first digit
 if (hex and charStack[-1] in hexDigits) \
 or (not hex and charStack[-1] in digits):
 # At least one digit found, so consume the whole number
 self.stream.unget(charStack[-1])
 output = self.consumeNumberEntity(hex)
 else:
 # No digits found
 self.tokenQueue.append({"type": tokenTypes["ParseError"],
 "data": "expected-numeric-entity"})
 self.stream.unget(charStack.pop())
 output = "&" + "".join(charStack)
 
 else:
 # At this point in the process might have named entity. Entities
 # are stored in the global variable "entities".
 #
 # Consume characters and compare to these to a substring of the
 # entity names in the list until the substring no longer matches.
 while (charStack[-1] is not EOF):
 if not entitiesTrie.has_keys_with_prefix("".join(charStack)):
 break
 charStack.append(self.stream.char())
 
 # At this point we have a string that starts with some characters
 # that may match an entity
 # Try to find the longest entity the string will match to take care
 # of ¬i for instance.
 try:
 entityName = entitiesTrie.longest_prefix("".join(charStack[:-1]))
 entityLength = len(entityName)
 except KeyError:
 entityName = None
 
 if entityName is not None:
 if entityName[-1] != ";":
 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
 "named-entity-without-semicolon"})
 if (entityName[-1] != ";" and fromAttribute and
 (charStack[entityLength] in asciiLetters or
 charStack[entityLength] in digits or
 charStack[entityLength] == "=")):
 self.stream.unget(charStack.pop())
 output = "&" + "".join(charStack)
 else:
 output = entities[entityName]
 self.stream.unget(charStack.pop())
 output += "".join(charStack[entityLength:])
 else:
 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
 "expected-named-entity"})
 self.stream.unget(charStack.pop())
 output = "&" + "".join(charStack)
 
 if fromAttribute:
 self.currentToken["data"][-1][1] += output
 else:
 if output in spaceCharacters:
 tokenType = "SpaceCharacters"
 else:
 tokenType = "Characters"
 self.tokenQueue.append({"type": tokenTypes[tokenType], "data": output})
 
 def processEntityInAttribute(self, allowedChar):
 """This method replaces the need for "entityInAttributeValueState".
 """
 self.consumeEntity(allowedChar=allowedChar, fromAttribute=True)
 
 def emitCurrentToken(self):
 """This method is a generic handler for emitting the tags. It also sets
 the state to "data" because that's what's needed after a token has been
 emitted.
 """
 token = self.currentToken
 # Add token to the queue to be yielded
 if (token["type"] in tagTokenTypes):
 token["name"] = token["name"].translate(asciiUpper2Lower)
 if token["type"] == tokenTypes["EndTag"]:
 if token["data"]:
 self.tokenQueue.append({"type": tokenTypes["ParseError"],
 "data": "attributes-in-end-tag"})
 if token["selfClosing"]:
 self.tokenQueue.append({"type": tokenTypes["ParseError"],
 "data": "self-closing-flag-on-end-tag"})
 self.tokenQueue.append(token)
 self.state = self.dataState
 
 # Below are the various tokenizer states worked out.
 def dataState(self):
 data = self.stream.char()
 if data == "&":
 self.state = self.entityDataState
 elif data == "<":
 self.state = self.tagOpenState
 elif data == "\u0000":
 self.tokenQueue.append({"type": tokenTypes["ParseError"],
 "data": "invalid-codepoint"})
 self.tokenQueue.append({"type": tokenTypes["Characters"],
 "data": "\u0000"})
 elif data is EOF:
 # Tokenization ends.
 return False
 elif data in spaceCharacters:
 # Directly after emitting a token you switch back to the "data
 # state". At that point spaceCharacters are important so they are
 # emitted separately.
 self.tokenQueue.append({"type": tokenTypes["SpaceCharacters"], "data":
 data + self.stream.charsUntil(spaceCharacters, True)})
 # No need to update lastFourChars here, since the first space will
 # have already been appended to lastFourChars and will have broken
 # any <!-- or --> sequences
 else:
 chars = self.stream.charsUntil(("&", "<", "\u0000"))
 self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
 data + chars})
 return True
 
 def entityDataState(self):
 self.consumeEntity()
 self.state = self.dataState
 return True
 
 def rcdataState(self):
 data = self.stream.char()
 if data == "&":
 self.state = self.characterReferenceInRcdata
 elif data == "<":
 self.state = self.rcdataLessThanSignState
 elif data == EOF:
 # Tokenization ends.
 return False
 elif data == "\u0000":
 self.tokenQueue.append({"type": tokenTypes["ParseError"],
 "data": "invalid-codepoint"})
 self.tokenQueue.append({"type": tokenTypes["Characters"],
 "data": "\uFFFD"})
 elif data in spaceCharacters:
 # Directly after emitting a token you switch back to the "data
 # state". At that point spaceCharacters are important so they are
 # emitted separately.
 self.tokenQueue.append({"type": tokenTypes["SpaceCharacters"], "data":
 data + self.stream.charsUntil(spaceCharacters, True)})
 # No need to update lastFourChars here, since the first space will
 # have already been appended to lastFourChars and will have broken
 # any <!-- or --> sequences
 else:
 chars = self.stream.charsUntil(("&", "<", "\u0000"))
 self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
 data + chars})
 return True
 
 def characterReferenceInRcdata(self):
 self.consumeEntity()
 self.state = self.rcdataState
 return True
 
 def rawtextState(self):
 data = self.stream.char()
 if data == "<":
 self.state = self.rawtextLessThanSignState
 elif data == "\u0000":
 self.tokenQueue.append({"type": tokenTypes["ParseError"],
 "data": "invalid-codepoint"})
 self.tokenQueue.append({"type": tokenTypes["Characters"],
 "data": "\uFFFD"})
 elif data == EOF:
 # Tokenization ends.
 return False
 else:
 chars = self.stream.charsUntil(("<", "\u0000"))
 self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
 data + chars})
 return True
 
 def scriptDataState(self):
 data = self.stream.char()
 if data == "<":
 self.state = self.scriptDataLessThanSignState
 elif data == "\u0000":
 self.tokenQueue.append({"type": tokenTypes["ParseError"],
 "data": "invalid-codepoint"})
 self.tokenQueue.append({"type": tokenTypes["Characters"],
 "data": "\uFFFD"})
 elif data == EOF:
 # Tokenization ends.
 return False
 else:
 chars = self.stream.charsUntil(("<", "\u0000"))
 self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
 data + chars})
 return True
 
 def plaintextState(self):
 data = self.stream.char()
 if data == EOF:
 # Tokenization ends.
 return False
 elif data == "\u0000":
 self.tokenQueue.append({"type": tokenTypes["ParseError"],
 "data": "invalid-codepoint"})
 self.tokenQueue.append({"type": tokenTypes["Characters"],
 "data": "\uFFFD"})
 else:
 self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
 data + self.stream.charsUntil("\u0000")})
 return True
 
 def tagOpenState(self):
 data = self.stream.char()
 if data == "!":
 self.state = self.markupDeclarationOpenState
 elif data == "/":
 self.state = self.closeTagOpenState
 elif data in asciiLetters:
 self.currentToken = {"type": tokenTypes["StartTag"],
 "name": data, "data": [],
 "selfClosing": False,
 "selfClosingAcknowledged": False}
 self.state = self.tagNameState
 elif data == ">":
 # XXX In theory it could be something besides a tag name. But
 # do we really care?
 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
 "expected-tag-name-but-got-right-bracket"})
 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<>"})
 self.state = self.dataState
 elif data == "?":
 # XXX In theory it could be something besides a tag name. But
 # do we really care?
 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
 "expected-tag-name-but-got-question-mark"})
 self.stream.unget(data)
 self.state = self.bogusCommentState
 else:
 # XXX
 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
 "expected-tag-name"})
 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
 self.stream.unget(data)
 self.state = self.dataState
 return True
 
 def closeTagOpenState(self):
 data = self.stream.char()
 if data in asciiLetters:
 self.currentToken = {"type": tokenTypes["EndTag"], "name": data,
 "data": [], "selfClosing": False}
 self.state = self.tagNameState
 elif data == ">":
 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
 "expected-closing-tag-but-got-right-bracket"})
 self.state = self.dataState
 elif data is EOF:
 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
 "expected-closing-tag-but-got-eof"})
 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"})
 self.state = self.dataState
 else:
 # XXX data can be _'_...
 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
 "expected-closing-tag-but-got-char",
 "datavars": {"data": data}})
 self.stream.unget(data)
 self.state = self.bogusCommentState
 return True
 
 def tagNameState(self):
 data = self.stream.char()
 if data in spaceCharacters:
 self.state = self.beforeAttributeNameState
 elif data == ">":
 self.emitCurrentToken()
 elif data is EOF:
 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
 "eof-in-tag-name"})
 self.state = self.dataState
 elif data == "/":
 self.state = self.selfClosingStartTagState
 elif data == "\u0000":
 self.tokenQueue.append({"type": tokenTypes["ParseError"],
 "data": "invalid-codepoint"})
 self.currentToken["name"] += "\uFFFD"
 else:
 self.currentToken["name"] += data
 # (Don't use charsUntil here, because tag names are
 # very short and it's faster to not do anything fancy)
 return True
 
 def rcdataLessThanSignState(self):
 data = self.stream.char()
 if data == "/":
 self.temporaryBuffer = ""
 self.state = self.rcdataEndTagOpenState
 else:
 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
 self.stream.unget(data)
 self.state = self.rcdataState
 return True
 
 def rcdataEndTagOpenState(self):
 data = self.stream.char()
 if data in asciiLetters:
 self.temporaryBuffer += data
 self.state = self.rcdataEndTagNameState
 else:
 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"})
 self.stream.unget(data)
 self.state = self.rcdataState
 return True
 
 def rcdataEndTagNameState(self):
 appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
 data = self.stream.char()
 if data in spaceCharacters and appropriate:
 self.currentToken = {"type": tokenTypes["EndTag"],
 "name": self.temporaryBuffer,
 "data": [], "selfClosing": False}
 self.state = self.beforeAttributeNameState
 elif data == "/" and appropriate:
 self.currentToken = {"type": tokenTypes["EndTag"],
 "name": self.temporaryBuffer,
 "data": [], "selfClosing": False}
 self.state = self.selfClosingStartTagState
 elif data == ">" and appropriate:
 self.currentToken = {"type": tokenTypes["EndTag"],
 "name": self.temporaryBuffer,
 "data": [], "selfClosing": False}
 self.emitCurrentToken()
 self.state = self.dataState
 elif data in asciiLetters:
 self.temporaryBuffer += data
 else:
 self.tokenQueue.append({"type": tokenTypes["Characters"],
 "data": "</" + self.temporaryBuffer})
 self.stream.unget(data)
 self.state = self.rcdataState
 return True
 
 def rawtextLessThanSignState(self):
 data = self.stream.char()
 if data == "/":
 self.temporaryBuffer = ""
 self.state = self.rawtextEndTagOpenState
 else:
 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
 self.stream.unget(data)
 self.state = self.rawtextState
 return True
 
 def rawtextEndTagOpenState(self):
 data = self.stream.char()
 if data in asciiLetters:
 self.temporaryBuffer += data
 self.state = self.rawtextEndTagNameState
 else:
 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"})
 self.stream.unget(data)
 self.state = self.rawtextState
 return True
 
 def rawtextEndTagNameState(self):
 appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
 data = self.stream.char()
 if data in spaceCharacters and appropriate:
 self.currentToken = {"type": tokenTypes["EndTag"],
 "name": self.temporaryBuffer,
 "data": [], "selfClosing": False}
 self.state = self.beforeAttributeNameState
 elif data == "/" and appropriate:
 self.currentToken = {"type": tokenTypes["EndTag"],
 "name": self.temporaryBuffer,
 "data": [], "selfClosing": False}
 self.state = self.selfClosingStartTagState
 elif data == ">" and appropriate:
 self.currentToken = {"type": tokenTypes["EndTag"],
 "name": self.temporaryBuffer,
 "data": [], "selfClosing": False}
 self.emitCurrentToken()
 self.state = self.dataState
 elif data in asciiLetters:
 self.temporaryBuffer += data
 else:
 self.tokenQueue.append({"type": tokenTypes["Characters"],
 "data": "</" + self.temporaryBuffer})
 self.stream.unget(data)
 self.state = self.rawtextState
 return True
 
 def scriptDataLessThanSignState(self):
 data = self.stream.char()
 if data == "/":
 self.temporaryBuffer = ""
 self.state = self.scriptDataEndTagOpenState
 elif data == "!":
 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<!"})
 self.state = self.scriptDataEscapeStartState
 else:
 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
 self.stream.unget(data)
 self.state = self.scriptDataState
 return True
 
 def scriptDataEndTagOpenState(self):
 data = self.stream.char()
 if data in asciiLetters:
 self.temporaryBuffer += data
 self.state = self.scriptDataEndTagNameState
 else:
 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"})
 self.stream.unget(data)
 self.state = self.scriptDataState
 return True
 
 def scriptDataEndTagNameState(self):
 appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
 data = self.stream.char()
 if data in spaceCharacters and appropriate:
 self.currentToken = {"type": tokenTypes["EndTag"],
 "name": self.temporaryBuffer,
 "data": [], "selfClosing": False}
 self.state = self.beforeAttributeNameState
 elif data == "/" and appropriate:
 self.currentToken = {"type": tokenTypes["EndTag"],
 "name": self.temporaryBuffer,
 "data": [], "selfClosing": False}
 self.state = self.selfClosingStartTagState
 elif data == ">" and appropriate:
 self.currentToken = {"type": tokenTypes["EndTag"],
 "name": self.temporaryBuffer,
 "data": [], "selfClosing": False}
 self.emitCurrentToken()
 self.state = self.dataState
 elif data in asciiLetters:
 self.temporaryBuffer += data
 else:
 self.tokenQueue.append({"type": tokenTypes["Characters"],
 "data": "</" + self.temporaryBuffer})
 self.stream.unget(data)
 self.state = self.scriptDataState
 return True
 
 def scriptDataEscapeStartState(self):
 data = self.stream.char()
 if data == "-":
 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
 self.state = self.scriptDataEscapeStartDashState
 else:
 self.stream.unget(data)
 self.state = self.scriptDataState
 return True
 
 def scriptDataEscapeStartDashState(self):
 data = self.stream.char()
 if data == "-":
 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
 self.state = self.scriptDataEscapedDashDashState
 else:
 self.stream.unget(data)
 self.state = self.scriptDataState
 return True
 
 def scriptDataEscapedState(self):
 data = self.stream.char()
 if data == "-":
 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
 self.state = self.scriptDataEscapedDashState
 elif data == "<":
 self.state = self.scriptDataEscapedLessThanSignState
 elif data == "\u0000":
 self.tokenQueue.append({"type": tokenTypes["ParseError"],
 "data": "invalid-codepoint"})
 self.tokenQueue.append({"type": tokenTypes["Characters"],
 "data": "\uFFFD"})
 elif data == EOF:
 self.state = self.dataState
 else:
 chars = self.stream.charsUntil(("<", "-", "\u0000"))
 self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
 data + chars})
 return True
 
 def scriptDataEscapedDashState(self):
 data = self.stream.char()
 if data == "-":
 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
 self.state = self.scriptDataEscapedDashDashState
 elif data == "<":
 self.state = self.scriptDataEscapedLessThanSignState
 elif data == "\u0000":
 self.tokenQueue.append({"type": tokenTypes["ParseError"],
 "data": "invalid-codepoint"})
 self.tokenQueue.append({"type": tokenTypes["Characters"],
 "data": "\uFFFD"})
 self.state = self.scriptDataEscapedState
 elif data == EOF:
 self.state = self.dataState
 else:
 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
 self.state = self.scriptDataEscapedState
 return True
 
 def scriptDataEscapedDashDashState(self):
 data = self.stream.char()
 if data == "-":
 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
 elif data == "<":
 self.state = self.scriptDataEscapedLessThanSignState
 elif data == ">":
 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": ">"})
 self.state = self.scriptDataState
 elif data == "\u0000":
 self.tokenQueue.append({"type": tokenTypes["ParseError"],
 "data": "invalid-codepoint"})
 self.tokenQueue.append({"type": tokenTypes["Characters"],
 "data": "\uFFFD"})
 self.state = self.scriptDataEscapedState
 elif data == EOF:
 self.state = self.dataState
 else:
 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
 self.state = self.scriptDataEscapedState
 return True
 
 def scriptDataEscapedLessThanSignState(self):
 data = self.stream.char()
 if data == "/":
 self.temporaryBuffer = ""
 self.state = self.scriptDataEscapedEndTagOpenState
 elif data in asciiLetters:
 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<" + data})
 self.temporaryBuffer = data
 self.state = self.scriptDataDoubleEscapeStartState
 else:
 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
 self.stream.unget(data)
 self.state = self.scriptDataEscapedState
 return True
 
 def scriptDataEscapedEndTagOpenState(self):
 data = self.stream.char()
 if data in asciiLetters:
 self.temporaryBuffer = data
 self.state = self.scriptDataEscapedEndTagNameState
 else:
 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"})
 self.stream.unget(data)
 self.state = self.scriptDataEscapedState
 return True
 
 def scriptDataEscapedEndTagNameState(self):
 appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
 data = self.stream.char()
 if data in spaceCharacters and appropriate:
 self.currentToken = {"type": tokenTypes["EndTag"],
 "name": self.temporaryBuffer,
 "data": [], "selfClosing": False}
 self.state = self.beforeAttributeNameState
 elif data == "/" and appropriate:
 self.currentToken = {"type": tokenTypes["EndTag"],
 "name": self.temporaryBuffer,
 "data": [], "selfClosing": False}
 self.state = self.selfClosingStartTagState
 elif data == ">" and appropriate:
 self.currentToken = {"type": tokenTypes["EndTag"],
 "name": self.temporaryBuffer,
 "data": [], "selfClosing": False}
 self.emitCurrentToken()
 self.state = self.dataState
 elif data in asciiLetters:
 self.temporaryBuffer += data
 else:
 self.tokenQueue.append({"type": tokenTypes["Characters"],
 "data": "</" + self.temporaryBuffer})
 self.stream.unget(data)
 self.state = self.scriptDataEscapedState
 return True
 
 def scriptDataDoubleEscapeStartState(self):
 data = self.stream.char()
 if data in (spaceCharacters | frozenset(("/", ">"))):
 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
 if self.temporaryBuffer.lower() == "script":
 self.state = self.scriptDataDoubleEscapedState
 else:
 self.state = self.scriptDataEscapedState
 elif data in asciiLetters:
 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
 self.temporaryBuffer += data
 else:
 self.stream.unget(data)
 self.state = self.scriptDataEscapedState
 return True
 
 def scriptDataDoubleEscapedState(self):
 data = self.stream.char()
 if data == "-":
 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
 self.state = self.scriptDataDoubleEscapedDashState
 elif data == "<":
 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
 self.state = self.scriptDataDoubleEscapedLessThanSignState
 elif data == "\u0000":
 self.tokenQueue.append({"type": tokenTypes["ParseError"],
 "data": "invalid-codepoint"})
 self.tokenQueue.append({"type": tokenTypes["Characters"],
 "data": "\uFFFD"})
 elif data == EOF:
 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
 "eof-in-script-in-script"})
 self.state = self.dataState
 else:
 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
 return True
 
 def scriptDataDoubleEscapedDashState(self):
 data = self.stream.char()
 if data == "-":
 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
 self.state = self.scriptDataDoubleEscapedDashDashState
 elif data == "<":
 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
 self.state = self.scriptDataDoubleEscapedLessThanSignState
 elif data == "\u0000":
 self.tokenQueue.append({"type": tokenTypes["ParseError"],
 "data": "invalid-codepoint"})
 self.tokenQueue.append({"type": tokenTypes["Characters"],
 "data": "\uFFFD"})
 self.state = self.scriptDataDoubleEscapedState
 elif data == EOF:
 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
 "eof-in-script-in-script"})
 self.state = self.dataState
 else:
 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
 self.state = self.scriptDataDoubleEscapedState
 return True
 
 def scriptDataDoubleEscapedDashDashState(self):
 data = self.stream.char()
 if data == "-":
 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
 elif data == "<":
 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
 self.state = self.scriptDataDoubleEscapedLessThanSignState
 elif data == ">":
 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": ">"})
 self.state = self.scriptDataState
 elif data == "\u0000":
 self.tokenQueue.append({"type": tokenTypes["ParseError"],
 "data": "invalid-codepoint"})
 self.tokenQueue.append({"type": tokenTypes["Characters"],
 "data": "\uFFFD"})
 self.state = self.scriptDataDoubleEscapedState
 elif data == EOF:
 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
 "eof-in-script-in-script"})
 self.state = self.dataState
 else:
 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
 self.state = self.scriptDataDoubleEscapedState
 return True
 
 def scriptDataDoubleEscapedLessThanSignState(self):
 data = self.stream.char()
 if data == "/":
 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "/"})
 self.temporaryBuffer = ""
 self.state = self.scriptDataDoubleEscapeEndState
 else:
 self.stream.unget(data)
 self.state = self.scriptDataDoubleEscapedState
 return True
 
 def scriptDataDoubleEscapeEndState(self):
 data = self.stream.char()
 if data in (spaceCharacters | frozenset(("/", ">"))):
 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
 if self.temporaryBuffer.lower() == "script":
 self.state = self.scriptDataEscapedState
 else:
 self.state = self.scriptDataDoubleEscapedState
 elif data in asciiLetters:
 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
 self.temporaryBuffer += data
 else:
 self.stream.unget(data)
 self.state = self.scriptDataDoubleEscapedState
 return True
 
 def beforeAttributeNameState(self):
 data = self.stream.char()
 if data in spaceCharacters:
 self.stream.charsUntil(spaceCharacters, True)
 elif data in asciiLetters:
 self.currentToken["data"].append([data, ""])
 self.state = self.attributeNameState
 elif data == ">":
 self.emitCurrentToken()
 elif data == "/":
 self.state = self.selfClosingStartTagState
 elif data in ("'", '"', "=", "<"):
 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
 "invalid-character-in-attribute-name"})
 self.currentToken["data"].append([data, ""])
 self.state = self.attributeNameState
 elif data == "\u0000":
 self.tokenQueue.append({"type": tokenTypes["ParseError"],
 "data": "invalid-codepoint"})
 self.currentToken["data"].append(["\uFFFD", ""])
 self.state = self.attributeNameState
 elif data is EOF:
 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
 "expected-attribute-name-but-got-eof"})
 self.state = self.dataState
 else:
 self.currentToken["data"].append([data, ""])
 self.state = self.attributeNameState
 return True
 
 def attributeNameState(self):
 data = self.stream.char()
 leavingThisState = True
 emitToken = False
 if data == "=":
 self.state = self.beforeAttributeValueState
 elif data in asciiLetters:
 self.currentToken["data"][-1][0] += data +\
 self.stream.charsUntil(asciiLetters, True)
 leavingThisState = False
 elif data == ">":
 # XXX If we emit here the attributes are converted to a dict
 # without being checked and when the code below runs we error
 # because data is a dict not a list
 emitToken = True
 elif data in spaceCharacters:
 self.state = self.afterAttributeNameState
 elif data == "/":
 self.state = self.selfClosingStartTagState
 elif data == "\u0000":
 self.tokenQueue.append({"type": tokenTypes["ParseError"],
 "data": "invalid-codepoint"})
 self.currentToken["data"][-1][0] += "\uFFFD"
 leavingThisState = False
 elif data in ("'", '"', "<"):
 self.tokenQueue.append({"type": tokenTypes["ParseError"],
 "data":
 "invalid-character-in-attribute-name"})
 self.currentToken["data"][-1][0] += data
 leavingThisState = False
 elif data is EOF:
 self.tokenQueue.append({"type": tokenTypes["ParseError"],
 "data": "eof-in-attribute-name"})
 self.state = self.dataState
 else:
 self.currentToken["data"][-1][0] += data
 leavingThisState = False
 
 if leavingThisState:
 # Attributes are not dropped at this stage. That happens when the
 # start tag token is emitted so values can still be safely appended
 # to attributes, but we do want to report the parse error in time.
 self.currentToken["data"][-1][0] = (
 self.currentToken["data"][-1][0].translate(asciiUpper2Lower))
 for name, _ in self.currentToken["data"][:-1]:
 if self.currentToken["data"][-1][0] == name:
 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
 "duplicate-attribute"})
 break
 # XXX Fix for above XXX
 if emitToken:
 self.emitCurrentToken()
 return True
 
 def afterAttributeNameState(self):
 data = self.stream.char()
 if data in spaceCharacters:
 self.stream.charsUntil(spaceCharacters, True)
 elif data == "=":
 self.state = self.beforeAttributeValueState
 elif data == ">":
 self.emitCurrentToken()
 elif data in asciiLetters:
 self.currentToken["data"].append([data, ""])
 self.state = self.attributeNameState
 elif data == "/":
 self.state = self.selfClosingStartTagState
 elif data == "\u0000":
 self.tokenQueue.append({"type": tokenTypes["ParseError"],
 "data": "invalid-codepoint"})
 self.currentToken["data"].append(["\uFFFD", ""])
 self.state = self.attributeNameState
 elif data in ("'", '"', "<"):
 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
 "invalid-character-after-attribute-name"})
 self.currentToken["data"].append([data, ""])
 self.state = self.attributeNameState
 elif data is EOF:
 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
 "expected-end-of-tag-but-got-eof"})
 self.state = self.dataState
 else:
 self.currentToken["data"].append([data, ""])
 self.state = self.attributeNameState
 return True
 
 def beforeAttributeValueState(self):
 data = self.stream.char()
 if data in spaceCharacters:
 self.stream.charsUntil(spaceCharacters, True)
 elif data == "\"":
 self.state = self.attributeValueDoubleQuotedState
 elif data == "&":
 self.state = self.attributeValueUnQuotedState
 self.stream.unget(data)
 elif data == "'":
 self.state = self.attributeValueSingleQuotedState
 elif data == ">":
 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
 "expected-attribute-value-but-got-right-bracket"})
 self.emitCurrentToken()
 elif data == "\u0000":
 self.tokenQueue.append({"type": tokenTypes["ParseError"],
 "data": "invalid-codepoint"})
 self.currentToken["data"][-1][1] += "\uFFFD"
 self.state = self.attributeValueUnQuotedState
 elif data in ("=", "<", "`"):
 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
 "equals-in-unquoted-attribute-value"})
 self.currentToken["data"][-1][1] += data
 self.state = self.attributeValueUnQuotedState
 elif data is EOF:
 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
 "expected-attribute-value-but-got-eof"})
 self.state = self.dataState
 else:
 self.currentToken["data"][-1][1] += data
 self.state = self.attributeValueUnQuotedState
 return True
 
 def attributeValueDoubleQuotedState(self):
 data = self.stream.char()
 if data == "\"":
 self.state = self.afterAttributeValueState
 elif data == "&":
 self.processEntityInAttribute('"')
 elif data == "\u0000":
 self.tokenQueue.append({"type": tokenTypes["ParseError"],
 "data": "invalid-codepoint"})
 self.currentToken["data"][-1][1] += "\uFFFD"
 elif data is EOF:
 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
 "eof-in-attribute-value-double-quote"})
 self.state = self.dataState
 else:
 self.currentToken["data"][-1][1] += data +\
 self.stream.charsUntil(("\"", "&", "\u0000"))
 return True
 
 def attributeValueSingleQuotedState(self):
 data = self.stream.char()
 if data == "'":
 self.state = self.afterAttributeValueState
 elif data == "&":
 self.processEntityInAttribute("'")
 elif data == "\u0000":
 self.tokenQueue.append({"type": tokenTypes["ParseError"],
 "data": "invalid-codepoint"})
 self.currentToken["data"][-1][1] += "\uFFFD"
 elif data is EOF:
 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
 "eof-in-attribute-value-single-quote"})
 self.state = self.dataState
 else:
 self.currentToken["data"][-1][1] += data +\
 self.stream.charsUntil(("'", "&", "\u0000"))
 return True
 
 def attributeValueUnQuotedState(self):
 data = self.stream.char()
 if data in spaceCharacters:
 self.state = self.beforeAttributeNameState
 elif data == "&":
 self.processEntityInAttribute(">")
 elif data == ">":
 self.emitCurrentToken()
 elif data in ('"', "'", "=", "<", "`"):
 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
 "unexpected-character-in-unquoted-attribute-value"})
 self.currentToken["data"][-1][1] += data
 elif data == "\u0000":
 self.tokenQueue.append({"type": tokenTypes["ParseError"],
 "data": "invalid-codepoint"})
 self.currentToken["data"][-1][1] += "\uFFFD"
 elif data is EOF:
 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
 "eof-in-attribute-value-no-quotes"})
 self.state = self.dataState
 else:
 self.currentToken["data"][-1][1] += data + self.stream.charsUntil(
 frozenset(("&", ">", '"', "'", "=", "<", "`", "\u0000")) | spaceCharacters)
 return True
 
 def afterAttributeValueState(self):
 data = self.stream.char()
 if data in spaceCharacters:
 self.state = self.beforeAttributeNameState
 elif data == ">":
 self.emitCurrentToken()
 elif data == "/":
 self.state = self.selfClosingStartTagState
 elif data is EOF:
 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
 "unexpected-EOF-after-attribute-value"})
 self.stream.unget(data)
 self.state = self.dataState
 else:
 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
 "unexpected-character-after-attribute-value"})
 self.stream.unget(data)
 self.state = self.beforeAttributeNameState
 return True
 
 def selfClosingStartTagState(self):
 data = self.stream.char()
 if data == ">":
 self.currentToken["selfClosing"] = True
 self.emitCurrentToken()
 elif data is EOF:
 self.tokenQueue.append({"type": tokenTypes["ParseError"],
 "data":
 "unexpected-EOF-after-solidus-in-tag"})
 self.stream.unget(data)
 self.state = self.dataState
 else:
 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
 "unexpected-character-after-solidus-in-tag"})
 self.stream.unget(data)
 self.state = self.beforeAttributeNameState
 return True
 
 def bogusCommentState(self):
 # Make a new comment token and give it as value all the characters
 # until the first > or EOF (charsUntil checks for EOF automatically)
 # and emit it.
 data = self.stream.charsUntil(">")
 data = data.replace("\u0000", "\uFFFD")
 self.tokenQueue.append(
 {"type": tokenTypes["Comment"], "data": data})
 
 # Eat the character directly after the bogus comment which is either a
 # ">" or an EOF.
 self.stream.char()
 self.state = self.dataState
 return True
 
 def markupDeclarationOpenState(self):
 charStack = [self.stream.char()]
 if charStack[-1] == "-":
 charStack.append(self.stream.char())
 if charStack[-1] == "-":
 self.currentToken = {"type": tokenTypes["Comment"], "data": ""}
 self.state = self.commentStartState
 return True
 elif charStack[-1] in ('d', 'D'):
 matched = True
 for expected in (('o', 'O'), ('c', 'C'), ('t', 'T'),
 ('y', 'Y'), ('p', 'P'), ('e', 'E')):
 charStack.append(self.stream.char())
 if charStack[-1] not in expected:
 matched = False
 break
 if matched:
 self.currentToken = {"type": tokenTypes["Doctype"],
 "name": "",
 "publicId": None, "systemId": None,
 "correct": True}
 self.state = self.doctypeState
 return True
 elif (charStack[-1] == "[" and
 self.parser is not None and
 self.parser.tree.openElements and
 self.parser.tree.openElements[-1].namespace != self.parser.tree.defaultNamespace):
 matched = True
 for expected in ["C", "D", "A", "T", "A", "["]:
 charStack.append(self.stream.char())
 if charStack[-1] != expected:
 matched = False
 break
 if matched:
 self.state = self.cdataSectionState
 return True
 
 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
 "expected-dashes-or-doctype"})
 
 while charStack:
 self.stream.unget(charStack.pop())
 self.state = self.bogusCommentState
 return True
 
 def commentStartState(self):
 data = self.stream.char()
 if data == "-":
 self.state = self.commentStartDashState
 elif data == "\u0000":
 self.tokenQueue.append({"type": tokenTypes["ParseError"],
 "data": "invalid-codepoint"})
 self.currentToken["data"] += "\uFFFD"
 elif data == ">":
 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
 "incorrect-comment"})
 self.tokenQueue.append(self.currentToken)
 self.state = self.dataState
 elif data is EOF:
 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
 "eof-in-comment"})
 self.tokenQueue.append(self.currentToken)
 self.state = self.dataState
 else:
 self.currentToken["data"] += data
 self.state = self.commentState
 return True
 
 def commentStartDashState(self):
 data = self.stream.char()
 if data == "-":
 self.state = self.commentEndState
 elif data == "\u0000":
 self.tokenQueue.append({"type": tokenTypes["ParseError"],
 "data": "invalid-codepoint"})
 self.currentToken["data"] += "-\uFFFD"
 elif data == ">":
 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
 "incorrect-comment"})
 self.tokenQueue.append(self.currentToken)
 self.state = self.dataState
 elif data is EOF:
 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
 "eof-in-comment"})
 self.tokenQueue.append(self.currentToken)
 self.state = self.dataState
 else:
 self.currentToken["data"] += "-" + data
 self.state = self.commentState
 return True
 
 def commentState(self):
 data = self.stream.char()
 if data == "-":
 self.state = self.commentEndDashState
 elif data == "\u0000":
 self.tokenQueue.append({"type": tokenTypes["ParseError"],
 "data": "invalid-codepoint"})
 self.currentToken["data"] += "\uFFFD"
 elif data is EOF:
 self.tokenQueue.append({"type": tokenTypes["ParseError"],
 "data": "eof-in-comment"})
 self.tokenQueue.append(self.currentToken)
 self.state = self.dataState
 else:
 self.currentToken["data"] += data + \
 self.stream.charsUntil(("-", "\u0000"))
 return True
 
 def commentEndDashState(self):
 data = self.stream.char()
 if data == "-":
 self.state = self.commentEndState
 elif data == "\u0000":
 self.tokenQueue.append({"type": tokenTypes["ParseError"],
 "data": "invalid-codepoint"})
 self.currentToken["data"] += "-\uFFFD"
 self.state = self.commentState
 elif data is EOF:
 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
 "eof-in-comment-end-dash"})
 self.tokenQueue.append(self.currentToken)
 self.state = self.dataState
 else:
 self.currentToken["data"] += "-" + data
 self.state = self.commentState
 return True
 
 def commentEndState(self):
 data = self.stream.char()
 if data == ">":
 self.tokenQueue.append(self.currentToken)
 self.state = self.dataState
 elif data == "\u0000":
 self.tokenQueue.append({"type": tokenTypes["ParseError"],
 "data": "invalid-codepoint"})
 self.currentToken["data"] += "--\uFFFD"
 self.state = self.commentState
 elif data == "!":
 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
 "unexpected-bang-after-double-dash-in-comment"})
 self.state = self.commentEndBangState
 elif data == "-":
 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
 "unexpected-dash-after-double-dash-in-comment"})
 self.currentToken["data"] += data
 elif data is EOF:
 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
 "eof-in-comment-double-dash"})
 self.tokenQueue.append(self.currentToken)
 self.state = self.dataState
 else:
 # XXX
 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
 "unexpected-char-in-comment"})
 self.currentToken["data"] += "--" + data
 self.state = self.commentState
 return True
 
 def commentEndBangState(self):
 data = self.stream.char()
 if data == ">":
 self.tokenQueue.append(self.currentToken)
 self.state = self.dataState
 elif data == "-":
 self.currentToken["data"] += "--!"
 self.state = self.commentEndDashState
 elif data == "\u0000":
 self.tokenQueue.append({"type": tokenTypes["ParseError"],
 "data": "invalid-codepoint"})
 self.currentToken["data"] += "--!\uFFFD"
 self.state = self.commentState
 elif data is EOF:
 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
 "eof-in-comment-end-bang-state"})
 self.tokenQueue.append(self.currentToken)
 self.state = self.dataState
 else:
 self.currentToken["data"] += "--!" + data
 self.state = self.commentState
 return True
 
 def doctypeState(self):
 data = self.stream.char()
 if data in spaceCharacters:
 self.state = self.beforeDoctypeNameState
 elif data is EOF:
 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
 "expected-doctype-name-but-got-eof"})
 self.currentToken["correct"] = False
 self.tokenQueue.append(self.currentToken)
 self.state = self.dataState
 else:
 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
 "need-space-after-doctype"})
 self.stream.unget(data)
 self.state = self.beforeDoctypeNameState
 return True
 
 def beforeDoctypeNameState(self):
 data = self.stream.char()
 if data in spaceCharacters:
 pass
 elif data == ">":
 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
 "expected-doctype-name-but-got-right-bracket"})
 self.currentToken["correct"] = False
 self.tokenQueue.append(self.currentToken)
 self.state = self.dataState
 elif data == "\u0000":
 self.tokenQueue.append({"type": tokenTypes["ParseError"],
 "data": "invalid-codepoint"})
 self.currentToken["name"] = "\uFFFD"
 self.state = self.doctypeNameState
 elif data is EOF:
 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
 "expected-doctype-name-but-got-eof"})
 self.currentToken["correct"] = False
 self.tokenQueue.append(self.currentToken)
 self.state = self.dataState
 else:
 self.currentToken["name"] = data
 self.state = self.doctypeNameState
 return True
 
 def doctypeNameState(self):
 data = self.stream.char()
 if data in spaceCharacters:
 self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower)
 self.state = self.afterDoctypeNameState
 elif data == ">":
 self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower)
 self.tokenQueue.append(self.currentToken)
 self.state = self.dataState
 elif data == "\u0000":
 self.tokenQueue.append({"type": tokenTypes["ParseError"],
 "data": "invalid-codepoint"})
 self.currentToken["name"] += "\uFFFD"
 self.state = self.doctypeNameState
 elif data is EOF:
 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
 "eof-in-doctype-name"})
 self.currentToken["correct"] = False
 self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower)
 self.tokenQueue.append(self.currentToken)
 self.state = self.dataState
 else:
 self.currentToken["name"] += data
 return True
 
 def afterDoctypeNameState(self):
 data = self.stream.char()
 if data in spaceCharacters:
 pass
 elif data == ">":
 self.tokenQueue.append(self.currentToken)
 self.state = self.dataState
 elif data is EOF:
 self.currentToken["correct"] = False
 self.stream.unget(data)
 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
 "eof-in-doctype"})
 self.tokenQueue.append(self.currentToken)
 self.state = self.dataState
 else:
 if data in ("p", "P"):
 matched = True
 for expected in (("u", "U"), ("b", "B"), ("l", "L"),
 ("i", "I"), ("c", "C")):
 data = self.stream.char()
 if data not in expected:
 matched = False
 break
 if matched:
 self.state = self.afterDoctypePublicKeywordState
 return True
 elif data in ("s", "S"):
 matched = True
 for expected in (("y", "Y"), ("s", "S"), ("t", "T"),
 ("e", "E"), ("m", "M")):
 data = self.stream.char()
 if data not in expected:
 matched = False
 break
 if matched:
 self.state = self.afterDoctypeSystemKeywordState
 return True
 
 # All the characters read before the current 'data' will be
 # [a-zA-Z], so they're garbage in the bogus doctype and can be
 # discarded; only the latest character might be '>' or EOF
 # and needs to be ungetted
 self.stream.unget(data)
 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
 "expected-space-or-right-bracket-in-doctype", "datavars":
 {"data": data}})
 self.currentToken["correct"] = False
 self.state = self.bogusDoctypeState
 
 return True
 
 def afterDoctypePublicKeywordState(self):
 data = self.stream.char()
 if data in spaceCharacters:
 self.state = self.beforeDoctypePublicIdentifierState
 elif data in ("'", '"'):
 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
 "unexpected-char-in-doctype"})
 self.stream.unget(data)
 self.state = self.beforeDoctypePublicIdentifierState
 elif data is EOF:
 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
 "eof-in-doctype"})
 self.currentToken["correct"] = False
 self.tokenQueue.append(self.currentToken)
 self.state = self.dataState
 else:
 self.stream.unget(data)
 self.state = self.beforeDoctypePublicIdentifierState
 return True
 
 def beforeDoctypePublicIdentifierState(self):
 data = self.stream.char()
 if data in spaceCharacters:
 pass
 elif data == "\"":
 self.currentToken["publicId"] = ""
 self.state = self.doctypePublicIdentifierDoubleQuotedState
 elif data == "'":
 self.currentToken["publicId"] = ""
 self.state = self.doctypePublicIdentifierSingleQuotedState
 elif data == ">":
 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
 "unexpected-end-of-doctype"})
 self.currentToken["correct"] = False
 self.tokenQueue.append(self.currentToken)
 self.state = self.dataState
 elif data is EOF:
 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
 "eof-in-doctype"})
 self.currentToken["correct"] = False
 self.tokenQueue.append(self.currentToken)
 self.state = self.dataState
 else:
 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
 "unexpected-char-in-doctype"})
 self.currentToken["correct"] = False
 self.state = self.bogusDoctypeState
 return True
 
 def doctypePublicIdentifierDoubleQuotedState(self):
 data = self.stream.char()
 if data == "\"":
 self.state = self.afterDoctypePublicIdentifierState
 elif data == "\u0000":
 self.tokenQueue.append({"type": tokenTypes["ParseError"],
 "data": "invalid-codepoint"})
 self.currentToken["publicId"] += "\uFFFD"
 elif data == ">":
 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
 "unexpected-end-of-doctype"})
 self.currentToken["correct"] = False
 self.tokenQueue.append(self.currentToken)
 self.state = self.dataState
 elif data is EOF:
 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
 "eof-in-doctype"})
 self.currentToken["correct"] = False
 self.tokenQueue.append(self.currentToken)
 self.state = self.dataState
 else:
 self.currentToken["publicId"] += data
 return True
 
 def doctypePublicIdentifierSingleQuotedState(self):
 data = self.stream.char()
 if data == "'":
 self.state = self.afterDoctypePublicIdentifierState
 elif data == "\u0000":
 self.tokenQueue.append({"type": tokenTypes["ParseError"],
 "data": "invalid-codepoint"})
 self.currentToken["publicId"] += "\uFFFD"
 elif data == ">":
 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
 "unexpected-end-of-doctype"})
 self.currentToken["correct"] = False
 self.tokenQueue.append(self.currentToken)
 self.state = self.dataState
 elif data is EOF:
 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
 "eof-in-doctype"})
 self.currentToken["correct"] = False
 self.tokenQueue.append(self.currentToken)
 self.state = self.dataState
 else:
 self.currentToken["publicId"] += data
 return True
 
 def afterDoctypePublicIdentifierState(self):
 data = self.stream.char()
 if data in spaceCharacters:
 self.state = self.betweenDoctypePublicAndSystemIdentifiersState
 elif data == ">":
 self.tokenQueue.append(self.currentToken)
 self.state = self.dataState
 elif data == '"':
 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
 "unexpected-char-in-doctype"})
 self.currentToken["systemId"] = ""
 self.state = self.doctypeSystemIdentifierDoubleQuotedState
 elif data == "'":
 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
 "unexpected-char-in-doctype"})
 self.currentToken["systemId"] = ""
 self.state = self.doctypeSystemIdentifierSingleQuotedState
 elif data is EOF:
 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
 "eof-in-doctype"})
 self.currentToken["correct"] = False
 self.tokenQueue.append(self.currentToken)
 self.state = self.dataState
 else:
 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
 "unexpected-char-in-doctype"})
 self.currentToken["correct"] = False
 self.state = self.bogusDoctypeState
 return True
 
 def betweenDoctypePublicAndSystemIdentifiersState(self):
 data = self.stream.char()
 if data in spaceCharacters:
 pass
 elif data == ">":
 self.tokenQueue.append(self.currentToken)
 self.state = self.dataState
 elif data == '"':
 self.currentToken["systemId"] = ""
 self.state = self.doctypeSystemIdentifierDoubleQuotedState
 elif data == "'":
 self.currentToken["systemId"] = ""
 self.state = self.doctypeSystemIdentifierSingleQuotedState
 elif data == EOF:
 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
 "eof-in-doctype"})
 self.currentToken["correct"] = False
 self.tokenQueue.append(self.currentToken)
 self.state = self.dataState
 else:
 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
 "unexpected-char-in-doctype"})
 self.currentToken["correct"] = False
 self.state = self.bogusDoctypeState
 return True
 
 def afterDoctypeSystemKeywordState(self):
 data = self.stream.char()
 if data in spaceCharacters:
 self.state = self.beforeDoctypeSystemIdentifierState
 elif data in ("'", '"'):
 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
 "unexpected-char-in-doctype"})
 self.stream.unget(data)
 self.state = self.beforeDoctypeSystemIdentifierState
 elif data is EOF:
 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
 "eof-in-doctype"})
 self.currentToken["correct"] = False
 self.tokenQueue.append(self.currentToken)
 self.state = self.dataState
 else:
 self.stream.unget(data)
 self.state = self.beforeDoctypeSystemIdentifierState
 return True
 
 def beforeDoctypeSystemIdentifierState(self):
 data = self.stream.char()
 if data in spaceCharacters:
 pass
 elif data == "\"":
 self.currentToken["systemId"] = ""
 self.state = self.doctypeSystemIdentifierDoubleQuotedState
 elif data == "'":
 self.currentToken["systemId"] = ""
 self.state = self.doctypeSystemIdentifierSingleQuotedState
 elif data == ">":
 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
 "unexpected-char-in-doctype"})
 self.currentToken["correct"] = False
 self.tokenQueue.append(self.currentToken)
 self.state = self.dataState
 elif data is EOF:
 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
 "eof-in-doctype"})
 self.currentToken["correct"] = False
 self.tokenQueue.append(self.currentToken)
 self.state = self.dataState
 else:
 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
 "unexpected-char-in-doctype"})
 self.currentToken["correct"] = False
 self.state = self.bogusDoctypeState
 return True
 
 def doctypeSystemIdentifierDoubleQuotedState(self):
 data = self.stream.char()
 if data == "\"":
 self.state = self.afterDoctypeSystemIdentifierState
 elif data == "\u0000":
 self.tokenQueue.append({"type": tokenTypes["ParseError"],
 "data": "invalid-codepoint"})
 self.currentToken["systemId"] += "\uFFFD"
 elif data == ">":
 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
 "unexpected-end-of-doctype"})
 self.currentToken["correct"] = False
 self.tokenQueue.append(self.currentToken)
 self.state = self.dataState
 elif data is EOF:
 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
 "eof-in-doctype"})
 self.currentToken["correct"] = False
 self.tokenQueue.append(self.currentToken)
 self.state = self.dataState
 else:
 self.currentToken["systemId"] += data
 return True
 
 def doctypeSystemIdentifierSingleQuotedState(self):
 data = self.stream.char()
 if data == "'":
 self.state = self.afterDoctypeSystemIdentifierState
 elif data == "\u0000":
 self.tokenQueue.append({"type": tokenTypes["ParseError"],
 "data": "invalid-codepoint"})
 self.currentToken["systemId"] += "\uFFFD"
 elif data == ">":
 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
 "unexpected-end-of-doctype"})
 self.currentToken["correct"] = False
 self.tokenQueue.append(self.currentToken)
 self.state = self.dataState
 elif data is EOF:
 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
 "eof-in-doctype"})
 self.currentToken["correct"] = False
 self.tokenQueue.append(self.currentToken)
 self.state = self.dataState
 else:
 self.currentToken["systemId"] += data
 return True
 
 def afterDoctypeSystemIdentifierState(self):
 data = self.stream.char()
 if data in spaceCharacters:
 pass
 elif data == ">":
 self.tokenQueue.append(self.currentToken)
 self.state = self.dataState
 elif data is EOF:
 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
 "eof-in-doctype"})
 self.currentToken["correct"] = False
 self.tokenQueue.append(self.currentToken)
 self.state = self.dataState
 else:
 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
 "unexpected-char-in-doctype"})
 self.state = self.bogusDoctypeState
 return True
 
 def bogusDoctypeState(self):
 data = self.stream.char()
 if data == ">":
 self.tokenQueue.append(self.currentToken)
 self.state = self.dataState
 elif data is EOF:
 # XXX EMIT
 self.stream.unget(data)
 self.tokenQueue.append(self.currentToken)
 self.state = self.dataState
 else:
 pass
 return True
 
 def cdataSectionState(self):
 data = []
 while True:
 data.append(self.stream.charsUntil("]"))
 data.append(self.stream.charsUntil(">"))
 char = self.stream.char()
 if char == EOF:
 break
 else:
 assert char == ">"
 if data[-1][-2:] == "]]":
 data[-1] = data[-1][:-2]
 break
 else:
 data.append(char)
 
 data = "".join(data)  # pylint:disable=redefined-variable-type
 # Deal with null here rather than in the parser
 nullCount = data.count("\u0000")
 if nullCount > 0:
 for _ in range(nullCount):
 self.tokenQueue.append({"type": tokenTypes["ParseError"],
 "data": "invalid-codepoint"})
 data = data.replace("\u0000", "\uFFFD")
 if data:
 self.tokenQueue.append({"type": tokenTypes["Characters"],
 "data": data})
 self.state = self.dataState
 return True
 
 |