// // HTMLTokenizer.m // HTMLKit // // Created by Iska on 19/09/14. // Copyright (c) 2014 BrainCookie. All rights reserved. // #import "HTMLTokenizer.h" #import "HTMLInputStreamReader.h" #import "HTMLTokens.h" #import "HTMLParser.h" #import "HTMLTokenizerStates.h" #import "HTMLTokenizerCharacters.h" #import "HTMLTokenizerEntities.h" #import "HTMLParser+Private.h" @interface HTMLTokenizer () { HTMLTokenizerState _currentState; /* Input Stream & Tokens Queue */ HTMLInputStreamReader *_inputStreamReader; NSMutableArray *_tokens; /* Character Reference */ HTMLTokenizerState _characterReferenceReturnState; unsigned long long _characterReferenceCode; BOOL _characterReferenceOverflow; /* Pending Tokens & Attributes*/ HTMLTagToken *_currentTagToken; HTMLCharacterToken *_currentCharacterToken; HTMLCommentToken *_currentCommentToken; HTMLDOCTYPEToken *_currentDoctypeToken; NSMutableString *_currentAttributeName; NSMutableString *_currentAttributeValue; BOOL _selfClosingFlagAknowledged; /* Aux */ NSString *_lastStartTagName; NSMutableString *_temporaryBuffer; BOOL _eof; } @end @implementation HTMLTokenizer @synthesize state = _currentState; @synthesize parseErrorCallback = _parseErrorCallback; #pragma mark - Lifecycle - (instancetype)initWithString:(NSString *)string { self = [super init]; if (self) { _currentState = HTMLTokenizerStateData; _characterReferenceReturnState = _currentState; _tokens = [NSMutableArray new]; _inputStreamReader = [[HTMLInputStreamReader alloc] initWithString:string]; __weak HTMLTokenizer *weakSelf = self; _inputStreamReader.errorCallback = ^ (NSString *code, NSString *details) { [weakSelf emitParseError:code details:@"%@", details]; }; } return self; } #pragma mark - Accessor - (NSString *)string { return _inputStreamReader.string; } #pragma mark - State Machine - (id)nextObject { @autoreleasepool { while (_eof == NO && _tokens.count == 0) { [self read]; } HTMLToken *nextToken = [_tokens firstObject]; if (_tokens.count > 0) { [_tokens removeObjectAtIndex:0]; } return nextToken; } } - (void)read { switch (_currentState) { case HTMLTokenizerStateData: return [self HTMLTokenizerStateData]; case HTMLTokenizerStateRCDATA: return [self HTMLTokenizerStateRCDATA]; case HTMLTokenizerStateRAWTEXT: return [self HTMLTokenizerStateRAWTEXT]; case HTMLTokenizerStateScriptData: return [self HTMLTokenizerStateScriptData]; case HTMLTokenizerStatePLAINTEXT: return [self HTMLTokenizerStatePLAINTEXT]; case HTMLTokenizerStateTagOpen: return [self HTMLTokenizerStateTagOpen]; case HTMLTokenizerStateEndTagOpen: return [self HTMLTokenizerStateEndTagOpen]; case HTMLTokenizerStateTagName: return [self HTMLTokenizerStateTagName]; case HTMLTokenizerStateRCDATALessThanSign: return [self HTMLTokenizerStateRCDATALessThanSign]; case HTMLTokenizerStateRCDATAEndTagOpen: return [self HTMLTokenizerStateRCDATAEndTagOpen]; case HTMLTokenizerStateRCDATAEndTagName: return [self HTMLTokenizerStateRCDATAEndTagName]; case HTMLTokenizerStateRAWTEXTLessThanSign: return [self HTMLTokenizerStateRAWTEXTLessThanSign]; case HTMLTokenizerStateRAWTEXTEndTagOpen: return [self HTMLTokenizerStateRAWTEXTEndTagOpen]; case HTMLTokenizerStateRAWTEXTEndTagName: return [self HTMLTokenizerStateRAWTEXTEndTagName]; case HTMLTokenizerStateScriptDataLessThanSign: return [self HTMLTokenizerStateScriptDataLessThanSign]; case HTMLTokenizerStateScriptDataEndTagOpen: return [self HTMLTokenizerStateScriptDataEndTagOpen]; case HTMLTokenizerStateScriptDataEndTagName: return [self HTMLTokenizerStateScriptDataEndTagName]; case HTMLTokenizerStateScriptDataEscapeStart: return [self HTMLTokenizerStateScriptDataEscapeStart]; case HTMLTokenizerStateScriptDataEscapeStartDash: return [self HTMLTokenizerStateScriptDataEscapeStartDash]; case HTMLTokenizerStateScriptDataEscaped: return [self HTMLTokenizerStateScriptDataEscaped]; case HTMLTokenizerStateScriptDataEscapedDash: return [self HTMLTokenizerStateScriptDataEscapedDash]; case HTMLTokenizerStateScriptDataEscapedDashDash: return [self HTMLTokenizerStateScriptDataEscapedDashDash]; case HTMLTokenizerStateScriptDataEscapedLessThanSign: return [self HTMLTokenizerStateScriptDataEscapedLessThanSign]; case HTMLTokenizerStateScriptDataEscapedEndTagOpen: return [self HTMLTokenizerStateScriptDataEscapedEndTagOpen]; case HTMLTokenizerStateScriptDataEscapedEndTagName: return [self HTMLTokenizerStateScriptDataEscapedEndTagName]; case HTMLTokenizerStateScriptDataDoubleEscapeStart: return [self HTMLTokenizerStateScriptDataDoubleEscapeStart]; case HTMLTokenizerStateScriptDataDoubleEscaped: return [self HTMLTokenizerStateScriptDataDoubleEscaped]; case HTMLTokenizerStateScriptDataDoubleEscapedDash: return [self HTMLTokenizerStateScriptDataDoubleEscapedDash]; case HTMLTokenizerStateScriptDataDoubleEscapedDashDash: return [self HTMLTokenizerStateScriptDataDoubleEscapedDashDash]; case HTMLTokenizerStateScriptDataDoubleEscapedLessThanSign: return [self HTMLTokenizerStateScriptDataDoubleEscapedLessThanSign]; case HTMLTokenizerStateScriptDataDoubleEscapeEnd: return [self HTMLTokenizerStateScriptDataDoubleEscapeEnd]; case HTMLTokenizerStateBeforeAttributeName: return [self HTMLTokenizerStateBeforeAttributeName]; case HTMLTokenizerStateAttributeName: return [self HTMLTokenizerStateAttributeName]; case HTMLTokenizerStateAfterAttributeName: return [self HTMLTokenizerStateAfterAttributeName]; case HTMLTokenizerStateBeforeAttributeValue: return [self HTMLTokenizerStateBeforeAttributeValue]; case HTMLTokenizerStateAttributeValueDoubleQuoted: return [self HTMLTokenizerStateAttributeValueDoubleQuoted]; case HTMLTokenizerStateAttributeValueSingleQuoted: return [self HTMLTokenizerStateAttributeValueSingleQuoted]; case HTMLTokenizerStateAttributeValueUnquoted: return [self HTMLTokenizerStateAttributeValueUnquoted]; case HTMLTokenizerStateAfterAttributeValueQuoted: return [self HTMLTokenizerStateAfterAttributeValueQuoted]; case HTMLTokenizerStateSelfClosingStartTag: return [self HTMLTokenizerStateSelfClosingStartTag]; case HTMLTokenizerStateBogusComment: return [self HTMLTokenizerStateBogusComment]; case HTMLTokenizerStateMarkupDeclarationOpen: return [self HTMLTokenizerStateMarkupDeclarationOpen]; case HTMLTokenizerStateCommentStart: return [self HTMLTokenizerStateCommentStart]; case HTMLTokenizerStateCommentStartDash: return [self HTMLTokenizerStateCommentStartDash]; case HTMLTokenizerStateComment: return [self HTMLTokenizerStateComment]; case HTMLTokenizerStateCommentLessThanSign: return [self HTMLTokenizerStateCommentLessThanSign]; case HTMLTokenizerStateCommentLessThanSignBang: return [self HTMLTokenizerStateCommentLessThanSignBang]; case HTMLTokenizerStateCommentLessThanSignBangDash: return [self HTMLTokenizerStateCommentLessThanSignBangDash]; case HTMLTokenizerStateCommentLessThanSignBangDashDash: return [self HTMLTokenizerStateCommentLessThanSignBangDashDash]; case HTMLTokenizerStateCommentEndDash: return [self HTMLTokenizerStateCommentEndDash]; case HTMLTokenizerStateCommentEnd: return [self HTMLTokenizerStateCommentEnd]; case HTMLTokenizerStateCommentEndBang: return [self HTMLTokenizerStateCommentEndBang]; case HTMLTokenizerStateDOCTYPE: return [self HTMLTokenizerStateDOCTYPE]; case HTMLTokenizerStateBeforeDOCTYPEName: return [self HTMLTokenizerStateBeforeDOCTYPEName]; case HTMLTokenizerStateDOCTYPEName: return [self HTMLTokenizerStateDOCTYPEName]; case HTMLTokenizerStateAfterDOCTYPEName: return [self HTMLTokenizerStateAfterDOCTYPEName]; case HTMLTokenizerStateAfterDOCTYPEPublicKeyword: return [self HTMLTokenizerStateAfterDOCTYPEPublicKeyword]; case HTMLTokenizerStateBeforeDOCTYPEPublicIdentifier: return [self HTMLTokenizerStateBeforeDOCTYPEPublicIdentifier]; case HTMLTokenizerStateDOCTYPEPublicIdentifierDoubleQuoted: return [self HTMLTokenizerStateDOCTYPEPublicIdentifierDoubleQuoted]; case HTMLTokenizerStateDOCTYPEPublicIdentifierSingleQuoted: return [self HTMLTokenizerStateDOCTYPEPublicIdentifierSingleQuoted]; case HTMLTokenizerStateAfterDOCTYPEPublicIdentifier: return [self HTMLTokenizerStateAfterDOCTYPEPublicIdentifier]; case HTMLTokenizerStateBetweenDOCTYPEPublicAndSystemIdentifiers: return [self HTMLTokenizerStateBetweenDOCTYPEPublicAndSystemIdentifiers]; case HTMLTokenizerStateAfterDOCTYPESystemKeyword: return [self HTMLTokenizerStateAfterDOCTYPESystemKeyword]; case HTMLTokenizerStateBeforeDOCTYPESystemIdentifier: return [self HTMLTokenizerStateBeforeDOCTYPESystemIdentifier]; case HTMLTokenizerStateDOCTYPESystemIdentifierDoubleQuoted: return [self HTMLTokenizerStateDOCTYPESystemIdentifierDoubleQuoted]; case HTMLTokenizerStateDOCTYPESystemIdentifierSingleQuoted: return [self HTMLTokenizerStateDOCTYPESystemIdentifierSingleQuoted]; case HTMLTokenizerStateAfterDOCTYPESystemIdentifier: return [self HTMLTokenizerStateAfterDOCTYPESystemIdentifier]; case HTMLTokenizerStateBogusDOCTYPE: return [self HTMLTokenizerStateBogusDOCTYPE]; case HTMLTokenizerStateCDATASection: return [self HTMLTokenizerStateCDATASection]; case HTMLTokenizerStateCDATASectionBracket: return [self HTMLTokenizerStateCDATASectionBracket]; case HTMLTokenizerStateCDATASectionEnd: return [self HTMLTokenizerStateCDATASectionEnd]; case HTMLTokenizerStateCharacterReference: return [self HTMLTokenizerStateCharacterReference]; case HTMLTokenizerStateNamedCharacterReference: return [self HTMLTokenizerStateNamedCharacterReference]; case HTMLTokenizerStateAmbiguousAmpersand: return [self HTMLTokenizerStateAmbiguousAmpersand]; case HTMLTokenizerStateNumericCharacterReference: return [self HTMLTokenizerStateNumericCharacterReference]; case HTMLTokenizerStateHexadecimalCharacterReferenceStart: return [self HTMLTokenizerStateHexadecimalCharacterReferenceStart]; case HTMLTokenizerStateDecimalCharacterReferenceStart: return [self HTMLTokenizerStateDecimalCharacterReferenceStart]; case HTMLTokenizerStateHexadecimalCharacterReference: return [self HTMLTokenizerStateHexadecimalCharacterReference]; case HTMLTokenizerStateDecimalCharacterReference: return [self HTMLTokenizerStateDecimalCharacterReference]; case HTMLTokenizerStateNumericCharacterReferenceEnd: return [self HTMLTokenizerStateNumericCharacterReferenceEnd]; default: break; } } - (void)switchToState:(HTMLTokenizerState)state { _currentState = state; } #pragma mark - Emits - (void)emitToken:(HTMLToken *)token { if (_currentCharacterToken != nil) { [_tokens addObject:_currentCharacterToken]; _currentCharacterToken = nil; } [_tokens addObject:token]; } - (void)emitEOFToken { [self emitToken:[HTMLEOFToken token]]; _eof = YES; } - (void)emitCurrentTagToken { [self finalizeCurrentAttribute]; switch (_currentTagToken.type) { case HTMLTokenTypeStartTag: _lastStartTagName = _currentTagToken.tagName; if (_currentTagToken.isSelfClosing) { _selfClosingFlagAknowledged = NO; } break; case HTMLTokenTypeEndTag: if (_currentTagToken.attributes != nil) { [self emitParseError:@"end-tag-with-attributes" details:@"End tag [%@]", _currentTagToken.tagName]; } if (_currentTagToken.isSelfClosing) { [self emitParseError:@"end-tag-with-trailing-solidus" details:@"End tag [%@]", _currentTagToken.tagName]; } break; default: break; } [self emitToken:_currentTagToken]; _currentTagToken = nil; } - (void)emitCharacterToken:(UTF32Char)character { [self emitCharacterTokenWithString:StringFromUTF32Char(character)]; } - (void)emitCharacterTokenWithString:(NSString *)string { if (string.length == 0) { return; } if (_currentCharacterToken == nil) { _currentCharacterToken = [HTMLCharacterToken new]; } [_currentCharacterToken appendString:string]; } - (void)emitParseError:(NSString *)code details:(NSString *)format, ... NS_FORMAT_FUNCTION(2, 3) { if (!self.parseErrorCallback) { return; } NSString *details = nil; if (format) { va_list args; va_start(args, format); details = [[NSString alloc] initWithFormat:format arguments:args]; va_end(args); } HTMLParseErrorToken *token = [[HTMLParseErrorToken alloc] initWithCode:code details:details location:_inputStreamReader.currentLocation]; self.parseErrorCallback(token); } #pragma mark - Token Checks - (BOOL)isCurrentEndTagTokenAppropriate { return ([_currentTagToken isKindOfClass:[HTMLEndTagToken class]] && [_currentTagToken.tagName isEqualToString:_lastStartTagName]); } #pragma mark - Attributes - (void)appendToCurrentAttributeName:(NSString *)string { if (_currentAttributeName == nil) { _currentAttributeName = [NSMutableString new]; } [_currentAttributeName appendString:string]; } - (void)appendToCurrentAttributeValue:(NSString *)string { if (_currentAttributeValue == nil) { _currentAttributeValue = [NSMutableString new]; } [_currentAttributeValue appendString:string]; } - (void)finalizeCurrentAttribute { if (_currentAttributeName != nil) { if (_currentTagToken.attributes == nil) { _currentTagToken.attributes = [HTMLOrderedDictionary new]; } if (_currentTagToken.attributes[_currentAttributeName] != nil) { [self emitParseError:@"duplicate-attribute" details:@"Tag [%@] already contains an attrbitue with name [%@]", _currentTagToken, _currentAttributeName]; } else { _currentTagToken.attributes[_currentAttributeName] = _currentAttributeValue ?: @""; } } _currentAttributeName = nil; _currentAttributeValue = nil; } #pragma mark - Character Reference - (void)flushCodePointsConsumedAsCharacterReference { if (_characterReferenceReturnState == HTMLTokenizerStateAttributeValueUnquoted || _characterReferenceReturnState == HTMLTokenizerStateAttributeValueSingleQuoted || _characterReferenceReturnState == HTMLTokenizerStateAttributeValueDoubleQuoted) { [self appendToCurrentAttributeValue:_temporaryBuffer]; } else { [self emitCharacterTokenWithString:_temporaryBuffer]; } } #pragma mark - States - (void)HTMLTokenizerStateData { UTF32Char character = [_inputStreamReader consumeNextInputCharacter]; switch (character) { case AMPERSAND: _characterReferenceReturnState = HTMLTokenizerStateData; [self switchToState:HTMLTokenizerStateCharacterReference]; break; case LESS_THAN_SIGN: [self switchToState:HTMLTokenizerStateTagOpen]; break; case NULL_CHAR: [self emitParseError:@"unexpected-null-character" details:nil]; [self emitCharacterToken:character]; break; case EOF: [self emitEOFToken]; break; default: [self emitCharacterToken:character]; break; } } - (void)HTMLTokenizerStateRCDATA { UTF32Char character = [_inputStreamReader consumeNextInputCharacter]; switch (character) { case AMPERSAND: _characterReferenceReturnState = HTMLTokenizerStateRCDATA; [self switchToState:HTMLTokenizerStateCharacterReference]; break; case LESS_THAN_SIGN: [self switchToState:HTMLTokenizerStateRCDATALessThanSign]; break; case NULL_CHAR: [self emitParseError:@"unexpected-null-character" details:nil]; [self emitCharacterToken:REPLACEMENT_CHAR]; break; case EOF: [self emitEOFToken]; break; default: [self emitCharacterToken:character]; break; } } - (void)HTMLTokenizerStateRAWTEXT { UTF32Char character = [_inputStreamReader consumeNextInputCharacter]; switch (character) { case LESS_THAN_SIGN: [self switchToState:HTMLTokenizerStateRAWTEXTLessThanSign]; break; case NULL_CHAR: [self emitParseError:@"unexpected-null-character" details:nil]; [self emitCharacterToken:REPLACEMENT_CHAR]; break; case EOF: [self emitEOFToken]; break; default: [self emitCharacterToken:character]; break; } } - (void)HTMLTokenizerStateScriptData { UTF32Char character = [_inputStreamReader consumeNextInputCharacter]; switch (character) { case LESS_THAN_SIGN: [self switchToState:HTMLTokenizerStateScriptDataLessThanSign]; break; case NULL_CHAR: [self emitParseError:@"unexpected-null-character" details:nil]; [self emitCharacterToken:REPLACEMENT_CHAR]; break; case EOF: [self emitEOFToken]; break; default: [self emitCharacterToken:character]; break; } } - (void)HTMLTokenizerStatePLAINTEXT { UTF32Char character = [_inputStreamReader consumeNextInputCharacter]; switch (character) { case NULL_CHAR: [self emitParseError:@"unexpected-null-character" details:nil]; [self emitCharacterToken:REPLACEMENT_CHAR]; break; case EOF: [self emitEOFToken]; break; default: [self emitCharacterToken:character]; break; } } - (void)HTMLTokenizerStateTagOpen { UTF32Char character = [_inputStreamReader consumeNextInputCharacter]; switch (character) { case EXCLAMATION_MARK: [self switchToState:HTMLTokenizerStateMarkupDeclarationOpen]; break; case SOLIDUS: [self switchToState:HTMLTokenizerStateEndTagOpen]; break; case LATIN_CAPITAL_LETTER_A ... LATIN_CAPITAL_LETTER_Z: _currentTagToken = [[HTMLStartTagToken alloc] initWithTagName:StringFromUniChar(character + 0x0020)]; [self switchToState:HTMLTokenizerStateTagName]; break; case LATIN_SMALL_LETTER_A ... LATIN_SMALL_LETTER_Z: _currentTagToken = [[HTMLStartTagToken alloc] initWithTagName:StringFromUniChar(character)]; [self switchToState:HTMLTokenizerStateTagName]; break; case QUESTION_MARK: [self emitParseError:@"unexpected-question-mark-instead-of-tag-name" details:@"Unexpected (0x003F, ?) instead of tag name"]; _currentCommentToken = [[HTMLCommentToken alloc] initWithData:@""]; [self switchToState:HTMLTokenizerStateBogusComment]; [_inputStreamReader reconsumeCurrentInputCharacter]; break; case EOF: [self emitParseError:@"eof-before-tag-name" details:nil]; [self emitCharacterToken:LESS_THAN_SIGN]; [self emitEOFToken]; break; default: [self emitParseError:@"invalid-first-character-of-tag-name" details:@"Unexpected first character (0x%X) of tag name", (unsigned int)character]; [self switchToState:HTMLTokenizerStateData]; [self emitCharacterToken:LESS_THAN_SIGN]; [_inputStreamReader reconsumeCurrentInputCharacter]; break; } } - (void)HTMLTokenizerStateEndTagOpen { UTF32Char character = [_inputStreamReader consumeNextInputCharacter]; switch (character) { case LATIN_CAPITAL_LETTER_A ... LATIN_CAPITAL_LETTER_Z: _currentTagToken = [[HTMLEndTagToken alloc] initWithTagName:StringFromUniChar(character + 0x0020)]; [self switchToState:HTMLTokenizerStateTagName]; break; case LATIN_SMALL_LETTER_A ... LATIN_SMALL_LETTER_Z: _currentTagToken = [[HTMLEndTagToken alloc] initWithTagName:StringFromUniChar(character)]; [self switchToState:HTMLTokenizerStateTagName]; break; case GREATER_THAN_SIGN: [self emitParseError:@"missing-end-tag-name" details:@"Unexpected (0x003E, >) missing end tag name"]; [self switchToState:HTMLTokenizerStateData]; break; case EOF: [self emitParseError:@"eof-before-tag-name" details:nil]; [self emitCharacterTokenWithString:@") in comment start"]; [self switchToState:HTMLTokenizerStateData]; [self emitToken:_currentCommentToken]; return; default: [self switchToState:HTMLTokenizerStateComment]; [_inputStreamReader reconsumeCurrentInputCharacter]; return; } } - (void)HTMLTokenizerStateCommentStartDash { UTF32Char character = [_inputStreamReader consumeNextInputCharacter]; switch (character) { case HYPHEN_MINUS: [self switchToState:HTMLTokenizerStateCommentEnd]; return; case GREATER_THAN_SIGN: [self emitParseError:@"abrupt-closing-of-empty-comment" details:@"Unexpeted character (0x003E, >) in comment start"]; [self switchToState:HTMLTokenizerStateData]; [self emitToken:_currentCommentToken]; return; case EOF: [self emitParseError:@"eof-in-comment" details:nil]; [self emitToken:_currentCommentToken]; [self emitEOFToken]; return; default: [_currentCommentToken appendStringToData:StringFromUniChar(HYPHEN_MINUS)]; [self switchToState:HTMLTokenizerStateComment]; [_inputStreamReader reconsumeCurrentInputCharacter]; return; } } - (void)HTMLTokenizerStateComment { UTF32Char character = [_inputStreamReader consumeNextInputCharacter]; switch (character) { case LESS_THAN_SIGN: [_currentCommentToken appendStringToData:StringFromUniChar(LESS_THAN_SIGN)]; [self switchToState:HTMLTokenizerStateCommentLessThanSign]; return; case HYPHEN_MINUS: [self switchToState:HTMLTokenizerStateCommentEndDash]; return; case NULL_CHAR: [self emitParseError:@"unexpected-null-character" details:nil]; [_currentCommentToken appendStringToData:StringFromUniChar(REPLACEMENT_CHAR)]; return; case EOF: [self emitParseError:@"eof-in-commnet" details:nil]; [self emitToken:_currentCommentToken]; [self emitEOFToken]; return; default: [_currentCommentToken appendStringToData:StringFromUTF32Char(character)]; return; } } - (void)HTMLTokenizerStateCommentLessThanSign { UTF32Char character = [_inputStreamReader consumeNextInputCharacter]; switch (character) { case EXCLAMATION_MARK: [_currentCommentToken appendStringToData:StringFromUniChar(EXCLAMATION_MARK)]; [self switchToState:HTMLTokenizerStateCommentLessThanSignBang]; return; case LESS_THAN_SIGN: [_currentCommentToken appendStringToData:StringFromUniChar(LESS_THAN_SIGN)]; return; default: [self switchToState:HTMLTokenizerStateComment]; [_inputStreamReader reconsumeCurrentInputCharacter]; return; } } - (void)HTMLTokenizerStateCommentLessThanSignBang { UTF32Char character = [_inputStreamReader consumeNextInputCharacter]; switch (character) { case HYPHEN_MINUS: [self switchToState:HTMLTokenizerStateCommentLessThanSignBangDash]; return; default: [self switchToState:HTMLTokenizerStateComment]; [_inputStreamReader reconsumeCurrentInputCharacter]; return; } } - (void)HTMLTokenizerStateCommentLessThanSignBangDash { UTF32Char character = [_inputStreamReader consumeNextInputCharacter]; switch (character) { case HYPHEN_MINUS: [self switchToState:HTMLTokenizerStateCommentLessThanSignBangDashDash]; return; default: [self switchToState:HTMLTokenizerStateCommentEndDash]; [_inputStreamReader reconsumeCurrentInputCharacter]; return; } } - (void)HTMLTokenizerStateCommentLessThanSignBangDashDash { UTF32Char character = [_inputStreamReader consumeNextInputCharacter]; switch (character) { case GREATER_THAN_SIGN: case EOF: [self switchToState:HTMLTokenizerStateCommentEnd]; [_inputStreamReader reconsumeCurrentInputCharacter]; return; default: [self emitParseError:@"nested-comment" details:nil]; [self switchToState:HTMLTokenizerStateCommentEnd]; [_inputStreamReader reconsumeCurrentInputCharacter]; return; } } - (void)HTMLTokenizerStateCommentEndDash { UTF32Char character = [_inputStreamReader consumeNextInputCharacter]; switch (character) { case HYPHEN_MINUS: [self switchToState:HTMLTokenizerStateCommentEnd]; return; case EOF: [self emitParseError:@"eof-in-comment" details:nil]; [self emitToken:_currentCommentToken]; [self emitEOFToken]; return; default: [_currentCommentToken appendStringToData:StringFromUniChar(HYPHEN_MINUS)]; [self switchToState:HTMLTokenizerStateComment]; [_inputStreamReader reconsumeCurrentInputCharacter]; return; } } - (void)HTMLTokenizerStateCommentEnd { UTF32Char character = [_inputStreamReader consumeNextInputCharacter]; switch (character) { case GREATER_THAN_SIGN: [self switchToState:HTMLTokenizerStateData]; [self emitToken:_currentCommentToken]; return; case EXCLAMATION_MARK: [self switchToState:HTMLTokenizerStateCommentEndBang]; return; case HYPHEN_MINUS: [_currentCommentToken appendStringToData:StringFromUniChar(HYPHEN_MINUS)]; return; case EOF: [self emitParseError:@"eof-in-comment" details:nil]; [self emitToken:_currentCommentToken]; [self emitEOFToken]; return; default: [_currentCommentToken appendStringToData:@"--"]; [self switchToState:HTMLTokenizerStateComment]; [_inputStreamReader reconsumeCurrentInputCharacter]; return; } } - (void)HTMLTokenizerStateCommentEndBang { UTF32Char character = [_inputStreamReader consumeNextInputCharacter]; switch (character) { case HYPHEN_MINUS: [_currentCommentToken appendStringToData:@"--!"]; [self switchToState:HTMLTokenizerStateCommentEndDash]; return; case GREATER_THAN_SIGN: [self emitParseError:@"incorrectly-closed-comment" details:@"Unexpeted character (0x003E, >) in comment end"]; [self switchToState:HTMLTokenizerStateData]; [self emitToken:_currentCommentToken]; return; case EOF: [self emitParseError:@"eof-in-comment" details:nil]; [self emitToken:_currentCommentToken]; [self emitEOFToken]; return; default: [_currentCommentToken appendStringToData:@"--!"]; [self switchToState:HTMLTokenizerStateComment]; [_inputStreamReader reconsumeCurrentInputCharacter]; return; } } - (void)HTMLTokenizerStateDOCTYPE { UTF32Char character = [_inputStreamReader consumeNextInputCharacter]; switch (character) { case CHARACTER_TABULATION: case LINE_FEED: case FORM_FEED: case SPACE: [self switchToState:HTMLTokenizerStateBeforeDOCTYPEName]; return; case GREATER_THAN_SIGN: [self switchToState:HTMLTokenizerStateBeforeDOCTYPEName]; [_inputStreamReader reconsumeCurrentInputCharacter]; return; case EOF: [self emitParseError:@"eof-in-doctype" details:nil]; _currentDoctypeToken = [HTMLDOCTYPEToken new]; _currentDoctypeToken.forceQuirks = YES; [self emitToken:_currentDoctypeToken]; [self emitEOFToken]; return; default: [self emitParseError:@"missing-whitespace-before-doctype-name" details:@"Unexpected character (%@) instead of whitespace before DOCTYPE name", StringFromUTF32Char(character)]; [self switchToState:HTMLTokenizerStateBeforeDOCTYPEName]; [_inputStreamReader reconsumeCurrentInputCharacter]; return; } } - (void)HTMLTokenizerStateBeforeDOCTYPEName { UTF32Char character = [_inputStreamReader consumeNextInputCharacter]; switch (character) { case CHARACTER_TABULATION: case LINE_FEED: case FORM_FEED: case SPACE: return; case LATIN_CAPITAL_LETTER_A ... LATIN_CAPITAL_LETTER_Z: _currentDoctypeToken = [[HTMLDOCTYPEToken alloc] initWithName:StringFromUniChar(character + 0x0020)]; [self switchToState:HTMLTokenizerStateDOCTYPEName]; return; case NULL_CHAR: [self emitParseError:@"unexpected-null-character" details:nil]; _currentDoctypeToken = [[HTMLDOCTYPEToken alloc] initWithName:StringFromUniChar(REPLACEMENT_CHAR)]; [self switchToState:HTMLTokenizerStateDOCTYPEName]; return; case GREATER_THAN_SIGN: [self emitParseError:@"missing-doctype-name" details:@"Unexpected character (0x003E, >) before DOCTYPE name"]; _currentDoctypeToken = [HTMLDOCTYPEToken new]; _currentDoctypeToken.forceQuirks = YES; [self switchToState:HTMLTokenizerStateData]; [self emitToken:_currentDoctypeToken]; return; case EOF: [self emitParseError:@"eof-in-doctype" details:nil]; [self switchToState:HTMLTokenizerStateData]; _currentDoctypeToken = [HTMLDOCTYPEToken new]; _currentDoctypeToken.forceQuirks = YES; [self emitToken:_currentDoctypeToken]; [self emitEOFToken]; return; default: _currentDoctypeToken = [[HTMLDOCTYPEToken alloc] initWithName:StringFromUTF32Char(character)]; [self switchToState:HTMLTokenizerStateDOCTYPEName]; return; } } - (void)HTMLTokenizerStateDOCTYPEName { UTF32Char character = [_inputStreamReader consumeNextInputCharacter]; switch (character) { case CHARACTER_TABULATION: case LINE_FEED: case FORM_FEED: case SPACE: [self switchToState:HTMLTokenizerStateAfterDOCTYPEName]; return; case GREATER_THAN_SIGN: [self switchToState:HTMLTokenizerStateData]; [self emitToken:_currentDoctypeToken]; return; case LATIN_CAPITAL_LETTER_A ... LATIN_CAPITAL_LETTER_Z: [_currentDoctypeToken appendStringToName:StringFromUTF32Char(character + 0x0020)]; return; case NULL_CHAR: [self emitParseError:@"unexpected-null-character" details:nil]; [_currentDoctypeToken appendStringToName:StringFromUniChar(REPLACEMENT_CHAR)]; return; case EOF: [self emitParseError:@"eof-in-doctype" details:nil]; [self switchToState:HTMLTokenizerStateData]; _currentDoctypeToken.forceQuirks = YES; [self emitToken:_currentDoctypeToken]; [self emitEOFToken]; return; default: [_currentDoctypeToken appendStringToName:StringFromUTF32Char(character)]; return; } } - (void)HTMLTokenizerStateAfterDOCTYPEName { UTF32Char character = [_inputStreamReader consumeNextInputCharacter]; switch (character) { case CHARACTER_TABULATION: case LINE_FEED: case FORM_FEED: case SPACE: return; case GREATER_THAN_SIGN: [self switchToState:HTMLTokenizerStateData]; [self emitToken:_currentDoctypeToken]; return; case EOF: [self emitParseError:@"eof-in-doctype" details:nil]; [self switchToState:HTMLTokenizerStateData]; _currentDoctypeToken.forceQuirks = YES; [self emitToken:_currentDoctypeToken]; [self emitEOFToken]; return; default: { if ((character == LATIN_SMALL_LETTER_P || character == LATIN_CAPITAL_LETTER_P) && [_inputStreamReader consumeString:@"UBLIC" caseSensitive:NO]) { [self switchToState:HTMLTokenizerStateAfterDOCTYPEPublicKeyword]; } else if ((character == LATIN_SMALL_LETTER_S || character == LATIN_CAPITAL_LETTER_S) && [_inputStreamReader consumeString:@"YSTEM" caseSensitive:NO]) { [self switchToState:HTMLTokenizerStateAfterDOCTYPESystemKeyword]; } else { [self emitParseError:@"invalid-character-sequence-after-doctype-name" details:@"Expected PUBLIC or SYSTEM after DOCTYPE name"]; _currentDoctypeToken.forceQuirks = YES; [self switchToState:HTMLTokenizerStateBogusDOCTYPE]; [_inputStreamReader reconsumeCurrentInputCharacter]; } return; } } } - (void)HTMLTokenizerStateAfterDOCTYPEPublicKeyword { UTF32Char character = [_inputStreamReader consumeNextInputCharacter]; switch (character) { case CHARACTER_TABULATION: case LINE_FEED: case FORM_FEED: case SPACE: [self switchToState:HTMLTokenizerStateBeforeDOCTYPEPublicIdentifier]; return; case QUOTATION_MARK: [self emitParseError:@"missing-whitespace-after-doctype-public-keyword" details:@"Unexpected character (0x0022, \") instead of whitespace after DOCTYPE PUBLIC keyword"]; _currentDoctypeToken.publicIdentifier = [NSMutableString string]; [self switchToState:HTMLTokenizerStateDOCTYPEPublicIdentifierDoubleQuoted]; return; case APOSTROPHE: [self emitParseError:@"missing-whitespace-after-doctype-public-keyword" details:@"Unexpected character (0x0027, ') instead of whitespace after DOCTYPE PUBLIC keyword"]; _currentDoctypeToken.publicIdentifier = [NSMutableString string]; [self switchToState:HTMLTokenizerStateDOCTYPEPublicIdentifierSingleQuoted]; return; case GREATER_THAN_SIGN: [self emitParseError:@"missing-doctype-public-identifier" details:nil]; _currentDoctypeToken.forceQuirks = YES; [self switchToState:HTMLTokenizerStateData]; [self emitToken:_currentDoctypeToken]; return; case EOF: [self emitParseError:@"eof-in-doctype" details:nil]; [self switchToState:HTMLTokenizerStateData]; _currentDoctypeToken.forceQuirks = YES; [self emitToken:_currentDoctypeToken]; [self emitEOFToken]; return; default: [self emitParseError:@"missing-quote-before-doctype-public-identifier" details:@"Unexpected character (%@) instead of quote before DOCTYPE Public identifier", StringFromUTF32Char(character)]; _currentDoctypeToken.forceQuirks = YES; [self switchToState:HTMLTokenizerStateBogusDOCTYPE]; [_inputStreamReader reconsumeCurrentInputCharacter]; return; } } - (void)HTMLTokenizerStateBeforeDOCTYPEPublicIdentifier { UTF32Char character = [_inputStreamReader consumeNextInputCharacter]; switch (character) { case CHARACTER_TABULATION: case LINE_FEED: case FORM_FEED: case SPACE: return; case QUOTATION_MARK: _currentDoctypeToken.publicIdentifier = [NSMutableString string]; [self switchToState:HTMLTokenizerStateDOCTYPEPublicIdentifierDoubleQuoted]; break; case APOSTROPHE: _currentDoctypeToken.publicIdentifier = [NSMutableString string]; [self switchToState:HTMLTokenizerStateDOCTYPEPublicIdentifierSingleQuoted]; break; case GREATER_THAN_SIGN: [self emitParseError:@"missing-doctype-public-identifier" details:nil]; _currentDoctypeToken.forceQuirks = YES; [self switchToState:HTMLTokenizerStateData]; [self emitToken:_currentDoctypeToken]; break; case EOF: [self emitParseError:@"eof-in-doctype" details:nil]; [self switchToState:HTMLTokenizerStateData]; _currentDoctypeToken.forceQuirks = YES; [self emitToken:_currentDoctypeToken]; [self emitEOFToken]; return; default: [self emitParseError:@"missing-quote-before-doctype-public-identifier" details:@"Unexpected character (%@) instead of quote before DOCTYPE Public identifier", StringFromUTF32Char(character)]; _currentDoctypeToken.forceQuirks = YES; [self switchToState:HTMLTokenizerStateBogusDOCTYPE]; [_inputStreamReader reconsumeCurrentInputCharacter]; return; } } - (void)HTMLTokenizerStateDOCTYPEPublicIdentifierDoubleQuoted { UTF32Char character = [_inputStreamReader consumeNextInputCharacter]; switch (character) { case QUOTATION_MARK: [self switchToState:HTMLTokenizerStateAfterDOCTYPEPublicIdentifier]; return; case NULL_CHAR: [self emitParseError:@"unexpected-null-character" details:nil]; [_currentDoctypeToken appendStringToPublicIdentifier:StringFromUniChar(REPLACEMENT_CHAR)]; return; case GREATER_THAN_SIGN: [self emitParseError:@"abrupt-doctype-public-identifier" details:@"Unexpected character (0x003E, >) in DOCTYPE Public identifier"]; _currentDoctypeToken.forceQuirks = YES; [self switchToState:HTMLTokenizerStateData]; [self emitToken:_currentDoctypeToken]; return; case EOF: [self emitParseError:@"eof-in-doctype" details:nil]; [self switchToState:HTMLTokenizerStateData]; _currentDoctypeToken.forceQuirks = YES; [self emitToken:_currentDoctypeToken]; [self emitEOFToken]; return; default: [_currentDoctypeToken appendStringToPublicIdentifier:StringFromUTF32Char(character)]; return; } } - (void)HTMLTokenizerStateDOCTYPEPublicIdentifierSingleQuoted { UTF32Char character = [_inputStreamReader consumeNextInputCharacter]; switch (character) { case APOSTROPHE: [self switchToState:HTMLTokenizerStateAfterDOCTYPEPublicIdentifier]; break; case NULL_CHAR: [self emitParseError:@"unexpected-null-character" details:nil]; [_currentDoctypeToken appendStringToPublicIdentifier:StringFromUniChar(REPLACEMENT_CHAR)]; return; case GREATER_THAN_SIGN: [self emitParseError:@"abrupt-doctype-public-identifier" details:@"Unexpected character (0x003E, >) in DOCTYPE Public identifier"]; _currentDoctypeToken.forceQuirks = YES; [self switchToState:HTMLTokenizerStateData]; [self emitToken:_currentDoctypeToken]; return; case EOF: [self emitParseError:@"eof-in-doctype" details:nil]; [self switchToState:HTMLTokenizerStateData]; _currentDoctypeToken.forceQuirks = YES; [self emitToken:_currentDoctypeToken]; [self emitEOFToken]; return; default: [_currentDoctypeToken appendStringToPublicIdentifier:StringFromUTF32Char(character)]; return; } } - (void)HTMLTokenizerStateAfterDOCTYPEPublicIdentifier { UTF32Char character = [_inputStreamReader consumeNextInputCharacter]; switch (character) { case CHARACTER_TABULATION: case LINE_FEED: case FORM_FEED: case SPACE: [self switchToState:HTMLTokenizerStateBetweenDOCTYPEPublicAndSystemIdentifiers]; return; case GREATER_THAN_SIGN: [self switchToState:HTMLTokenizerStateData]; [self emitToken:_currentDoctypeToken]; return; case QUOTATION_MARK: [self emitParseError:@"missing-whitespace-between-doctype-public-and-system-identifiers" details:@"Unexpected character (0x0022, \") instead of whitespace between DOCTYPE Public and System identifiers"]; _currentDoctypeToken.systemIdentifier = [NSMutableString string]; [self switchToState:HTMLTokenizerStateDOCTYPESystemIdentifierDoubleQuoted]; return; case APOSTROPHE: [self emitParseError:@"missing-whitespace-between-doctype-public-and-system-identifiers" details:@"Unexpected character (0x0027, ') instead of whitespace between DOCTYPE Public and System identifiers"]; _currentDoctypeToken.systemIdentifier = [NSMutableString string]; [self switchToState:HTMLTokenizerStateDOCTYPESystemIdentifierSingleQuoted]; return; case EOF: [self emitParseError:@"eof-in-doctype" details:nil]; [self switchToState:HTMLTokenizerStateData]; _currentDoctypeToken.forceQuirks = YES; [self emitToken:_currentDoctypeToken]; [self emitEOFToken]; return; default: [self emitParseError:@"missing-quote-before-doctype-system-identifier" details:@"Unexpected character (%@) instead of quote before DOCTYPE System identifier", StringFromUTF32Char(character)]; _currentDoctypeToken.forceQuirks = YES; [self switchToState:HTMLTokenizerStateBogusDOCTYPE]; [_inputStreamReader reconsumeCurrentInputCharacter]; return; } } - (void)HTMLTokenizerStateBetweenDOCTYPEPublicAndSystemIdentifiers { UTF32Char character = [_inputStreamReader consumeNextInputCharacter]; switch (character) { case CHARACTER_TABULATION: case LINE_FEED: case FORM_FEED: case SPACE: return; case GREATER_THAN_SIGN: [self switchToState:HTMLTokenizerStateData]; [self emitToken:_currentDoctypeToken]; return; case QUOTATION_MARK: _currentDoctypeToken.systemIdentifier = [NSMutableString string]; [self switchToState:HTMLTokenizerStateDOCTYPESystemIdentifierDoubleQuoted]; return; case APOSTROPHE: _currentDoctypeToken.systemIdentifier = [NSMutableString string]; [self switchToState:HTMLTokenizerStateDOCTYPESystemIdentifierSingleQuoted]; return; case EOF: [self emitParseError:@"eof-in-doctype" details:nil]; [self switchToState:HTMLTokenizerStateData]; _currentDoctypeToken.forceQuirks = YES; [self emitToken:_currentDoctypeToken]; [self emitEOFToken]; return; default: [self emitParseError:@"missing-quote-before-doctype-system-identifier" details:@"Unexpected character (%@) instead of quote before DOCTYPE System identifier", StringFromUTF32Char(character)]; _currentDoctypeToken.forceQuirks = YES; [self switchToState:HTMLTokenizerStateBogusDOCTYPE]; [_inputStreamReader reconsumeCurrentInputCharacter]; return; } } - (void)HTMLTokenizerStateAfterDOCTYPESystemKeyword { UTF32Char character = [_inputStreamReader consumeNextInputCharacter]; switch (character) { case CHARACTER_TABULATION: case LINE_FEED: case FORM_FEED: case SPACE: [self switchToState:HTMLTokenizerStateBeforeDOCTYPESystemIdentifier]; return; case QUOTATION_MARK: [self emitParseError:@"missing-whitespace-after-doctype-system-keyword" details:@"Unexpected character (0x0022, \") after DOCTYPE System identifier"]; _currentDoctypeToken.systemIdentifier = [NSMutableString string]; [self switchToState:HTMLTokenizerStateDOCTYPESystemIdentifierDoubleQuoted]; return; case APOSTROPHE: [self emitParseError:@"missing-whitespace-after-doctype-system-keyword" details:@"Unexpected character (0x0027, ') after DOCTYPE System identifier"]; _currentDoctypeToken.systemIdentifier = [NSMutableString string]; [self switchToState:HTMLTokenizerStateDOCTYPESystemIdentifierSingleQuoted]; return; case GREATER_THAN_SIGN: [self emitParseError:@"missing-doctype-system-identifier" details:nil]; _currentDoctypeToken.forceQuirks = YES; [self switchToState:HTMLTokenizerStateData]; [self emitToken:_currentDoctypeToken]; return; case EOF: [self emitParseError:@"eof-in-doctype" details:nil]; [self switchToState:HTMLTokenizerStateData]; _currentDoctypeToken.forceQuirks = YES; [self emitToken:_currentDoctypeToken]; [self emitEOFToken]; return; default: [self emitParseError:@"missing-quote-before-doctype-system-identifier" details:@"Unexpected character (%@) instead of quote before DOCTYPE System identifier", StringFromUTF32Char(character)]; _currentDoctypeToken.forceQuirks = YES; [self switchToState:HTMLTokenizerStateBogusDOCTYPE]; [_inputStreamReader reconsumeCurrentInputCharacter]; return; } } - (void)HTMLTokenizerStateBeforeDOCTYPESystemIdentifier { UTF32Char character = [_inputStreamReader consumeNextInputCharacter]; switch (character) { case CHARACTER_TABULATION: case LINE_FEED: case FORM_FEED: case SPACE: break; case QUOTATION_MARK: _currentDoctypeToken.systemIdentifier = [NSMutableString string]; [self switchToState:HTMLTokenizerStateDOCTYPESystemIdentifierDoubleQuoted]; break; case APOSTROPHE: _currentDoctypeToken.systemIdentifier = [NSMutableString string]; [self switchToState:HTMLTokenizerStateDOCTYPESystemIdentifierSingleQuoted]; break; case GREATER_THAN_SIGN: [self emitParseError:@"missing-doctype-system-identifier" details:nil]; _currentDoctypeToken.forceQuirks = YES; [self switchToState:HTMLTokenizerStateData]; [self emitToken:_currentDoctypeToken]; return; case EOF: [self emitParseError:@"eof-in-doctype" details:nil]; [self switchToState:HTMLTokenizerStateData]; _currentDoctypeToken.forceQuirks = YES; [self emitToken:_currentDoctypeToken]; [self emitEOFToken]; return; default: [self emitParseError:@"missing-quote-before-doctype-system-identifier" details:@"Unexpected character (%@) instead of quote before DOCTYPE System identifier", StringFromUTF32Char(character)]; _currentDoctypeToken.forceQuirks = YES; [self switchToState:HTMLTokenizerStateBogusDOCTYPE]; [_inputStreamReader reconsumeCurrentInputCharacter]; return; } } - (void)HTMLTokenizerStateDOCTYPESystemIdentifierDoubleQuoted { UTF32Char character = [_inputStreamReader consumeNextInputCharacter]; switch (character) { case QUOTATION_MARK: [self switchToState:HTMLTokenizerStateAfterDOCTYPESystemIdentifier]; break; case NULL_CHAR: [self emitParseError:@"unexpected-null-character" details:nil]; [_currentDoctypeToken appendStringToSystemIdentifier:StringFromUniChar(REPLACEMENT_CHAR)]; break; case GREATER_THAN_SIGN: [self emitParseError:@"abrupt-doctype-system-identifier" details:@"Unexpected character (0x003E, >) in DOCTYPE System identifier"]; _currentDoctypeToken.forceQuirks = YES; [self switchToState:HTMLTokenizerStateData]; [self emitToken:_currentDoctypeToken]; return; case EOF: [self emitParseError:@"eof-in-doctype" details:nil]; [self switchToState:HTMLTokenizerStateData]; _currentDoctypeToken.forceQuirks = YES; [self emitToken:_currentDoctypeToken]; [self emitEOFToken]; return; default: [_currentDoctypeToken appendStringToSystemIdentifier:StringFromUTF32Char(character)]; return; } } - (void)HTMLTokenizerStateDOCTYPESystemIdentifierSingleQuoted { UTF32Char character = [_inputStreamReader consumeNextInputCharacter]; switch (character) { case APOSTROPHE: [self switchToState:HTMLTokenizerStateAfterDOCTYPESystemIdentifier]; return; case NULL_CHAR: [self emitParseError:@"unexpected-null-character" details:nil]; [_currentDoctypeToken appendStringToSystemIdentifier:StringFromUniChar(REPLACEMENT_CHAR)]; return; case GREATER_THAN_SIGN: [self emitParseError:@"abrupt-doctype-system-identifier" details:@"Unexpected character (0x003E, >) in DOCTYPE System identifier"]; _currentDoctypeToken.forceQuirks = YES; [self switchToState:HTMLTokenizerStateData]; [self emitToken:_currentDoctypeToken]; return; case EOF: [self emitParseError:@"eof-in-doctype" details:nil]; [self switchToState:HTMLTokenizerStateData]; _currentDoctypeToken.forceQuirks = YES; [self emitToken:_currentDoctypeToken]; [self emitEOFToken]; return; default: [_currentDoctypeToken appendStringToSystemIdentifier:StringFromUTF32Char(character)]; return; } } - (void)HTMLTokenizerStateAfterDOCTYPESystemIdentifier { UTF32Char character = [_inputStreamReader consumeNextInputCharacter]; switch (character) { case CHARACTER_TABULATION: case LINE_FEED: case FORM_FEED: case SPACE: return; case GREATER_THAN_SIGN: [self switchToState:HTMLTokenizerStateData]; [self emitToken:_currentDoctypeToken]; return; case EOF: [self emitParseError:@"eof-in-doctype" details:nil]; [self switchToState:HTMLTokenizerStateData]; _currentDoctypeToken.forceQuirks = YES; [self emitToken:_currentDoctypeToken]; [self emitEOFToken]; return; default: [self emitParseError:@"unexpected-character-after-doctype-system-identifier" details:@"Unexpected character (%@) after DOCTYPE System identifier", StringFromUTF32Char(character)]; [self switchToState:HTMLTokenizerStateBogusDOCTYPE]; [_inputStreamReader reconsumeCurrentInputCharacter]; return; } } - (void)HTMLTokenizerStateBogusDOCTYPE { UTF32Char character = [_inputStreamReader consumeNextInputCharacter]; switch (character) { case GREATER_THAN_SIGN: [self switchToState:HTMLTokenizerStateData]; [self emitToken:_currentDoctypeToken]; return; case NULL_CHAR: [self emitParseError:@"unexpected-null-character" details:nil]; return; case EOF: [self emitToken:_currentDoctypeToken]; [self emitEOFToken]; return; default: return; } } - (void)HTMLTokenizerStateCDATASection { UTF32Char character = [_inputStreamReader consumeNextInputCharacter]; switch (character) { case RIGHT_SQUARE_BRACKET: [self switchToState:HTMLTokenizerStateCDATASectionBracket]; return; case EOF: [self emitParseError:@"eof-in-cdata" details:nil]; [self emitEOFToken]; return; default: [self emitCharacterToken:character]; return; } } - (void)HTMLTokenizerStateCDATASectionBracket { UTF32Char character = [_inputStreamReader consumeNextInputCharacter]; switch (character) { case RIGHT_SQUARE_BRACKET: [self switchToState:HTMLTokenizerStateCDATASectionEnd]; return; default: [self emitCharacterToken:RIGHT_SQUARE_BRACKET]; [self switchToState:HTMLTokenizerStateCDATASection]; [_inputStreamReader reconsumeCurrentInputCharacter]; return; } } - (void)HTMLTokenizerStateCDATASectionEnd { UTF32Char character = [_inputStreamReader consumeNextInputCharacter]; switch (character) { case RIGHT_SQUARE_BRACKET: [self emitCharacterToken:RIGHT_SQUARE_BRACKET]; return; case GREATER_THAN_SIGN: [self switchToState:HTMLTokenizerStateData]; return; default: [self emitCharacterTokenWithString:@"]]"]; [self switchToState:HTMLTokenizerStateCDATASection]; [_inputStreamReader reconsumeCurrentInputCharacter]; return; } } - (void)HTMLTokenizerStateCharacterReference { _temporaryBuffer = [NSMutableString new]; [_temporaryBuffer appendString:@"&"]; UTF32Char character = [_inputStreamReader consumeNextInputCharacter]; if (isAlphanumeric(character)) { [self switchToState:HTMLTokenizerStateNamedCharacterReference]; [_inputStreamReader unconsumeCurrentInputCharacter]; return; } switch (character) { case NUMBER_SIGN: [_temporaryBuffer appendString:@"#"]; [self switchToState:HTMLTokenizerStateNumericCharacterReference]; return; default: [self flushCodePointsConsumedAsCharacterReference]; [self switchToState:_characterReferenceReturnState]; [_inputStreamReader reconsumeCurrentInputCharacter]; return; } } - (void)HTMLTokenizerStateNamedCharacterReference { NSArray *entities = [HTMLTokenizerEntities entities]; NSMutableString *name = [NSMutableString stringWithString:@""]; NSString *foundEntityName = nil; NSString *foundEntityReplacement = nil; UTF32Char lastConsumedCharacter = EOF; NSUInteger searchIndex = 0; [_inputStreamReader markCurrentLocation]; while (YES) { lastConsumedCharacter = [_inputStreamReader consumeNextInputCharacter]; if (lastConsumedCharacter == EOF) break; NSString *lastCharacterString = StringFromUTF32Char(lastConsumedCharacter); [name appendString:lastCharacterString]; searchIndex= [entities indexOfObject:name inSortedRange:NSMakeRange(searchIndex, entities.count - searchIndex) options:NSBinarySearchingInsertionIndex | NSBinarySearchingFirstEqual usingComparator:^ NSComparisonResult (id obj1, id obj2) { return [obj1 compare:obj2]; }]; if (searchIndex >= entities.count || ![[entities objectAtIndex:searchIndex] hasPrefix:name]) { break; } if ([[entities objectAtIndex:searchIndex] isEqualToString:name]) { foundEntityName = [name copy]; foundEntityReplacement = [HTMLTokenizerEntities replacementAtIndex:searchIndex]; } } [_inputStreamReader rewindToMarkedLocation]; if (foundEntityName) { [_inputStreamReader consumeString:foundEntityName caseSensitive:YES]; [_temporaryBuffer appendString:foundEntityName]; BOOL inAttribute = (_characterReferenceReturnState == HTMLTokenizerStateAttributeValueUnquoted || _characterReferenceReturnState == HTMLTokenizerStateAttributeValueSingleQuoted || _characterReferenceReturnState == HTMLTokenizerStateAttributeValueDoubleQuoted); unichar lastMatchedCharacter = [foundEntityName characterAtIndex:foundEntityName.length - 1]; UTF32Char nextCharacter = [_inputStreamReader nextInputCharacter]; if (inAttribute && lastMatchedCharacter != SEMICOLON && (nextCharacter == EQUALS_SIGN || isAlphanumeric(nextCharacter))) { [self flushCodePointsConsumedAsCharacterReference]; [self switchToState:_characterReferenceReturnState]; return; } if (lastMatchedCharacter != SEMICOLON) { [self emitParseError:@"missing-semicolon-after-character-reference" details:nil]; } _temporaryBuffer = [NSMutableString new]; [_temporaryBuffer appendString:foundEntityReplacement]; [self flushCodePointsConsumedAsCharacterReference]; [self switchToState:_characterReferenceReturnState]; } else { NSString *unknownEntity = name; if (lastConsumedCharacter == SEMICOLON) { unknownEntity = [name substringToIndex:name.length -1]; } [_inputStreamReader consumeString:unknownEntity caseSensitive:YES]; [_temporaryBuffer appendString:unknownEntity]; [self flushCodePointsConsumedAsCharacterReference]; [self switchToState:HTMLTokenizerStateAmbiguousAmpersand]; return; } } - (void)HTMLTokenizerStateAmbiguousAmpersand { UTF32Char character = [_inputStreamReader consumeNextInputCharacter]; if (isAlphanumeric(character)) { if (_characterReferenceReturnState == HTMLTokenizerStateAttributeValueUnquoted || _characterReferenceReturnState == HTMLTokenizerStateAttributeValueSingleQuoted || _characterReferenceReturnState == HTMLTokenizerStateAttributeValueDoubleQuoted) { [self appendToCurrentAttributeValue:StringFromUTF32Char(character)]; } else { [self emitCharacterToken:character]; } return; } switch (character) { case SEMICOLON: [self emitParseError:@"unknown-named-character-reference" details:@"Ambiguous ampersand followed by a semicolon encountered"]; [self switchToState:_characterReferenceReturnState]; [_inputStreamReader reconsumeCurrentInputCharacter]; return; default: [self switchToState:_characterReferenceReturnState]; [_inputStreamReader reconsumeCurrentInputCharacter]; return; } } - (void)HTMLTokenizerStateNumericCharacterReference { _characterReferenceCode = 0; _characterReferenceOverflow = NO; UTF32Char character = [_inputStreamReader consumeNextInputCharacter]; switch (character) { case LATIN_CAPITAL_LETTER_X: case LATIN_SMALL_LETTER_X: [_temporaryBuffer appendString:StringFromUniChar(character)]; [self switchToState:HTMLTokenizerStateHexadecimalCharacterReferenceStart]; return; default: [self switchToState:HTMLTokenizerStateDecimalCharacterReferenceStart]; [_inputStreamReader reconsumeCurrentInputCharacter]; return; } } - (void)HTMLTokenizerStateHexadecimalCharacterReferenceStart { UTF32Char character = [_inputStreamReader consumeNextInputCharacter]; if (isHexDigit(character)) { [self switchToState:HTMLTokenizerStateHexadecimalCharacterReference]; [_inputStreamReader reconsumeCurrentInputCharacter]; } else { [self emitParseError:@"absence-of-digits-in-numeric-character-reference" details:@"Expected a hexadecimal digit but got character (%@) ", StringFromUTF32Char(character)]; [self flushCodePointsConsumedAsCharacterReference]; [self switchToState:_characterReferenceReturnState]; [_inputStreamReader reconsumeCurrentInputCharacter]; } } - (void)HTMLTokenizerStateDecimalCharacterReferenceStart { UTF32Char character = [_inputStreamReader consumeNextInputCharacter]; if (isDigit(character)) { [self switchToState:HTMLTokenizerStateDecimalCharacterReference]; [_inputStreamReader reconsumeCurrentInputCharacter]; } else { [self emitParseError:@"absence-of-digits-in-numeric-character-reference" details:@"Expected a decimal digit but got character (%@) ", StringFromUTF32Char(character)]; [self flushCodePointsConsumedAsCharacterReference]; [self switchToState:_characterReferenceReturnState]; [_inputStreamReader reconsumeCurrentInputCharacter]; } } - (void)HTMLTokenizerStateHexadecimalCharacterReference { UTF32Char character = [_inputStreamReader consumeNextInputCharacter]; if (isDigit(character)) { if (_characterReferenceCode > (ULLONG_MAX >> 4)) { _characterReferenceOverflow = YES; } _characterReferenceCode <<= 4; if (_characterReferenceCode > (ULLONG_MAX - (character - 0x0030))) { _characterReferenceOverflow = YES; } _characterReferenceCode += (character - 0x0030); } else if (isUpperHexDigit(character)) { _characterReferenceCode <<= 4; _characterReferenceCode += (character - 0x0037); } else if (isLowerHexDigit(character)) { _characterReferenceCode <<= 4; _characterReferenceCode += (character - 0x0057); } else if (character == SEMICOLON) { [self switchToState:HTMLTokenizerStateNumericCharacterReferenceEnd]; } else { [self emitParseError:@"missing-semicolon-after-character-reference" details:@"Expected semicolon but got (%@)", StringFromUTF32Char(character)]; [self switchToState:HTMLTokenizerStateNumericCharacterReferenceEnd]; [_inputStreamReader reconsumeCurrentInputCharacter]; } } - (void)HTMLTokenizerStateDecimalCharacterReference { UTF32Char character = [_inputStreamReader consumeNextInputCharacter]; if (isDigit(character)) { if (_characterReferenceCode > (ULLONG_MAX / 10)) { _characterReferenceOverflow = YES; } _characterReferenceCode = (_characterReferenceCode << 3) + (_characterReferenceCode << 1); if (_characterReferenceCode > (ULLONG_MAX - (character - 0x0030))) { _characterReferenceOverflow = YES; } _characterReferenceCode += (character - 0x0030); } else if (character == SEMICOLON) { [self switchToState:HTMLTokenizerStateNumericCharacterReferenceEnd]; } else { [self emitParseError:@"missing-semicolon-after-character-reference" details:@"Expected semicolon but got (%@)", StringFromUTF32Char(character)]; [self switchToState:HTMLTokenizerStateNumericCharacterReferenceEnd]; [_inputStreamReader reconsumeCurrentInputCharacter]; } } - (void)HTMLTokenizerStateNumericCharacterReferenceEnd { if (_characterReferenceOverflow) { [self emitParseError:@"character-reference-outside-unicode-range" details:nil]; _characterReferenceCode = REPLACEMENT_CHAR; } else if (_characterReferenceCode == NULL_CHAR) { [self emitParseError:@"null-character-reference" details:nil]; _characterReferenceCode = REPLACEMENT_CHAR; } else if (_characterReferenceCode > 0x10FFFF) { [self emitParseError:@"character-reference-outside-unicode-range" details:nil]; _characterReferenceCode = REPLACEMENT_CHAR; } else if (isSurrogate(_characterReferenceCode)) { [self emitParseError:@"surrogate-character-reference" details:nil]; _characterReferenceCode = REPLACEMENT_CHAR; } else if (isNoncharacter(_characterReferenceCode)) { [self emitParseError:@"noncharacter-character-reference" details:nil]; } else if (_characterReferenceCode == CARRIAGE_RETURN || isControlCharacter(_characterReferenceCode)) { [self emitParseError:@"control-character-reference" details:nil]; UTF32Char reference = NumericReplacementCharacter((UTF32Char)_characterReferenceCode); if (reference != NULL_CHAR) { _characterReferenceCode = reference; } } _temporaryBuffer = [NSMutableString new]; [_temporaryBuffer appendString:StringFromUTF32Char((UTF32Char)_characterReferenceCode)]; [self flushCodePointsConsumedAsCharacterReference]; [self switchToState:_characterReferenceReturnState]; } @end