diff --git a/HTMLKit/HTMLTokenizer.m b/HTMLKit/HTMLTokenizer.m index a0247e7..c019a44 100644 --- a/HTMLKit/HTMLTokenizer.m +++ b/HTMLKit/HTMLTokenizer.m @@ -317,36 +317,40 @@ NSString *entityName = nil; +#warning Improve Named Entity Search UTF32Char inputCharacter = [_inputStreamReader consumeNextInputCharacter]; - NSArray *names = [HTMLTokenizerEntities entityNames]; +// NSArray *names = [HTMLTokenizerEntities entityNames]; + NSArray *names = NAMES(); NSMutableString *name = [NSMutableString stringWithString:StringFromUTF32Char(inputCharacter)]; + NSUInteger searchIndex = 0; + while (YES) { - NSPredicate *predicate = [NSPredicate predicateWithFormat:@"SELF BEGINSWITH %@", name]; - names = [names filteredArrayUsingPredicate:predicate]; - if (names.count == 0) break; + searchIndex= [names indexOfObject:name + inSortedRange:NSMakeRange(searchIndex, names.count - searchIndex) + options:NSBinarySearchingInsertionIndex | NSBinarySearchingFirstEqual + usingComparator:^NSComparisonResult(id obj1, id obj2) { + return [obj1 compare:obj2]; + }]; + + if (searchIndex >= names.count) break; + + if ([[names objectAtIndex:searchIndex] isEqualToString:name]) { + entityName = [name copy]; + } + + if ([name hasSuffix:@";"]) break; inputCharacter = [_inputStreamReader consumeNextInputCharacter]; if (inputCharacter == EOF) break; [name appendString:StringFromUTF32Char(inputCharacter)]; - - if ([names containsObject:name]) { - entityName = [name copy]; - if ([entityName hasSuffix:@";"]) { - break; - } - } } if (entityName == nil) { - if ([name hasSuffix:@";"]) { - [self emitParseError:@"Undefined named entity with semicolon found"]; - } else { - NSString *nextAlphanumeric = [_inputStreamReader consumeAlphanumericCharacters]; - if (nextAlphanumeric != nil) { - [name appendString:nextAlphanumeric]; - } + [_inputStreamReader rewindToMarkedLocation]; + + if ([_inputStreamReader consumeAlphanumericCharacters] != nil) { if ([_inputStreamReader consumeString:@";" caseSensitive:NO]) { [self emitParseError:@"Undefined named entity with semicolon found"]; } diff --git a/HTMLKit/HTMLTokenizerEntities.h b/HTMLKit/HTMLTokenizerEntities.h index 49cee29..5c60c70 100644 --- a/HTMLKit/HTMLTokenizerEntities.h +++ b/HTMLKit/HTMLTokenizerEntities.h @@ -14,3 +14,5 @@ + (NSString *)replacementForNamedCharacterEntity:(NSString *)entity; @end + +extern NSArray * NAMES(); diff --git a/HTMLKit/HTMLTokenizerEntities.m b/HTMLKit/HTMLTokenizerEntities.m index cdddce3..f9e43dc 100644 --- a/HTMLKit/HTMLTokenizerEntities.m +++ b/HTMLKit/HTMLTokenizerEntities.m @@ -2243,6 +2243,16 @@ static NSDictionary *_entities; +static NSString * x[] = { +#define NAMED_CHARACTER_REFERENCE( name, value ) @name, + NAMED_CHARACTER_REFERENCES +#undef NAMED_CHARACTER_REFERENCE +}; + +NSArray * NAMES() { + return [[NSArray alloc] initWithObjects:x count:2231]; +} + @implementation HTMLTokenizerEntities + (void)initialize